corems.molecular_id.search.database_interfaces

   1import os
   2import re
   3import warnings
   4from abc import ABC
   5from io import StringIO
   6from pathlib import Path
   7import sqlite3
   8
   9import numpy as np
  10import requests
  11import pandas as pd
  12from ms_entropy import FlashEntropySearch
  13
  14from corems.molecular_id.factory.EI_SQL import (
  15    EI_LowRes_SQLite,
  16    Metadatar,
  17    MetaboliteMetadata,
  18)
  19from corems.molecular_id.factory.lipid_molecular_metadata import LipidMetadata
  20from corems.mass_spectra.calc.lc_calc import find_closest
  21
  22
  23class SpectralDatabaseInterface(ABC):
  24    """
  25    Base class that facilitates connection to spectral reference databases,
  26    such as EMSL's Metabolomics Reference Database (MetabRef).
  27
  28    """
  29
  30    def __init__(self, key=None):
  31        """
  32        Initialize instance.
  33
  34        Parameters
  35        ----------
  36        key : str
  37            Token key.
  38
  39        """
  40
  41        self.key = key
  42
  43    def set_token(self, path):
  44        """
  45        Set environment variable for MetabRef database token.
  46
  47        Parameters
  48        ----------
  49        path : str
  50            Path to token.
  51
  52        """
  53
  54        # Read token from file
  55        with open(path, "r", encoding="utf-8") as f:
  56            token = f.readline().strip()
  57
  58        # Set environment variable
  59        os.environ[self.key] = token
  60
  61    def get_token(self):
  62        """
  63        Get environment variable for database token.
  64
  65        Returns
  66        -------
  67        str
  68            Token string.
  69
  70        """
  71
  72        # Check for token
  73        if self.key not in os.environ:
  74            raise ValueError("Must set {} environment variable.".format(self.key))
  75
  76        # Get token from environment variables
  77        return os.environ.get(self.key)
  78
  79    def get_header(self):
  80        """
  81        Access stored database token and prepare as header.
  82
  83        Returns
  84        -------
  85        str
  86            Header string.
  87
  88        """
  89
  90        # Get token
  91        token = self.get_token()
  92
  93        # Pad header information
  94        header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"}
  95
  96        return header
  97
  98    def get_query(self, url, use_header=True):
  99        """
 100        Request payload from URL according to `get` protocol.
 101
 102        Parameters
 103        ----------
 104        url : str
 105            URL for request.
 106        use_header: bool
 107            Whether or not the query should include the header
 108
 109        Returns
 110        -------
 111        dict
 112            Response as JSON.
 113
 114        """
 115
 116        # Query URL via `get`
 117        if use_header:
 118            response = requests.get(url, headers=self.get_header())
 119        else:
 120            response = requests.get(url)
 121
 122        # Check response
 123        response.raise_for_status()
 124
 125        # Return as JSON
 126        return response.json()
 127
 128    def post_query(self, url, variable, values, tolerance):
 129        """
 130        Request payload from URL according to `post` protocol.
 131
 132        Parameters
 133        ----------
 134        url : str
 135            URL for request.
 136        variable : str
 137            Variable to query.
 138        values : str
 139            Specific values of `variable` to query.
 140        tolerance : str
 141            Query tolerance relative to `values`.
 142
 143        Returns
 144        -------
 145        dict
 146            Response as JSON.
 147
 148        """
 149
 150        # Coerce to string
 151        if not isinstance(variable, str):
 152            variable = str(variable).replace(" ", "")
 153
 154        if not isinstance(values, str):
 155            values = str(values).replace(" ", "")
 156
 157        if not isinstance(tolerance, str):
 158            tolerance = str(tolerance).replace(" ", "")
 159
 160        # Query URL via `post`
 161        response = requests.post(
 162            os.path.join(url, variable, tolerance),
 163            data=values,
 164            headers=self.get_header(),
 165        )
 166
 167        # Check response
 168        response.raise_for_status()
 169
 170        # Return as JSON
 171        return response.json()
 172
 173    def _check_flash_entropy_kwargs(self, fe_kwargs):
 174        """
 175        Check FlashEntropy keyword arguments.
 176
 177        Parameters
 178        ----------
 179        fe_kwargs : dict
 180            Keyword arguments for FlashEntropy search.
 181
 182
 183        Raises
 184        ------
 185        ValueError
 186            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they
 187            are not equal.
 188
 189        """
 190        # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da
 191        if (
 192            "min_ms2_difference_in_da" in fe_kwargs
 193            or "max_ms2_tolerance_in_da" in fe_kwargs
 194        ):
 195            if (
 196                "min_ms2_difference_in_da" not in fe_kwargs
 197                or "max_ms2_tolerance_in_da" not in fe_kwargs
 198            ):
 199                raise ValueError(
 200                    "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified."
 201                )
 202            if (
 203                fe_kwargs["min_ms2_difference_in_da"]
 204                != 2 * fe_kwargs["max_ms2_tolerance_in_da"]
 205            ):
 206                raise ValueError(
 207                    "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'."
 208                )
 209
 210    def _get_format_func(self, format):
 211        """
 212        Obtain format function by key.
 213
 214        Returns
 215        -------
 216        func
 217            Formatting function.
 218        """
 219
 220        if format.lower() in self.format_map.keys():
 221            return self.format_map[format.lower()]
 222
 223        raise ValueError(("{} not a supported format.").format(format))
 224
 225    def _dict_to_dataclass(self, source_dict, data_class):
 226        """
 227        Convert dictionary to dataclass.
 228
 229        Notes
 230        -----
 231        This function will pull the attributes a dataclass and its parent class
 232        and convert the dictionary to a dataclass instance with the appropriate
 233        attributes.
 234
 235        Parameters
 236        ----------
 237        data_class : :obj:`~dataclasses.dataclass`
 238            Dataclass to convert to.
 239        source_dict : dict
 240            Dictionary object to convert to dataclass.
 241
 242        Returns
 243        -------
 244        :obj:`~dataclasses.dataclass`
 245            Dataclass instance.
 246
 247        """
 248
 249        # Get list of expected attributes of data_class
 250        data_class_keys = list(data_class.__annotations__.keys())
 251
 252        # Does the data_class inherit from another class, if so, get the attributes of the parent class as well
 253        if len(data_class.__mro__) > 2:
 254            parent_class_keys = list(data_class.__bases__[0].__annotations__.keys())
 255            data_class_keys = list(set(data_class_keys + parent_class_keys))
 256
 257        # Remove keys that are not in the data_class from the input dictionary
 258        input_dict = {k: v for k, v in source_dict.items() if k in data_class_keys}
 259
 260        # Add keys that are in the data class but not in the input dictionary as None
 261        for key in data_class_keys:
 262            if key not in input_dict.keys():
 263                input_dict[key] = None
 264        return data_class(**input_dict)
 265
 266    def _spectrum_to_array(self, spectrum, normalize=True):
 267        """
 268        Convert a parenthesis-delimited spectrum string to array.
 269
 270        Parameters
 271        ----------
 272        spectrum : str
 273            Spectrum string, i.e. list of (m/z,abundance) pairs.
 274        normalize : bool
 275            Normalize the spectrum by its magnitude.
 276
 277        Returns
 278        -------
 279        :obj:`~numpy.array`
 280            Array of shape (N, 2), with m/z in the first column and abundance in
 281            the second.
 282        """
 283
 284        arr = np.array(
 285            re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float
 286        ).reshape(-1, 2)
 287
 288        if normalize:
 289            arr = self.normalize_peaks(arr)
 290
 291        return arr
 292
 293    @staticmethod
 294    def normalize_peaks(arr):
 295        """
 296        Normalize peaks in an array.
 297
 298        Parameters
 299        ----------
 300        arr : :obj:`~numpy.array`
 301            Array of shape (N, 2), with m/z in the first column and abundance in
 302            the second.
 303
 304        Returns
 305        -------
 306        :obj:`~numpy.array`
 307            Normalized array of shape (N, 2), with m/z in the first column and
 308            normalized abundance in the second.
 309        """
 310        # Normalize the array
 311        arr[:, -1] = arr[:, -1] / arr[:, -1].sum()
 312
 313        return arr
 314
 315    @staticmethod
 316    def _build_flash_entropy_index(fe_lib, fe_kwargs={}, clean_spectra=True):
 317        """
 318        Build FlashEntropy index.
 319
 320        Parameters
 321        ----------
 322        fe_lib : list
 323            List of spectra to build index from. Can be a list of dictionaries or
 324            a FlashEntropy search instance.
 325        fe_kwargs : dict, optional
 326            Keyword arguments for FlashEntropy search.
 327        clean_spectra : bool, optional
 328            Clean spectra before building index. Default is True.
 329
 330        Returns
 331        -------
 332        :obj:`~ms_entropy.FlashEntropySearch`
 333            FlashEntropy search instance.
 334
 335        """
 336        # Initialize FlashEntropy
 337        fe_init_kws = [
 338            "max_ms2_tolerance_in_da",
 339            "mz_index_step",
 340            "low_memory",
 341            "path_data",
 342        ]
 343        fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws}
 344        fes = FlashEntropySearch(**fe_init_kws)
 345
 346        # Build FlashEntropy index
 347        fe_index_kws = [
 348            "max_indexed_mz",
 349            "precursor_ions_removal_da",
 350            "noise_threshold",
 351            "min_ms2_difference_in_da",
 352            "max_peak_num",
 353        ]
 354        fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws}
 355        fes.build_index(fe_lib, **fe_index_kws, clean_spectra=clean_spectra)
 356
 357        return fes
 358
 359
 360class MetabRefInterface(SpectralDatabaseInterface):
 361    """
 362    DEPRECATED interface retained for backward compatibility only.
 363    """
 364
 365    def __init__(self):
 366        """
 367        Initialize instance with deprecation warning.
 368
 369        """
 370
 371        super().__init__(key=None)
 372
 373        if self.__class__ is MetabRefInterface:
 374            warnings.warn(
 375                "MetabRefInterface is deprecated. Instantiate a concrete interface "
 376                "such as GCMSLibraryInterface or LCLipidLibraryInterface instead.",
 377                DeprecationWarning,
 378                stacklevel=2,
 379            )
 380
 381
 382class GCMSLibraryInterface(SpectralDatabaseInterface):
 383    """
 384    Interface to bundled GCMS spectral libraries in MSP format.
 385    
 386    Loads GCMS compound library and FAMES calibration library from local MSP files.
 387    Default files are bundled with CoreMS, but can be overridden via environment variables.
 388    """
 389
 390    def __init__(self):
 391        """
 392        Initialize instance.
 393        """
 394        super().__init__(key=None)
 395        
 396        # Local data file paths
 397        from pathlib import Path
 398        
 399        # Default to bundled data files
 400        data_dir = Path(__file__).parent.parent / "data"
 401        self.gcms_library_file = os.getenv(
 402            "GCMS_LIBRARY_PATH", 
 403            str(data_dir / "PNNLMetV20191015.msp")
 404        )
 405        self.fames_library_file = os.getenv(
 406            "FAMES_LIBRARY_PATH",
 407            str(data_dir / "FAMES_REF.msp")
 408        )
 409
 410        self.__init_format_map__()
 411
 412    def __init_format_map__(self):
 413        """
 414        Initialize database format mapper, enabling multiple format requests.
 415
 416        """
 417
 418        # Define format workflows
 419        self.format_map = {
 420            "json": lambda x, normalize, fe_kwargs: x,
 421            "dict": lambda x,
 422            normalize,
 423            fe_kwargs: self._to_LowResolutionEICompound_dict(x, normalize),
 424            "sql": lambda x,
 425            normalize,
 426            fe_kwargs: self._LowResolutionEICompound_dict_to_sqlite(
 427                self._to_LowResolutionEICompound_dict(x, normalize)
 428            ),
 429        }
 430
 431        # Add aliases
 432        self.format_map["metabref"] = self.format_map["json"]
 433        self.format_map["datadict"] = self.format_map["dict"]
 434        self.format_map["data-dict"] = self.format_map["dict"]
 435        self.format_map["lowreseicompound"] = self.format_map["dict"]
 436        self.format_map["lowres"] = self.format_map["dict"]
 437        self.format_map["lowresgc"] = self.format_map["dict"]
 438        self.format_map["sqlite"] = self.format_map["sql"]
 439
 440    def available_formats(self):
 441        """
 442        View list of available formats.
 443
 444        Returns
 445        -------
 446        list
 447            Format map keys.
 448        """
 449
 450        return list(self.format_map.keys())
 451
 452    def get_library(self, format="json", normalize=False):
 453        """
 454        Load GC/MS library from local MSP file.
 455
 456        Parameters
 457        ----------
 458        format : str
 459            Format of requested library, i.e. "json", "sql", "dict".
 460            See `available_formats` method for aliases.
 461        normalize : bool
 462            Normalize the spectrum by its magnitude.
 463
 464        Returns
 465        -------
 466        Library in requested format.
 467
 468        """
 469        # Load from local MSP file
 470        library_data = self._load_msp_file(self.gcms_library_file, normalize)
 471        
 472        # Init format function
 473        format_func = self._get_format_func(format)
 474        
 475        # Apply format conversion
 476        return format_func(library_data, normalize, {})
 477
 478    def get_fames(self, format="json", normalize=False):
 479        """
 480        Load GC/MS FAMEs library from local MSP file.
 481
 482        Parameters
 483        ----------
 484        format : str
 485            Format of requested library, i.e. "json", "sql", "dict".
 486            See `available_formats` method for aliases.
 487        normalize : bool
 488            Normalize the spectrum by its magnitude.
 489
 490        Returns
 491        -------
 492        Library in requested format.
 493
 494        """
 495        # Load from local MSP file
 496        library_data = self._load_msp_file(self.fames_library_file, normalize)
 497        
 498        # Init format function
 499        format_func = self._get_format_func(format)
 500        
 501        # Apply format conversion
 502        return format_func(library_data, normalize, {})
 503    
 504    def _load_msp_file(self, file_path, normalize=False):
 505        """
 506        Load and parse MSP file into format compatible with existing pipeline.
 507        
 508        Parameters
 509        ----------
 510        file_path : str
 511            Path to MSP file
 512        normalize : bool
 513            Normalize spectra
 514            
 515        Returns
 516        -------
 517        list of dict
 518            Library data in format compatible with _to_LowResolutionEICompound_dict
 519        """
 520        from pathlib import Path
 521        
 522        file_path = Path(file_path)
 523        if not file_path.exists():
 524            raise FileNotFoundError(
 525                f"Library file not found: {file_path}. "
 526                f"Set GCMS_LIBRARY_PATH or FAMES_LIBRARY_PATH environment variable to specify location."
 527            )
 528        
 529        # Parse MSP file
 530        spectra = []
 531        spectrum = {}
 532        peaks = []
 533        
 534        with open(file_path, 'r') as f:
 535            for line in f:
 536                line = line.strip()
 537                
 538                # Empty line marks end of spectrum
 539                if not line:
 540                    if spectrum and peaks:
 541                        # Convert peaks to the format expected by downstream code
 542                        # Format: "(mz,abundance)(mz,abundance)..."
 543                        peak_str = "".join([f"({int(mz)},{int(abun)})" for mz, abun in peaks])
 544                        spectrum['mz'] = peak_str
 545                        spectra.append(spectrum)
 546                    spectrum = {}
 547                    peaks = []
 548                    continue
 549                
 550                # Check if line contains peak data (starts with digit)
 551                if line and line[0].isdigit():
 552                    parts = line.split()
 553                    if len(parts) >= 2:
 554                        peaks.append((float(parts[0]), float(parts[1])))
 555                    continue
 556                
 557                # Handle metadata fields
 558                if ":" in line:
 559                    key, value = line.split(":", 1)
 560                    key = key.strip().lower()
 561                    value = value.strip()
 562                    
 563                    # Map MSP fields to expected format
 564                    field_mapping = {
 565                        "name": "molecule_name",
 566                        "formula": "formula",
 567                        "cas": "casno",
 568                        "retentiontime": "retention_time",
 569                        "ri": "ri",
 570                        "comment": "comments",
 571                        "num peaks": "peak_count",
 572                        "derivative": "derivative"
 573                    }
 574                    
 575                    # Metadata fields that go into the metadata dict
 576                    metadata_fields = {
 577                        "inchikey": "inchikey",
 578                        "inchi": "inchi",
 579                        "smiles": "smiles",
 580                        "pubchem": "pubchem",
 581                        "chebi": "chebi",
 582                        "kegg": "kegg",
 583                        "refmet": "refmet",
 584                        "iupac_name": "iupac_name"
 585                    }
 586                    
 587                    if key in field_mapping:
 588                        mapped_key = field_mapping[key]
 589                        # Convert numeric fields
 590                        if key in ["retentiontime", "ri"]:
 591                            try:
 592                                value = float(value)
 593                            except:
 594                                pass
 595                        elif key == "num peaks":
 596                            try:
 597                                value = int(value)
 598                            except:
 599                                pass
 600                        spectrum[mapped_key] = value
 601                    elif key in metadata_fields:
 602                        # Store in nested metadata dict
 603                        if "metadata" not in spectrum:
 604                            spectrum["metadata"] = {}
 605                        spectrum["metadata"][metadata_fields[key]] = value
 606                    else:
 607                        # Keep unmapped fields
 608                        spectrum[key] = value
 609        
 610        # Add last spectrum if file doesn't end with blank line
 611        if spectrum and peaks:
 612            peak_str = "".join([f"({int(mz)},{int(abun)})" for mz, abun in peaks])
 613            spectrum['mz'] = peak_str
 614            spectra.append(spectrum)
 615        
 616        return spectra
 617
 618    def _to_LowResolutionEICompound_dict(self, metabref_lib, normalize=False):
 619        """
 620        Convert MetabRef-formatted library to CoreMS LowResolutionEICompound-formatted
 621        dictionary for local ingestion.
 622
 623        Parameters
 624        ----------
 625        metabref_lib : dict
 626            MetabRef GC-MS library in JSON format.
 627        normalize : bool
 628            Normalize each spectrum by its magnitude.
 629
 630        Returns
 631        -------
 632        list of dict
 633            List of each spectrum contained in dictionary.
 634
 635        """
 636
 637        # All below key:value lookups are based on CoreMS class definitions
 638        # NOT MetabRef content. For example, MetabRef has keys for PubChem,
 639        # USI, etc. that are not considered below.
 640
 641        # Dictionary to map metabref keys to corems keys
 642        metadatar_cols = {
 643            "casno": "cas",
 644            "inchikey": "inchikey",
 645            "inchi": "inchi",
 646            "chebi": "chebi",
 647            "smiles": "smiles",
 648            "kegg": "kegg",
 649            "iupac_name": "iupac_name",
 650            "traditional_name": "traditional_name",  # Not present in metabref
 651            "common_name": "common_name",  # Not present in metabref
 652        }
 653
 654        # Dictionary to map metabref keys to corems keys
 655        lowres_ei_compound_cols = {
 656            "id": "metabref_id",
 657            "molecule_name": "name",  # Is this correct?
 658            "classify": "classify",  # Not present in metabref
 659            "formula": "formula",
 660            "ri": "ri",
 661            "rt": "retention_time",
 662            "source": "source",  # Not present in metabref
 663            "casno": "casno",
 664            "comments": "comment",
 665            "source_temp_c": "source_temp_c",  # Not present in metabref
 666            "ev": "ev",  # Not present in metabref
 667            "peak_count": "peaks_count",
 668            "mz": "mz",
 669            "abundance": "abundance",
 670        }
 671
 672        # Local result container
 673        corems_lib = []
 674
 675        # Enumerate spectra
 676        for i, source_ in enumerate(metabref_lib):
 677            # Copy source to prevent modification
 678            source = source_.copy()
 679
 680            # Parse target data
 681            target = {
 682                lowres_ei_compound_cols[k]: v
 683                for k, v in source.items()
 684                if k in lowres_ei_compound_cols
 685            }
 686
 687            # Explicitly add this to connect with LowResCompoundRef later
 688            if "retention_time" in source:
 689                target["rt"] = source["retention_time"]
 690            elif "rt" in source:
 691                target["rt"] = source["rt"]
 692
 693            # Parse (mz, abundance)
 694            arr = self._spectrum_to_array(target["mz"], normalize=normalize)
 695            target["mz"] = arr[:, 0]
 696            target["abundance"] = arr[:, 1]
 697
 698            # Parse meta data
 699            target["metadata"] = {
 700                metadatar_cols[k]: v for k, v in source.items() if k in metadatar_cols
 701            }
 702
 703            # Add anything else
 704            for k in source:
 705                if k not in lowres_ei_compound_cols:
 706                    target[k] = source[k]
 707
 708            # Add to CoreMS list
 709            corems_lib.append(target)
 710
 711        return corems_lib
 712
 713    def _LowResolutionEICompound_dict_to_sqlite(
 714        self, lowres_ei_compound_dict, url="sqlite://"
 715    ):
 716        """
 717        Convert CoreMS LowResolutionEICompound-formatted dictionary to SQLite
 718        database for local ingestion.
 719
 720        Parameters
 721        ----------
 722        lowres_ei_compound_dict : dict
 723            CoreMS GC-MS library formatted for LowResolutionEICompound.
 724        url : str
 725            URL to SQLite prefix.
 726
 727        Returns
 728        -------
 729        sqlite database
 730            Spectra contained in SQLite database.
 731
 732        """
 733
 734        # Dictionary to map corems keys to all-caps keys
 735        capped_cols = {
 736            "name": "NAME",
 737            "formula": "FORM",
 738            "ri": "RI",
 739            "retention_time": "RT",
 740            "source": "SOURCE",
 741            "casno": "CASNO",
 742            "comment": "COMMENT",
 743            "peaks_count": "NUM PEAKS",
 744        }
 745
 746        # Initialize SQLite object
 747        sqlite_obj = EI_LowRes_SQLite(url=url)
 748
 749        # Iterate spectra
 750        for _data_dict in lowres_ei_compound_dict:
 751            # Copy source to prevent modification
 752            data_dict = _data_dict.copy()
 753
 754            # Add missing capped values
 755            for k, v in capped_cols.items():
 756                # Key exists
 757                if k in data_dict:
 758                    # # This will replace the key
 759                    # data_dict[v] = data_dict.pop(k)
 760
 761                    # This will keep both keys
 762                    data_dict[v] = data_dict[k]
 763
 764            # Parse number of peaks
 765            if not data_dict.get("NUM PEAKS"):
 766                data_dict["NUM PEAKS"] = len(data_dict.get("mz"))
 767
 768            # Parse CAS number
 769            if not data_dict.get("CASNO"):
 770                data_dict["CASNO"] = data_dict.get("CAS")
 771
 772            if not data_dict["CASNO"]:
 773                data_dict["CASNO"] = 0
 774
 775            # Build linked metadata table
 776            if "metadata" in data_dict:
 777                metadata = data_dict.pop("metadata")
 778                # Only create metadata entry if we have required fields and valid data
 779                # Filter to only include fields that Metadatar model supports
 780                supported_metadata_fields = [
 781                    'cas', 'inchikey', 'inchi', 'chebi', 'smiles', 
 782                    'kegg', 'iupac_name', 'traditional_name', 'common_name'
 783                ]
 784                filtered_metadata = {
 785                    k: v for k, v in metadata.items() 
 786                    if k in supported_metadata_fields and v
 787                }
 788                # Inchikey is required by the database model
 789                if filtered_metadata and filtered_metadata.get("inchikey"):
 790                    data_dict["metadatar"] = Metadatar(**filtered_metadata)
 791
 792            # Attempt addition to sqlite
 793            try:
 794                sqlite_obj.add_compound(data_dict)
 795            except:
 796                print(data_dict["NAME"])
 797
 798        return sqlite_obj
 799
 800
 801class MetabRefGCInterface(GCMSLibraryInterface):
 802    """
 803    DEPRECATED: Use GCMSLibraryInterface instead.
 804    
 805    This interface is maintained for backward compatibility only.
 806    MetabRef API has been discontinued as of 2026.
 807    """
 808
 809    def __init__(self):
 810        """
 811        Initialize instance with deprecation warning.
 812        """
 813        warnings.warn(
 814            "MetabRefGCInterface is deprecated. Use GCMSLibraryInterface instead. "
 815            "MetabRef API has been discontinued; all data now loads from bundled local MSP files.",
 816            DeprecationWarning,
 817            stacklevel=2
 818        )
 819        super().__init__()
 820
 821
 822class LCLipidLibraryInterface(SpectralDatabaseInterface):
 823    """
 824    Interface to a local sqlite lipid library for LC-MS spectral searches.
 825    """
 826
 827    DEFAULT_DOWNLOAD_URL = (
 828        "https://nmdcdemo.emsl.pnnl.gov/minio/lipidomics/parameter_files/"
 829        "202412_lipid_ref.sqlite"
 830    )
 831
 832    def __init__(self, db_location=None):
 833        """
 834        Initialize instance.
 835
 836        Parameters
 837        ----------
 838        db_location : str | Path, optional
 839            Local path to the sqlite lipid library. If omitted, the
 840            COREMS_LIPIDOMICS_SQLITE_PATH environment variable is used.
 841        """
 842
 843        super().__init__(key=None)
 844        self.db_location = db_location
 845        self.__init_format_map__()
 846
 847    def _to_flashentropy(self, spectral_library, normalize=True, fe_kwargs={}):
 848        """
 849        Convert a spectral library to FlashEntropy format.
 850
 851        Parameters
 852        ----------
 853        spectral_library : dict
 854            MS2 library in JSON format or FlashEntropy search instance
 855            (for reformatting at different MS2 separation).
 856        normalize : bool
 857            Normalize each spectrum by its magnitude.
 858        fe_kwargs : dict, optional
 859            Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search;
 860            any keys not recognized will be ignored. By default, all parameters set to defaults.
 861
 862        Returns
 863        -------
 864        :obj:`~ms_entropy.FlashEntropySearch`
 865            MS2 library as FlashEntropy search instance.
 866
 867        Raises
 868        ------
 869        ValueError
 870            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal.
 871
 872        """
 873        self._check_flash_entropy_kwargs(fe_kwargs)
 874
 875        # Initialize empty library
 876        fe_lib = []
 877
 878        # Enumerate spectra
 879        for i, source in enumerate(spectral_library):
 880            if "spectrum_data" in source.keys():
 881                spectrum = source["spectrum_data"]
 882            else:
 883                spectrum = source
 884
 885            if "precursor_mz" not in spectrum.keys():
 886                spectrum["precursor_mz"] = spectrum.pop("precursor_ion")
 887
 888            spectrum["peaks"] = self._spectrum_to_array(
 889                spectrum["mz"], normalize=normalize
 890            )
 891            fe_lib.append(spectrum)
 892
 893        fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs)
 894
 895        return fe_search
 896
 897    def __init_format_map__(self):
 898        """
 899        Initialize database format mapper, enabling multiple format requests.
 900        """
 901
 902        self.format_map = {
 903            "json": lambda x, normalize, fe_kwargs: x,
 904            "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy(
 905                x, normalize, fe_kwargs
 906            ),
 907            "dataframe": lambda x, normalize, fe_kwargs: pd.DataFrame(x),
 908        }
 909
 910        self.format_map["fe"] = self.format_map["flashentropy"]
 911        self.format_map["flash-entropy"] = self.format_map["flashentropy"]
 912        self.format_map["df"] = self.format_map["dataframe"]
 913
 914    def available_formats(self):
 915        """
 916        View list of available formats.
 917
 918        Returns
 919        -------
 920        list
 921            Format map keys.
 922        """
 923
 924        return list(self.format_map.keys())
 925
 926    def _resolve_db_location(self):
 927        """
 928        Resolve and validate sqlite database location.
 929
 930        Returns
 931        -------
 932        Path
 933            Existing sqlite database file path.
 934        """
 935
 936        db_location = self.db_location or os.getenv("COREMS_LIPIDOMICS_SQLITE_PATH")
 937        if not db_location:
 938            raise ValueError(
 939                "A local lipid sqlite library path is required. "
 940                "Set COREMS_LIPIDOMICS_SQLITE_PATH or pass db_location."
 941            )
 942
 943        db_path = Path(db_location).expanduser()
 944        if not db_path.exists():
 945            raise FileNotFoundError(
 946                f"Lipid sqlite library not found at {db_path}. "
 947                f"Download it from {self.DEFAULT_DOWNLOAD_URL} "
 948                "and set COREMS_LIPIDOMICS_SQLITE_PATH."
 949            )
 950
 951        return db_path
 952
 953    def _get_candidate_spectra(self, connection, mz_list, polarity, mz_tol_ppm):
 954        """
 955        Fetch candidate spectra rows by precursor m/z and polarity.
 956
 957        Returns
 958        -------
 959        pandas.DataFrame
 960            Filtered rows from lipidMassSpectrumObject.
 961        """
 962
 963        mz_observed = np.sort(np.asarray(mz_list, dtype=float))
 964        if mz_observed.size == 0:
 965            return pd.DataFrame()
 966
 967        mz_all = pd.read_sql_query(
 968            "SELECT id, polarity, precursor_mz FROM lipidMassSpectrumObject", connection
 969        )
 970        mz_all = mz_all[mz_all["polarity"] == polarity].copy()
 971        if mz_all.empty:
 972            return pd.DataFrame()
 973
 974        mz_all = mz_all.sort_values(by="precursor_mz").reset_index(drop=True)
 975
 976        if mz_observed.size == 1:
 977            mz_all["closest_mz_obs"] = mz_observed[0]
 978        else:
 979            mz_all["closest_mz_obs"] = mz_observed[
 980                find_closest(mz_observed, mz_all.precursor_mz.values)
 981            ]
 982
 983        mz_all["ppm_error"] = (
 984            (mz_all["precursor_mz"] - mz_all["closest_mz_obs"])
 985            / mz_all["precursor_mz"]
 986            * 1e6
 987        )
 988
 989        mz_all = mz_all[np.abs(mz_all["ppm_error"]) <= mz_tol_ppm]
 990        if mz_all.empty:
 991            return pd.DataFrame()
 992
 993        mz_ids = tuple(mz_all["id"].tolist())
 994        return pd.read_sql_query(
 995            f"SELECT * FROM lipidMassSpectrumObject WHERE id IN {mz_ids}",
 996            connection,
 997        )
 998
 999    def get_lipid_library(
1000        self,
1001        mz_list,
1002        polarity,
1003        mz_tol_ppm,
1004        mz_tol_da_api=None,
1005        format="json",
1006        normalize=True,
1007        fe_kwargs={},
1008        api_delay=5,
1009        api_attempts=10,
1010    ):
1011        """
1012        Retrieve lipid spectra and metadata from a local sqlite library.
1013
1014        Parameters
1015        ----------
1016        mz_list : list
1017            List of precursor m/z values.
1018        polarity : str
1019            Ionization polarity, either "positive" or "negative".
1020        mz_tol_ppm : float
1021            Tolerance in ppm for precursor matching.
1022        mz_tol_da_api : float, optional
1023            Unused, kept for backward compatibility.
1024        format : str, optional
1025            Format of requested library, e.g. "json" or "flashentropy".
1026        normalize : bool, optional
1027            Normalize spectrum intensities.
1028        fe_kwargs : dict, optional
1029            Keyword arguments for FlashEntropy search.
1030        api_delay : int, optional
1031            Unused, kept for backward compatibility.
1032        api_attempts : int, optional
1033            Unused, kept for backward compatibility.
1034
1035        Returns
1036        -------
1037        tuple
1038            Library in requested format and lipid metadata dictionary.
1039        """
1040
1041        if not isinstance(mz_list, (list, np.ndarray)):
1042            raise ValueError("mz_list must be a list or numpy array")
1043        if not all(isinstance(mz, (float, int)) for mz in mz_list):
1044            raise ValueError("All elements in mz_list must be float or int")
1045        if polarity not in {"positive", "negative"}:
1046            raise ValueError("polarity must be either 'positive' or 'negative'")
1047        if not isinstance(mz_tol_ppm, (float, int)):
1048            raise ValueError("mz_tol_ppm must be a float or int")
1049
1050        db_path = self._resolve_db_location()
1051        connection = sqlite3.connect(str(db_path))
1052        try:
1053            # Step 1: Get candidate spectra records based on m/z and polarity
1054            spectra_df = self._get_candidate_spectra(
1055                connection=connection,
1056                mz_list=mz_list,
1057                polarity=polarity,
1058                mz_tol_ppm=float(mz_tol_ppm),
1059            )
1060
1061            if spectra_df.empty:
1062                format_func = self._get_format_func(format)
1063                return format_func([], normalize=normalize, fe_kwargs=fe_kwargs), {}
1064
1065            # Step 2: Get corresponding lipid metadata for candidate spectra from lipidTree view
1066            mol_ids = tuple(spectra_df["molecular_data_id"].tolist())
1067            mol_df = pd.read_sql_query(
1068                f"SELECT * FROM lipidTree WHERE id IN {mol_ids}",
1069                connection,
1070            )
1071        finally:
1072            connection.close()
1073
1074        mol_df["id_index"] = mol_df["id"]
1075        mol_df = mol_df.set_index("id_index")
1076        mol_records = mol_df.to_dict(orient="index")
1077        lipid_metadata = {
1078            int(k): self._dict_to_dataclass(v, LipidMetadata)
1079            for k, v in mol_records.items()
1080        }
1081
1082        spectra_records = spectra_df.to_dict(orient="records")
1083        format_func = self._get_format_func(format)
1084        library = format_func(spectra_records, normalize=normalize, fe_kwargs=fe_kwargs)
1085        return library, lipid_metadata
1086
1087
1088class MSPInterface(SpectralDatabaseInterface):
1089    """
1090    Interface to parse NIST MSP files
1091    """
1092
1093    def __init__(self, file_path):
1094        """
1095        Initialize instance.
1096
1097        Parameters
1098        ----------
1099        file_path : str
1100            Path to a local MSP file.
1101
1102        Attributes
1103        ----------
1104        file_path : str
1105            Path to the MSP file.
1106        _file_content : str
1107            Content of the MSP file.
1108        _data_frame : :obj:`~pandas.DataFrame`
1109            DataFrame of spectra from the MSP file with unaltered content.
1110        """
1111        super().__init__(key=None)
1112
1113        self.file_path = file_path
1114        if not os.path.exists(self.file_path):
1115            raise FileNotFoundError(
1116                f"File {self.file_path} does not exist. Please check the file path."
1117            )
1118        with open(self.file_path, "r") as f:
1119            self._file_content = f.read()
1120
1121        self._data_frame = self._read_msp_file()
1122        self.__init_format_map__()
1123
1124    def __init_format_map__(self):
1125        """
1126        Initialize database format mapper, enabling multiple format requests.
1127
1128        """
1129
1130        # x is a pandas dataframe similar to self._data_frame format
1131        # Define format workflows
1132        self.format_map = {
1133            "msp": lambda x, normalize, fe_kwargs: self._to_msp(x, normalize),
1134            "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy(
1135                x, normalize, fe_kwargs
1136            ),
1137            "df": lambda x, normalize, fe_kwargs: self._to_df(x, normalize),
1138        }
1139
1140        # Add aliases
1141        self.format_map["fe"] = self.format_map["flashentropy"]
1142        self.format_map["flash-entropy"] = self.format_map["flashentropy"]
1143        self.format_map["dataframe"] = self.format_map["df"]
1144        self.format_map["data-frame"] = self.format_map["df"]
1145
1146    def _read_msp_file(self):
1147        """
1148        Reads the MSP files into the pandas dataframe, and sort/remove zero intensity ions in MS/MS spectra.
1149
1150        Returns
1151        -------
1152        :obj:`~pandas.DataFrame`
1153            DataFrame of spectra from the MSP file, exacly as it is in the file (no sorting, filtering etc)
1154        """
1155        # If input_dataframe is provided, return it it
1156        spectra = []
1157        spectrum = {}
1158
1159        f = StringIO(self._file_content)
1160        for line in f:
1161            line = line.strip()
1162            if not line:
1163                continue  # Skip empty lines
1164
1165            # Handle metadata
1166            if ":" in line:
1167                key, value = line.split(":", 1)
1168                key = key.strip().lower()
1169                value = value.strip()
1170
1171                if key == "name":
1172                    # Save current spectrum and start a new one
1173                    if spectrum:
1174                        spectra.append(spectrum)
1175                    spectrum = {"name": value, "peaks": []}
1176                else:
1177                    spectrum[key] = value
1178
1179            # Handle peak data (assumed to start with a number)
1180            elif line[0].isdigit():
1181                peaks = line.split()
1182                m_z = float(peaks[0])
1183                intensity = float(peaks[1])
1184                spectrum["peaks"].append(([m_z, intensity]))
1185        # Save the last spectrum
1186        if spectrum:
1187            spectra.append(spectrum)
1188
1189        df = pd.DataFrame(spectra)
1190        for column in df.columns:
1191            if column != "peaks":  # Skip 'peaks' column
1192                try:
1193                    df[column] = pd.to_numeric(df[column], errors="raise")
1194                except:
1195                    pass
1196        return df
1197
1198    def _to_df(self, input_dataframe, normalize=True):
1199        """
1200        Convert MSP-derived library to FlashEntropy library. 
1201
1202        Parameters
1203        ----------
1204        input_dataframe : :obj:`~pandas.DataFrame`
1205            Input DataFrame containing MSP-formatted spectra.
1206        normalize : bool, optional
1207            Normalize each spectrum by its magnitude.
1208            Default is True.
1209
1210        Returns
1211        -------
1212        :obj:`~pandas.DataFrame`
1213            DataFrame of with desired normalization
1214        """
1215        if not normalize:
1216            return input_dataframe
1217        else:
1218            # Convert to dictionary
1219            db_dict = input_dataframe.to_dict(orient="records")
1220
1221            # Initialize empty library
1222            lib = []
1223
1224            # Enumerate spectra
1225            for i, source in enumerate(db_dict):
1226                spectrum = source
1227                # Check that spectrum["peaks"] exists
1228                if "peaks" not in spectrum.keys():
1229                    raise KeyError(
1230                        "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute."
1231                    )
1232
1233                # Convert spectrum["peaks"] to numpy array
1234                if not isinstance(spectrum["peaks"], np.ndarray):
1235                    spectrum["peaks"] = np.array(spectrum["peaks"])
1236
1237                # Normalize peaks, if requested
1238                if normalize:
1239                    spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"])
1240                    spectrum["num peaks"] = len(spectrum["peaks"])
1241
1242                # Add spectrum to library
1243                lib.append(spectrum)
1244            
1245            # Convert to DataFrame
1246            df = pd.DataFrame(lib)
1247            return df
1248    
1249    def _to_flashentropy(self, input_dataframe, normalize=True, fe_kwargs={}):
1250        """
1251        Convert MSP-derived library to FlashEntropy library.
1252
1253        Parameters
1254        ----------
1255        input_dataframe : :obj:`~pandas.DataFrame`
1256            Input DataFrame containing MSP-formatted spectra.
1257        normalize : bool
1258            Normalize each spectrum by its magnitude.
1259        fe_kwargs : dict, optional
1260            Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search;
1261            any keys not recognized will be ignored. By default, all parameters set to defaults.
1262
1263        Returns
1264        -------
1265        :obj:`~ms_entropy.FlashEntropySearch`
1266            MS2 library as FlashEntropy search instance.
1267
1268        Raises
1269        ------
1270        ValueError
1271            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they
1272        """
1273        self._check_flash_entropy_kwargs(fe_kwargs)
1274
1275        db_df = input_dataframe
1276
1277        # Convert to dictionary
1278        db_dict = db_df.to_dict(orient="records")
1279
1280        # Initialize empty library
1281        fe_lib = []
1282
1283        # Enumerate spectra
1284        for i, source in enumerate(db_dict):
1285            # Reorganize source dict, if necessary
1286            if "spectrum_data" in source.keys():
1287                spectrum = source["spectrum_data"]
1288            else:
1289                spectrum = source
1290
1291            # Rename precursor_mz key for FlashEntropy
1292            if "precursor_mz" not in spectrum.keys():
1293                if "precursormz" in spectrum:
1294                    spectrum["precursor_mz"] = spectrum.pop("precursormz")
1295                elif "precursor_ion" in spectrum:
1296                    spectrum["precursor_mz"] = spectrum.pop("precursor_ion")
1297                else:
1298                    raise KeyError(
1299                        "MSP must have either 'precursormz' or 'precursor_ion' key to be converted to FlashEntropy format."
1300                    )
1301
1302            # Check that spectrum["peaks"] exists
1303            if "peaks" not in spectrum.keys():
1304                raise KeyError(
1305                    "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute."
1306                )
1307
1308            # Convert spectrum["peaks"] to numpy array
1309            if not isinstance(spectrum["peaks"], np.ndarray):
1310                spectrum["peaks"] = np.array(spectrum["peaks"])
1311
1312            # Normalize peaks, if requested
1313            if normalize:
1314                spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"])
1315
1316            # Add spectrum to library
1317            fe_lib.append(spectrum)
1318
1319        # Build FlashEntropy index
1320        fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs)
1321
1322        return fe_search
1323    
1324    def _check_msp_compatibility(self):
1325        """
1326        Check if the MSP file is compatible with the get_metabolomics_spectra_library method and provide feedback if it is not.
1327        """
1328        # Check polarity
1329        if (
1330            "polarity" not in self._data_frame.columns
1331            and "ionmode" not in self._data_frame.columns
1332        ):
1333            raise ValueError(
1334                "Neither 'polarity' nor 'ionmode' columns found in the input MSP metadata. Please check the file."
1335            )
1336        polarity_column = (
1337            "polarity" if "polarity" in self._data_frame.columns else "ionmode"
1338        )
1339
1340        # Check if polarity_column contents is either "positive" or "negative"
1341        if not all(self._data_frame[polarity_column].isin(["positive", "negative"])):
1342            raise ValueError(
1343                f"Input field on MSP '{polarity_column}' must contain only 'positive' or 'negative' values."
1344            )
1345
1346        # Check if the MSP file contains the required columns for metabolite metadata
1347        # inchikey, by name, not null
1348        # either formula or molecular_formula, not null
1349        if not all(self._data_frame["inchikey"].notnull()):
1350            raise ValueError(
1351                "Input field on MSP 'inchikey' must contain only non-null values."
1352            )
1353        if (
1354            "formula" not in self._data_frame.columns
1355            and "molecular_formula" not in self._data_frame.columns
1356        ):
1357            raise ValueError(
1358                "Input field on MSP must contain either 'formula' or 'molecular_formula' columns."
1359            )
1360        molecular_formula_column = (
1361            "formula" if "formula" in self._data_frame.columns else "molecular_formula"
1362        )
1363        if not all(self._data_frame[molecular_formula_column].notnull()):
1364            raise ValueError(
1365                f"Input field on MSP '{molecular_formula_column}' must contain only non-null values."
1366            )
1367
1368    def get_metabolomics_spectra_library(
1369        self,
1370        polarity,
1371        metabolite_metadata_mapping={},
1372        format="fe",
1373        normalize=True,
1374        fe_kwargs={},
1375    ):
1376        """
1377        Prepare metabolomics spectra library and associated metabolite metadata
1378
1379        Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input
1380
1381        """
1382        # Check if the MSP file is compatible with the get_metabolomics_spectra_library method
1383        self._check_msp_compatibility()
1384
1385        # Check if the polarity parameter is valid and if a polarity column exists in the dataframe
1386        if polarity not in ["positive", "negative"]:
1387            raise ValueError("Polarity must be 'positive' or 'negative'")
1388        polarity_column = (
1389            "polarity" if "polarity" in self._data_frame.columns else "ionmode"
1390        )
1391
1392        # Get a subset of the initial dataframea by polarity
1393        db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy()
1394
1395        # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping
1396        # If the mapping is not provided, use the default mapping
1397        if not metabolite_metadata_mapping:
1398            metabolite_metadata_mapping = {
1399                "chebi_id": "chebi",
1400                "kegg_id": "kegg",
1401                "refmet_name": "common_name",
1402                "molecular_formula": "formula",
1403                "gnps_spectra_id":"id",
1404                "precursormz": "precursor_mz",
1405                "precursortype":"ion_type"
1406            }
1407        db_df.rename(columns=metabolite_metadata_mapping, inplace=True)
1408        db_df["molecular_data_id"] = db_df["inchikey"]
1409
1410
1411
1412        # Check if the resulting dataframe has the required columns for the flash entropy search
1413        required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"]
1414        for col in required_columns:
1415            if col not in db_df.columns:
1416                raise ValueError(
1417                    f"Input field on MSP must contain '{col}' column for FlashEntropy search."
1418                )
1419
1420        # Pull out the metabolite metadata from the dataframe and put it into a different dataframe
1421        # First get a list of the possible attributes of the MetaboliteMetadata dataclass
1422        metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys())
1423        # Replace id with molecular_data_id in metabolite_metadata_keys
1424        metabolite_metadata_keys = [
1425            "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys
1426        ]
1427        metabolite_metadata_df = db_df[
1428            db_df.columns[db_df.columns.isin(metabolite_metadata_keys)]
1429        ].copy()
1430
1431        # Make unique and recast the id column for metabolite metadata
1432        metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True)
1433        metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"]
1434
1435        # Convert to a dictionary using the inchikey as the key
1436        metabolite_metadata_dict = metabolite_metadata_df.to_dict(
1437            orient="records"
1438        )
1439        metabolite_metadata_dict = {
1440            v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata)
1441            for v in metabolite_metadata_dict
1442        }
1443
1444        # Remove the metabolite metadata columns from the original dataframe
1445        for key in metabolite_metadata_keys:
1446            if key != "molecular_data_id":
1447                if key in db_df.columns:
1448                    db_df.drop(columns=key, inplace=True)
1449
1450        # Format the spectral library
1451        format_func = self._get_format_func(format)
1452        lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs)
1453        return (lib, metabolite_metadata_dict)
class SpectralDatabaseInterface(abc.ABC):
 24class SpectralDatabaseInterface(ABC):
 25    """
 26    Base class that facilitates connection to spectral reference databases,
 27    such as EMSL's Metabolomics Reference Database (MetabRef).
 28
 29    """
 30
 31    def __init__(self, key=None):
 32        """
 33        Initialize instance.
 34
 35        Parameters
 36        ----------
 37        key : str
 38            Token key.
 39
 40        """
 41
 42        self.key = key
 43
 44    def set_token(self, path):
 45        """
 46        Set environment variable for MetabRef database token.
 47
 48        Parameters
 49        ----------
 50        path : str
 51            Path to token.
 52
 53        """
 54
 55        # Read token from file
 56        with open(path, "r", encoding="utf-8") as f:
 57            token = f.readline().strip()
 58
 59        # Set environment variable
 60        os.environ[self.key] = token
 61
 62    def get_token(self):
 63        """
 64        Get environment variable for database token.
 65
 66        Returns
 67        -------
 68        str
 69            Token string.
 70
 71        """
 72
 73        # Check for token
 74        if self.key not in os.environ:
 75            raise ValueError("Must set {} environment variable.".format(self.key))
 76
 77        # Get token from environment variables
 78        return os.environ.get(self.key)
 79
 80    def get_header(self):
 81        """
 82        Access stored database token and prepare as header.
 83
 84        Returns
 85        -------
 86        str
 87            Header string.
 88
 89        """
 90
 91        # Get token
 92        token = self.get_token()
 93
 94        # Pad header information
 95        header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"}
 96
 97        return header
 98
 99    def get_query(self, url, use_header=True):
100        """
101        Request payload from URL according to `get` protocol.
102
103        Parameters
104        ----------
105        url : str
106            URL for request.
107        use_header: bool
108            Whether or not the query should include the header
109
110        Returns
111        -------
112        dict
113            Response as JSON.
114
115        """
116
117        # Query URL via `get`
118        if use_header:
119            response = requests.get(url, headers=self.get_header())
120        else:
121            response = requests.get(url)
122
123        # Check response
124        response.raise_for_status()
125
126        # Return as JSON
127        return response.json()
128
129    def post_query(self, url, variable, values, tolerance):
130        """
131        Request payload from URL according to `post` protocol.
132
133        Parameters
134        ----------
135        url : str
136            URL for request.
137        variable : str
138            Variable to query.
139        values : str
140            Specific values of `variable` to query.
141        tolerance : str
142            Query tolerance relative to `values`.
143
144        Returns
145        -------
146        dict
147            Response as JSON.
148
149        """
150
151        # Coerce to string
152        if not isinstance(variable, str):
153            variable = str(variable).replace(" ", "")
154
155        if not isinstance(values, str):
156            values = str(values).replace(" ", "")
157
158        if not isinstance(tolerance, str):
159            tolerance = str(tolerance).replace(" ", "")
160
161        # Query URL via `post`
162        response = requests.post(
163            os.path.join(url, variable, tolerance),
164            data=values,
165            headers=self.get_header(),
166        )
167
168        # Check response
169        response.raise_for_status()
170
171        # Return as JSON
172        return response.json()
173
174    def _check_flash_entropy_kwargs(self, fe_kwargs):
175        """
176        Check FlashEntropy keyword arguments.
177
178        Parameters
179        ----------
180        fe_kwargs : dict
181            Keyword arguments for FlashEntropy search.
182
183
184        Raises
185        ------
186        ValueError
187            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they
188            are not equal.
189
190        """
191        # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da
192        if (
193            "min_ms2_difference_in_da" in fe_kwargs
194            or "max_ms2_tolerance_in_da" in fe_kwargs
195        ):
196            if (
197                "min_ms2_difference_in_da" not in fe_kwargs
198                or "max_ms2_tolerance_in_da" not in fe_kwargs
199            ):
200                raise ValueError(
201                    "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified."
202                )
203            if (
204                fe_kwargs["min_ms2_difference_in_da"]
205                != 2 * fe_kwargs["max_ms2_tolerance_in_da"]
206            ):
207                raise ValueError(
208                    "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'."
209                )
210
211    def _get_format_func(self, format):
212        """
213        Obtain format function by key.
214
215        Returns
216        -------
217        func
218            Formatting function.
219        """
220
221        if format.lower() in self.format_map.keys():
222            return self.format_map[format.lower()]
223
224        raise ValueError(("{} not a supported format.").format(format))
225
226    def _dict_to_dataclass(self, source_dict, data_class):
227        """
228        Convert dictionary to dataclass.
229
230        Notes
231        -----
232        This function will pull the attributes a dataclass and its parent class
233        and convert the dictionary to a dataclass instance with the appropriate
234        attributes.
235
236        Parameters
237        ----------
238        data_class : :obj:`~dataclasses.dataclass`
239            Dataclass to convert to.
240        source_dict : dict
241            Dictionary object to convert to dataclass.
242
243        Returns
244        -------
245        :obj:`~dataclasses.dataclass`
246            Dataclass instance.
247
248        """
249
250        # Get list of expected attributes of data_class
251        data_class_keys = list(data_class.__annotations__.keys())
252
253        # Does the data_class inherit from another class, if so, get the attributes of the parent class as well
254        if len(data_class.__mro__) > 2:
255            parent_class_keys = list(data_class.__bases__[0].__annotations__.keys())
256            data_class_keys = list(set(data_class_keys + parent_class_keys))
257
258        # Remove keys that are not in the data_class from the input dictionary
259        input_dict = {k: v for k, v in source_dict.items() if k in data_class_keys}
260
261        # Add keys that are in the data class but not in the input dictionary as None
262        for key in data_class_keys:
263            if key not in input_dict.keys():
264                input_dict[key] = None
265        return data_class(**input_dict)
266
267    def _spectrum_to_array(self, spectrum, normalize=True):
268        """
269        Convert a parenthesis-delimited spectrum string to array.
270
271        Parameters
272        ----------
273        spectrum : str
274            Spectrum string, i.e. list of (m/z,abundance) pairs.
275        normalize : bool
276            Normalize the spectrum by its magnitude.
277
278        Returns
279        -------
280        :obj:`~numpy.array`
281            Array of shape (N, 2), with m/z in the first column and abundance in
282            the second.
283        """
284
285        arr = np.array(
286            re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float
287        ).reshape(-1, 2)
288
289        if normalize:
290            arr = self.normalize_peaks(arr)
291
292        return arr
293
294    @staticmethod
295    def normalize_peaks(arr):
296        """
297        Normalize peaks in an array.
298
299        Parameters
300        ----------
301        arr : :obj:`~numpy.array`
302            Array of shape (N, 2), with m/z in the first column and abundance in
303            the second.
304
305        Returns
306        -------
307        :obj:`~numpy.array`
308            Normalized array of shape (N, 2), with m/z in the first column and
309            normalized abundance in the second.
310        """
311        # Normalize the array
312        arr[:, -1] = arr[:, -1] / arr[:, -1].sum()
313
314        return arr
315
316    @staticmethod
317    def _build_flash_entropy_index(fe_lib, fe_kwargs={}, clean_spectra=True):
318        """
319        Build FlashEntropy index.
320
321        Parameters
322        ----------
323        fe_lib : list
324            List of spectra to build index from. Can be a list of dictionaries or
325            a FlashEntropy search instance.
326        fe_kwargs : dict, optional
327            Keyword arguments for FlashEntropy search.
328        clean_spectra : bool, optional
329            Clean spectra before building index. Default is True.
330
331        Returns
332        -------
333        :obj:`~ms_entropy.FlashEntropySearch`
334            FlashEntropy search instance.
335
336        """
337        # Initialize FlashEntropy
338        fe_init_kws = [
339            "max_ms2_tolerance_in_da",
340            "mz_index_step",
341            "low_memory",
342            "path_data",
343        ]
344        fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws}
345        fes = FlashEntropySearch(**fe_init_kws)
346
347        # Build FlashEntropy index
348        fe_index_kws = [
349            "max_indexed_mz",
350            "precursor_ions_removal_da",
351            "noise_threshold",
352            "min_ms2_difference_in_da",
353            "max_peak_num",
354        ]
355        fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws}
356        fes.build_index(fe_lib, **fe_index_kws, clean_spectra=clean_spectra)
357
358        return fes

Base class that facilitates connection to spectral reference databases, such as EMSL's Metabolomics Reference Database (MetabRef).

SpectralDatabaseInterface(key=None)
31    def __init__(self, key=None):
32        """
33        Initialize instance.
34
35        Parameters
36        ----------
37        key : str
38            Token key.
39
40        """
41
42        self.key = key

Initialize instance.

Parameters
  • key (str): Token key.
key
def set_token(self, path):
44    def set_token(self, path):
45        """
46        Set environment variable for MetabRef database token.
47
48        Parameters
49        ----------
50        path : str
51            Path to token.
52
53        """
54
55        # Read token from file
56        with open(path, "r", encoding="utf-8") as f:
57            token = f.readline().strip()
58
59        # Set environment variable
60        os.environ[self.key] = token

Set environment variable for MetabRef database token.

Parameters
  • path (str): Path to token.
def get_token(self):
62    def get_token(self):
63        """
64        Get environment variable for database token.
65
66        Returns
67        -------
68        str
69            Token string.
70
71        """
72
73        # Check for token
74        if self.key not in os.environ:
75            raise ValueError("Must set {} environment variable.".format(self.key))
76
77        # Get token from environment variables
78        return os.environ.get(self.key)

Get environment variable for database token.

Returns
  • str: Token string.
def get_header(self):
80    def get_header(self):
81        """
82        Access stored database token and prepare as header.
83
84        Returns
85        -------
86        str
87            Header string.
88
89        """
90
91        # Get token
92        token = self.get_token()
93
94        # Pad header information
95        header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"}
96
97        return header

Access stored database token and prepare as header.

Returns
  • str: Header string.
def get_query(self, url, use_header=True):
 99    def get_query(self, url, use_header=True):
100        """
101        Request payload from URL according to `get` protocol.
102
103        Parameters
104        ----------
105        url : str
106            URL for request.
107        use_header: bool
108            Whether or not the query should include the header
109
110        Returns
111        -------
112        dict
113            Response as JSON.
114
115        """
116
117        # Query URL via `get`
118        if use_header:
119            response = requests.get(url, headers=self.get_header())
120        else:
121            response = requests.get(url)
122
123        # Check response
124        response.raise_for_status()
125
126        # Return as JSON
127        return response.json()

Request payload from URL according to get protocol.

Parameters
  • url (str): URL for request.
  • use_header (bool): Whether or not the query should include the header
Returns
  • dict: Response as JSON.
def post_query(self, url, variable, values, tolerance):
129    def post_query(self, url, variable, values, tolerance):
130        """
131        Request payload from URL according to `post` protocol.
132
133        Parameters
134        ----------
135        url : str
136            URL for request.
137        variable : str
138            Variable to query.
139        values : str
140            Specific values of `variable` to query.
141        tolerance : str
142            Query tolerance relative to `values`.
143
144        Returns
145        -------
146        dict
147            Response as JSON.
148
149        """
150
151        # Coerce to string
152        if not isinstance(variable, str):
153            variable = str(variable).replace(" ", "")
154
155        if not isinstance(values, str):
156            values = str(values).replace(" ", "")
157
158        if not isinstance(tolerance, str):
159            tolerance = str(tolerance).replace(" ", "")
160
161        # Query URL via `post`
162        response = requests.post(
163            os.path.join(url, variable, tolerance),
164            data=values,
165            headers=self.get_header(),
166        )
167
168        # Check response
169        response.raise_for_status()
170
171        # Return as JSON
172        return response.json()

Request payload from URL according to post protocol.

Parameters
  • url (str): URL for request.
  • variable (str): Variable to query.
  • values (str): Specific values of variable to query.
  • tolerance (str): Query tolerance relative to values.
Returns
  • dict: Response as JSON.
@staticmethod
def normalize_peaks(arr):
294    @staticmethod
295    def normalize_peaks(arr):
296        """
297        Normalize peaks in an array.
298
299        Parameters
300        ----------
301        arr : :obj:`~numpy.array`
302            Array of shape (N, 2), with m/z in the first column and abundance in
303            the second.
304
305        Returns
306        -------
307        :obj:`~numpy.array`
308            Normalized array of shape (N, 2), with m/z in the first column and
309            normalized abundance in the second.
310        """
311        # Normalize the array
312        arr[:, -1] = arr[:, -1] / arr[:, -1].sum()
313
314        return arr

Normalize peaks in an array.

Parameters
  • arr (~numpy.array): Array of shape (N, 2), with m/z in the first column and abundance in the second.
Returns
  • ~numpy.array: Normalized array of shape (N, 2), with m/z in the first column and normalized abundance in the second.
class MetabRefInterface(SpectralDatabaseInterface):
361class MetabRefInterface(SpectralDatabaseInterface):
362    """
363    DEPRECATED interface retained for backward compatibility only.
364    """
365
366    def __init__(self):
367        """
368        Initialize instance with deprecation warning.
369
370        """
371
372        super().__init__(key=None)
373
374        if self.__class__ is MetabRefInterface:
375            warnings.warn(
376                "MetabRefInterface is deprecated. Instantiate a concrete interface "
377                "such as GCMSLibraryInterface or LCLipidLibraryInterface instead.",
378                DeprecationWarning,
379                stacklevel=2,
380            )

DEPRECATED interface retained for backward compatibility only.

MetabRefInterface()
366    def __init__(self):
367        """
368        Initialize instance with deprecation warning.
369
370        """
371
372        super().__init__(key=None)
373
374        if self.__class__ is MetabRefInterface:
375            warnings.warn(
376                "MetabRefInterface is deprecated. Instantiate a concrete interface "
377                "such as GCMSLibraryInterface or LCLipidLibraryInterface instead.",
378                DeprecationWarning,
379                stacklevel=2,
380            )

Initialize instance with deprecation warning.

class GCMSLibraryInterface(SpectralDatabaseInterface):
383class GCMSLibraryInterface(SpectralDatabaseInterface):
384    """
385    Interface to bundled GCMS spectral libraries in MSP format.
386    
387    Loads GCMS compound library and FAMES calibration library from local MSP files.
388    Default files are bundled with CoreMS, but can be overridden via environment variables.
389    """
390
391    def __init__(self):
392        """
393        Initialize instance.
394        """
395        super().__init__(key=None)
396        
397        # Local data file paths
398        from pathlib import Path
399        
400        # Default to bundled data files
401        data_dir = Path(__file__).parent.parent / "data"
402        self.gcms_library_file = os.getenv(
403            "GCMS_LIBRARY_PATH", 
404            str(data_dir / "PNNLMetV20191015.msp")
405        )
406        self.fames_library_file = os.getenv(
407            "FAMES_LIBRARY_PATH",
408            str(data_dir / "FAMES_REF.msp")
409        )
410
411        self.__init_format_map__()
412
413    def __init_format_map__(self):
414        """
415        Initialize database format mapper, enabling multiple format requests.
416
417        """
418
419        # Define format workflows
420        self.format_map = {
421            "json": lambda x, normalize, fe_kwargs: x,
422            "dict": lambda x,
423            normalize,
424            fe_kwargs: self._to_LowResolutionEICompound_dict(x, normalize),
425            "sql": lambda x,
426            normalize,
427            fe_kwargs: self._LowResolutionEICompound_dict_to_sqlite(
428                self._to_LowResolutionEICompound_dict(x, normalize)
429            ),
430        }
431
432        # Add aliases
433        self.format_map["metabref"] = self.format_map["json"]
434        self.format_map["datadict"] = self.format_map["dict"]
435        self.format_map["data-dict"] = self.format_map["dict"]
436        self.format_map["lowreseicompound"] = self.format_map["dict"]
437        self.format_map["lowres"] = self.format_map["dict"]
438        self.format_map["lowresgc"] = self.format_map["dict"]
439        self.format_map["sqlite"] = self.format_map["sql"]
440
441    def available_formats(self):
442        """
443        View list of available formats.
444
445        Returns
446        -------
447        list
448            Format map keys.
449        """
450
451        return list(self.format_map.keys())
452
453    def get_library(self, format="json", normalize=False):
454        """
455        Load GC/MS library from local MSP file.
456
457        Parameters
458        ----------
459        format : str
460            Format of requested library, i.e. "json", "sql", "dict".
461            See `available_formats` method for aliases.
462        normalize : bool
463            Normalize the spectrum by its magnitude.
464
465        Returns
466        -------
467        Library in requested format.
468
469        """
470        # Load from local MSP file
471        library_data = self._load_msp_file(self.gcms_library_file, normalize)
472        
473        # Init format function
474        format_func = self._get_format_func(format)
475        
476        # Apply format conversion
477        return format_func(library_data, normalize, {})
478
479    def get_fames(self, format="json", normalize=False):
480        """
481        Load GC/MS FAMEs library from local MSP file.
482
483        Parameters
484        ----------
485        format : str
486            Format of requested library, i.e. "json", "sql", "dict".
487            See `available_formats` method for aliases.
488        normalize : bool
489            Normalize the spectrum by its magnitude.
490
491        Returns
492        -------
493        Library in requested format.
494
495        """
496        # Load from local MSP file
497        library_data = self._load_msp_file(self.fames_library_file, normalize)
498        
499        # Init format function
500        format_func = self._get_format_func(format)
501        
502        # Apply format conversion
503        return format_func(library_data, normalize, {})
504    
505    def _load_msp_file(self, file_path, normalize=False):
506        """
507        Load and parse MSP file into format compatible with existing pipeline.
508        
509        Parameters
510        ----------
511        file_path : str
512            Path to MSP file
513        normalize : bool
514            Normalize spectra
515            
516        Returns
517        -------
518        list of dict
519            Library data in format compatible with _to_LowResolutionEICompound_dict
520        """
521        from pathlib import Path
522        
523        file_path = Path(file_path)
524        if not file_path.exists():
525            raise FileNotFoundError(
526                f"Library file not found: {file_path}. "
527                f"Set GCMS_LIBRARY_PATH or FAMES_LIBRARY_PATH environment variable to specify location."
528            )
529        
530        # Parse MSP file
531        spectra = []
532        spectrum = {}
533        peaks = []
534        
535        with open(file_path, 'r') as f:
536            for line in f:
537                line = line.strip()
538                
539                # Empty line marks end of spectrum
540                if not line:
541                    if spectrum and peaks:
542                        # Convert peaks to the format expected by downstream code
543                        # Format: "(mz,abundance)(mz,abundance)..."
544                        peak_str = "".join([f"({int(mz)},{int(abun)})" for mz, abun in peaks])
545                        spectrum['mz'] = peak_str
546                        spectra.append(spectrum)
547                    spectrum = {}
548                    peaks = []
549                    continue
550                
551                # Check if line contains peak data (starts with digit)
552                if line and line[0].isdigit():
553                    parts = line.split()
554                    if len(parts) >= 2:
555                        peaks.append((float(parts[0]), float(parts[1])))
556                    continue
557                
558                # Handle metadata fields
559                if ":" in line:
560                    key, value = line.split(":", 1)
561                    key = key.strip().lower()
562                    value = value.strip()
563                    
564                    # Map MSP fields to expected format
565                    field_mapping = {
566                        "name": "molecule_name",
567                        "formula": "formula",
568                        "cas": "casno",
569                        "retentiontime": "retention_time",
570                        "ri": "ri",
571                        "comment": "comments",
572                        "num peaks": "peak_count",
573                        "derivative": "derivative"
574                    }
575                    
576                    # Metadata fields that go into the metadata dict
577                    metadata_fields = {
578                        "inchikey": "inchikey",
579                        "inchi": "inchi",
580                        "smiles": "smiles",
581                        "pubchem": "pubchem",
582                        "chebi": "chebi",
583                        "kegg": "kegg",
584                        "refmet": "refmet",
585                        "iupac_name": "iupac_name"
586                    }
587                    
588                    if key in field_mapping:
589                        mapped_key = field_mapping[key]
590                        # Convert numeric fields
591                        if key in ["retentiontime", "ri"]:
592                            try:
593                                value = float(value)
594                            except:
595                                pass
596                        elif key == "num peaks":
597                            try:
598                                value = int(value)
599                            except:
600                                pass
601                        spectrum[mapped_key] = value
602                    elif key in metadata_fields:
603                        # Store in nested metadata dict
604                        if "metadata" not in spectrum:
605                            spectrum["metadata"] = {}
606                        spectrum["metadata"][metadata_fields[key]] = value
607                    else:
608                        # Keep unmapped fields
609                        spectrum[key] = value
610        
611        # Add last spectrum if file doesn't end with blank line
612        if spectrum and peaks:
613            peak_str = "".join([f"({int(mz)},{int(abun)})" for mz, abun in peaks])
614            spectrum['mz'] = peak_str
615            spectra.append(spectrum)
616        
617        return spectra
618
619    def _to_LowResolutionEICompound_dict(self, metabref_lib, normalize=False):
620        """
621        Convert MetabRef-formatted library to CoreMS LowResolutionEICompound-formatted
622        dictionary for local ingestion.
623
624        Parameters
625        ----------
626        metabref_lib : dict
627            MetabRef GC-MS library in JSON format.
628        normalize : bool
629            Normalize each spectrum by its magnitude.
630
631        Returns
632        -------
633        list of dict
634            List of each spectrum contained in dictionary.
635
636        """
637
638        # All below key:value lookups are based on CoreMS class definitions
639        # NOT MetabRef content. For example, MetabRef has keys for PubChem,
640        # USI, etc. that are not considered below.
641
642        # Dictionary to map metabref keys to corems keys
643        metadatar_cols = {
644            "casno": "cas",
645            "inchikey": "inchikey",
646            "inchi": "inchi",
647            "chebi": "chebi",
648            "smiles": "smiles",
649            "kegg": "kegg",
650            "iupac_name": "iupac_name",
651            "traditional_name": "traditional_name",  # Not present in metabref
652            "common_name": "common_name",  # Not present in metabref
653        }
654
655        # Dictionary to map metabref keys to corems keys
656        lowres_ei_compound_cols = {
657            "id": "metabref_id",
658            "molecule_name": "name",  # Is this correct?
659            "classify": "classify",  # Not present in metabref
660            "formula": "formula",
661            "ri": "ri",
662            "rt": "retention_time",
663            "source": "source",  # Not present in metabref
664            "casno": "casno",
665            "comments": "comment",
666            "source_temp_c": "source_temp_c",  # Not present in metabref
667            "ev": "ev",  # Not present in metabref
668            "peak_count": "peaks_count",
669            "mz": "mz",
670            "abundance": "abundance",
671        }
672
673        # Local result container
674        corems_lib = []
675
676        # Enumerate spectra
677        for i, source_ in enumerate(metabref_lib):
678            # Copy source to prevent modification
679            source = source_.copy()
680
681            # Parse target data
682            target = {
683                lowres_ei_compound_cols[k]: v
684                for k, v in source.items()
685                if k in lowres_ei_compound_cols
686            }
687
688            # Explicitly add this to connect with LowResCompoundRef later
689            if "retention_time" in source:
690                target["rt"] = source["retention_time"]
691            elif "rt" in source:
692                target["rt"] = source["rt"]
693
694            # Parse (mz, abundance)
695            arr = self._spectrum_to_array(target["mz"], normalize=normalize)
696            target["mz"] = arr[:, 0]
697            target["abundance"] = arr[:, 1]
698
699            # Parse meta data
700            target["metadata"] = {
701                metadatar_cols[k]: v for k, v in source.items() if k in metadatar_cols
702            }
703
704            # Add anything else
705            for k in source:
706                if k not in lowres_ei_compound_cols:
707                    target[k] = source[k]
708
709            # Add to CoreMS list
710            corems_lib.append(target)
711
712        return corems_lib
713
714    def _LowResolutionEICompound_dict_to_sqlite(
715        self, lowres_ei_compound_dict, url="sqlite://"
716    ):
717        """
718        Convert CoreMS LowResolutionEICompound-formatted dictionary to SQLite
719        database for local ingestion.
720
721        Parameters
722        ----------
723        lowres_ei_compound_dict : dict
724            CoreMS GC-MS library formatted for LowResolutionEICompound.
725        url : str
726            URL to SQLite prefix.
727
728        Returns
729        -------
730        sqlite database
731            Spectra contained in SQLite database.
732
733        """
734
735        # Dictionary to map corems keys to all-caps keys
736        capped_cols = {
737            "name": "NAME",
738            "formula": "FORM",
739            "ri": "RI",
740            "retention_time": "RT",
741            "source": "SOURCE",
742            "casno": "CASNO",
743            "comment": "COMMENT",
744            "peaks_count": "NUM PEAKS",
745        }
746
747        # Initialize SQLite object
748        sqlite_obj = EI_LowRes_SQLite(url=url)
749
750        # Iterate spectra
751        for _data_dict in lowres_ei_compound_dict:
752            # Copy source to prevent modification
753            data_dict = _data_dict.copy()
754
755            # Add missing capped values
756            for k, v in capped_cols.items():
757                # Key exists
758                if k in data_dict:
759                    # # This will replace the key
760                    # data_dict[v] = data_dict.pop(k)
761
762                    # This will keep both keys
763                    data_dict[v] = data_dict[k]
764
765            # Parse number of peaks
766            if not data_dict.get("NUM PEAKS"):
767                data_dict["NUM PEAKS"] = len(data_dict.get("mz"))
768
769            # Parse CAS number
770            if not data_dict.get("CASNO"):
771                data_dict["CASNO"] = data_dict.get("CAS")
772
773            if not data_dict["CASNO"]:
774                data_dict["CASNO"] = 0
775
776            # Build linked metadata table
777            if "metadata" in data_dict:
778                metadata = data_dict.pop("metadata")
779                # Only create metadata entry if we have required fields and valid data
780                # Filter to only include fields that Metadatar model supports
781                supported_metadata_fields = [
782                    'cas', 'inchikey', 'inchi', 'chebi', 'smiles', 
783                    'kegg', 'iupac_name', 'traditional_name', 'common_name'
784                ]
785                filtered_metadata = {
786                    k: v for k, v in metadata.items() 
787                    if k in supported_metadata_fields and v
788                }
789                # Inchikey is required by the database model
790                if filtered_metadata and filtered_metadata.get("inchikey"):
791                    data_dict["metadatar"] = Metadatar(**filtered_metadata)
792
793            # Attempt addition to sqlite
794            try:
795                sqlite_obj.add_compound(data_dict)
796            except:
797                print(data_dict["NAME"])
798
799        return sqlite_obj

Interface to bundled GCMS spectral libraries in MSP format.

Loads GCMS compound library and FAMES calibration library from local MSP files. Default files are bundled with CoreMS, but can be overridden via environment variables.

GCMSLibraryInterface()
391    def __init__(self):
392        """
393        Initialize instance.
394        """
395        super().__init__(key=None)
396        
397        # Local data file paths
398        from pathlib import Path
399        
400        # Default to bundled data files
401        data_dir = Path(__file__).parent.parent / "data"
402        self.gcms_library_file = os.getenv(
403            "GCMS_LIBRARY_PATH", 
404            str(data_dir / "PNNLMetV20191015.msp")
405        )
406        self.fames_library_file = os.getenv(
407            "FAMES_LIBRARY_PATH",
408            str(data_dir / "FAMES_REF.msp")
409        )
410
411        self.__init_format_map__()

Initialize instance.

gcms_library_file
fames_library_file
def available_formats(self):
441    def available_formats(self):
442        """
443        View list of available formats.
444
445        Returns
446        -------
447        list
448            Format map keys.
449        """
450
451        return list(self.format_map.keys())

View list of available formats.

Returns
  • list: Format map keys.
def get_library(self, format='json', normalize=False):
453    def get_library(self, format="json", normalize=False):
454        """
455        Load GC/MS library from local MSP file.
456
457        Parameters
458        ----------
459        format : str
460            Format of requested library, i.e. "json", "sql", "dict".
461            See `available_formats` method for aliases.
462        normalize : bool
463            Normalize the spectrum by its magnitude.
464
465        Returns
466        -------
467        Library in requested format.
468
469        """
470        # Load from local MSP file
471        library_data = self._load_msp_file(self.gcms_library_file, normalize)
472        
473        # Init format function
474        format_func = self._get_format_func(format)
475        
476        # Apply format conversion
477        return format_func(library_data, normalize, {})

Load GC/MS library from local MSP file.

Parameters
  • format (str): Format of requested library, i.e. "json", "sql", "dict". See available_formats method for aliases.
  • normalize (bool): Normalize the spectrum by its magnitude.
Returns
  • Library in requested format.
def get_fames(self, format='json', normalize=False):
479    def get_fames(self, format="json", normalize=False):
480        """
481        Load GC/MS FAMEs library from local MSP file.
482
483        Parameters
484        ----------
485        format : str
486            Format of requested library, i.e. "json", "sql", "dict".
487            See `available_formats` method for aliases.
488        normalize : bool
489            Normalize the spectrum by its magnitude.
490
491        Returns
492        -------
493        Library in requested format.
494
495        """
496        # Load from local MSP file
497        library_data = self._load_msp_file(self.fames_library_file, normalize)
498        
499        # Init format function
500        format_func = self._get_format_func(format)
501        
502        # Apply format conversion
503        return format_func(library_data, normalize, {})

Load GC/MS FAMEs library from local MSP file.

Parameters
  • format (str): Format of requested library, i.e. "json", "sql", "dict". See available_formats method for aliases.
  • normalize (bool): Normalize the spectrum by its magnitude.
Returns
  • Library in requested format.
class MetabRefGCInterface(GCMSLibraryInterface):
802class MetabRefGCInterface(GCMSLibraryInterface):
803    """
804    DEPRECATED: Use GCMSLibraryInterface instead.
805    
806    This interface is maintained for backward compatibility only.
807    MetabRef API has been discontinued as of 2026.
808    """
809
810    def __init__(self):
811        """
812        Initialize instance with deprecation warning.
813        """
814        warnings.warn(
815            "MetabRefGCInterface is deprecated. Use GCMSLibraryInterface instead. "
816            "MetabRef API has been discontinued; all data now loads from bundled local MSP files.",
817            DeprecationWarning,
818            stacklevel=2
819        )
820        super().__init__()

DEPRECATED: Use GCMSLibraryInterface instead.

This interface is maintained for backward compatibility only. MetabRef API has been discontinued as of 2026.

MetabRefGCInterface()
810    def __init__(self):
811        """
812        Initialize instance with deprecation warning.
813        """
814        warnings.warn(
815            "MetabRefGCInterface is deprecated. Use GCMSLibraryInterface instead. "
816            "MetabRef API has been discontinued; all data now loads from bundled local MSP files.",
817            DeprecationWarning,
818            stacklevel=2
819        )
820        super().__init__()

Initialize instance with deprecation warning.

class LCLipidLibraryInterface(SpectralDatabaseInterface):
 823class LCLipidLibraryInterface(SpectralDatabaseInterface):
 824    """
 825    Interface to a local sqlite lipid library for LC-MS spectral searches.
 826    """
 827
 828    DEFAULT_DOWNLOAD_URL = (
 829        "https://nmdcdemo.emsl.pnnl.gov/minio/lipidomics/parameter_files/"
 830        "202412_lipid_ref.sqlite"
 831    )
 832
 833    def __init__(self, db_location=None):
 834        """
 835        Initialize instance.
 836
 837        Parameters
 838        ----------
 839        db_location : str | Path, optional
 840            Local path to the sqlite lipid library. If omitted, the
 841            COREMS_LIPIDOMICS_SQLITE_PATH environment variable is used.
 842        """
 843
 844        super().__init__(key=None)
 845        self.db_location = db_location
 846        self.__init_format_map__()
 847
 848    def _to_flashentropy(self, spectral_library, normalize=True, fe_kwargs={}):
 849        """
 850        Convert a spectral library to FlashEntropy format.
 851
 852        Parameters
 853        ----------
 854        spectral_library : dict
 855            MS2 library in JSON format or FlashEntropy search instance
 856            (for reformatting at different MS2 separation).
 857        normalize : bool
 858            Normalize each spectrum by its magnitude.
 859        fe_kwargs : dict, optional
 860            Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search;
 861            any keys not recognized will be ignored. By default, all parameters set to defaults.
 862
 863        Returns
 864        -------
 865        :obj:`~ms_entropy.FlashEntropySearch`
 866            MS2 library as FlashEntropy search instance.
 867
 868        Raises
 869        ------
 870        ValueError
 871            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal.
 872
 873        """
 874        self._check_flash_entropy_kwargs(fe_kwargs)
 875
 876        # Initialize empty library
 877        fe_lib = []
 878
 879        # Enumerate spectra
 880        for i, source in enumerate(spectral_library):
 881            if "spectrum_data" in source.keys():
 882                spectrum = source["spectrum_data"]
 883            else:
 884                spectrum = source
 885
 886            if "precursor_mz" not in spectrum.keys():
 887                spectrum["precursor_mz"] = spectrum.pop("precursor_ion")
 888
 889            spectrum["peaks"] = self._spectrum_to_array(
 890                spectrum["mz"], normalize=normalize
 891            )
 892            fe_lib.append(spectrum)
 893
 894        fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs)
 895
 896        return fe_search
 897
 898    def __init_format_map__(self):
 899        """
 900        Initialize database format mapper, enabling multiple format requests.
 901        """
 902
 903        self.format_map = {
 904            "json": lambda x, normalize, fe_kwargs: x,
 905            "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy(
 906                x, normalize, fe_kwargs
 907            ),
 908            "dataframe": lambda x, normalize, fe_kwargs: pd.DataFrame(x),
 909        }
 910
 911        self.format_map["fe"] = self.format_map["flashentropy"]
 912        self.format_map["flash-entropy"] = self.format_map["flashentropy"]
 913        self.format_map["df"] = self.format_map["dataframe"]
 914
 915    def available_formats(self):
 916        """
 917        View list of available formats.
 918
 919        Returns
 920        -------
 921        list
 922            Format map keys.
 923        """
 924
 925        return list(self.format_map.keys())
 926
 927    def _resolve_db_location(self):
 928        """
 929        Resolve and validate sqlite database location.
 930
 931        Returns
 932        -------
 933        Path
 934            Existing sqlite database file path.
 935        """
 936
 937        db_location = self.db_location or os.getenv("COREMS_LIPIDOMICS_SQLITE_PATH")
 938        if not db_location:
 939            raise ValueError(
 940                "A local lipid sqlite library path is required. "
 941                "Set COREMS_LIPIDOMICS_SQLITE_PATH or pass db_location."
 942            )
 943
 944        db_path = Path(db_location).expanduser()
 945        if not db_path.exists():
 946            raise FileNotFoundError(
 947                f"Lipid sqlite library not found at {db_path}. "
 948                f"Download it from {self.DEFAULT_DOWNLOAD_URL} "
 949                "and set COREMS_LIPIDOMICS_SQLITE_PATH."
 950            )
 951
 952        return db_path
 953
 954    def _get_candidate_spectra(self, connection, mz_list, polarity, mz_tol_ppm):
 955        """
 956        Fetch candidate spectra rows by precursor m/z and polarity.
 957
 958        Returns
 959        -------
 960        pandas.DataFrame
 961            Filtered rows from lipidMassSpectrumObject.
 962        """
 963
 964        mz_observed = np.sort(np.asarray(mz_list, dtype=float))
 965        if mz_observed.size == 0:
 966            return pd.DataFrame()
 967
 968        mz_all = pd.read_sql_query(
 969            "SELECT id, polarity, precursor_mz FROM lipidMassSpectrumObject", connection
 970        )
 971        mz_all = mz_all[mz_all["polarity"] == polarity].copy()
 972        if mz_all.empty:
 973            return pd.DataFrame()
 974
 975        mz_all = mz_all.sort_values(by="precursor_mz").reset_index(drop=True)
 976
 977        if mz_observed.size == 1:
 978            mz_all["closest_mz_obs"] = mz_observed[0]
 979        else:
 980            mz_all["closest_mz_obs"] = mz_observed[
 981                find_closest(mz_observed, mz_all.precursor_mz.values)
 982            ]
 983
 984        mz_all["ppm_error"] = (
 985            (mz_all["precursor_mz"] - mz_all["closest_mz_obs"])
 986            / mz_all["precursor_mz"]
 987            * 1e6
 988        )
 989
 990        mz_all = mz_all[np.abs(mz_all["ppm_error"]) <= mz_tol_ppm]
 991        if mz_all.empty:
 992            return pd.DataFrame()
 993
 994        mz_ids = tuple(mz_all["id"].tolist())
 995        return pd.read_sql_query(
 996            f"SELECT * FROM lipidMassSpectrumObject WHERE id IN {mz_ids}",
 997            connection,
 998        )
 999
1000    def get_lipid_library(
1001        self,
1002        mz_list,
1003        polarity,
1004        mz_tol_ppm,
1005        mz_tol_da_api=None,
1006        format="json",
1007        normalize=True,
1008        fe_kwargs={},
1009        api_delay=5,
1010        api_attempts=10,
1011    ):
1012        """
1013        Retrieve lipid spectra and metadata from a local sqlite library.
1014
1015        Parameters
1016        ----------
1017        mz_list : list
1018            List of precursor m/z values.
1019        polarity : str
1020            Ionization polarity, either "positive" or "negative".
1021        mz_tol_ppm : float
1022            Tolerance in ppm for precursor matching.
1023        mz_tol_da_api : float, optional
1024            Unused, kept for backward compatibility.
1025        format : str, optional
1026            Format of requested library, e.g. "json" or "flashentropy".
1027        normalize : bool, optional
1028            Normalize spectrum intensities.
1029        fe_kwargs : dict, optional
1030            Keyword arguments for FlashEntropy search.
1031        api_delay : int, optional
1032            Unused, kept for backward compatibility.
1033        api_attempts : int, optional
1034            Unused, kept for backward compatibility.
1035
1036        Returns
1037        -------
1038        tuple
1039            Library in requested format and lipid metadata dictionary.
1040        """
1041
1042        if not isinstance(mz_list, (list, np.ndarray)):
1043            raise ValueError("mz_list must be a list or numpy array")
1044        if not all(isinstance(mz, (float, int)) for mz in mz_list):
1045            raise ValueError("All elements in mz_list must be float or int")
1046        if polarity not in {"positive", "negative"}:
1047            raise ValueError("polarity must be either 'positive' or 'negative'")
1048        if not isinstance(mz_tol_ppm, (float, int)):
1049            raise ValueError("mz_tol_ppm must be a float or int")
1050
1051        db_path = self._resolve_db_location()
1052        connection = sqlite3.connect(str(db_path))
1053        try:
1054            # Step 1: Get candidate spectra records based on m/z and polarity
1055            spectra_df = self._get_candidate_spectra(
1056                connection=connection,
1057                mz_list=mz_list,
1058                polarity=polarity,
1059                mz_tol_ppm=float(mz_tol_ppm),
1060            )
1061
1062            if spectra_df.empty:
1063                format_func = self._get_format_func(format)
1064                return format_func([], normalize=normalize, fe_kwargs=fe_kwargs), {}
1065
1066            # Step 2: Get corresponding lipid metadata for candidate spectra from lipidTree view
1067            mol_ids = tuple(spectra_df["molecular_data_id"].tolist())
1068            mol_df = pd.read_sql_query(
1069                f"SELECT * FROM lipidTree WHERE id IN {mol_ids}",
1070                connection,
1071            )
1072        finally:
1073            connection.close()
1074
1075        mol_df["id_index"] = mol_df["id"]
1076        mol_df = mol_df.set_index("id_index")
1077        mol_records = mol_df.to_dict(orient="index")
1078        lipid_metadata = {
1079            int(k): self._dict_to_dataclass(v, LipidMetadata)
1080            for k, v in mol_records.items()
1081        }
1082
1083        spectra_records = spectra_df.to_dict(orient="records")
1084        format_func = self._get_format_func(format)
1085        library = format_func(spectra_records, normalize=normalize, fe_kwargs=fe_kwargs)
1086        return library, lipid_metadata

Interface to a local sqlite lipid library for LC-MS spectral searches.

LCLipidLibraryInterface(db_location=None)
833    def __init__(self, db_location=None):
834        """
835        Initialize instance.
836
837        Parameters
838        ----------
839        db_location : str | Path, optional
840            Local path to the sqlite lipid library. If omitted, the
841            COREMS_LIPIDOMICS_SQLITE_PATH environment variable is used.
842        """
843
844        super().__init__(key=None)
845        self.db_location = db_location
846        self.__init_format_map__()

Initialize instance.

Parameters
  • db_location (str | Path, optional): Local path to the sqlite lipid library. If omitted, the COREMS_LIPIDOMICS_SQLITE_PATH environment variable is used.
DEFAULT_DOWNLOAD_URL = 'https://nmdcdemo.emsl.pnnl.gov/minio/lipidomics/parameter_files/202412_lipid_ref.sqlite'
db_location
def available_formats(self):
915    def available_formats(self):
916        """
917        View list of available formats.
918
919        Returns
920        -------
921        list
922            Format map keys.
923        """
924
925        return list(self.format_map.keys())

View list of available formats.

Returns
  • list: Format map keys.
def get_lipid_library( self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=None, format='json', normalize=True, fe_kwargs={}, api_delay=5, api_attempts=10):
1000    def get_lipid_library(
1001        self,
1002        mz_list,
1003        polarity,
1004        mz_tol_ppm,
1005        mz_tol_da_api=None,
1006        format="json",
1007        normalize=True,
1008        fe_kwargs={},
1009        api_delay=5,
1010        api_attempts=10,
1011    ):
1012        """
1013        Retrieve lipid spectra and metadata from a local sqlite library.
1014
1015        Parameters
1016        ----------
1017        mz_list : list
1018            List of precursor m/z values.
1019        polarity : str
1020            Ionization polarity, either "positive" or "negative".
1021        mz_tol_ppm : float
1022            Tolerance in ppm for precursor matching.
1023        mz_tol_da_api : float, optional
1024            Unused, kept for backward compatibility.
1025        format : str, optional
1026            Format of requested library, e.g. "json" or "flashentropy".
1027        normalize : bool, optional
1028            Normalize spectrum intensities.
1029        fe_kwargs : dict, optional
1030            Keyword arguments for FlashEntropy search.
1031        api_delay : int, optional
1032            Unused, kept for backward compatibility.
1033        api_attempts : int, optional
1034            Unused, kept for backward compatibility.
1035
1036        Returns
1037        -------
1038        tuple
1039            Library in requested format and lipid metadata dictionary.
1040        """
1041
1042        if not isinstance(mz_list, (list, np.ndarray)):
1043            raise ValueError("mz_list must be a list or numpy array")
1044        if not all(isinstance(mz, (float, int)) for mz in mz_list):
1045            raise ValueError("All elements in mz_list must be float or int")
1046        if polarity not in {"positive", "negative"}:
1047            raise ValueError("polarity must be either 'positive' or 'negative'")
1048        if not isinstance(mz_tol_ppm, (float, int)):
1049            raise ValueError("mz_tol_ppm must be a float or int")
1050
1051        db_path = self._resolve_db_location()
1052        connection = sqlite3.connect(str(db_path))
1053        try:
1054            # Step 1: Get candidate spectra records based on m/z and polarity
1055            spectra_df = self._get_candidate_spectra(
1056                connection=connection,
1057                mz_list=mz_list,
1058                polarity=polarity,
1059                mz_tol_ppm=float(mz_tol_ppm),
1060            )
1061
1062            if spectra_df.empty:
1063                format_func = self._get_format_func(format)
1064                return format_func([], normalize=normalize, fe_kwargs=fe_kwargs), {}
1065
1066            # Step 2: Get corresponding lipid metadata for candidate spectra from lipidTree view
1067            mol_ids = tuple(spectra_df["molecular_data_id"].tolist())
1068            mol_df = pd.read_sql_query(
1069                f"SELECT * FROM lipidTree WHERE id IN {mol_ids}",
1070                connection,
1071            )
1072        finally:
1073            connection.close()
1074
1075        mol_df["id_index"] = mol_df["id"]
1076        mol_df = mol_df.set_index("id_index")
1077        mol_records = mol_df.to_dict(orient="index")
1078        lipid_metadata = {
1079            int(k): self._dict_to_dataclass(v, LipidMetadata)
1080            for k, v in mol_records.items()
1081        }
1082
1083        spectra_records = spectra_df.to_dict(orient="records")
1084        format_func = self._get_format_func(format)
1085        library = format_func(spectra_records, normalize=normalize, fe_kwargs=fe_kwargs)
1086        return library, lipid_metadata

Retrieve lipid spectra and metadata from a local sqlite library.

Parameters
  • mz_list (list): List of precursor m/z values.
  • polarity (str): Ionization polarity, either "positive" or "negative".
  • mz_tol_ppm (float): Tolerance in ppm for precursor matching.
  • mz_tol_da_api (float, optional): Unused, kept for backward compatibility.
  • format (str, optional): Format of requested library, e.g. "json" or "flashentropy".
  • normalize (bool, optional): Normalize spectrum intensities.
  • fe_kwargs (dict, optional): Keyword arguments for FlashEntropy search.
  • api_delay (int, optional): Unused, kept for backward compatibility.
  • api_attempts (int, optional): Unused, kept for backward compatibility.
Returns
  • tuple: Library in requested format and lipid metadata dictionary.
class MSPInterface(SpectralDatabaseInterface):
1089class MSPInterface(SpectralDatabaseInterface):
1090    """
1091    Interface to parse NIST MSP files
1092    """
1093
1094    def __init__(self, file_path):
1095        """
1096        Initialize instance.
1097
1098        Parameters
1099        ----------
1100        file_path : str
1101            Path to a local MSP file.
1102
1103        Attributes
1104        ----------
1105        file_path : str
1106            Path to the MSP file.
1107        _file_content : str
1108            Content of the MSP file.
1109        _data_frame : :obj:`~pandas.DataFrame`
1110            DataFrame of spectra from the MSP file with unaltered content.
1111        """
1112        super().__init__(key=None)
1113
1114        self.file_path = file_path
1115        if not os.path.exists(self.file_path):
1116            raise FileNotFoundError(
1117                f"File {self.file_path} does not exist. Please check the file path."
1118            )
1119        with open(self.file_path, "r") as f:
1120            self._file_content = f.read()
1121
1122        self._data_frame = self._read_msp_file()
1123        self.__init_format_map__()
1124
1125    def __init_format_map__(self):
1126        """
1127        Initialize database format mapper, enabling multiple format requests.
1128
1129        """
1130
1131        # x is a pandas dataframe similar to self._data_frame format
1132        # Define format workflows
1133        self.format_map = {
1134            "msp": lambda x, normalize, fe_kwargs: self._to_msp(x, normalize),
1135            "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy(
1136                x, normalize, fe_kwargs
1137            ),
1138            "df": lambda x, normalize, fe_kwargs: self._to_df(x, normalize),
1139        }
1140
1141        # Add aliases
1142        self.format_map["fe"] = self.format_map["flashentropy"]
1143        self.format_map["flash-entropy"] = self.format_map["flashentropy"]
1144        self.format_map["dataframe"] = self.format_map["df"]
1145        self.format_map["data-frame"] = self.format_map["df"]
1146
1147    def _read_msp_file(self):
1148        """
1149        Reads the MSP files into the pandas dataframe, and sort/remove zero intensity ions in MS/MS spectra.
1150
1151        Returns
1152        -------
1153        :obj:`~pandas.DataFrame`
1154            DataFrame of spectra from the MSP file, exacly as it is in the file (no sorting, filtering etc)
1155        """
1156        # If input_dataframe is provided, return it it
1157        spectra = []
1158        spectrum = {}
1159
1160        f = StringIO(self._file_content)
1161        for line in f:
1162            line = line.strip()
1163            if not line:
1164                continue  # Skip empty lines
1165
1166            # Handle metadata
1167            if ":" in line:
1168                key, value = line.split(":", 1)
1169                key = key.strip().lower()
1170                value = value.strip()
1171
1172                if key == "name":
1173                    # Save current spectrum and start a new one
1174                    if spectrum:
1175                        spectra.append(spectrum)
1176                    spectrum = {"name": value, "peaks": []}
1177                else:
1178                    spectrum[key] = value
1179
1180            # Handle peak data (assumed to start with a number)
1181            elif line[0].isdigit():
1182                peaks = line.split()
1183                m_z = float(peaks[0])
1184                intensity = float(peaks[1])
1185                spectrum["peaks"].append(([m_z, intensity]))
1186        # Save the last spectrum
1187        if spectrum:
1188            spectra.append(spectrum)
1189
1190        df = pd.DataFrame(spectra)
1191        for column in df.columns:
1192            if column != "peaks":  # Skip 'peaks' column
1193                try:
1194                    df[column] = pd.to_numeric(df[column], errors="raise")
1195                except:
1196                    pass
1197        return df
1198
1199    def _to_df(self, input_dataframe, normalize=True):
1200        """
1201        Convert MSP-derived library to FlashEntropy library. 
1202
1203        Parameters
1204        ----------
1205        input_dataframe : :obj:`~pandas.DataFrame`
1206            Input DataFrame containing MSP-formatted spectra.
1207        normalize : bool, optional
1208            Normalize each spectrum by its magnitude.
1209            Default is True.
1210
1211        Returns
1212        -------
1213        :obj:`~pandas.DataFrame`
1214            DataFrame of with desired normalization
1215        """
1216        if not normalize:
1217            return input_dataframe
1218        else:
1219            # Convert to dictionary
1220            db_dict = input_dataframe.to_dict(orient="records")
1221
1222            # Initialize empty library
1223            lib = []
1224
1225            # Enumerate spectra
1226            for i, source in enumerate(db_dict):
1227                spectrum = source
1228                # Check that spectrum["peaks"] exists
1229                if "peaks" not in spectrum.keys():
1230                    raise KeyError(
1231                        "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute."
1232                    )
1233
1234                # Convert spectrum["peaks"] to numpy array
1235                if not isinstance(spectrum["peaks"], np.ndarray):
1236                    spectrum["peaks"] = np.array(spectrum["peaks"])
1237
1238                # Normalize peaks, if requested
1239                if normalize:
1240                    spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"])
1241                    spectrum["num peaks"] = len(spectrum["peaks"])
1242
1243                # Add spectrum to library
1244                lib.append(spectrum)
1245            
1246            # Convert to DataFrame
1247            df = pd.DataFrame(lib)
1248            return df
1249    
1250    def _to_flashentropy(self, input_dataframe, normalize=True, fe_kwargs={}):
1251        """
1252        Convert MSP-derived library to FlashEntropy library.
1253
1254        Parameters
1255        ----------
1256        input_dataframe : :obj:`~pandas.DataFrame`
1257            Input DataFrame containing MSP-formatted spectra.
1258        normalize : bool
1259            Normalize each spectrum by its magnitude.
1260        fe_kwargs : dict, optional
1261            Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search;
1262            any keys not recognized will be ignored. By default, all parameters set to defaults.
1263
1264        Returns
1265        -------
1266        :obj:`~ms_entropy.FlashEntropySearch`
1267            MS2 library as FlashEntropy search instance.
1268
1269        Raises
1270        ------
1271        ValueError
1272            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they
1273        """
1274        self._check_flash_entropy_kwargs(fe_kwargs)
1275
1276        db_df = input_dataframe
1277
1278        # Convert to dictionary
1279        db_dict = db_df.to_dict(orient="records")
1280
1281        # Initialize empty library
1282        fe_lib = []
1283
1284        # Enumerate spectra
1285        for i, source in enumerate(db_dict):
1286            # Reorganize source dict, if necessary
1287            if "spectrum_data" in source.keys():
1288                spectrum = source["spectrum_data"]
1289            else:
1290                spectrum = source
1291
1292            # Rename precursor_mz key for FlashEntropy
1293            if "precursor_mz" not in spectrum.keys():
1294                if "precursormz" in spectrum:
1295                    spectrum["precursor_mz"] = spectrum.pop("precursormz")
1296                elif "precursor_ion" in spectrum:
1297                    spectrum["precursor_mz"] = spectrum.pop("precursor_ion")
1298                else:
1299                    raise KeyError(
1300                        "MSP must have either 'precursormz' or 'precursor_ion' key to be converted to FlashEntropy format."
1301                    )
1302
1303            # Check that spectrum["peaks"] exists
1304            if "peaks" not in spectrum.keys():
1305                raise KeyError(
1306                    "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute."
1307                )
1308
1309            # Convert spectrum["peaks"] to numpy array
1310            if not isinstance(spectrum["peaks"], np.ndarray):
1311                spectrum["peaks"] = np.array(spectrum["peaks"])
1312
1313            # Normalize peaks, if requested
1314            if normalize:
1315                spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"])
1316
1317            # Add spectrum to library
1318            fe_lib.append(spectrum)
1319
1320        # Build FlashEntropy index
1321        fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs)
1322
1323        return fe_search
1324    
1325    def _check_msp_compatibility(self):
1326        """
1327        Check if the MSP file is compatible with the get_metabolomics_spectra_library method and provide feedback if it is not.
1328        """
1329        # Check polarity
1330        if (
1331            "polarity" not in self._data_frame.columns
1332            and "ionmode" not in self._data_frame.columns
1333        ):
1334            raise ValueError(
1335                "Neither 'polarity' nor 'ionmode' columns found in the input MSP metadata. Please check the file."
1336            )
1337        polarity_column = (
1338            "polarity" if "polarity" in self._data_frame.columns else "ionmode"
1339        )
1340
1341        # Check if polarity_column contents is either "positive" or "negative"
1342        if not all(self._data_frame[polarity_column].isin(["positive", "negative"])):
1343            raise ValueError(
1344                f"Input field on MSP '{polarity_column}' must contain only 'positive' or 'negative' values."
1345            )
1346
1347        # Check if the MSP file contains the required columns for metabolite metadata
1348        # inchikey, by name, not null
1349        # either formula or molecular_formula, not null
1350        if not all(self._data_frame["inchikey"].notnull()):
1351            raise ValueError(
1352                "Input field on MSP 'inchikey' must contain only non-null values."
1353            )
1354        if (
1355            "formula" not in self._data_frame.columns
1356            and "molecular_formula" not in self._data_frame.columns
1357        ):
1358            raise ValueError(
1359                "Input field on MSP must contain either 'formula' or 'molecular_formula' columns."
1360            )
1361        molecular_formula_column = (
1362            "formula" if "formula" in self._data_frame.columns else "molecular_formula"
1363        )
1364        if not all(self._data_frame[molecular_formula_column].notnull()):
1365            raise ValueError(
1366                f"Input field on MSP '{molecular_formula_column}' must contain only non-null values."
1367            )
1368
1369    def get_metabolomics_spectra_library(
1370        self,
1371        polarity,
1372        metabolite_metadata_mapping={},
1373        format="fe",
1374        normalize=True,
1375        fe_kwargs={},
1376    ):
1377        """
1378        Prepare metabolomics spectra library and associated metabolite metadata
1379
1380        Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input
1381
1382        """
1383        # Check if the MSP file is compatible with the get_metabolomics_spectra_library method
1384        self._check_msp_compatibility()
1385
1386        # Check if the polarity parameter is valid and if a polarity column exists in the dataframe
1387        if polarity not in ["positive", "negative"]:
1388            raise ValueError("Polarity must be 'positive' or 'negative'")
1389        polarity_column = (
1390            "polarity" if "polarity" in self._data_frame.columns else "ionmode"
1391        )
1392
1393        # Get a subset of the initial dataframea by polarity
1394        db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy()
1395
1396        # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping
1397        # If the mapping is not provided, use the default mapping
1398        if not metabolite_metadata_mapping:
1399            metabolite_metadata_mapping = {
1400                "chebi_id": "chebi",
1401                "kegg_id": "kegg",
1402                "refmet_name": "common_name",
1403                "molecular_formula": "formula",
1404                "gnps_spectra_id":"id",
1405                "precursormz": "precursor_mz",
1406                "precursortype":"ion_type"
1407            }
1408        db_df.rename(columns=metabolite_metadata_mapping, inplace=True)
1409        db_df["molecular_data_id"] = db_df["inchikey"]
1410
1411
1412
1413        # Check if the resulting dataframe has the required columns for the flash entropy search
1414        required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"]
1415        for col in required_columns:
1416            if col not in db_df.columns:
1417                raise ValueError(
1418                    f"Input field on MSP must contain '{col}' column for FlashEntropy search."
1419                )
1420
1421        # Pull out the metabolite metadata from the dataframe and put it into a different dataframe
1422        # First get a list of the possible attributes of the MetaboliteMetadata dataclass
1423        metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys())
1424        # Replace id with molecular_data_id in metabolite_metadata_keys
1425        metabolite_metadata_keys = [
1426            "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys
1427        ]
1428        metabolite_metadata_df = db_df[
1429            db_df.columns[db_df.columns.isin(metabolite_metadata_keys)]
1430        ].copy()
1431
1432        # Make unique and recast the id column for metabolite metadata
1433        metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True)
1434        metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"]
1435
1436        # Convert to a dictionary using the inchikey as the key
1437        metabolite_metadata_dict = metabolite_metadata_df.to_dict(
1438            orient="records"
1439        )
1440        metabolite_metadata_dict = {
1441            v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata)
1442            for v in metabolite_metadata_dict
1443        }
1444
1445        # Remove the metabolite metadata columns from the original dataframe
1446        for key in metabolite_metadata_keys:
1447            if key != "molecular_data_id":
1448                if key in db_df.columns:
1449                    db_df.drop(columns=key, inplace=True)
1450
1451        # Format the spectral library
1452        format_func = self._get_format_func(format)
1453        lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs)
1454        return (lib, metabolite_metadata_dict)

Interface to parse NIST MSP files

MSPInterface(file_path)
1094    def __init__(self, file_path):
1095        """
1096        Initialize instance.
1097
1098        Parameters
1099        ----------
1100        file_path : str
1101            Path to a local MSP file.
1102
1103        Attributes
1104        ----------
1105        file_path : str
1106            Path to the MSP file.
1107        _file_content : str
1108            Content of the MSP file.
1109        _data_frame : :obj:`~pandas.DataFrame`
1110            DataFrame of spectra from the MSP file with unaltered content.
1111        """
1112        super().__init__(key=None)
1113
1114        self.file_path = file_path
1115        if not os.path.exists(self.file_path):
1116            raise FileNotFoundError(
1117                f"File {self.file_path} does not exist. Please check the file path."
1118            )
1119        with open(self.file_path, "r") as f:
1120            self._file_content = f.read()
1121
1122        self._data_frame = self._read_msp_file()
1123        self.__init_format_map__()

Initialize instance.

Parameters
  • file_path (str): Path to a local MSP file.
Attributes
  • file_path (str): Path to the MSP file.
  • _file_content (str): Content of the MSP file.
  • _data_frame (~pandas.DataFrame): DataFrame of spectra from the MSP file with unaltered content.
file_path
def get_metabolomics_spectra_library( self, polarity, metabolite_metadata_mapping={}, format='fe', normalize=True, fe_kwargs={}):
1369    def get_metabolomics_spectra_library(
1370        self,
1371        polarity,
1372        metabolite_metadata_mapping={},
1373        format="fe",
1374        normalize=True,
1375        fe_kwargs={},
1376    ):
1377        """
1378        Prepare metabolomics spectra library and associated metabolite metadata
1379
1380        Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input
1381
1382        """
1383        # Check if the MSP file is compatible with the get_metabolomics_spectra_library method
1384        self._check_msp_compatibility()
1385
1386        # Check if the polarity parameter is valid and if a polarity column exists in the dataframe
1387        if polarity not in ["positive", "negative"]:
1388            raise ValueError("Polarity must be 'positive' or 'negative'")
1389        polarity_column = (
1390            "polarity" if "polarity" in self._data_frame.columns else "ionmode"
1391        )
1392
1393        # Get a subset of the initial dataframea by polarity
1394        db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy()
1395
1396        # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping
1397        # If the mapping is not provided, use the default mapping
1398        if not metabolite_metadata_mapping:
1399            metabolite_metadata_mapping = {
1400                "chebi_id": "chebi",
1401                "kegg_id": "kegg",
1402                "refmet_name": "common_name",
1403                "molecular_formula": "formula",
1404                "gnps_spectra_id":"id",
1405                "precursormz": "precursor_mz",
1406                "precursortype":"ion_type"
1407            }
1408        db_df.rename(columns=metabolite_metadata_mapping, inplace=True)
1409        db_df["molecular_data_id"] = db_df["inchikey"]
1410
1411
1412
1413        # Check if the resulting dataframe has the required columns for the flash entropy search
1414        required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"]
1415        for col in required_columns:
1416            if col not in db_df.columns:
1417                raise ValueError(
1418                    f"Input field on MSP must contain '{col}' column for FlashEntropy search."
1419                )
1420
1421        # Pull out the metabolite metadata from the dataframe and put it into a different dataframe
1422        # First get a list of the possible attributes of the MetaboliteMetadata dataclass
1423        metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys())
1424        # Replace id with molecular_data_id in metabolite_metadata_keys
1425        metabolite_metadata_keys = [
1426            "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys
1427        ]
1428        metabolite_metadata_df = db_df[
1429            db_df.columns[db_df.columns.isin(metabolite_metadata_keys)]
1430        ].copy()
1431
1432        # Make unique and recast the id column for metabolite metadata
1433        metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True)
1434        metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"]
1435
1436        # Convert to a dictionary using the inchikey as the key
1437        metabolite_metadata_dict = metabolite_metadata_df.to_dict(
1438            orient="records"
1439        )
1440        metabolite_metadata_dict = {
1441            v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata)
1442            for v in metabolite_metadata_dict
1443        }
1444
1445        # Remove the metabolite metadata columns from the original dataframe
1446        for key in metabolite_metadata_keys:
1447            if key != "molecular_data_id":
1448                if key in db_df.columns:
1449                    db_df.drop(columns=key, inplace=True)
1450
1451        # Format the spectral library
1452        format_func = self._get_format_func(format)
1453        lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs)
1454        return (lib, metabolite_metadata_dict)

Prepare metabolomics spectra library and associated metabolite metadata

Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input