corems.molecular_id.search.database_interfaces

   1import os
   2import re
   3from abc import ABC
   4from io import StringIO
   5from pathlib import Path
   6import time
   7import json
   8
   9import numpy as np
  10import requests
  11import pandas as pd
  12from ms_entropy import FlashEntropySearch
  13
  14from corems.molecular_id.factory.EI_SQL import (
  15    EI_LowRes_SQLite,
  16    Metadatar,
  17    MetaboliteMetadata,
  18)
  19from corems.molecular_id.factory.lipid_molecular_metadata import LipidMetadata
  20from corems.mass_spectra.calc.lc_calc import find_closest
  21
  22
  23class SpectralDatabaseInterface(ABC):
  24    """
  25    Base class that facilitates connection to spectral reference databases,
  26    such as EMSL's Metabolomics Reference Database (MetabRef).
  27
  28    """
  29
  30    def __init__(self, key=None):
  31        """
  32        Initialize instance.
  33
  34        Parameters
  35        ----------
  36        key : str
  37            Token key.
  38
  39        """
  40
  41        self.key = key
  42
  43    def set_token(self, path):
  44        """
  45        Set environment variable for MetabRef database token.
  46
  47        Parameters
  48        ----------
  49        path : str
  50            Path to token.
  51
  52        """
  53
  54        # Read token from file
  55        with open(path, "r", encoding="utf-8") as f:
  56            token = f.readline().strip()
  57
  58        # Set environment variable
  59        os.environ[self.key] = token
  60
  61    def get_token(self):
  62        """
  63        Get environment variable for database token.
  64
  65        Returns
  66        -------
  67        str
  68            Token string.
  69
  70        """
  71
  72        # Check for token
  73        if self.key not in os.environ:
  74            raise ValueError("Must set {} environment variable.".format(self.key))
  75
  76        # Get token from environment variables
  77        return os.environ.get(self.key)
  78
  79    def get_header(self):
  80        """
  81        Access stored database token and prepare as header.
  82
  83        Returns
  84        -------
  85        str
  86            Header string.
  87
  88        """
  89
  90        # Get token
  91        token = self.get_token()
  92
  93        # Pad header information
  94        header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"}
  95
  96        return header
  97
  98    def get_query(self, url, use_header=True):
  99        """
 100        Request payload from URL according to `get` protocol.
 101
 102        Parameters
 103        ----------
 104        url : str
 105            URL for request.
 106        use_header: bool
 107            Whether or not the query should include the header
 108
 109        Returns
 110        -------
 111        dict
 112            Response as JSON.
 113
 114        """
 115
 116        # Query URL via `get`
 117        if use_header:
 118            response = requests.get(url, headers=self.get_header())
 119        else:
 120            response = requests.get(url)
 121
 122        # Check response
 123        response.raise_for_status()
 124
 125        # Return as JSON
 126        return response.json()
 127
 128    def post_query(self, url, variable, values, tolerance):
 129        """
 130        Request payload from URL according to `post` protocol.
 131
 132        Parameters
 133        ----------
 134        url : str
 135            URL for request.
 136        variable : str
 137            Variable to query.
 138        values : str
 139            Specific values of `variable` to query.
 140        tolerance : str
 141            Query tolerance relative to `values`.
 142
 143        Returns
 144        -------
 145        dict
 146            Response as JSON.
 147
 148        """
 149
 150        # Coerce to string
 151        if not isinstance(variable, str):
 152            variable = str(variable).replace(" ", "")
 153
 154        if not isinstance(values, str):
 155            values = str(values).replace(" ", "")
 156
 157        if not isinstance(tolerance, str):
 158            tolerance = str(tolerance).replace(" ", "")
 159
 160        # Query URL via `post`
 161        response = requests.post(
 162            os.path.join(url, variable, tolerance),
 163            data=values,
 164            headers=self.get_header(),
 165        )
 166
 167        # Check response
 168        response.raise_for_status()
 169
 170        # Return as JSON
 171        return response.json()
 172
 173    def _check_flash_entropy_kwargs(self, fe_kwargs):
 174        """
 175        Check FlashEntropy keyword arguments.
 176
 177        Parameters
 178        ----------
 179        fe_kwargs : dict
 180            Keyword arguments for FlashEntropy search.
 181
 182
 183        Raises
 184        ------
 185        ValueError
 186            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they
 187            are not equal.
 188
 189        """
 190        # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da
 191        if (
 192            "min_ms2_difference_in_da" in fe_kwargs
 193            or "max_ms2_tolerance_in_da" in fe_kwargs
 194        ):
 195            if (
 196                "min_ms2_difference_in_da" not in fe_kwargs
 197                or "max_ms2_tolerance_in_da" not in fe_kwargs
 198            ):
 199                raise ValueError(
 200                    "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified."
 201                )
 202            if (
 203                fe_kwargs["min_ms2_difference_in_da"]
 204                != 2 * fe_kwargs["max_ms2_tolerance_in_da"]
 205            ):
 206                raise ValueError(
 207                    "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'."
 208                )
 209
 210    def _get_format_func(self, format):
 211        """
 212        Obtain format function by key.
 213
 214        Returns
 215        -------
 216        func
 217            Formatting function.
 218        """
 219
 220        if format.lower() in self.format_map.keys():
 221            return self.format_map[format.lower()]
 222
 223        raise ValueError(("{} not a supported format.").format(format))
 224
 225    def _dict_to_dataclass(self, metabref_lib, data_class):
 226        """
 227        Convert dictionary to dataclass.
 228
 229        Notes
 230        -----
 231        This function will pull the attributes a dataclass and its parent class
 232        and convert the dictionary to a dataclass instance with the appropriate
 233        attributes.
 234
 235        Parameters
 236        ----------
 237        data_class : :obj:`~dataclasses.dataclass`
 238            Dataclass to convert to.
 239        metabref_lib : dict
 240            Metabref dictionary object to convert to dataclass.
 241
 242        Returns
 243        -------
 244        :obj:`~dataclasses.dataclass`
 245            Dataclass instance.
 246
 247        """
 248
 249        # Get list of expected attributes of data_class
 250        data_class_keys = list(data_class.__annotations__.keys())
 251
 252        # Does the data_class inherit from another class, if so, get the attributes of the parent class as well
 253        if len(data_class.__mro__) > 2:
 254            parent_class_keys = list(data_class.__bases__[0].__annotations__.keys())
 255            data_class_keys = list(set(data_class_keys + parent_class_keys))
 256
 257        # Remove keys that are not in the data_class from the input dictionary
 258        input_dict = {k: v for k, v in metabref_lib.items() if k in data_class_keys}
 259
 260        # Add keys that are in the data class but not in the input dictionary as None
 261        for key in data_class_keys:
 262            if key not in input_dict.keys():
 263                input_dict[key] = None
 264        return data_class(**input_dict)
 265
 266    @staticmethod
 267    def normalize_peaks(arr):
 268        """
 269        Normalize peaks in an array.
 270
 271        Parameters
 272        ----------
 273        arr : :obj:`~numpy.array`
 274            Array of shape (N, 2), with m/z in the first column and abundance in
 275            the second.
 276
 277        Returns
 278        -------
 279        :obj:`~numpy.array`
 280            Normalized array of shape (N, 2), with m/z in the first column and
 281            normalized abundance in the second.
 282        """
 283        # Normalize the array
 284        arr[:, -1] = arr[:, -1] / arr[:, -1].sum()
 285
 286        return arr
 287
 288    @staticmethod
 289    def _build_flash_entropy_index(fe_lib, fe_kwargs={}, clean_spectra=True):
 290        """
 291        Build FlashEntropy index.
 292
 293        Parameters
 294        ----------
 295        fe_lib : list
 296            List of spectra to build index from. Can be a list of dictionaries or
 297            a FlashEntropy search instance.
 298        fe_kwargs : dict, optional
 299            Keyword arguments for FlashEntropy search.
 300        clean_spectra : bool, optional
 301            Clean spectra before building index. Default is True.
 302
 303        Returns
 304        -------
 305        :obj:`~ms_entropy.FlashEntropySearch`
 306            FlashEntropy search instance.
 307
 308        """
 309        # Initialize FlashEntropy
 310        fe_init_kws = [
 311            "max_ms2_tolerance_in_da",
 312            "mz_index_step",
 313            "low_memory",
 314            "path_data",
 315        ]
 316        fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws}
 317        fes = FlashEntropySearch(**fe_init_kws)
 318
 319        # Build FlashEntropy index
 320        fe_index_kws = [
 321            "max_indexed_mz",
 322            "precursor_ions_removal_da",
 323            "noise_threshold",
 324            "min_ms2_difference_in_da",
 325            "max_peak_num",
 326        ]
 327        fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws}
 328        fes.build_index(fe_lib, **fe_index_kws, clean_spectra=clean_spectra)
 329
 330        return fes
 331
 332
 333class MetabRefInterface(SpectralDatabaseInterface):
 334    """
 335    Interface to the Metabolomics Reference Database.
 336    """
 337
 338    def __init__(self):
 339        """
 340        Initialize instance.
 341
 342        """
 343
 344        super().__init__(key=None)
 345
 346    def spectrum_to_array(self, spectrum, normalize=True):
 347        """
 348        Convert MetabRef-formatted spectrum to array.
 349
 350        Parameters
 351        ----------
 352        spectrum : str
 353            MetabRef spectrum, i.e. list of (m/z,abundance) pairs.
 354        normalize : bool
 355            Normalize the spectrum by its magnitude.
 356
 357        Returns
 358        -------
 359        :obj:`~numpy.array`
 360            Array of shape (N, 2), with m/z in the first column and abundance in
 361            the second.
 362
 363        """
 364
 365        # Convert parenthesis-delimited string to array
 366        arr = np.array(
 367            re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float
 368        ).reshape(-1, 2)
 369
 370        if normalize:
 371            arr = self.normalize_peaks(arr)
 372
 373        return arr
 374
 375    def _to_flashentropy(self, metabref_lib, normalize=True, fe_kwargs={}):
 376        """
 377        Convert metabref-formatted library to FlashEntropy library.
 378
 379        Parameters
 380        ----------
 381        metabref_lib : dict
 382            MetabRef MS2 library in JSON format or FlashEntropy search instance (for reformatting at different MS2 separation).
 383        normalize : bool
 384            Normalize each spectrum by its magnitude.
 385        fe_kwargs : dict, optional
 386            Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search;
 387            any keys not recognized will be ignored. By default, all parameters set to defaults.
 388
 389        Returns
 390        -------
 391        :obj:`~ms_entropy.FlashEntropySearch`
 392            MS2 library as FlashEntropy search instance.
 393
 394        Raises
 395        ------
 396        ValueError
 397            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal.
 398
 399        """
 400        self._check_flash_entropy_kwargs(fe_kwargs)
 401
 402        # Initialize empty library
 403        fe_lib = []
 404
 405        # Enumerate spectra
 406        for i, source in enumerate(metabref_lib):
 407            # Reorganize source dict, if necessary
 408            if "spectrum_data" in source.keys():
 409                spectrum = source["spectrum_data"]
 410            else:
 411                spectrum = source
 412
 413            # Rename precursor_mz key for FlashEntropy
 414            if "precursor_mz" not in spectrum.keys():
 415                spectrum["precursor_mz"] = spectrum.pop("precursor_ion")
 416
 417            # Convert CoreMS spectrum to array and clean, store as `peaks`
 418            spectrum["peaks"] = self.spectrum_to_array(
 419                spectrum["mz"], normalize=normalize
 420            )
 421
 422            # Add spectrum to library
 423            fe_lib.append(spectrum)
 424
 425        # Build FlashEntropy index
 426        fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs)
 427
 428        return fe_search
 429
 430    def get_query(self, url, use_header=False):
 431        """Overwrites the get_query method on the parent class to default to not use a header
 432
 433        Notes
 434        -----
 435        As of January 2025, the metabref database no longer requires a token and therefore no header is needed
 436
 437        """
 438        return super().get_query(url, use_header)
 439
 440
 441class MetabRefGCInterface(MetabRefInterface):
 442    """
 443    Interface to the Metabolomics Reference Database.
 444    """
 445
 446    def __init__(self):
 447        """
 448        Initialize instance.
 449
 450        """
 451
 452        super().__init__()
 453        self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1"
 454        self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames"
 455
 456        self.__init_format_map__()
 457
 458    def __init_format_map__(self):
 459        """
 460        Initialize database format mapper, enabling multiple format requests.
 461
 462        """
 463
 464        # Define format workflows
 465        self.format_map = {
 466            "json": lambda x, normalize, fe_kwargs: x,
 467            "dict": lambda x,
 468            normalize,
 469            fe_kwargs: self._to_LowResolutionEICompound_dict(x, normalize),
 470            "sql": lambda x,
 471            normalize,
 472            fe_kwargs: self._LowResolutionEICompound_dict_to_sqlite(
 473                self._to_LowResolutionEICompound_dict(x, normalize)
 474            ),
 475        }
 476
 477        # Add aliases
 478        self.format_map["metabref"] = self.format_map["json"]
 479        self.format_map["datadict"] = self.format_map["dict"]
 480        self.format_map["data-dict"] = self.format_map["dict"]
 481        self.format_map["lowreseicompound"] = self.format_map["dict"]
 482        self.format_map["lowres"] = self.format_map["dict"]
 483        self.format_map["lowresgc"] = self.format_map["dict"]
 484        self.format_map["sqlite"] = self.format_map["sql"]
 485
 486    def available_formats(self):
 487        """
 488        View list of available formats.
 489
 490        Returns
 491        -------
 492        list
 493            Format map keys.
 494        """
 495
 496        return list(self.format_map.keys())
 497
 498    def get_library(self, format="json", normalize=False):
 499        """
 500        Request MetabRef GC/MS library.
 501
 502        Parameters
 503        ----------
 504        format : str
 505            Format of requested library, i.e. "json", "sql", "flashentropy".
 506            See `available_formats` method for aliases.
 507        normalize : bool
 508            Normalize the spectrum by its magnitude.
 509
 510        Returns
 511        -------
 512        Library in requested format.
 513
 514        """
 515
 516        # Init format function
 517        format_func = self._get_format_func(format)
 518
 519        return format_func(
 520            self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {}
 521        )
 522
 523    def get_fames(self, format="json", normalize=False):
 524        """
 525        Request MetabRef GC/MS FAMEs library.
 526
 527        Parameters
 528        ----------
 529        format : str
 530            Format of requested library, i.e. "json", "sql", "flashentropy".
 531            See `available_formats` method for aliases.
 532        normalize : bool
 533            Normalize the spectrum by its magnitude.
 534
 535        Returns
 536        -------
 537        Library in requested format.
 538
 539        """
 540
 541        # Init format function
 542        format_func = self._get_format_func(format)
 543
 544        return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {})
 545
 546    def _to_LowResolutionEICompound_dict(self, metabref_lib, normalize=False):
 547        """
 548        Convert MetabRef-formatted library to CoreMS LowResolutionEICompound-formatted
 549        dictionary for local ingestion.
 550
 551        Parameters
 552        ----------
 553        metabref_lib : dict
 554            MetabRef GC-MS library in JSON format.
 555        normalize : bool
 556            Normalize each spectrum by its magnitude.
 557
 558        Returns
 559        -------
 560        list of dict
 561            List of each spectrum contained in dictionary.
 562
 563        """
 564
 565        # All below key:value lookups are based on CoreMS class definitions
 566        # NOT MetabRef content. For example, MetabRef has keys for PubChem,
 567        # USI, etc. that are not considered below.
 568
 569        # Dictionary to map metabref keys to corems keys
 570        metadatar_cols = {
 571            "casno": "cas",
 572            "inchikey": "inchikey",
 573            "inchi": "inchi",
 574            "chebi": "chebi",
 575            "smiles": "smiles",
 576            "kegg": "kegg",
 577            "iupac_name": "iupac_name",
 578            "traditional_name": "traditional_name",  # Not present in metabref
 579            "common_name": "common_name",  # Not present in metabref
 580        }
 581
 582        # Dictionary to map metabref keys to corems keys
 583        lowres_ei_compound_cols = {
 584            "id": "metabref_id",
 585            "molecule_name": "name",  # Is this correct?
 586            "classify": "classify",  # Not present in metabref
 587            "formula": "formula",
 588            "ri": "ri",
 589            "rt": "retention_time",
 590            "source": "source",  # Not present in metabref
 591            "casno": "casno",
 592            "comments": "comment",
 593            "source_temp_c": "source_temp_c",  # Not present in metabref
 594            "ev": "ev",  # Not present in metabref
 595            "peak_count": "peaks_count",
 596            "mz": "mz",
 597            "abundance": "abundance",
 598        }
 599
 600        # Local result container
 601        corems_lib = []
 602
 603        # Enumerate spectra
 604        for i, source_ in enumerate(metabref_lib):
 605            # Copy source to prevent modification
 606            source = source_.copy()
 607
 608            # Flatten source dict
 609            source = source.pop("spectrum_data") | source
 610
 611            # Parse target data
 612            target = {
 613                lowres_ei_compound_cols[k]: v
 614                for k, v in source.items()
 615                if k in lowres_ei_compound_cols
 616            }
 617
 618            # Explicitly add this to connect with LowResCompoundRef later
 619            target["rt"] = source["rt"]
 620
 621            # Parse (mz, abundance)
 622            arr = self.spectrum_to_array(target["mz"], normalize=normalize)
 623            target["mz"] = arr[:, 0]
 624            target["abundance"] = arr[:, 1]
 625
 626            # Parse meta data
 627            target["metadata"] = {
 628                metadatar_cols[k]: v for k, v in source.items() if k in metadatar_cols
 629            }
 630
 631            # Add anything else
 632            for k in source:
 633                if k not in lowres_ei_compound_cols:
 634                    target[k] = source[k]
 635
 636            # Add to CoreMS list
 637            corems_lib.append(target)
 638
 639        return corems_lib
 640
 641    def _LowResolutionEICompound_dict_to_sqlite(
 642        self, lowres_ei_compound_dict, url="sqlite://"
 643    ):
 644        """
 645        Convert CoreMS LowResolutionEICompound-formatted dictionary to SQLite
 646        database for local ingestion.
 647
 648        Parameters
 649        ----------
 650        lowres_ei_compound_dict : dict
 651            CoreMS GC-MS library formatted for LowResolutionEICompound.
 652        url : str
 653            URL to SQLite prefix.
 654
 655        Returns
 656        -------
 657        sqlite database
 658            Spectra contained in SQLite database.
 659
 660        """
 661
 662        # Dictionary to map corems keys to all-caps keys
 663        capped_cols = {
 664            "name": "NAME",
 665            "formula": "FORM",
 666            "ri": "RI",
 667            "retention_time": "RT",
 668            "source": "SOURCE",
 669            "casno": "CASNO",
 670            "comment": "COMMENT",
 671            "peaks_count": "NUM PEAKS",
 672        }
 673
 674        # Initialize SQLite object
 675        sqlite_obj = EI_LowRes_SQLite(url=url)
 676
 677        # Iterate spectra
 678        for _data_dict in lowres_ei_compound_dict:
 679            # Copy source to prevent modification
 680            data_dict = _data_dict.copy()
 681
 682            # Add missing capped values
 683            for k, v in capped_cols.items():
 684                # Key exists
 685                if k in data_dict:
 686                    # # This will replace the key
 687                    # data_dict[v] = data_dict.pop(k)
 688
 689                    # This will keep both keys
 690                    data_dict[v] = data_dict[k]
 691
 692            # Parse number of peaks
 693            if not data_dict.get("NUM PEAKS"):
 694                data_dict["NUM PEAKS"] = len(data_dict.get("mz"))
 695
 696            # Parse CAS number
 697            if not data_dict.get("CASNO"):
 698                data_dict["CASNO"] = data_dict.get("CAS")
 699
 700            if not data_dict["CASNO"]:
 701                data_dict["CASNO"] = 0
 702
 703            # Build linked metadata table
 704            if "metadata" in data_dict:
 705                if len(data_dict["metadata"]) > 0:
 706                    data_dict["metadatar"] = Metadatar(**data_dict.pop("metadata"))
 707                else:
 708                    data_dict.pop("metadata")
 709
 710            # Attempt addition to sqlite
 711            try:
 712                sqlite_obj.add_compound(data_dict)
 713            except:
 714                print(data_dict["NAME"])
 715
 716        return sqlite_obj
 717
 718
 719class MetabRefLCInterface(MetabRefInterface):
 720    """
 721    Interface to the Metabolomics Reference Database for LC-MS data.
 722    """
 723
 724    def __init__(self):
 725        """
 726        Initialize instance.
 727
 728        """
 729
 730        super().__init__()
 731
 732        # API endpoint for precursor m/z search
 733        # inputs = mz, tolerance (in Da), polarity, page_no, per_page
 734        self.PRECURSOR_MZ_URL = "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}"
 735
 736        # API endpoint for returning full list of precursor m/z values in database
 737        # inputs = polarity, page_no, per_page
 738        self.PRECURSOR_MZ_ALL_URL = (
 739            "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}"
 740        )
 741
 742        # API endpoint for lipid data
 743        self.LIPID_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/lipid/data"
 744
 745        self.__init_format_map__()
 746
 747    def __init_format_map__(self):
 748        """
 749        Initialize database format mapper, enabling multiple format requests.
 750
 751        """
 752
 753        # Define format workflows
 754        self.format_map = {
 755            "json": lambda x, normalize, fe_kwargs: x,
 756            "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy(
 757                x, normalize, fe_kwargs
 758            ),
 759        }
 760
 761        # Add aliases
 762        self.format_map["metabref"] = self.format_map["json"]
 763        self.format_map["fe"] = self.format_map["flashentropy"]
 764        self.format_map["flash-entropy"] = self.format_map["flashentropy"]
 765    
 766    def query_by_precursor(
 767        self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50
 768    ):
 769        """
 770        Query MetabRef by precursor m/z values.
 771
 772        Parameters
 773        ----------
 774        mz_list : list
 775            List of precursor m/z values.
 776        polarity : str
 777            Ionization polarity, either "positive" or "negative".
 778        mz_tol_ppm : float
 779            Tolerance in ppm for each precursor m/z value.
 780            Used for retrieving from a potential match from database.
 781        mz_tol_da_api : float, optional
 782            Maximum tolerance between precursor m/z values for API search, in daltons.
 783            Used to group similar mzs into a single API query for speed. Default is 0.2.
 784        max_per_page : int, optional
 785            Maximum records to return from MetabRef API query at a time.  Default is 50.
 786
 787        Returns
 788        -------
 789        list
 790            List of library entries in original JSON format.
 791        """
 792        raise DeprecationWarning(
 793            "query_by_precursor is deprecated. Use get_lipid_library instead."
 794        )
 795
 796    def request_all_precursors(self, polarity, per_page=50000):
 797        """
 798        Request all precursor m/z values for MS2 spectra from MetabRef.
 799
 800        Parameters
 801        ----------
 802        polarity : str
 803            Ionization polarity, either "positive" or "negative".
 804        per_page : int, optional
 805            Number of records to fetch per call. Default is 50000
 806
 807        Returns
 808        -------
 809        list
 810            List of all precursor m/z values, sorted.
 811        """
 812        raise DeprecationWarning("request_all_precursors is deprecated.")
 813
 814    def post_lipid_query(self, mz_list, polarity, mz_tol_ppm):
 815        """
 816        Post query to get MetabRef lipid spectra.
 817
 818        Parameters
 819        ----------
 820        mz_list : list
 821            List of precursor m/z values.
 822        polarity : str
 823            Ionization polarity, either "positive" or "negative".
 824        mz_tol_ppm : float
 825            Tolerance in ppm for each precursor m/z value.
 826
 827        Returns
 828        -------
 829        download_id : str
 830            Download ID for the lipid library query.
 831
 832        Raises
 833        ------
 834        ValueError
 835            If any input parameter is invalid.
 836            If no download ID is returned.
 837        """
 838        url = self.LIPID_LIBRARY_URL
 839
 840        headers = {
 841            'accept': '*/*',
 842            'Content-Type': 'application/json'
 843        }
 844        
 845        payload = {
 846            "tolerance_ppm": mz_tol_ppm,
 847            "polarity": polarity,
 848            "mz_list": list(set(np.sort(mz_list))) 
 849        }
 850        
 851        try:
 852            response = requests.post(url, headers=headers, json=payload)
 853            response.raise_for_status()  # Raises an HTTPError for bad responses
 854            text = response.text.strip()
 855            # Drop everything before the final space
 856            if not text:
 857                raise ValueError("Empty response from MetabRef lipid library API.")
 858            if " " in text:
 859                text = text.rsplit(" ", 1)[-1]
 860                return text
 861            else:
 862                raise ValueError("Unexpected response format from MetabRef lipid library API.")
 863        except requests.exceptions.RequestException as e:
 864            raise ValueError(f"Error querying MetabRef lipid library: {e}")
 865
 866    def get_lipid_data(self, job_id, attempts=10, delay=5):
 867        """
 868        Get download content from lipid library query from MetabRef using job ID.
 869
 870        Parameters
 871        ----------
 872        job_id : str
 873            Job ID for the lipid library query.
 874            Retrieved from the post_lipid_query method.
 875        attempts : int, optional
 876            Number of attempts to retrieve the data. Default is 10.
 877        delay : int, optional
 878            Delay in seconds between attempts. Default is 5.
 879
 880        Returns
 881        -------
 882        str
 883            Download content from the lipid library query.
 884
 885        Raises
 886        ------
 887        ValueError
 888            If no download content is returned.
 889        """
 890        url = f"https://metabref.emsl.pnnl.gov/api/lipid/data/download/{job_id}"
 891        
 892        # Check the response, if it's 400, try again in 5 seconds.  Try up to 10 times
 893        for attempt in range(attempts):
 894            try:
 895                response = requests.get(url)
 896                response.raise_for_status()  # Raises an HTTPError for bad responses
 897                if response.status_code == 200:
 898                    if response.content == b"Job still running":
 899                        if attempt < attempts - 1:
 900                            time.sleep(delay)
 901                            continue
 902                    else:
 903                        lib = response.content
 904                        return lib.decode('utf-8') if isinstance(lib, bytes) else lib
 905                elif response.status_code == 400:
 906                    if attempt < attempts - 1:
 907                        time.sleep(delay)  # Wait before retrying
 908                        continue
 909                    else:
 910                        raise ValueError("Job ID not found or job is still processing.")
 911            except requests.exceptions.RequestException as e:
 912                if attempt < attempts - 1:
 913                    time.sleep(delay)
 914                    continue
 915                else:
 916                    raise ValueError(f"Error retrieving lipid library job: {e}")
 917    
 918    def get_lipid_library(
 919        self,
 920        mz_list,
 921        polarity,
 922        mz_tol_ppm,
 923        mz_tol_da_api=None,
 924        format="json",
 925        normalize=True,
 926        fe_kwargs={},
 927        api_delay=5,
 928        api_attempts=10,
 929    ):
 930        """
 931        Request MetabRef lipid library.
 932
 933        Parameters
 934        ----------
 935        mz_list : list
 936            List of precursor m/z values.
 937        polarity : str
 938            Ionization polarity, either "positive" or "negative".
 939        mz_tol_ppm : float
 940            Tolerance in ppm for each precursor m/z value.
 941            Used for retrieving from a potential match from database.
 942        mz_tol_da_api : float, optional
 943            DEPRECATED.  No longer used, but kept for backwards compatibility.
 944        format : str, optional
 945            Format of requested library, i.e. "json", "sql", "flashentropy".
 946            See `available_formats` method for aliases. Default is "json".
 947        normalize : bool, optional
 948            Normalize the spectrum by its magnitude. Default is True.
 949        fe_kwargs : dict, optional
 950            Keyword arguments for FlashEntropy search. Default is {}.
 951        api_delay : int, optional
 952            Delay in seconds between API attempts. Default is 5.
 953        api_attempts : int, optional
 954            Number of attempts to retrieve the data from the API. Default is 10.
 955
 956        Returns
 957        -------
 958        tuple
 959            Library in requested format and lipid metadata as a LipidMetadata dataclass.
 960
 961        """
 962        # Check for valid types in mz_list, polarity, and mz_tol_ppm
 963        if not isinstance(mz_list, (list, np.ndarray)):
 964            raise ValueError("mz_list must be a list or numpy array")
 965        if not all(isinstance(mz, (float, int)) for mz in mz_list):
 966            raise ValueError("All elements in mz_list must be float or int")
 967        if not isinstance(polarity, str):
 968            raise ValueError("polarity must be a string")
 969        if not isinstance(mz_tol_ppm, (float, int)):
 970            raise ValueError("mz_tol_ppm must be a float or int")
 971        
 972        job_id = self.post_lipid_query(
 973            mz_list=mz_list,
 974            polarity=polarity,
 975            mz_tol_ppm=mz_tol_ppm,
 976        )
 977        
 978        lib = self.get_lipid_data(
 979            job_id=job_id,
 980            attempts=api_attempts,
 981            delay=api_delay,
 982        )
 983        lib = json.loads(lib)
 984
 985        # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass
 986        mol_data_dict = lib['molecular_data']
 987        mol_data_dict = {
 988            int(k): self._dict_to_dataclass(v, LipidMetadata)
 989            for k, v in mol_data_dict.items()
 990        }
 991
 992        # Remove lipid metadata from the metabref library
 993        lib = lib['mass_spectrum_data']
 994        # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry
 995        for x in lib:
 996            if "Lipid Fragments" in x.keys():
 997                x.update(x.pop("Lipid Fragments"))
 998            if "MSO Data" in x.keys():
 999                x.update(x.pop("MSO Data"))
1000
1001        # Format the spectral library
1002        format_func = self._get_format_func(format)
1003        lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs)
1004        return (lib, mol_data_dict)
1005
1006
1007class MSPInterface(SpectralDatabaseInterface):
1008    """
1009    Interface to parse NIST MSP files
1010    """
1011
1012    def __init__(self, file_path):
1013        """
1014        Initialize instance.
1015
1016        Parameters
1017        ----------
1018        file_path : str
1019            Path to a local MSP file.
1020
1021        Attributes
1022        ----------
1023        file_path : str
1024            Path to the MSP file.
1025        _file_content : str
1026            Content of the MSP file.
1027        _data_frame : :obj:`~pandas.DataFrame`
1028            DataFrame of spectra from the MSP file with unaltered content.
1029        """
1030        super().__init__(key=None)
1031
1032        self.file_path = file_path
1033        if not os.path.exists(self.file_path):
1034            raise FileNotFoundError(
1035                f"File {self.file_path} does not exist. Please check the file path."
1036            )
1037        with open(self.file_path, "r") as f:
1038            self._file_content = f.read()
1039
1040        self._data_frame = self._read_msp_file()
1041        self.__init_format_map__()
1042
1043    def __init_format_map__(self):
1044        """
1045        Initialize database format mapper, enabling multiple format requests.
1046
1047        """
1048
1049        # x is a pandas dataframe similar to self._data_frame format
1050        # Define format workflows
1051        self.format_map = {
1052            "msp": lambda x, normalize, fe_kwargs: self._to_msp(x, normalize),
1053            "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy(
1054                x, normalize, fe_kwargs
1055            ),
1056            "df": lambda x, normalize, fe_kwargs: self._to_df(x, normalize),
1057        }
1058
1059        # Add aliases
1060        self.format_map["fe"] = self.format_map["flashentropy"]
1061        self.format_map["flash-entropy"] = self.format_map["flashentropy"]
1062        self.format_map["dataframe"] = self.format_map["df"]
1063        self.format_map["data-frame"] = self.format_map["df"]
1064
1065    def _read_msp_file(self):
1066        """
1067        Reads the MSP files into the pandas dataframe, and sort/remove zero intensity ions in MS/MS spectra.
1068
1069        Returns
1070        -------
1071        :obj:`~pandas.DataFrame`
1072            DataFrame of spectra from the MSP file, exacly as it is in the file (no sorting, filtering etc)
1073        """
1074        # If input_dataframe is provided, return it it
1075        spectra = []
1076        spectrum = {}
1077
1078        f = StringIO(self._file_content)
1079        for line in f:
1080            line = line.strip()
1081            if not line:
1082                continue  # Skip empty lines
1083
1084            # Handle metadata
1085            if ":" in line:
1086                key, value = line.split(":", 1)
1087                key = key.strip().lower()
1088                value = value.strip()
1089
1090                if key == "name":
1091                    # Save current spectrum and start a new one
1092                    if spectrum:
1093                        spectra.append(spectrum)
1094                    spectrum = {"name": value, "peaks": []}
1095                else:
1096                    spectrum[key] = value
1097
1098            # Handle peak data (assumed to start with a number)
1099            elif line[0].isdigit():
1100                peaks = line.split()
1101                m_z = float(peaks[0])
1102                intensity = float(peaks[1])
1103                spectrum["peaks"].append(([m_z, intensity]))
1104        # Save the last spectrum
1105        if spectrum:
1106            spectra.append(spectrum)
1107
1108        df = pd.DataFrame(spectra)
1109        for column in df.columns:
1110            if column != "peaks":  # Skip 'peaks' column
1111                try:
1112                    df[column] = pd.to_numeric(df[column], errors="raise")
1113                except:
1114                    pass
1115        return df
1116
1117    def _to_df(self, input_dataframe, normalize=True):
1118        """
1119        Convert MSP-derived library to FlashEntropy library. 
1120
1121        Parameters
1122        ----------
1123        input_dataframe : :obj:`~pandas.DataFrame`
1124            Input DataFrame containing MSP-formatted spectra.
1125        normalize : bool, optional
1126            Normalize each spectrum by its magnitude.
1127            Default is True.
1128
1129        Returns
1130        -------
1131        :obj:`~pandas.DataFrame`
1132            DataFrame of with desired normalization
1133        """
1134        if not normalize:
1135            return input_dataframe
1136        else:
1137            # Convert to dictionary
1138            db_dict = input_dataframe.to_dict(orient="records")
1139
1140            # Initialize empty library
1141            lib = []
1142
1143            # Enumerate spectra
1144            for i, source in enumerate(db_dict):
1145                spectrum = source
1146                # Check that spectrum["peaks"] exists
1147                if "peaks" not in spectrum.keys():
1148                    raise KeyError(
1149                        "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute."
1150                    )
1151
1152                # Convert spectrum["peaks"] to numpy array
1153                if not isinstance(spectrum["peaks"], np.ndarray):
1154                    spectrum["peaks"] = np.array(spectrum["peaks"])
1155
1156                # Normalize peaks, if requested
1157                if normalize:
1158                    spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"])
1159                    spectrum["num peaks"] = len(spectrum["peaks"])
1160
1161                # Add spectrum to library
1162                lib.append(spectrum)
1163            
1164            # Convert to DataFrame
1165            df = pd.DataFrame(lib)
1166            return df
1167    
1168    def _to_flashentropy(self, input_dataframe, normalize=True, fe_kwargs={}):
1169        """
1170        Convert MSP-derived library to FlashEntropy library.
1171
1172        Parameters
1173        ----------
1174        input_dataframe : :obj:`~pandas.DataFrame`
1175            Input DataFrame containing MSP-formatted spectra.
1176        normalize : bool
1177            Normalize each spectrum by its magnitude.
1178        fe_kwargs : dict, optional
1179            Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search;
1180            any keys not recognized will be ignored. By default, all parameters set to defaults.
1181
1182        Returns
1183        -------
1184        :obj:`~ms_entropy.FlashEntropySearch`
1185            MS2 library as FlashEntropy search instance.
1186
1187        Raises
1188        ------
1189        ValueError
1190            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they
1191        """
1192        self._check_flash_entropy_kwargs(fe_kwargs)
1193
1194        db_df = input_dataframe
1195
1196        # Convert to dictionary
1197        db_dict = db_df.to_dict(orient="records")
1198
1199        # Initialize empty library
1200        fe_lib = []
1201
1202        # Enumerate spectra
1203        for i, source in enumerate(db_dict):
1204            # Reorganize source dict, if necessary
1205            if "spectrum_data" in source.keys():
1206                spectrum = source["spectrum_data"]
1207            else:
1208                spectrum = source
1209
1210            # Rename precursor_mz key for FlashEntropy
1211            if "precursor_mz" not in spectrum.keys():
1212                if "precursormz" in spectrum:
1213                    spectrum["precursor_mz"] = spectrum.pop("precursormz")
1214                elif "precursor_ion" in spectrum:
1215                    spectrum["precursor_mz"] = spectrum.pop("precursor_ion")
1216                else:
1217                    raise KeyError(
1218                        "MSP must have either 'precursormz' or 'precursor_ion' key to be converted to FlashEntropy format."
1219                    )
1220
1221            # Check that spectrum["peaks"] exists
1222            if "peaks" not in spectrum.keys():
1223                raise KeyError(
1224                    "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute."
1225                )
1226
1227            # Convert spectrum["peaks"] to numpy array
1228            if not isinstance(spectrum["peaks"], np.ndarray):
1229                spectrum["peaks"] = np.array(spectrum["peaks"])
1230
1231            # Normalize peaks, if requested
1232            if normalize:
1233                spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"])
1234
1235            # Add spectrum to library
1236            fe_lib.append(spectrum)
1237
1238        # Build FlashEntropy index
1239        fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs)
1240
1241        return fe_search
1242    
1243    def _check_msp_compatibility(self):
1244        """
1245        Check if the MSP file is compatible with the get_metabolomics_spectra_library method and provide feedback if it is not.
1246        """
1247        # Check polarity
1248        if (
1249            "polarity" not in self._data_frame.columns
1250            and "ionmode" not in self._data_frame.columns
1251        ):
1252            raise ValueError(
1253                "Neither 'polarity' nor 'ionmode' columns found in the input MSP metadata. Please check the file."
1254            )
1255        polarity_column = (
1256            "polarity" if "polarity" in self._data_frame.columns else "ionmode"
1257        )
1258
1259        # Check if polarity_column contents is either "positive" or "negative"
1260        if not all(self._data_frame[polarity_column].isin(["positive", "negative"])):
1261            raise ValueError(
1262                f"Input field on MSP '{polarity_column}' must contain only 'positive' or 'negative' values."
1263            )
1264
1265        # Check if the MSP file contains the required columns for metabolite metadata
1266        # inchikey, by name, not null
1267        # either formula or molecular_formula, not null
1268        if not all(self._data_frame["inchikey"].notnull()):
1269            raise ValueError(
1270                "Input field on MSP 'inchikey' must contain only non-null values."
1271            )
1272        if (
1273            "formula" not in self._data_frame.columns
1274            and "molecular_formula" not in self._data_frame.columns
1275        ):
1276            raise ValueError(
1277                "Input field on MSP must contain either 'formula' or 'molecular_formula' columns."
1278            )
1279        molecular_formula_column = (
1280            "formula" if "formula" in self._data_frame.columns else "molecular_formula"
1281        )
1282        if not all(self._data_frame[molecular_formula_column].notnull()):
1283            raise ValueError(
1284                f"Input field on MSP '{molecular_formula_column}' must contain only non-null values."
1285            )
1286
1287    def get_metabolomics_spectra_library(
1288        self,
1289        polarity,
1290        metabolite_metadata_mapping={},
1291        format="fe",
1292        normalize=True,
1293        fe_kwargs={},
1294    ):
1295        """
1296        Prepare metabolomics spectra library and associated metabolite metadata
1297
1298        Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input
1299
1300        """
1301        # Check if the MSP file is compatible with the get_metabolomics_spectra_library method
1302        self._check_msp_compatibility()
1303
1304        # Check if the polarity parameter is valid and if a polarity column exists in the dataframe
1305        if polarity not in ["positive", "negative"]:
1306            raise ValueError("Polarity must be 'positive' or 'negative'")
1307        polarity_column = (
1308            "polarity" if "polarity" in self._data_frame.columns else "ionmode"
1309        )
1310
1311        # Get a subset of the initial dataframea by polarity
1312        db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy()
1313
1314        # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping
1315        # If the mapping is not provided, use the default mapping
1316        if not metabolite_metadata_mapping:
1317            metabolite_metadata_mapping = {
1318                "chebi_id": "chebi",
1319                "kegg_id": "kegg",
1320                "refmet_name": "common_name",
1321                "molecular_formula": "formula",
1322                "gnps_spectra_id":"id",
1323                "precursormz": "precursor_mz",
1324                "precursortype":"ion_type"
1325            }
1326        db_df.rename(columns=metabolite_metadata_mapping, inplace=True)
1327        db_df["molecular_data_id"] = db_df["inchikey"]
1328
1329
1330
1331        # Check if the resulting dataframe has the required columns for the flash entropy search
1332        required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"]
1333        for col in required_columns:
1334            if col not in db_df.columns:
1335                raise ValueError(
1336                    f"Input field on MSP must contain '{col}' column for FlashEntropy search."
1337                )
1338
1339        # Pull out the metabolite metadata from the dataframe and put it into a different dataframe
1340        # First get a list of the possible attributes of the MetaboliteMetadata dataclass
1341        metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys())
1342        # Replace id with molecular_data_id in metabolite_metadata_keys
1343        metabolite_metadata_keys = [
1344            "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys
1345        ]
1346        metabolite_metadata_df = db_df[
1347            db_df.columns[db_df.columns.isin(metabolite_metadata_keys)]
1348        ].copy()
1349
1350        # Make unique and recast the id column for metabolite metadata
1351        metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True)
1352        metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"]
1353
1354        # Convert to a dictionary using the inchikey as the key
1355        metabolite_metadata_dict = metabolite_metadata_df.to_dict(
1356            orient="records"
1357        )
1358        metabolite_metadata_dict = {
1359            v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata)
1360            for v in metabolite_metadata_dict
1361        }
1362
1363        # Remove the metabolite metadata columns from the original dataframe
1364        for key in metabolite_metadata_keys:
1365            if key != "molecular_data_id":
1366                if key in db_df.columns:
1367                    db_df.drop(columns=key, inplace=True)
1368
1369        # Format the spectral library
1370        format_func = self._get_format_func(format)
1371        lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs)
1372        return (lib, metabolite_metadata_dict)
class SpectralDatabaseInterface(abc.ABC):
 24class SpectralDatabaseInterface(ABC):
 25    """
 26    Base class that facilitates connection to spectral reference databases,
 27    such as EMSL's Metabolomics Reference Database (MetabRef).
 28
 29    """
 30
 31    def __init__(self, key=None):
 32        """
 33        Initialize instance.
 34
 35        Parameters
 36        ----------
 37        key : str
 38            Token key.
 39
 40        """
 41
 42        self.key = key
 43
 44    def set_token(self, path):
 45        """
 46        Set environment variable for MetabRef database token.
 47
 48        Parameters
 49        ----------
 50        path : str
 51            Path to token.
 52
 53        """
 54
 55        # Read token from file
 56        with open(path, "r", encoding="utf-8") as f:
 57            token = f.readline().strip()
 58
 59        # Set environment variable
 60        os.environ[self.key] = token
 61
 62    def get_token(self):
 63        """
 64        Get environment variable for database token.
 65
 66        Returns
 67        -------
 68        str
 69            Token string.
 70
 71        """
 72
 73        # Check for token
 74        if self.key not in os.environ:
 75            raise ValueError("Must set {} environment variable.".format(self.key))
 76
 77        # Get token from environment variables
 78        return os.environ.get(self.key)
 79
 80    def get_header(self):
 81        """
 82        Access stored database token and prepare as header.
 83
 84        Returns
 85        -------
 86        str
 87            Header string.
 88
 89        """
 90
 91        # Get token
 92        token = self.get_token()
 93
 94        # Pad header information
 95        header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"}
 96
 97        return header
 98
 99    def get_query(self, url, use_header=True):
100        """
101        Request payload from URL according to `get` protocol.
102
103        Parameters
104        ----------
105        url : str
106            URL for request.
107        use_header: bool
108            Whether or not the query should include the header
109
110        Returns
111        -------
112        dict
113            Response as JSON.
114
115        """
116
117        # Query URL via `get`
118        if use_header:
119            response = requests.get(url, headers=self.get_header())
120        else:
121            response = requests.get(url)
122
123        # Check response
124        response.raise_for_status()
125
126        # Return as JSON
127        return response.json()
128
129    def post_query(self, url, variable, values, tolerance):
130        """
131        Request payload from URL according to `post` protocol.
132
133        Parameters
134        ----------
135        url : str
136            URL for request.
137        variable : str
138            Variable to query.
139        values : str
140            Specific values of `variable` to query.
141        tolerance : str
142            Query tolerance relative to `values`.
143
144        Returns
145        -------
146        dict
147            Response as JSON.
148
149        """
150
151        # Coerce to string
152        if not isinstance(variable, str):
153            variable = str(variable).replace(" ", "")
154
155        if not isinstance(values, str):
156            values = str(values).replace(" ", "")
157
158        if not isinstance(tolerance, str):
159            tolerance = str(tolerance).replace(" ", "")
160
161        # Query URL via `post`
162        response = requests.post(
163            os.path.join(url, variable, tolerance),
164            data=values,
165            headers=self.get_header(),
166        )
167
168        # Check response
169        response.raise_for_status()
170
171        # Return as JSON
172        return response.json()
173
174    def _check_flash_entropy_kwargs(self, fe_kwargs):
175        """
176        Check FlashEntropy keyword arguments.
177
178        Parameters
179        ----------
180        fe_kwargs : dict
181            Keyword arguments for FlashEntropy search.
182
183
184        Raises
185        ------
186        ValueError
187            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they
188            are not equal.
189
190        """
191        # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da
192        if (
193            "min_ms2_difference_in_da" in fe_kwargs
194            or "max_ms2_tolerance_in_da" in fe_kwargs
195        ):
196            if (
197                "min_ms2_difference_in_da" not in fe_kwargs
198                or "max_ms2_tolerance_in_da" not in fe_kwargs
199            ):
200                raise ValueError(
201                    "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified."
202                )
203            if (
204                fe_kwargs["min_ms2_difference_in_da"]
205                != 2 * fe_kwargs["max_ms2_tolerance_in_da"]
206            ):
207                raise ValueError(
208                    "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'."
209                )
210
211    def _get_format_func(self, format):
212        """
213        Obtain format function by key.
214
215        Returns
216        -------
217        func
218            Formatting function.
219        """
220
221        if format.lower() in self.format_map.keys():
222            return self.format_map[format.lower()]
223
224        raise ValueError(("{} not a supported format.").format(format))
225
226    def _dict_to_dataclass(self, metabref_lib, data_class):
227        """
228        Convert dictionary to dataclass.
229
230        Notes
231        -----
232        This function will pull the attributes a dataclass and its parent class
233        and convert the dictionary to a dataclass instance with the appropriate
234        attributes.
235
236        Parameters
237        ----------
238        data_class : :obj:`~dataclasses.dataclass`
239            Dataclass to convert to.
240        metabref_lib : dict
241            Metabref dictionary object to convert to dataclass.
242
243        Returns
244        -------
245        :obj:`~dataclasses.dataclass`
246            Dataclass instance.
247
248        """
249
250        # Get list of expected attributes of data_class
251        data_class_keys = list(data_class.__annotations__.keys())
252
253        # Does the data_class inherit from another class, if so, get the attributes of the parent class as well
254        if len(data_class.__mro__) > 2:
255            parent_class_keys = list(data_class.__bases__[0].__annotations__.keys())
256            data_class_keys = list(set(data_class_keys + parent_class_keys))
257
258        # Remove keys that are not in the data_class from the input dictionary
259        input_dict = {k: v for k, v in metabref_lib.items() if k in data_class_keys}
260
261        # Add keys that are in the data class but not in the input dictionary as None
262        for key in data_class_keys:
263            if key not in input_dict.keys():
264                input_dict[key] = None
265        return data_class(**input_dict)
266
267    @staticmethod
268    def normalize_peaks(arr):
269        """
270        Normalize peaks in an array.
271
272        Parameters
273        ----------
274        arr : :obj:`~numpy.array`
275            Array of shape (N, 2), with m/z in the first column and abundance in
276            the second.
277
278        Returns
279        -------
280        :obj:`~numpy.array`
281            Normalized array of shape (N, 2), with m/z in the first column and
282            normalized abundance in the second.
283        """
284        # Normalize the array
285        arr[:, -1] = arr[:, -1] / arr[:, -1].sum()
286
287        return arr
288
289    @staticmethod
290    def _build_flash_entropy_index(fe_lib, fe_kwargs={}, clean_spectra=True):
291        """
292        Build FlashEntropy index.
293
294        Parameters
295        ----------
296        fe_lib : list
297            List of spectra to build index from. Can be a list of dictionaries or
298            a FlashEntropy search instance.
299        fe_kwargs : dict, optional
300            Keyword arguments for FlashEntropy search.
301        clean_spectra : bool, optional
302            Clean spectra before building index. Default is True.
303
304        Returns
305        -------
306        :obj:`~ms_entropy.FlashEntropySearch`
307            FlashEntropy search instance.
308
309        """
310        # Initialize FlashEntropy
311        fe_init_kws = [
312            "max_ms2_tolerance_in_da",
313            "mz_index_step",
314            "low_memory",
315            "path_data",
316        ]
317        fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws}
318        fes = FlashEntropySearch(**fe_init_kws)
319
320        # Build FlashEntropy index
321        fe_index_kws = [
322            "max_indexed_mz",
323            "precursor_ions_removal_da",
324            "noise_threshold",
325            "min_ms2_difference_in_da",
326            "max_peak_num",
327        ]
328        fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws}
329        fes.build_index(fe_lib, **fe_index_kws, clean_spectra=clean_spectra)
330
331        return fes

Base class that facilitates connection to spectral reference databases, such as EMSL's Metabolomics Reference Database (MetabRef).

SpectralDatabaseInterface(key=None)
31    def __init__(self, key=None):
32        """
33        Initialize instance.
34
35        Parameters
36        ----------
37        key : str
38            Token key.
39
40        """
41
42        self.key = key

Initialize instance.

Parameters
  • key (str): Token key.
key
def set_token(self, path):
44    def set_token(self, path):
45        """
46        Set environment variable for MetabRef database token.
47
48        Parameters
49        ----------
50        path : str
51            Path to token.
52
53        """
54
55        # Read token from file
56        with open(path, "r", encoding="utf-8") as f:
57            token = f.readline().strip()
58
59        # Set environment variable
60        os.environ[self.key] = token

Set environment variable for MetabRef database token.

Parameters
  • path (str): Path to token.
def get_token(self):
62    def get_token(self):
63        """
64        Get environment variable for database token.
65
66        Returns
67        -------
68        str
69            Token string.
70
71        """
72
73        # Check for token
74        if self.key not in os.environ:
75            raise ValueError("Must set {} environment variable.".format(self.key))
76
77        # Get token from environment variables
78        return os.environ.get(self.key)

Get environment variable for database token.

Returns
  • str: Token string.
def get_header(self):
80    def get_header(self):
81        """
82        Access stored database token and prepare as header.
83
84        Returns
85        -------
86        str
87            Header string.
88
89        """
90
91        # Get token
92        token = self.get_token()
93
94        # Pad header information
95        header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"}
96
97        return header

Access stored database token and prepare as header.

Returns
  • str: Header string.
def get_query(self, url, use_header=True):
 99    def get_query(self, url, use_header=True):
100        """
101        Request payload from URL according to `get` protocol.
102
103        Parameters
104        ----------
105        url : str
106            URL for request.
107        use_header: bool
108            Whether or not the query should include the header
109
110        Returns
111        -------
112        dict
113            Response as JSON.
114
115        """
116
117        # Query URL via `get`
118        if use_header:
119            response = requests.get(url, headers=self.get_header())
120        else:
121            response = requests.get(url)
122
123        # Check response
124        response.raise_for_status()
125
126        # Return as JSON
127        return response.json()

Request payload from URL according to get protocol.

Parameters
  • url (str): URL for request.
  • use_header (bool): Whether or not the query should include the header
Returns
  • dict: Response as JSON.
def post_query(self, url, variable, values, tolerance):
129    def post_query(self, url, variable, values, tolerance):
130        """
131        Request payload from URL according to `post` protocol.
132
133        Parameters
134        ----------
135        url : str
136            URL for request.
137        variable : str
138            Variable to query.
139        values : str
140            Specific values of `variable` to query.
141        tolerance : str
142            Query tolerance relative to `values`.
143
144        Returns
145        -------
146        dict
147            Response as JSON.
148
149        """
150
151        # Coerce to string
152        if not isinstance(variable, str):
153            variable = str(variable).replace(" ", "")
154
155        if not isinstance(values, str):
156            values = str(values).replace(" ", "")
157
158        if not isinstance(tolerance, str):
159            tolerance = str(tolerance).replace(" ", "")
160
161        # Query URL via `post`
162        response = requests.post(
163            os.path.join(url, variable, tolerance),
164            data=values,
165            headers=self.get_header(),
166        )
167
168        # Check response
169        response.raise_for_status()
170
171        # Return as JSON
172        return response.json()

Request payload from URL according to post protocol.

Parameters
  • url (str): URL for request.
  • variable (str): Variable to query.
  • values (str): Specific values of variable to query.
  • tolerance (str): Query tolerance relative to values.
Returns
  • dict: Response as JSON.
@staticmethod
def normalize_peaks(arr):
267    @staticmethod
268    def normalize_peaks(arr):
269        """
270        Normalize peaks in an array.
271
272        Parameters
273        ----------
274        arr : :obj:`~numpy.array`
275            Array of shape (N, 2), with m/z in the first column and abundance in
276            the second.
277
278        Returns
279        -------
280        :obj:`~numpy.array`
281            Normalized array of shape (N, 2), with m/z in the first column and
282            normalized abundance in the second.
283        """
284        # Normalize the array
285        arr[:, -1] = arr[:, -1] / arr[:, -1].sum()
286
287        return arr

Normalize peaks in an array.

Parameters
  • arr (~numpy.array): Array of shape (N, 2), with m/z in the first column and abundance in the second.
Returns
  • ~numpy.array: Normalized array of shape (N, 2), with m/z in the first column and normalized abundance in the second.
class MetabRefInterface(SpectralDatabaseInterface):
334class MetabRefInterface(SpectralDatabaseInterface):
335    """
336    Interface to the Metabolomics Reference Database.
337    """
338
339    def __init__(self):
340        """
341        Initialize instance.
342
343        """
344
345        super().__init__(key=None)
346
347    def spectrum_to_array(self, spectrum, normalize=True):
348        """
349        Convert MetabRef-formatted spectrum to array.
350
351        Parameters
352        ----------
353        spectrum : str
354            MetabRef spectrum, i.e. list of (m/z,abundance) pairs.
355        normalize : bool
356            Normalize the spectrum by its magnitude.
357
358        Returns
359        -------
360        :obj:`~numpy.array`
361            Array of shape (N, 2), with m/z in the first column and abundance in
362            the second.
363
364        """
365
366        # Convert parenthesis-delimited string to array
367        arr = np.array(
368            re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float
369        ).reshape(-1, 2)
370
371        if normalize:
372            arr = self.normalize_peaks(arr)
373
374        return arr
375
376    def _to_flashentropy(self, metabref_lib, normalize=True, fe_kwargs={}):
377        """
378        Convert metabref-formatted library to FlashEntropy library.
379
380        Parameters
381        ----------
382        metabref_lib : dict
383            MetabRef MS2 library in JSON format or FlashEntropy search instance (for reformatting at different MS2 separation).
384        normalize : bool
385            Normalize each spectrum by its magnitude.
386        fe_kwargs : dict, optional
387            Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search;
388            any keys not recognized will be ignored. By default, all parameters set to defaults.
389
390        Returns
391        -------
392        :obj:`~ms_entropy.FlashEntropySearch`
393            MS2 library as FlashEntropy search instance.
394
395        Raises
396        ------
397        ValueError
398            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal.
399
400        """
401        self._check_flash_entropy_kwargs(fe_kwargs)
402
403        # Initialize empty library
404        fe_lib = []
405
406        # Enumerate spectra
407        for i, source in enumerate(metabref_lib):
408            # Reorganize source dict, if necessary
409            if "spectrum_data" in source.keys():
410                spectrum = source["spectrum_data"]
411            else:
412                spectrum = source
413
414            # Rename precursor_mz key for FlashEntropy
415            if "precursor_mz" not in spectrum.keys():
416                spectrum["precursor_mz"] = spectrum.pop("precursor_ion")
417
418            # Convert CoreMS spectrum to array and clean, store as `peaks`
419            spectrum["peaks"] = self.spectrum_to_array(
420                spectrum["mz"], normalize=normalize
421            )
422
423            # Add spectrum to library
424            fe_lib.append(spectrum)
425
426        # Build FlashEntropy index
427        fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs)
428
429        return fe_search
430
431    def get_query(self, url, use_header=False):
432        """Overwrites the get_query method on the parent class to default to not use a header
433
434        Notes
435        -----
436        As of January 2025, the metabref database no longer requires a token and therefore no header is needed
437
438        """
439        return super().get_query(url, use_header)

Interface to the Metabolomics Reference Database.

MetabRefInterface()
339    def __init__(self):
340        """
341        Initialize instance.
342
343        """
344
345        super().__init__(key=None)

Initialize instance.

def spectrum_to_array(self, spectrum, normalize=True):
347    def spectrum_to_array(self, spectrum, normalize=True):
348        """
349        Convert MetabRef-formatted spectrum to array.
350
351        Parameters
352        ----------
353        spectrum : str
354            MetabRef spectrum, i.e. list of (m/z,abundance) pairs.
355        normalize : bool
356            Normalize the spectrum by its magnitude.
357
358        Returns
359        -------
360        :obj:`~numpy.array`
361            Array of shape (N, 2), with m/z in the first column and abundance in
362            the second.
363
364        """
365
366        # Convert parenthesis-delimited string to array
367        arr = np.array(
368            re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float
369        ).reshape(-1, 2)
370
371        if normalize:
372            arr = self.normalize_peaks(arr)
373
374        return arr

Convert MetabRef-formatted spectrum to array.

Parameters
  • spectrum (str): MetabRef spectrum, i.e. list of (m/z,abundance) pairs.
  • normalize (bool): Normalize the spectrum by its magnitude.
Returns
  • ~numpy.array: Array of shape (N, 2), with m/z in the first column and abundance in the second.
def get_query(self, url, use_header=False):
431    def get_query(self, url, use_header=False):
432        """Overwrites the get_query method on the parent class to default to not use a header
433
434        Notes
435        -----
436        As of January 2025, the metabref database no longer requires a token and therefore no header is needed
437
438        """
439        return super().get_query(url, use_header)

Overwrites the get_query method on the parent class to default to not use a header

Notes

As of January 2025, the metabref database no longer requires a token and therefore no header is needed

class MetabRefGCInterface(MetabRefInterface):
442class MetabRefGCInterface(MetabRefInterface):
443    """
444    Interface to the Metabolomics Reference Database.
445    """
446
447    def __init__(self):
448        """
449        Initialize instance.
450
451        """
452
453        super().__init__()
454        self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1"
455        self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames"
456
457        self.__init_format_map__()
458
459    def __init_format_map__(self):
460        """
461        Initialize database format mapper, enabling multiple format requests.
462
463        """
464
465        # Define format workflows
466        self.format_map = {
467            "json": lambda x, normalize, fe_kwargs: x,
468            "dict": lambda x,
469            normalize,
470            fe_kwargs: self._to_LowResolutionEICompound_dict(x, normalize),
471            "sql": lambda x,
472            normalize,
473            fe_kwargs: self._LowResolutionEICompound_dict_to_sqlite(
474                self._to_LowResolutionEICompound_dict(x, normalize)
475            ),
476        }
477
478        # Add aliases
479        self.format_map["metabref"] = self.format_map["json"]
480        self.format_map["datadict"] = self.format_map["dict"]
481        self.format_map["data-dict"] = self.format_map["dict"]
482        self.format_map["lowreseicompound"] = self.format_map["dict"]
483        self.format_map["lowres"] = self.format_map["dict"]
484        self.format_map["lowresgc"] = self.format_map["dict"]
485        self.format_map["sqlite"] = self.format_map["sql"]
486
487    def available_formats(self):
488        """
489        View list of available formats.
490
491        Returns
492        -------
493        list
494            Format map keys.
495        """
496
497        return list(self.format_map.keys())
498
499    def get_library(self, format="json", normalize=False):
500        """
501        Request MetabRef GC/MS library.
502
503        Parameters
504        ----------
505        format : str
506            Format of requested library, i.e. "json", "sql", "flashentropy".
507            See `available_formats` method for aliases.
508        normalize : bool
509            Normalize the spectrum by its magnitude.
510
511        Returns
512        -------
513        Library in requested format.
514
515        """
516
517        # Init format function
518        format_func = self._get_format_func(format)
519
520        return format_func(
521            self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {}
522        )
523
524    def get_fames(self, format="json", normalize=False):
525        """
526        Request MetabRef GC/MS FAMEs library.
527
528        Parameters
529        ----------
530        format : str
531            Format of requested library, i.e. "json", "sql", "flashentropy".
532            See `available_formats` method for aliases.
533        normalize : bool
534            Normalize the spectrum by its magnitude.
535
536        Returns
537        -------
538        Library in requested format.
539
540        """
541
542        # Init format function
543        format_func = self._get_format_func(format)
544
545        return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {})
546
547    def _to_LowResolutionEICompound_dict(self, metabref_lib, normalize=False):
548        """
549        Convert MetabRef-formatted library to CoreMS LowResolutionEICompound-formatted
550        dictionary for local ingestion.
551
552        Parameters
553        ----------
554        metabref_lib : dict
555            MetabRef GC-MS library in JSON format.
556        normalize : bool
557            Normalize each spectrum by its magnitude.
558
559        Returns
560        -------
561        list of dict
562            List of each spectrum contained in dictionary.
563
564        """
565
566        # All below key:value lookups are based on CoreMS class definitions
567        # NOT MetabRef content. For example, MetabRef has keys for PubChem,
568        # USI, etc. that are not considered below.
569
570        # Dictionary to map metabref keys to corems keys
571        metadatar_cols = {
572            "casno": "cas",
573            "inchikey": "inchikey",
574            "inchi": "inchi",
575            "chebi": "chebi",
576            "smiles": "smiles",
577            "kegg": "kegg",
578            "iupac_name": "iupac_name",
579            "traditional_name": "traditional_name",  # Not present in metabref
580            "common_name": "common_name",  # Not present in metabref
581        }
582
583        # Dictionary to map metabref keys to corems keys
584        lowres_ei_compound_cols = {
585            "id": "metabref_id",
586            "molecule_name": "name",  # Is this correct?
587            "classify": "classify",  # Not present in metabref
588            "formula": "formula",
589            "ri": "ri",
590            "rt": "retention_time",
591            "source": "source",  # Not present in metabref
592            "casno": "casno",
593            "comments": "comment",
594            "source_temp_c": "source_temp_c",  # Not present in metabref
595            "ev": "ev",  # Not present in metabref
596            "peak_count": "peaks_count",
597            "mz": "mz",
598            "abundance": "abundance",
599        }
600
601        # Local result container
602        corems_lib = []
603
604        # Enumerate spectra
605        for i, source_ in enumerate(metabref_lib):
606            # Copy source to prevent modification
607            source = source_.copy()
608
609            # Flatten source dict
610            source = source.pop("spectrum_data") | source
611
612            # Parse target data
613            target = {
614                lowres_ei_compound_cols[k]: v
615                for k, v in source.items()
616                if k in lowres_ei_compound_cols
617            }
618
619            # Explicitly add this to connect with LowResCompoundRef later
620            target["rt"] = source["rt"]
621
622            # Parse (mz, abundance)
623            arr = self.spectrum_to_array(target["mz"], normalize=normalize)
624            target["mz"] = arr[:, 0]
625            target["abundance"] = arr[:, 1]
626
627            # Parse meta data
628            target["metadata"] = {
629                metadatar_cols[k]: v for k, v in source.items() if k in metadatar_cols
630            }
631
632            # Add anything else
633            for k in source:
634                if k not in lowres_ei_compound_cols:
635                    target[k] = source[k]
636
637            # Add to CoreMS list
638            corems_lib.append(target)
639
640        return corems_lib
641
642    def _LowResolutionEICompound_dict_to_sqlite(
643        self, lowres_ei_compound_dict, url="sqlite://"
644    ):
645        """
646        Convert CoreMS LowResolutionEICompound-formatted dictionary to SQLite
647        database for local ingestion.
648
649        Parameters
650        ----------
651        lowres_ei_compound_dict : dict
652            CoreMS GC-MS library formatted for LowResolutionEICompound.
653        url : str
654            URL to SQLite prefix.
655
656        Returns
657        -------
658        sqlite database
659            Spectra contained in SQLite database.
660
661        """
662
663        # Dictionary to map corems keys to all-caps keys
664        capped_cols = {
665            "name": "NAME",
666            "formula": "FORM",
667            "ri": "RI",
668            "retention_time": "RT",
669            "source": "SOURCE",
670            "casno": "CASNO",
671            "comment": "COMMENT",
672            "peaks_count": "NUM PEAKS",
673        }
674
675        # Initialize SQLite object
676        sqlite_obj = EI_LowRes_SQLite(url=url)
677
678        # Iterate spectra
679        for _data_dict in lowres_ei_compound_dict:
680            # Copy source to prevent modification
681            data_dict = _data_dict.copy()
682
683            # Add missing capped values
684            for k, v in capped_cols.items():
685                # Key exists
686                if k in data_dict:
687                    # # This will replace the key
688                    # data_dict[v] = data_dict.pop(k)
689
690                    # This will keep both keys
691                    data_dict[v] = data_dict[k]
692
693            # Parse number of peaks
694            if not data_dict.get("NUM PEAKS"):
695                data_dict["NUM PEAKS"] = len(data_dict.get("mz"))
696
697            # Parse CAS number
698            if not data_dict.get("CASNO"):
699                data_dict["CASNO"] = data_dict.get("CAS")
700
701            if not data_dict["CASNO"]:
702                data_dict["CASNO"] = 0
703
704            # Build linked metadata table
705            if "metadata" in data_dict:
706                if len(data_dict["metadata"]) > 0:
707                    data_dict["metadatar"] = Metadatar(**data_dict.pop("metadata"))
708                else:
709                    data_dict.pop("metadata")
710
711            # Attempt addition to sqlite
712            try:
713                sqlite_obj.add_compound(data_dict)
714            except:
715                print(data_dict["NAME"])
716
717        return sqlite_obj

Interface to the Metabolomics Reference Database.

MetabRefGCInterface()
447    def __init__(self):
448        """
449        Initialize instance.
450
451        """
452
453        super().__init__()
454        self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1"
455        self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames"
456
457        self.__init_format_map__()

Initialize instance.

GCMS_LIBRARY_URL
FAMES_URL
def available_formats(self):
487    def available_formats(self):
488        """
489        View list of available formats.
490
491        Returns
492        -------
493        list
494            Format map keys.
495        """
496
497        return list(self.format_map.keys())

View list of available formats.

Returns
  • list: Format map keys.
def get_library(self, format='json', normalize=False):
499    def get_library(self, format="json", normalize=False):
500        """
501        Request MetabRef GC/MS library.
502
503        Parameters
504        ----------
505        format : str
506            Format of requested library, i.e. "json", "sql", "flashentropy".
507            See `available_formats` method for aliases.
508        normalize : bool
509            Normalize the spectrum by its magnitude.
510
511        Returns
512        -------
513        Library in requested format.
514
515        """
516
517        # Init format function
518        format_func = self._get_format_func(format)
519
520        return format_func(
521            self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {}
522        )

Request MetabRef GC/MS library.

Parameters
  • format (str): Format of requested library, i.e. "json", "sql", "flashentropy". See available_formats method for aliases.
  • normalize (bool): Normalize the spectrum by its magnitude.
Returns
  • Library in requested format.
def get_fames(self, format='json', normalize=False):
524    def get_fames(self, format="json", normalize=False):
525        """
526        Request MetabRef GC/MS FAMEs library.
527
528        Parameters
529        ----------
530        format : str
531            Format of requested library, i.e. "json", "sql", "flashentropy".
532            See `available_formats` method for aliases.
533        normalize : bool
534            Normalize the spectrum by its magnitude.
535
536        Returns
537        -------
538        Library in requested format.
539
540        """
541
542        # Init format function
543        format_func = self._get_format_func(format)
544
545        return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {})

Request MetabRef GC/MS FAMEs library.

Parameters
  • format (str): Format of requested library, i.e. "json", "sql", "flashentropy". See available_formats method for aliases.
  • normalize (bool): Normalize the spectrum by its magnitude.
Returns
  • Library in requested format.
class MetabRefLCInterface(MetabRefInterface):
 720class MetabRefLCInterface(MetabRefInterface):
 721    """
 722    Interface to the Metabolomics Reference Database for LC-MS data.
 723    """
 724
 725    def __init__(self):
 726        """
 727        Initialize instance.
 728
 729        """
 730
 731        super().__init__()
 732
 733        # API endpoint for precursor m/z search
 734        # inputs = mz, tolerance (in Da), polarity, page_no, per_page
 735        self.PRECURSOR_MZ_URL = "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}"
 736
 737        # API endpoint for returning full list of precursor m/z values in database
 738        # inputs = polarity, page_no, per_page
 739        self.PRECURSOR_MZ_ALL_URL = (
 740            "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}"
 741        )
 742
 743        # API endpoint for lipid data
 744        self.LIPID_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/lipid/data"
 745
 746        self.__init_format_map__()
 747
 748    def __init_format_map__(self):
 749        """
 750        Initialize database format mapper, enabling multiple format requests.
 751
 752        """
 753
 754        # Define format workflows
 755        self.format_map = {
 756            "json": lambda x, normalize, fe_kwargs: x,
 757            "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy(
 758                x, normalize, fe_kwargs
 759            ),
 760        }
 761
 762        # Add aliases
 763        self.format_map["metabref"] = self.format_map["json"]
 764        self.format_map["fe"] = self.format_map["flashentropy"]
 765        self.format_map["flash-entropy"] = self.format_map["flashentropy"]
 766    
 767    def query_by_precursor(
 768        self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50
 769    ):
 770        """
 771        Query MetabRef by precursor m/z values.
 772
 773        Parameters
 774        ----------
 775        mz_list : list
 776            List of precursor m/z values.
 777        polarity : str
 778            Ionization polarity, either "positive" or "negative".
 779        mz_tol_ppm : float
 780            Tolerance in ppm for each precursor m/z value.
 781            Used for retrieving from a potential match from database.
 782        mz_tol_da_api : float, optional
 783            Maximum tolerance between precursor m/z values for API search, in daltons.
 784            Used to group similar mzs into a single API query for speed. Default is 0.2.
 785        max_per_page : int, optional
 786            Maximum records to return from MetabRef API query at a time.  Default is 50.
 787
 788        Returns
 789        -------
 790        list
 791            List of library entries in original JSON format.
 792        """
 793        raise DeprecationWarning(
 794            "query_by_precursor is deprecated. Use get_lipid_library instead."
 795        )
 796
 797    def request_all_precursors(self, polarity, per_page=50000):
 798        """
 799        Request all precursor m/z values for MS2 spectra from MetabRef.
 800
 801        Parameters
 802        ----------
 803        polarity : str
 804            Ionization polarity, either "positive" or "negative".
 805        per_page : int, optional
 806            Number of records to fetch per call. Default is 50000
 807
 808        Returns
 809        -------
 810        list
 811            List of all precursor m/z values, sorted.
 812        """
 813        raise DeprecationWarning("request_all_precursors is deprecated.")
 814
 815    def post_lipid_query(self, mz_list, polarity, mz_tol_ppm):
 816        """
 817        Post query to get MetabRef lipid spectra.
 818
 819        Parameters
 820        ----------
 821        mz_list : list
 822            List of precursor m/z values.
 823        polarity : str
 824            Ionization polarity, either "positive" or "negative".
 825        mz_tol_ppm : float
 826            Tolerance in ppm for each precursor m/z value.
 827
 828        Returns
 829        -------
 830        download_id : str
 831            Download ID for the lipid library query.
 832
 833        Raises
 834        ------
 835        ValueError
 836            If any input parameter is invalid.
 837            If no download ID is returned.
 838        """
 839        url = self.LIPID_LIBRARY_URL
 840
 841        headers = {
 842            'accept': '*/*',
 843            'Content-Type': 'application/json'
 844        }
 845        
 846        payload = {
 847            "tolerance_ppm": mz_tol_ppm,
 848            "polarity": polarity,
 849            "mz_list": list(set(np.sort(mz_list))) 
 850        }
 851        
 852        try:
 853            response = requests.post(url, headers=headers, json=payload)
 854            response.raise_for_status()  # Raises an HTTPError for bad responses
 855            text = response.text.strip()
 856            # Drop everything before the final space
 857            if not text:
 858                raise ValueError("Empty response from MetabRef lipid library API.")
 859            if " " in text:
 860                text = text.rsplit(" ", 1)[-1]
 861                return text
 862            else:
 863                raise ValueError("Unexpected response format from MetabRef lipid library API.")
 864        except requests.exceptions.RequestException as e:
 865            raise ValueError(f"Error querying MetabRef lipid library: {e}")
 866
 867    def get_lipid_data(self, job_id, attempts=10, delay=5):
 868        """
 869        Get download content from lipid library query from MetabRef using job ID.
 870
 871        Parameters
 872        ----------
 873        job_id : str
 874            Job ID for the lipid library query.
 875            Retrieved from the post_lipid_query method.
 876        attempts : int, optional
 877            Number of attempts to retrieve the data. Default is 10.
 878        delay : int, optional
 879            Delay in seconds between attempts. Default is 5.
 880
 881        Returns
 882        -------
 883        str
 884            Download content from the lipid library query.
 885
 886        Raises
 887        ------
 888        ValueError
 889            If no download content is returned.
 890        """
 891        url = f"https://metabref.emsl.pnnl.gov/api/lipid/data/download/{job_id}"
 892        
 893        # Check the response, if it's 400, try again in 5 seconds.  Try up to 10 times
 894        for attempt in range(attempts):
 895            try:
 896                response = requests.get(url)
 897                response.raise_for_status()  # Raises an HTTPError for bad responses
 898                if response.status_code == 200:
 899                    if response.content == b"Job still running":
 900                        if attempt < attempts - 1:
 901                            time.sleep(delay)
 902                            continue
 903                    else:
 904                        lib = response.content
 905                        return lib.decode('utf-8') if isinstance(lib, bytes) else lib
 906                elif response.status_code == 400:
 907                    if attempt < attempts - 1:
 908                        time.sleep(delay)  # Wait before retrying
 909                        continue
 910                    else:
 911                        raise ValueError("Job ID not found or job is still processing.")
 912            except requests.exceptions.RequestException as e:
 913                if attempt < attempts - 1:
 914                    time.sleep(delay)
 915                    continue
 916                else:
 917                    raise ValueError(f"Error retrieving lipid library job: {e}")
 918    
 919    def get_lipid_library(
 920        self,
 921        mz_list,
 922        polarity,
 923        mz_tol_ppm,
 924        mz_tol_da_api=None,
 925        format="json",
 926        normalize=True,
 927        fe_kwargs={},
 928        api_delay=5,
 929        api_attempts=10,
 930    ):
 931        """
 932        Request MetabRef lipid library.
 933
 934        Parameters
 935        ----------
 936        mz_list : list
 937            List of precursor m/z values.
 938        polarity : str
 939            Ionization polarity, either "positive" or "negative".
 940        mz_tol_ppm : float
 941            Tolerance in ppm for each precursor m/z value.
 942            Used for retrieving from a potential match from database.
 943        mz_tol_da_api : float, optional
 944            DEPRECATED.  No longer used, but kept for backwards compatibility.
 945        format : str, optional
 946            Format of requested library, i.e. "json", "sql", "flashentropy".
 947            See `available_formats` method for aliases. Default is "json".
 948        normalize : bool, optional
 949            Normalize the spectrum by its magnitude. Default is True.
 950        fe_kwargs : dict, optional
 951            Keyword arguments for FlashEntropy search. Default is {}.
 952        api_delay : int, optional
 953            Delay in seconds between API attempts. Default is 5.
 954        api_attempts : int, optional
 955            Number of attempts to retrieve the data from the API. Default is 10.
 956
 957        Returns
 958        -------
 959        tuple
 960            Library in requested format and lipid metadata as a LipidMetadata dataclass.
 961
 962        """
 963        # Check for valid types in mz_list, polarity, and mz_tol_ppm
 964        if not isinstance(mz_list, (list, np.ndarray)):
 965            raise ValueError("mz_list must be a list or numpy array")
 966        if not all(isinstance(mz, (float, int)) for mz in mz_list):
 967            raise ValueError("All elements in mz_list must be float or int")
 968        if not isinstance(polarity, str):
 969            raise ValueError("polarity must be a string")
 970        if not isinstance(mz_tol_ppm, (float, int)):
 971            raise ValueError("mz_tol_ppm must be a float or int")
 972        
 973        job_id = self.post_lipid_query(
 974            mz_list=mz_list,
 975            polarity=polarity,
 976            mz_tol_ppm=mz_tol_ppm,
 977        )
 978        
 979        lib = self.get_lipid_data(
 980            job_id=job_id,
 981            attempts=api_attempts,
 982            delay=api_delay,
 983        )
 984        lib = json.loads(lib)
 985
 986        # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass
 987        mol_data_dict = lib['molecular_data']
 988        mol_data_dict = {
 989            int(k): self._dict_to_dataclass(v, LipidMetadata)
 990            for k, v in mol_data_dict.items()
 991        }
 992
 993        # Remove lipid metadata from the metabref library
 994        lib = lib['mass_spectrum_data']
 995        # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry
 996        for x in lib:
 997            if "Lipid Fragments" in x.keys():
 998                x.update(x.pop("Lipid Fragments"))
 999            if "MSO Data" in x.keys():
1000                x.update(x.pop("MSO Data"))
1001
1002        # Format the spectral library
1003        format_func = self._get_format_func(format)
1004        lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs)
1005        return (lib, mol_data_dict)

Interface to the Metabolomics Reference Database for LC-MS data.

MetabRefLCInterface()
725    def __init__(self):
726        """
727        Initialize instance.
728
729        """
730
731        super().__init__()
732
733        # API endpoint for precursor m/z search
734        # inputs = mz, tolerance (in Da), polarity, page_no, per_page
735        self.PRECURSOR_MZ_URL = "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}"
736
737        # API endpoint for returning full list of precursor m/z values in database
738        # inputs = polarity, page_no, per_page
739        self.PRECURSOR_MZ_ALL_URL = (
740            "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}"
741        )
742
743        # API endpoint for lipid data
744        self.LIPID_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/lipid/data"
745
746        self.__init_format_map__()

Initialize instance.

PRECURSOR_MZ_URL
PRECURSOR_MZ_ALL_URL
LIPID_LIBRARY_URL
def query_by_precursor( self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50):
767    def query_by_precursor(
768        self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50
769    ):
770        """
771        Query MetabRef by precursor m/z values.
772
773        Parameters
774        ----------
775        mz_list : list
776            List of precursor m/z values.
777        polarity : str
778            Ionization polarity, either "positive" or "negative".
779        mz_tol_ppm : float
780            Tolerance in ppm for each precursor m/z value.
781            Used for retrieving from a potential match from database.
782        mz_tol_da_api : float, optional
783            Maximum tolerance between precursor m/z values for API search, in daltons.
784            Used to group similar mzs into a single API query for speed. Default is 0.2.
785        max_per_page : int, optional
786            Maximum records to return from MetabRef API query at a time.  Default is 50.
787
788        Returns
789        -------
790        list
791            List of library entries in original JSON format.
792        """
793        raise DeprecationWarning(
794            "query_by_precursor is deprecated. Use get_lipid_library instead."
795        )

Query MetabRef by precursor m/z values.

Parameters
  • mz_list (list): List of precursor m/z values.
  • polarity (str): Ionization polarity, either "positive" or "negative".
  • mz_tol_ppm (float): Tolerance in ppm for each precursor m/z value. Used for retrieving from a potential match from database.
  • mz_tol_da_api (float, optional): Maximum tolerance between precursor m/z values for API search, in daltons. Used to group similar mzs into a single API query for speed. Default is 0.2.
  • max_per_page (int, optional): Maximum records to return from MetabRef API query at a time. Default is 50.
Returns
  • list: List of library entries in original JSON format.
def request_all_precursors(self, polarity, per_page=50000):
797    def request_all_precursors(self, polarity, per_page=50000):
798        """
799        Request all precursor m/z values for MS2 spectra from MetabRef.
800
801        Parameters
802        ----------
803        polarity : str
804            Ionization polarity, either "positive" or "negative".
805        per_page : int, optional
806            Number of records to fetch per call. Default is 50000
807
808        Returns
809        -------
810        list
811            List of all precursor m/z values, sorted.
812        """
813        raise DeprecationWarning("request_all_precursors is deprecated.")

Request all precursor m/z values for MS2 spectra from MetabRef.

Parameters
  • polarity (str): Ionization polarity, either "positive" or "negative".
  • per_page (int, optional): Number of records to fetch per call. Default is 50000
Returns
  • list: List of all precursor m/z values, sorted.
def post_lipid_query(self, mz_list, polarity, mz_tol_ppm):
815    def post_lipid_query(self, mz_list, polarity, mz_tol_ppm):
816        """
817        Post query to get MetabRef lipid spectra.
818
819        Parameters
820        ----------
821        mz_list : list
822            List of precursor m/z values.
823        polarity : str
824            Ionization polarity, either "positive" or "negative".
825        mz_tol_ppm : float
826            Tolerance in ppm for each precursor m/z value.
827
828        Returns
829        -------
830        download_id : str
831            Download ID for the lipid library query.
832
833        Raises
834        ------
835        ValueError
836            If any input parameter is invalid.
837            If no download ID is returned.
838        """
839        url = self.LIPID_LIBRARY_URL
840
841        headers = {
842            'accept': '*/*',
843            'Content-Type': 'application/json'
844        }
845        
846        payload = {
847            "tolerance_ppm": mz_tol_ppm,
848            "polarity": polarity,
849            "mz_list": list(set(np.sort(mz_list))) 
850        }
851        
852        try:
853            response = requests.post(url, headers=headers, json=payload)
854            response.raise_for_status()  # Raises an HTTPError for bad responses
855            text = response.text.strip()
856            # Drop everything before the final space
857            if not text:
858                raise ValueError("Empty response from MetabRef lipid library API.")
859            if " " in text:
860                text = text.rsplit(" ", 1)[-1]
861                return text
862            else:
863                raise ValueError("Unexpected response format from MetabRef lipid library API.")
864        except requests.exceptions.RequestException as e:
865            raise ValueError(f"Error querying MetabRef lipid library: {e}")

Post query to get MetabRef lipid spectra.

Parameters
  • mz_list (list): List of precursor m/z values.
  • polarity (str): Ionization polarity, either "positive" or "negative".
  • mz_tol_ppm (float): Tolerance in ppm for each precursor m/z value.
Returns
  • download_id (str): Download ID for the lipid library query.
Raises
  • ValueError: If any input parameter is invalid. If no download ID is returned.
def get_lipid_data(self, job_id, attempts=10, delay=5):
867    def get_lipid_data(self, job_id, attempts=10, delay=5):
868        """
869        Get download content from lipid library query from MetabRef using job ID.
870
871        Parameters
872        ----------
873        job_id : str
874            Job ID for the lipid library query.
875            Retrieved from the post_lipid_query method.
876        attempts : int, optional
877            Number of attempts to retrieve the data. Default is 10.
878        delay : int, optional
879            Delay in seconds between attempts. Default is 5.
880
881        Returns
882        -------
883        str
884            Download content from the lipid library query.
885
886        Raises
887        ------
888        ValueError
889            If no download content is returned.
890        """
891        url = f"https://metabref.emsl.pnnl.gov/api/lipid/data/download/{job_id}"
892        
893        # Check the response, if it's 400, try again in 5 seconds.  Try up to 10 times
894        for attempt in range(attempts):
895            try:
896                response = requests.get(url)
897                response.raise_for_status()  # Raises an HTTPError for bad responses
898                if response.status_code == 200:
899                    if response.content == b"Job still running":
900                        if attempt < attempts - 1:
901                            time.sleep(delay)
902                            continue
903                    else:
904                        lib = response.content
905                        return lib.decode('utf-8') if isinstance(lib, bytes) else lib
906                elif response.status_code == 400:
907                    if attempt < attempts - 1:
908                        time.sleep(delay)  # Wait before retrying
909                        continue
910                    else:
911                        raise ValueError("Job ID not found or job is still processing.")
912            except requests.exceptions.RequestException as e:
913                if attempt < attempts - 1:
914                    time.sleep(delay)
915                    continue
916                else:
917                    raise ValueError(f"Error retrieving lipid library job: {e}")

Get download content from lipid library query from MetabRef using job ID.

Parameters
  • job_id (str): Job ID for the lipid library query. Retrieved from the post_lipid_query method.
  • attempts (int, optional): Number of attempts to retrieve the data. Default is 10.
  • delay (int, optional): Delay in seconds between attempts. Default is 5.
Returns
  • str: Download content from the lipid library query.
Raises
  • ValueError: If no download content is returned.
def get_lipid_library( self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=None, format='json', normalize=True, fe_kwargs={}, api_delay=5, api_attempts=10):
 919    def get_lipid_library(
 920        self,
 921        mz_list,
 922        polarity,
 923        mz_tol_ppm,
 924        mz_tol_da_api=None,
 925        format="json",
 926        normalize=True,
 927        fe_kwargs={},
 928        api_delay=5,
 929        api_attempts=10,
 930    ):
 931        """
 932        Request MetabRef lipid library.
 933
 934        Parameters
 935        ----------
 936        mz_list : list
 937            List of precursor m/z values.
 938        polarity : str
 939            Ionization polarity, either "positive" or "negative".
 940        mz_tol_ppm : float
 941            Tolerance in ppm for each precursor m/z value.
 942            Used for retrieving from a potential match from database.
 943        mz_tol_da_api : float, optional
 944            DEPRECATED.  No longer used, but kept for backwards compatibility.
 945        format : str, optional
 946            Format of requested library, i.e. "json", "sql", "flashentropy".
 947            See `available_formats` method for aliases. Default is "json".
 948        normalize : bool, optional
 949            Normalize the spectrum by its magnitude. Default is True.
 950        fe_kwargs : dict, optional
 951            Keyword arguments for FlashEntropy search. Default is {}.
 952        api_delay : int, optional
 953            Delay in seconds between API attempts. Default is 5.
 954        api_attempts : int, optional
 955            Number of attempts to retrieve the data from the API. Default is 10.
 956
 957        Returns
 958        -------
 959        tuple
 960            Library in requested format and lipid metadata as a LipidMetadata dataclass.
 961
 962        """
 963        # Check for valid types in mz_list, polarity, and mz_tol_ppm
 964        if not isinstance(mz_list, (list, np.ndarray)):
 965            raise ValueError("mz_list must be a list or numpy array")
 966        if not all(isinstance(mz, (float, int)) for mz in mz_list):
 967            raise ValueError("All elements in mz_list must be float or int")
 968        if not isinstance(polarity, str):
 969            raise ValueError("polarity must be a string")
 970        if not isinstance(mz_tol_ppm, (float, int)):
 971            raise ValueError("mz_tol_ppm must be a float or int")
 972        
 973        job_id = self.post_lipid_query(
 974            mz_list=mz_list,
 975            polarity=polarity,
 976            mz_tol_ppm=mz_tol_ppm,
 977        )
 978        
 979        lib = self.get_lipid_data(
 980            job_id=job_id,
 981            attempts=api_attempts,
 982            delay=api_delay,
 983        )
 984        lib = json.loads(lib)
 985
 986        # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass
 987        mol_data_dict = lib['molecular_data']
 988        mol_data_dict = {
 989            int(k): self._dict_to_dataclass(v, LipidMetadata)
 990            for k, v in mol_data_dict.items()
 991        }
 992
 993        # Remove lipid metadata from the metabref library
 994        lib = lib['mass_spectrum_data']
 995        # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry
 996        for x in lib:
 997            if "Lipid Fragments" in x.keys():
 998                x.update(x.pop("Lipid Fragments"))
 999            if "MSO Data" in x.keys():
1000                x.update(x.pop("MSO Data"))
1001
1002        # Format the spectral library
1003        format_func = self._get_format_func(format)
1004        lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs)
1005        return (lib, mol_data_dict)

Request MetabRef lipid library.

Parameters
  • mz_list (list): List of precursor m/z values.
  • polarity (str): Ionization polarity, either "positive" or "negative".
  • mz_tol_ppm (float): Tolerance in ppm for each precursor m/z value. Used for retrieving from a potential match from database.
  • mz_tol_da_api (float, optional): DEPRECATED. No longer used, but kept for backwards compatibility.
  • format (str, optional): Format of requested library, i.e. "json", "sql", "flashentropy". See available_formats method for aliases. Default is "json".
  • normalize (bool, optional): Normalize the spectrum by its magnitude. Default is True.
  • fe_kwargs (dict, optional): Keyword arguments for FlashEntropy search. Default is {}.
  • api_delay (int, optional): Delay in seconds between API attempts. Default is 5.
  • api_attempts (int, optional): Number of attempts to retrieve the data from the API. Default is 10.
Returns
  • tuple: Library in requested format and lipid metadata as a LipidMetadata dataclass.
class MSPInterface(SpectralDatabaseInterface):
1008class MSPInterface(SpectralDatabaseInterface):
1009    """
1010    Interface to parse NIST MSP files
1011    """
1012
1013    def __init__(self, file_path):
1014        """
1015        Initialize instance.
1016
1017        Parameters
1018        ----------
1019        file_path : str
1020            Path to a local MSP file.
1021
1022        Attributes
1023        ----------
1024        file_path : str
1025            Path to the MSP file.
1026        _file_content : str
1027            Content of the MSP file.
1028        _data_frame : :obj:`~pandas.DataFrame`
1029            DataFrame of spectra from the MSP file with unaltered content.
1030        """
1031        super().__init__(key=None)
1032
1033        self.file_path = file_path
1034        if not os.path.exists(self.file_path):
1035            raise FileNotFoundError(
1036                f"File {self.file_path} does not exist. Please check the file path."
1037            )
1038        with open(self.file_path, "r") as f:
1039            self._file_content = f.read()
1040
1041        self._data_frame = self._read_msp_file()
1042        self.__init_format_map__()
1043
1044    def __init_format_map__(self):
1045        """
1046        Initialize database format mapper, enabling multiple format requests.
1047
1048        """
1049
1050        # x is a pandas dataframe similar to self._data_frame format
1051        # Define format workflows
1052        self.format_map = {
1053            "msp": lambda x, normalize, fe_kwargs: self._to_msp(x, normalize),
1054            "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy(
1055                x, normalize, fe_kwargs
1056            ),
1057            "df": lambda x, normalize, fe_kwargs: self._to_df(x, normalize),
1058        }
1059
1060        # Add aliases
1061        self.format_map["fe"] = self.format_map["flashentropy"]
1062        self.format_map["flash-entropy"] = self.format_map["flashentropy"]
1063        self.format_map["dataframe"] = self.format_map["df"]
1064        self.format_map["data-frame"] = self.format_map["df"]
1065
1066    def _read_msp_file(self):
1067        """
1068        Reads the MSP files into the pandas dataframe, and sort/remove zero intensity ions in MS/MS spectra.
1069
1070        Returns
1071        -------
1072        :obj:`~pandas.DataFrame`
1073            DataFrame of spectra from the MSP file, exacly as it is in the file (no sorting, filtering etc)
1074        """
1075        # If input_dataframe is provided, return it it
1076        spectra = []
1077        spectrum = {}
1078
1079        f = StringIO(self._file_content)
1080        for line in f:
1081            line = line.strip()
1082            if not line:
1083                continue  # Skip empty lines
1084
1085            # Handle metadata
1086            if ":" in line:
1087                key, value = line.split(":", 1)
1088                key = key.strip().lower()
1089                value = value.strip()
1090
1091                if key == "name":
1092                    # Save current spectrum and start a new one
1093                    if spectrum:
1094                        spectra.append(spectrum)
1095                    spectrum = {"name": value, "peaks": []}
1096                else:
1097                    spectrum[key] = value
1098
1099            # Handle peak data (assumed to start with a number)
1100            elif line[0].isdigit():
1101                peaks = line.split()
1102                m_z = float(peaks[0])
1103                intensity = float(peaks[1])
1104                spectrum["peaks"].append(([m_z, intensity]))
1105        # Save the last spectrum
1106        if spectrum:
1107            spectra.append(spectrum)
1108
1109        df = pd.DataFrame(spectra)
1110        for column in df.columns:
1111            if column != "peaks":  # Skip 'peaks' column
1112                try:
1113                    df[column] = pd.to_numeric(df[column], errors="raise")
1114                except:
1115                    pass
1116        return df
1117
1118    def _to_df(self, input_dataframe, normalize=True):
1119        """
1120        Convert MSP-derived library to FlashEntropy library. 
1121
1122        Parameters
1123        ----------
1124        input_dataframe : :obj:`~pandas.DataFrame`
1125            Input DataFrame containing MSP-formatted spectra.
1126        normalize : bool, optional
1127            Normalize each spectrum by its magnitude.
1128            Default is True.
1129
1130        Returns
1131        -------
1132        :obj:`~pandas.DataFrame`
1133            DataFrame of with desired normalization
1134        """
1135        if not normalize:
1136            return input_dataframe
1137        else:
1138            # Convert to dictionary
1139            db_dict = input_dataframe.to_dict(orient="records")
1140
1141            # Initialize empty library
1142            lib = []
1143
1144            # Enumerate spectra
1145            for i, source in enumerate(db_dict):
1146                spectrum = source
1147                # Check that spectrum["peaks"] exists
1148                if "peaks" not in spectrum.keys():
1149                    raise KeyError(
1150                        "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute."
1151                    )
1152
1153                # Convert spectrum["peaks"] to numpy array
1154                if not isinstance(spectrum["peaks"], np.ndarray):
1155                    spectrum["peaks"] = np.array(spectrum["peaks"])
1156
1157                # Normalize peaks, if requested
1158                if normalize:
1159                    spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"])
1160                    spectrum["num peaks"] = len(spectrum["peaks"])
1161
1162                # Add spectrum to library
1163                lib.append(spectrum)
1164            
1165            # Convert to DataFrame
1166            df = pd.DataFrame(lib)
1167            return df
1168    
1169    def _to_flashentropy(self, input_dataframe, normalize=True, fe_kwargs={}):
1170        """
1171        Convert MSP-derived library to FlashEntropy library.
1172
1173        Parameters
1174        ----------
1175        input_dataframe : :obj:`~pandas.DataFrame`
1176            Input DataFrame containing MSP-formatted spectra.
1177        normalize : bool
1178            Normalize each spectrum by its magnitude.
1179        fe_kwargs : dict, optional
1180            Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search;
1181            any keys not recognized will be ignored. By default, all parameters set to defaults.
1182
1183        Returns
1184        -------
1185        :obj:`~ms_entropy.FlashEntropySearch`
1186            MS2 library as FlashEntropy search instance.
1187
1188        Raises
1189        ------
1190        ValueError
1191            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they
1192        """
1193        self._check_flash_entropy_kwargs(fe_kwargs)
1194
1195        db_df = input_dataframe
1196
1197        # Convert to dictionary
1198        db_dict = db_df.to_dict(orient="records")
1199
1200        # Initialize empty library
1201        fe_lib = []
1202
1203        # Enumerate spectra
1204        for i, source in enumerate(db_dict):
1205            # Reorganize source dict, if necessary
1206            if "spectrum_data" in source.keys():
1207                spectrum = source["spectrum_data"]
1208            else:
1209                spectrum = source
1210
1211            # Rename precursor_mz key for FlashEntropy
1212            if "precursor_mz" not in spectrum.keys():
1213                if "precursormz" in spectrum:
1214                    spectrum["precursor_mz"] = spectrum.pop("precursormz")
1215                elif "precursor_ion" in spectrum:
1216                    spectrum["precursor_mz"] = spectrum.pop("precursor_ion")
1217                else:
1218                    raise KeyError(
1219                        "MSP must have either 'precursormz' or 'precursor_ion' key to be converted to FlashEntropy format."
1220                    )
1221
1222            # Check that spectrum["peaks"] exists
1223            if "peaks" not in spectrum.keys():
1224                raise KeyError(
1225                    "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute."
1226                )
1227
1228            # Convert spectrum["peaks"] to numpy array
1229            if not isinstance(spectrum["peaks"], np.ndarray):
1230                spectrum["peaks"] = np.array(spectrum["peaks"])
1231
1232            # Normalize peaks, if requested
1233            if normalize:
1234                spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"])
1235
1236            # Add spectrum to library
1237            fe_lib.append(spectrum)
1238
1239        # Build FlashEntropy index
1240        fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs)
1241
1242        return fe_search
1243    
1244    def _check_msp_compatibility(self):
1245        """
1246        Check if the MSP file is compatible with the get_metabolomics_spectra_library method and provide feedback if it is not.
1247        """
1248        # Check polarity
1249        if (
1250            "polarity" not in self._data_frame.columns
1251            and "ionmode" not in self._data_frame.columns
1252        ):
1253            raise ValueError(
1254                "Neither 'polarity' nor 'ionmode' columns found in the input MSP metadata. Please check the file."
1255            )
1256        polarity_column = (
1257            "polarity" if "polarity" in self._data_frame.columns else "ionmode"
1258        )
1259
1260        # Check if polarity_column contents is either "positive" or "negative"
1261        if not all(self._data_frame[polarity_column].isin(["positive", "negative"])):
1262            raise ValueError(
1263                f"Input field on MSP '{polarity_column}' must contain only 'positive' or 'negative' values."
1264            )
1265
1266        # Check if the MSP file contains the required columns for metabolite metadata
1267        # inchikey, by name, not null
1268        # either formula or molecular_formula, not null
1269        if not all(self._data_frame["inchikey"].notnull()):
1270            raise ValueError(
1271                "Input field on MSP 'inchikey' must contain only non-null values."
1272            )
1273        if (
1274            "formula" not in self._data_frame.columns
1275            and "molecular_formula" not in self._data_frame.columns
1276        ):
1277            raise ValueError(
1278                "Input field on MSP must contain either 'formula' or 'molecular_formula' columns."
1279            )
1280        molecular_formula_column = (
1281            "formula" if "formula" in self._data_frame.columns else "molecular_formula"
1282        )
1283        if not all(self._data_frame[molecular_formula_column].notnull()):
1284            raise ValueError(
1285                f"Input field on MSP '{molecular_formula_column}' must contain only non-null values."
1286            )
1287
1288    def get_metabolomics_spectra_library(
1289        self,
1290        polarity,
1291        metabolite_metadata_mapping={},
1292        format="fe",
1293        normalize=True,
1294        fe_kwargs={},
1295    ):
1296        """
1297        Prepare metabolomics spectra library and associated metabolite metadata
1298
1299        Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input
1300
1301        """
1302        # Check if the MSP file is compatible with the get_metabolomics_spectra_library method
1303        self._check_msp_compatibility()
1304
1305        # Check if the polarity parameter is valid and if a polarity column exists in the dataframe
1306        if polarity not in ["positive", "negative"]:
1307            raise ValueError("Polarity must be 'positive' or 'negative'")
1308        polarity_column = (
1309            "polarity" if "polarity" in self._data_frame.columns else "ionmode"
1310        )
1311
1312        # Get a subset of the initial dataframea by polarity
1313        db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy()
1314
1315        # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping
1316        # If the mapping is not provided, use the default mapping
1317        if not metabolite_metadata_mapping:
1318            metabolite_metadata_mapping = {
1319                "chebi_id": "chebi",
1320                "kegg_id": "kegg",
1321                "refmet_name": "common_name",
1322                "molecular_formula": "formula",
1323                "gnps_spectra_id":"id",
1324                "precursormz": "precursor_mz",
1325                "precursortype":"ion_type"
1326            }
1327        db_df.rename(columns=metabolite_metadata_mapping, inplace=True)
1328        db_df["molecular_data_id"] = db_df["inchikey"]
1329
1330
1331
1332        # Check if the resulting dataframe has the required columns for the flash entropy search
1333        required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"]
1334        for col in required_columns:
1335            if col not in db_df.columns:
1336                raise ValueError(
1337                    f"Input field on MSP must contain '{col}' column for FlashEntropy search."
1338                )
1339
1340        # Pull out the metabolite metadata from the dataframe and put it into a different dataframe
1341        # First get a list of the possible attributes of the MetaboliteMetadata dataclass
1342        metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys())
1343        # Replace id with molecular_data_id in metabolite_metadata_keys
1344        metabolite_metadata_keys = [
1345            "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys
1346        ]
1347        metabolite_metadata_df = db_df[
1348            db_df.columns[db_df.columns.isin(metabolite_metadata_keys)]
1349        ].copy()
1350
1351        # Make unique and recast the id column for metabolite metadata
1352        metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True)
1353        metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"]
1354
1355        # Convert to a dictionary using the inchikey as the key
1356        metabolite_metadata_dict = metabolite_metadata_df.to_dict(
1357            orient="records"
1358        )
1359        metabolite_metadata_dict = {
1360            v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata)
1361            for v in metabolite_metadata_dict
1362        }
1363
1364        # Remove the metabolite metadata columns from the original dataframe
1365        for key in metabolite_metadata_keys:
1366            if key != "molecular_data_id":
1367                if key in db_df.columns:
1368                    db_df.drop(columns=key, inplace=True)
1369
1370        # Format the spectral library
1371        format_func = self._get_format_func(format)
1372        lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs)
1373        return (lib, metabolite_metadata_dict)

Interface to parse NIST MSP files

MSPInterface(file_path)
1013    def __init__(self, file_path):
1014        """
1015        Initialize instance.
1016
1017        Parameters
1018        ----------
1019        file_path : str
1020            Path to a local MSP file.
1021
1022        Attributes
1023        ----------
1024        file_path : str
1025            Path to the MSP file.
1026        _file_content : str
1027            Content of the MSP file.
1028        _data_frame : :obj:`~pandas.DataFrame`
1029            DataFrame of spectra from the MSP file with unaltered content.
1030        """
1031        super().__init__(key=None)
1032
1033        self.file_path = file_path
1034        if not os.path.exists(self.file_path):
1035            raise FileNotFoundError(
1036                f"File {self.file_path} does not exist. Please check the file path."
1037            )
1038        with open(self.file_path, "r") as f:
1039            self._file_content = f.read()
1040
1041        self._data_frame = self._read_msp_file()
1042        self.__init_format_map__()

Initialize instance.

Parameters
  • file_path (str): Path to a local MSP file.
Attributes
  • file_path (str): Path to the MSP file.
  • _file_content (str): Content of the MSP file.
  • _data_frame (~pandas.DataFrame): DataFrame of spectra from the MSP file with unaltered content.
file_path
def get_metabolomics_spectra_library( self, polarity, metabolite_metadata_mapping={}, format='fe', normalize=True, fe_kwargs={}):
1288    def get_metabolomics_spectra_library(
1289        self,
1290        polarity,
1291        metabolite_metadata_mapping={},
1292        format="fe",
1293        normalize=True,
1294        fe_kwargs={},
1295    ):
1296        """
1297        Prepare metabolomics spectra library and associated metabolite metadata
1298
1299        Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input
1300
1301        """
1302        # Check if the MSP file is compatible with the get_metabolomics_spectra_library method
1303        self._check_msp_compatibility()
1304
1305        # Check if the polarity parameter is valid and if a polarity column exists in the dataframe
1306        if polarity not in ["positive", "negative"]:
1307            raise ValueError("Polarity must be 'positive' or 'negative'")
1308        polarity_column = (
1309            "polarity" if "polarity" in self._data_frame.columns else "ionmode"
1310        )
1311
1312        # Get a subset of the initial dataframea by polarity
1313        db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy()
1314
1315        # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping
1316        # If the mapping is not provided, use the default mapping
1317        if not metabolite_metadata_mapping:
1318            metabolite_metadata_mapping = {
1319                "chebi_id": "chebi",
1320                "kegg_id": "kegg",
1321                "refmet_name": "common_name",
1322                "molecular_formula": "formula",
1323                "gnps_spectra_id":"id",
1324                "precursormz": "precursor_mz",
1325                "precursortype":"ion_type"
1326            }
1327        db_df.rename(columns=metabolite_metadata_mapping, inplace=True)
1328        db_df["molecular_data_id"] = db_df["inchikey"]
1329
1330
1331
1332        # Check if the resulting dataframe has the required columns for the flash entropy search
1333        required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"]
1334        for col in required_columns:
1335            if col not in db_df.columns:
1336                raise ValueError(
1337                    f"Input field on MSP must contain '{col}' column for FlashEntropy search."
1338                )
1339
1340        # Pull out the metabolite metadata from the dataframe and put it into a different dataframe
1341        # First get a list of the possible attributes of the MetaboliteMetadata dataclass
1342        metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys())
1343        # Replace id with molecular_data_id in metabolite_metadata_keys
1344        metabolite_metadata_keys = [
1345            "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys
1346        ]
1347        metabolite_metadata_df = db_df[
1348            db_df.columns[db_df.columns.isin(metabolite_metadata_keys)]
1349        ].copy()
1350
1351        # Make unique and recast the id column for metabolite metadata
1352        metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True)
1353        metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"]
1354
1355        # Convert to a dictionary using the inchikey as the key
1356        metabolite_metadata_dict = metabolite_metadata_df.to_dict(
1357            orient="records"
1358        )
1359        metabolite_metadata_dict = {
1360            v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata)
1361            for v in metabolite_metadata_dict
1362        }
1363
1364        # Remove the metabolite metadata columns from the original dataframe
1365        for key in metabolite_metadata_keys:
1366            if key != "molecular_data_id":
1367                if key in db_df.columns:
1368                    db_df.drop(columns=key, inplace=True)
1369
1370        # Format the spectral library
1371        format_func = self._get_format_func(format)
1372        lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs)
1373        return (lib, metabolite_metadata_dict)

Prepare metabolomics spectra library and associated metabolite metadata

Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input