corems.molecular_id.search.database_interfaces

   1import os
   2import re
   3from abc import ABC
   4from io import StringIO
   5from pathlib import Path
   6
   7import numpy as np
   8import requests
   9import pandas as pd
  10from ms_entropy import FlashEntropySearch
  11
  12from corems.molecular_id.factory.EI_SQL import (
  13    EI_LowRes_SQLite,
  14    Metadatar,
  15    MetaboliteMetadata,
  16)
  17from corems.molecular_id.factory.lipid_molecular_metadata import LipidMetadata
  18from corems.mass_spectra.calc.lc_calc import find_closest
  19
  20
  21class SpectralDatabaseInterface(ABC):
  22    """
  23    Base class that facilitates connection to spectral reference databases,
  24    such as EMSL's Metabolomics Reference Database (MetabRef).
  25
  26    """
  27
  28    def __init__(self, key=None):
  29        """
  30        Initialize instance.
  31
  32        Parameters
  33        ----------
  34        key : str
  35            Token key.
  36
  37        """
  38
  39        self.key = key
  40
  41    def set_token(self, path):
  42        """
  43        Set environment variable for MetabRef database token.
  44
  45        Parameters
  46        ----------
  47        path : str
  48            Path to token.
  49
  50        """
  51
  52        # Read token from file
  53        with open(path, "r", encoding="utf-8") as f:
  54            token = f.readline().strip()
  55
  56        # Set environment variable
  57        os.environ[self.key] = token
  58
  59    def get_token(self):
  60        """
  61        Get environment variable for database token.
  62
  63        Returns
  64        -------
  65        str
  66            Token string.
  67
  68        """
  69
  70        # Check for token
  71        if self.key not in os.environ:
  72            raise ValueError("Must set {} environment variable.".format(self.key))
  73
  74        # Get token from environment variables
  75        return os.environ.get(self.key)
  76
  77    def get_header(self):
  78        """
  79        Access stored database token and prepare as header.
  80
  81        Returns
  82        -------
  83        str
  84            Header string.
  85
  86        """
  87
  88        # Get token
  89        token = self.get_token()
  90
  91        # Pad header information
  92        header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"}
  93
  94        return header
  95
  96    def get_query(self, url, use_header=True):
  97        """
  98        Request payload from URL according to `get` protocol.
  99
 100        Parameters
 101        ----------
 102        url : str
 103            URL for request.
 104        use_header: bool
 105            Whether or not the query should include the header
 106
 107        Returns
 108        -------
 109        dict
 110            Response as JSON.
 111
 112        """
 113
 114        # Query URL via `get`
 115        if use_header:
 116            response = requests.get(url, headers=self.get_header())
 117        else:
 118            response = requests.get(url)
 119
 120        # Check response
 121        response.raise_for_status()
 122
 123        # Return as JSON
 124        return response.json()
 125
 126    def post_query(self, url, variable, values, tolerance):
 127        """
 128        Request payload from URL according to `post` protocol.
 129
 130        Parameters
 131        ----------
 132        url : str
 133            URL for request.
 134        variable : str
 135            Variable to query.
 136        values : str
 137            Specific values of `variable` to query.
 138        tolerance : str
 139            Query tolerance relative to `values`.
 140
 141        Returns
 142        -------
 143        dict
 144            Response as JSON.
 145
 146        """
 147
 148        # Coerce to string
 149        if not isinstance(variable, str):
 150            variable = str(variable).replace(" ", "")
 151
 152        if not isinstance(values, str):
 153            values = str(values).replace(" ", "")
 154
 155        if not isinstance(tolerance, str):
 156            tolerance = str(tolerance).replace(" ", "")
 157
 158        # Query URL via `post`
 159        response = requests.post(
 160            os.path.join(url, variable, tolerance),
 161            data=values,
 162            headers=self.get_header(),
 163        )
 164
 165        # Check response
 166        response.raise_for_status()
 167
 168        # Return as JSON
 169        return response.json()
 170
 171    def _check_flash_entropy_kwargs(self, fe_kwargs):
 172        """
 173        Check FlashEntropy keyword arguments.
 174
 175        Parameters
 176        ----------
 177        fe_kwargs : dict
 178            Keyword arguments for FlashEntropy search.
 179
 180
 181        Raises
 182        ------
 183        ValueError
 184            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they
 185            are not equal.
 186
 187        """
 188        # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da
 189        if (
 190            "min_ms2_difference_in_da" in fe_kwargs
 191            or "max_ms2_tolerance_in_da" in fe_kwargs
 192        ):
 193            if (
 194                "min_ms2_difference_in_da" not in fe_kwargs
 195                or "max_ms2_tolerance_in_da" not in fe_kwargs
 196            ):
 197                raise ValueError(
 198                    "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified."
 199                )
 200            if (
 201                fe_kwargs["min_ms2_difference_in_da"]
 202                != 2 * fe_kwargs["max_ms2_tolerance_in_da"]
 203            ):
 204                raise ValueError(
 205                    "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'."
 206                )
 207
 208    def _get_format_func(self, format):
 209        """
 210        Obtain format function by key.
 211
 212        Returns
 213        -------
 214        func
 215            Formatting function.
 216        """
 217
 218        if format.lower() in self.format_map.keys():
 219            return self.format_map[format.lower()]
 220
 221        raise ValueError(("{} not a supported format.").format(format))
 222
 223    def _dict_to_dataclass(self, metabref_lib, data_class):
 224        """
 225        Convert dictionary to dataclass.
 226
 227        Notes
 228        -----
 229        This function will pull the attributes a dataclass and its parent class
 230        and convert the dictionary to a dataclass instance with the appropriate
 231        attributes.
 232
 233        Parameters
 234        ----------
 235        data_class : :obj:`~dataclasses.dataclass`
 236            Dataclass to convert to.
 237        metabref_lib : dict
 238            Metabref dictionary object to convert to dataclass.
 239
 240        Returns
 241        -------
 242        :obj:`~dataclasses.dataclass`
 243            Dataclass instance.
 244
 245        """
 246
 247        # Get list of expected attributes of data_class
 248        data_class_keys = list(data_class.__annotations__.keys())
 249
 250        # Does the data_class inherit from another class, if so, get the attributes of the parent class as well
 251        if len(data_class.__mro__) > 2:
 252            parent_class_keys = list(data_class.__bases__[0].__annotations__.keys())
 253            data_class_keys = list(set(data_class_keys + parent_class_keys))
 254
 255        # Remove keys that are not in the data_class from the input dictionary
 256        input_dict = {k: v for k, v in metabref_lib.items() if k in data_class_keys}
 257
 258        # Add keys that are in the data class but not in the input dictionary as None
 259        for key in data_class_keys:
 260            if key not in input_dict.keys():
 261                input_dict[key] = None
 262        return data_class(**input_dict)
 263
 264    @staticmethod
 265    def normalize_peaks(arr):
 266        """
 267        Normalize peaks in an array.
 268
 269        Parameters
 270        ----------
 271        arr : :obj:`~numpy.array`
 272            Array of shape (N, 2), with m/z in the first column and abundance in
 273            the second.
 274
 275        Returns
 276        -------
 277        :obj:`~numpy.array`
 278            Normalized array of shape (N, 2), with m/z in the first column and
 279            normalized abundance in the second.
 280        """
 281        # Normalize the array
 282        arr[:, -1] = arr[:, -1] / arr[:, -1].sum()
 283
 284        return arr
 285
 286    @staticmethod
 287    def _build_flash_entropy_index(fe_lib, fe_kwargs={}, clean_spectra=True):
 288        """
 289        Build FlashEntropy index.
 290
 291        Parameters
 292        ----------
 293        fe_lib : list
 294            List of spectra to build index from. Can be a list of dictionaries or
 295            a FlashEntropy search instance.
 296        fe_kwargs : dict, optional
 297            Keyword arguments for FlashEntropy search.
 298        clean_spectra : bool, optional
 299            Clean spectra before building index. Default is True.
 300
 301        Returns
 302        -------
 303        :obj:`~ms_entropy.FlashEntropySearch`
 304            FlashEntropy search instance.
 305
 306        """
 307        # Initialize FlashEntropy
 308        fe_init_kws = [
 309            "max_ms2_tolerance_in_da",
 310            "mz_index_step",
 311            "low_memory",
 312            "path_data",
 313        ]
 314        fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws}
 315        fes = FlashEntropySearch(**fe_init_kws)
 316
 317        # Build FlashEntropy index
 318        fe_index_kws = [
 319            "max_indexed_mz",
 320            "precursor_ions_removal_da",
 321            "noise_threshold",
 322            "min_ms2_difference_in_da",
 323            "max_peak_num",
 324        ]
 325        fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws}
 326        fes.build_index(fe_lib, **fe_index_kws, clean_spectra=clean_spectra)
 327
 328        return fes
 329
 330
 331class MetabRefInterface(SpectralDatabaseInterface):
 332    """
 333    Interface to the Metabolomics Reference Database.
 334    """
 335
 336    def __init__(self):
 337        """
 338        Initialize instance.
 339
 340        """
 341
 342        super().__init__(key=None)
 343
 344    def spectrum_to_array(self, spectrum, normalize=True):
 345        """
 346        Convert MetabRef-formatted spectrum to array.
 347
 348        Parameters
 349        ----------
 350        spectrum : str
 351            MetabRef spectrum, i.e. list of (m/z,abundance) pairs.
 352        normalize : bool
 353            Normalize the spectrum by its magnitude.
 354
 355        Returns
 356        -------
 357        :obj:`~numpy.array`
 358            Array of shape (N, 2), with m/z in the first column and abundance in
 359            the second.
 360
 361        """
 362
 363        # Convert parenthesis-delimited string to array
 364        arr = np.array(
 365            re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float
 366        ).reshape(-1, 2)
 367
 368        if normalize:
 369            arr = self.normalize_peaks(arr)
 370
 371        return arr
 372
 373    def _to_flashentropy(self, metabref_lib, normalize=True, fe_kwargs={}):
 374        """
 375        Convert metabref-formatted library to FlashEntropy library.
 376
 377        Parameters
 378        ----------
 379        metabref_lib : dict
 380            MetabRef MS2 library in JSON format or FlashEntropy search instance (for reformatting at different MS2 separation).
 381        normalize : bool
 382            Normalize each spectrum by its magnitude.
 383        fe_kwargs : dict, optional
 384            Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search;
 385            any keys not recognized will be ignored. By default, all parameters set to defaults.
 386
 387        Returns
 388        -------
 389        :obj:`~ms_entropy.FlashEntropySearch`
 390            MS2 library as FlashEntropy search instance.
 391
 392        Raises
 393        ------
 394        ValueError
 395            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal.
 396
 397        """
 398        self._check_flash_entropy_kwargs(fe_kwargs)
 399
 400        # Initialize empty library
 401        fe_lib = []
 402
 403        # Enumerate spectra
 404        for i, source in enumerate(metabref_lib):
 405            # Reorganize source dict, if necessary
 406            if "spectrum_data" in source.keys():
 407                spectrum = source["spectrum_data"]
 408            else:
 409                spectrum = source
 410
 411            # Rename precursor_mz key for FlashEntropy
 412            if "precursor_mz" not in spectrum.keys():
 413                spectrum["precursor_mz"] = spectrum.pop("precursor_ion")
 414
 415            # Convert CoreMS spectrum to array and clean, store as `peaks`
 416            spectrum["peaks"] = self.spectrum_to_array(
 417                spectrum["mz"], normalize=normalize
 418            )
 419
 420            # Add spectrum to library
 421            fe_lib.append(spectrum)
 422
 423        # Build FlashEntropy index
 424        fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs)
 425
 426        return fe_search
 427
 428    def get_query(self, url, use_header=False):
 429        """Overwrites the get_query method on the parent class to default to not use a header
 430
 431        Notes
 432        -----
 433        As of January 2025, the metabref database no longer requires a token and therefore no header is needed
 434
 435        """
 436        return super().get_query(url, use_header)
 437
 438
 439class MetabRefGCInterface(MetabRefInterface):
 440    """
 441    Interface to the Metabolomics Reference Database.
 442    """
 443
 444    def __init__(self):
 445        """
 446        Initialize instance.
 447
 448        """
 449
 450        super().__init__()
 451        self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1"
 452        self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames"
 453
 454        self.__init_format_map__()
 455
 456    def __init_format_map__(self):
 457        """
 458        Initialize database format mapper, enabling multiple format requests.
 459
 460        """
 461
 462        # Define format workflows
 463        self.format_map = {
 464            "json": lambda x, normalize, fe_kwargs: x,
 465            "dict": lambda x,
 466            normalize,
 467            fe_kwargs: self._to_LowResolutionEICompound_dict(x, normalize),
 468            "sql": lambda x,
 469            normalize,
 470            fe_kwargs: self._LowResolutionEICompound_dict_to_sqlite(
 471                self._to_LowResolutionEICompound_dict(x, normalize)
 472            ),
 473        }
 474
 475        # Add aliases
 476        self.format_map["metabref"] = self.format_map["json"]
 477        self.format_map["datadict"] = self.format_map["dict"]
 478        self.format_map["data-dict"] = self.format_map["dict"]
 479        self.format_map["lowreseicompound"] = self.format_map["dict"]
 480        self.format_map["lowres"] = self.format_map["dict"]
 481        self.format_map["lowresgc"] = self.format_map["dict"]
 482        self.format_map["sqlite"] = self.format_map["sql"]
 483
 484    def available_formats(self):
 485        """
 486        View list of available formats.
 487
 488        Returns
 489        -------
 490        list
 491            Format map keys.
 492        """
 493
 494        return list(self.format_map.keys())
 495
 496    def get_library(self, format="json", normalize=False):
 497        """
 498        Request MetabRef GC/MS library.
 499
 500        Parameters
 501        ----------
 502        format : str
 503            Format of requested library, i.e. "json", "sql", "flashentropy".
 504            See `available_formats` method for aliases.
 505        normalize : bool
 506            Normalize the spectrum by its magnitude.
 507
 508        Returns
 509        -------
 510        Library in requested format.
 511
 512        """
 513
 514        # Init format function
 515        format_func = self._get_format_func(format)
 516
 517        return format_func(
 518            self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {}
 519        )
 520
 521    def get_fames(self, format="json", normalize=False):
 522        """
 523        Request MetabRef GC/MS FAMEs library.
 524
 525        Parameters
 526        ----------
 527        format : str
 528            Format of requested library, i.e. "json", "sql", "flashentropy".
 529            See `available_formats` method for aliases.
 530        normalize : bool
 531            Normalize the spectrum by its magnitude.
 532
 533        Returns
 534        -------
 535        Library in requested format.
 536
 537        """
 538
 539        # Init format function
 540        format_func = self._get_format_func(format)
 541
 542        return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {})
 543
 544    def _to_LowResolutionEICompound_dict(self, metabref_lib, normalize=False):
 545        """
 546        Convert MetabRef-formatted library to CoreMS LowResolutionEICompound-formatted
 547        dictionary for local ingestion.
 548
 549        Parameters
 550        ----------
 551        metabref_lib : dict
 552            MetabRef GC-MS library in JSON format.
 553        normalize : bool
 554            Normalize each spectrum by its magnitude.
 555
 556        Returns
 557        -------
 558        list of dict
 559            List of each spectrum contained in dictionary.
 560
 561        """
 562
 563        # All below key:value lookups are based on CoreMS class definitions
 564        # NOT MetabRef content. For example, MetabRef has keys for PubChem,
 565        # USI, etc. that are not considered below.
 566
 567        # Dictionary to map metabref keys to corems keys
 568        metadatar_cols = {
 569            "casno": "cas",
 570            "inchikey": "inchikey",
 571            "inchi": "inchi",
 572            "chebi": "chebi",
 573            "smiles": "smiles",
 574            "kegg": "kegg",
 575            "iupac_name": "iupac_name",
 576            "traditional_name": "traditional_name",  # Not present in metabref
 577            "common_name": "common_name",  # Not present in metabref
 578        }
 579
 580        # Dictionary to map metabref keys to corems keys
 581        lowres_ei_compound_cols = {
 582            "id": "metabref_id",
 583            "molecule_name": "name",  # Is this correct?
 584            "classify": "classify",  # Not present in metabref
 585            "formula": "formula",
 586            "ri": "ri",
 587            "rt": "retention_time",
 588            "source": "source",  # Not present in metabref
 589            "casno": "casno",
 590            "comments": "comment",
 591            "source_temp_c": "source_temp_c",  # Not present in metabref
 592            "ev": "ev",  # Not present in metabref
 593            "peak_count": "peaks_count",
 594            "mz": "mz",
 595            "abundance": "abundance",
 596        }
 597
 598        # Local result container
 599        corems_lib = []
 600
 601        # Enumerate spectra
 602        for i, source_ in enumerate(metabref_lib):
 603            # Copy source to prevent modification
 604            source = source_.copy()
 605
 606            # Flatten source dict
 607            source = source.pop("spectrum_data") | source
 608
 609            # Parse target data
 610            target = {
 611                lowres_ei_compound_cols[k]: v
 612                for k, v in source.items()
 613                if k in lowres_ei_compound_cols
 614            }
 615
 616            # Explicitly add this to connect with LowResCompoundRef later
 617            target["rt"] = source["rt"]
 618
 619            # Parse (mz, abundance)
 620            arr = self.spectrum_to_array(target["mz"], normalize=normalize)
 621            target["mz"] = arr[:, 0]
 622            target["abundance"] = arr[:, 1]
 623
 624            # Parse meta data
 625            target["metadata"] = {
 626                metadatar_cols[k]: v for k, v in source.items() if k in metadatar_cols
 627            }
 628
 629            # Add anything else
 630            for k in source:
 631                if k not in lowres_ei_compound_cols:
 632                    target[k] = source[k]
 633
 634            # Add to CoreMS list
 635            corems_lib.append(target)
 636
 637        return corems_lib
 638
 639    def _LowResolutionEICompound_dict_to_sqlite(
 640        self, lowres_ei_compound_dict, url="sqlite://"
 641    ):
 642        """
 643        Convert CoreMS LowResolutionEICompound-formatted dictionary to SQLite
 644        database for local ingestion.
 645
 646        Parameters
 647        ----------
 648        lowres_ei_compound_dict : dict
 649            CoreMS GC-MS library formatted for LowResolutionEICompound.
 650        url : str
 651            URL to SQLite prefix.
 652
 653        Returns
 654        -------
 655        sqlite database
 656            Spectra contained in SQLite database.
 657
 658        """
 659
 660        # Dictionary to map corems keys to all-caps keys
 661        capped_cols = {
 662            "name": "NAME",
 663            "formula": "FORM",
 664            "ri": "RI",
 665            "retention_time": "RT",
 666            "source": "SOURCE",
 667            "casno": "CASNO",
 668            "comment": "COMMENT",
 669            "peaks_count": "NUM PEAKS",
 670        }
 671
 672        # Initialize SQLite object
 673        sqlite_obj = EI_LowRes_SQLite(url=url)
 674
 675        # Iterate spectra
 676        for _data_dict in lowres_ei_compound_dict:
 677            # Copy source to prevent modification
 678            data_dict = _data_dict.copy()
 679
 680            # Add missing capped values
 681            for k, v in capped_cols.items():
 682                # Key exists
 683                if k in data_dict:
 684                    # # This will replace the key
 685                    # data_dict[v] = data_dict.pop(k)
 686
 687                    # This will keep both keys
 688                    data_dict[v] = data_dict[k]
 689
 690            # Parse number of peaks
 691            if not data_dict.get("NUM PEAKS"):
 692                data_dict["NUM PEAKS"] = len(data_dict.get("mz"))
 693
 694            # Parse CAS number
 695            if not data_dict.get("CASNO"):
 696                data_dict["CASNO"] = data_dict.get("CAS")
 697
 698            if not data_dict["CASNO"]:
 699                data_dict["CASNO"] = 0
 700
 701            # Build linked metadata table
 702            if "metadata" in data_dict:
 703                if len(data_dict["metadata"]) > 0:
 704                    data_dict["metadatar"] = Metadatar(**data_dict.pop("metadata"))
 705                else:
 706                    data_dict.pop("metadata")
 707
 708            # Attempt addition to sqlite
 709            try:
 710                sqlite_obj.add_compound(data_dict)
 711            except:
 712                print(data_dict["NAME"])
 713
 714        return sqlite_obj
 715
 716
 717class MetabRefLCInterface(MetabRefInterface):
 718    """
 719    Interface to the Metabolomics Reference Database for LC-MS data.
 720    """
 721
 722    def __init__(self):
 723        """
 724        Initialize instance.
 725
 726        """
 727
 728        super().__init__()
 729
 730        # API endpoint for precursor m/z search
 731        # inputs = mz, tolerance (in Da), polarity, page_no, per_page
 732        self.PRECURSOR_MZ_URL = "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}"
 733
 734        # API endpoint for returning full list of precursor m/z values in database
 735        # inputs = polarity, page_no, per_page
 736        self.PRECURSOR_MZ_ALL_URL = (
 737            "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}"
 738        )
 739
 740        self.__init_format_map__()
 741
 742    def __init_format_map__(self):
 743        """
 744        Initialize database format mapper, enabling multiple format requests.
 745
 746        """
 747
 748        # Define format workflows
 749        self.format_map = {
 750            "json": lambda x, normalize, fe_kwargs: x,
 751            "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy(
 752                x, normalize, fe_kwargs
 753            ),
 754        }
 755
 756        # Add aliases
 757        self.format_map["metabref"] = self.format_map["json"]
 758        self.format_map["fe"] = self.format_map["flashentropy"]
 759        self.format_map["flash-entropy"] = self.format_map["flashentropy"]
 760
 761    def query_by_precursor(
 762        self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50
 763    ):
 764        """
 765        Query MetabRef by precursor m/z values.
 766
 767        Parameters
 768        ----------
 769        mz_list : list
 770            List of precursor m/z values.
 771        polarity : str
 772            Ionization polarity, either "positive" or "negative".
 773        mz_tol_ppm : float
 774            Tolerance in ppm for each precursor m/z value.
 775            Used for retrieving from a potential match from database.
 776        mz_tol_da_api : float, optional
 777            Maximum tolerance between precursor m/z values for API search, in daltons.
 778            Used to group similar mzs into a single API query for speed. Default is 0.2.
 779        max_per_page : int, optional
 780            Maximum records to return from MetabRef API query at a time.  Default is 50.
 781
 782        Returns
 783        -------
 784        list
 785            List of library entries in original JSON format.
 786        """
 787
 788        # If polarity is anything other than positive or negative, raise error
 789        if polarity not in ["positive", "negative"]:
 790            raise ValueError("Polarity must be 'positive' or 'negative'")
 791
 792        # Cluster groups of mz according to mz_tol_da_api for precursor query
 793        mz_list.sort()
 794        mz_groups = [[mz_list[0]]]
 795        for x in mz_list[1:]:
 796            if abs(x - mz_groups[-1][0]) <= mz_tol_da_api:
 797                mz_groups[-1].append(x)
 798            else:
 799                mz_groups.append([x])
 800
 801        # Query MetabRef for each mz group
 802        lib = []
 803        for mz_group in mz_groups:
 804            mz = np.mean(mz_group)
 805            if len(mz_group) == 1:
 806                mz = mz_group[0]
 807                tol = mz_tol_ppm * 10**-6 * mz
 808            else:
 809                mz = (max(mz_group) - min(mz_group)) / 2 + min(mz_group)
 810                tol = (max(mz_group) - min(mz_group)) / 2 + mz_tol_ppm**-6 * max(
 811                    mz_group
 812                )
 813
 814            # Get first page of results
 815            response = self.get_query(
 816                self.PRECURSOR_MZ_URL.format(
 817                    str(mz), str(tol), polarity, 1, max_per_page
 818                )
 819            )
 820            lib = lib + response["results"]
 821
 822            # If there are more pages of results, get them
 823            if response["total_pages"] > 1:
 824                for i in np.arange(2, response["total_pages"] + 1):
 825                    lib = (
 826                        lib
 827                        + self.get_query(
 828                            self.PRECURSOR_MZ_URL.format(
 829                                str(mz), str(tol), polarity, i, max_per_page
 830                            )
 831                        )["results"]
 832                    )
 833
 834        return lib
 835
 836    def request_all_precursors(self, polarity, per_page=50000):
 837        """
 838        Request all precursor m/z values for MS2 spectra from MetabRef.
 839
 840        Parameters
 841        ----------
 842        polarity : str
 843            Ionization polarity, either "positive" or "negative".
 844        per_page : int, optional
 845            Number of records to fetch per call. Default is 50000
 846
 847        Returns
 848        -------
 849        list
 850            List of all precursor m/z values, sorted.
 851        """
 852        # If polarity is anything other than positive or negative, raise error
 853        if polarity not in ["positive", "negative"]:
 854            raise ValueError("Polarity must be 'positive' or 'negative'")
 855
 856        precursors = []
 857
 858        # Get first page of results and total number of pages of results
 859        response = self.get_query(
 860            self.PRECURSOR_MZ_ALL_URL.format(polarity, str(1), str(per_page))
 861        )
 862        total_pages = response["total_pages"]
 863        precursors.extend([x["precursor_ion"] for x in response["results"]])
 864
 865        # Go through remaining pages of results
 866        for i in np.arange(2, total_pages + 1):
 867            response = self.get_query(
 868                self.PRECURSOR_MZ_ALL_URL.format(polarity, str(i), str(per_page))
 869            )
 870            precursors.extend([x["precursor_ion"] for x in response["results"]])
 871
 872        # Sort precursors from smallest to largest and remove duplicates
 873        precursors = list(set(precursors))
 874        precursors.sort()
 875
 876        return precursors
 877
 878    def get_lipid_library(
 879        self,
 880        mz_list,
 881        polarity,
 882        mz_tol_ppm,
 883        mz_tol_da_api=0.2,
 884        format="json",
 885        normalize=True,
 886        fe_kwargs={},
 887    ):
 888        """
 889        Request MetabRef lipid library.
 890
 891        Parameters
 892        ----------
 893        mz_list : list
 894            List of precursor m/z values.
 895        polarity : str
 896            Ionization polarity, either "positive" or "negative".
 897        mz_tol_ppm : float
 898            Tolerance in ppm for each precursor m/z value.
 899            Used for retrieving from a potential match from database.
 900        mz_tol_da_api : float, optional
 901            Maximum tolerance between precursor m/z values for API search, in daltons.
 902            Used to group similar mzs into a single API query for speed. Default is 0.2.
 903        format : str, optional
 904            Format of requested library, i.e. "json", "sql", "flashentropy".
 905            See `available_formats` method for aliases. Default is "json".
 906        normalize : bool, optional
 907            Normalize the spectrum by its magnitude. Default is True.
 908        fe_kwargs : dict, optional
 909            Keyword arguments for FlashEntropy search. Default is {}.
 910
 911        Returns
 912        -------
 913        tuple
 914            Library in requested format and lipid metadata as a LipidMetadata dataclass.
 915
 916        """
 917        mz_list.sort()
 918        mz_list = np.array(mz_list)
 919
 920        # Get all precursors in the library matching the polarity
 921        precusors_in_lib = self.request_all_precursors(polarity=polarity)
 922        precusors_in_lib = np.array(precusors_in_lib)
 923
 924        # Compare the mz_list with the precursors in the library, keep any mzs that are within mz_tol of any precursor in the library
 925        lib_mz_df = pd.DataFrame(precusors_in_lib, columns=["lib_mz"])
 926        lib_mz_df["closest_obs_mz"] = mz_list[
 927            find_closest(mz_list, lib_mz_df.lib_mz.values)
 928        ]
 929        lib_mz_df["mz_diff_ppm"] = np.abs(
 930            (lib_mz_df["lib_mz"] - lib_mz_df["closest_obs_mz"])
 931            / lib_mz_df["lib_mz"]
 932            * 1e6
 933        )
 934        lib_mz_sub = lib_mz_df[lib_mz_df["mz_diff_ppm"] <= mz_tol_ppm]
 935
 936        # Do the same in the opposite direction
 937        mz_df = pd.DataFrame(mz_list, columns=["mass_feature_mz"])
 938        mz_df["closest_lib_pre_mz"] = precusors_in_lib[
 939            find_closest(precusors_in_lib, mz_df.mass_feature_mz.values)
 940        ]
 941        mz_df["mz_diff_ppm"] = np.abs(
 942            (mz_df["mass_feature_mz"] - mz_df["closest_lib_pre_mz"])
 943            / mz_df["mass_feature_mz"]
 944            * 1e6
 945        )
 946        mz_df_sub = mz_df[mz_df["mz_diff_ppm"] <= mz_tol_ppm]
 947
 948        # Evaluate which is fewer mzs - lib_mz_sub or mz_df_sub and use that as the input for next step
 949        if len(lib_mz_sub) < len(mz_df_sub):
 950            mzs_to_query = lib_mz_sub.lib_mz.values
 951        else:
 952            mzs_to_query = mz_df_sub.mass_feature_mz.values
 953
 954        # Query the library for the precursors in the mz_list that are in the library to retrieve the spectra and metadata
 955        lib = self.query_by_precursor(
 956            mz_list=mzs_to_query,
 957            polarity=polarity,
 958            mz_tol_ppm=mz_tol_ppm,
 959            mz_tol_da_api=mz_tol_da_api,
 960        )
 961
 962        # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass
 963        mol_data_dict = {x["id"]: x["Molecular Data"] for x in lib}
 964        lipid_lib = {x["id"]: x["Lipid Tree"] for x in lib if "Lipid Tree" in x.keys()}
 965        mol_data_dict = {k: {**v, **lipid_lib[k]} for k, v in mol_data_dict.items()}
 966        mol_data_dict = {
 967            k: self._dict_to_dataclass(v, LipidMetadata)
 968            for k, v in mol_data_dict.items()
 969        }
 970
 971        # Remove lipid metadata from the metabref library
 972        lib = [
 973            {k: v for k, v in x.items() if k not in ["Molecular Data", "Lipid Tree"]}
 974            for x in lib
 975        ]
 976        # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry
 977        for x in lib:
 978            if "Lipid Fragments" in x.keys():
 979                x.update(x.pop("Lipid Fragments"))
 980            if "MSO Data" in x.keys():
 981                x.update(x.pop("MSO Data"))
 982
 983        # Format the spectral library
 984        format_func = self._get_format_func(format)
 985        lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs)
 986        return (lib, mol_data_dict)
 987
 988
 989class MSPInterface(SpectralDatabaseInterface):
 990    """
 991    Interface to parse NIST MSP files
 992    """
 993
 994    def __init__(self, file_path):
 995        """
 996        Initialize instance.
 997
 998        Parameters
 999        ----------
1000        file_path : str
1001            Path to a local MSP file.
1002
1003        Attributes
1004        ----------
1005        file_path : str
1006            Path to the MSP file.
1007        _file_content : str
1008            Content of the MSP file.
1009        _data_frame : :obj:`~pandas.DataFrame`
1010            DataFrame of spectra from the MSP file with unaltered content.
1011        """
1012        super().__init__(key=None)
1013
1014        self.file_path = file_path
1015        if not os.path.exists(self.file_path):
1016            raise FileNotFoundError(
1017                f"File {self.file_path} does not exist. Please check the file path."
1018            )
1019        with open(self.file_path, "r") as f:
1020            self._file_content = f.read()
1021
1022        self._data_frame = self._read_msp_file()
1023        self.__init_format_map__()
1024
1025    def __init_format_map__(self):
1026        """
1027        Initialize database format mapper, enabling multiple format requests.
1028
1029        """
1030
1031        # x is a pandas dataframe similar to self._data_frame format
1032        # Define format workflows
1033        self.format_map = {
1034            "msp": lambda x, normalize, fe_kwargs: self._to_msp(x, normalize),
1035            "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy(
1036                x, normalize, fe_kwargs
1037            ),
1038            "df": lambda x, normalize, fe_kwargs: self._to_df(x, normalize),
1039        }
1040
1041        # Add aliases
1042        self.format_map["fe"] = self.format_map["flashentropy"]
1043        self.format_map["flash-entropy"] = self.format_map["flashentropy"]
1044        self.format_map["dataframe"] = self.format_map["df"]
1045        self.format_map["data-frame"] = self.format_map["df"]
1046
1047    def _read_msp_file(self):
1048        """
1049        Reads the MSP files into the pandas dataframe, and sort/remove zero intensity ions in MS/MS spectra.
1050
1051        Returns
1052        -------
1053        :obj:`~pandas.DataFrame`
1054            DataFrame of spectra from the MSP file, exacly as it is in the file (no sorting, filtering etc)
1055        """
1056        # If input_dataframe is provided, return it it
1057        spectra = []
1058        spectrum = {}
1059
1060        f = StringIO(self._file_content)
1061        for line in f:
1062            line = line.strip()
1063            if not line:
1064                continue  # Skip empty lines
1065
1066            # Handle metadata
1067            if ":" in line:
1068                key, value = line.split(":", 1)
1069                key = key.strip().lower()
1070                value = value.strip()
1071
1072                if key == "name":
1073                    # Save current spectrum and start a new one
1074                    if spectrum:
1075                        spectra.append(spectrum)
1076                    spectrum = {"name": value, "peaks": []}
1077                else:
1078                    spectrum[key] = value
1079
1080            # Handle peak data (assumed to start with a number)
1081            elif line[0].isdigit():
1082                peaks = line.split()
1083                m_z = float(peaks[0])
1084                intensity = float(peaks[1])
1085                spectrum["peaks"].append(([m_z, intensity]))
1086        # Save the last spectrum
1087        if spectrum:
1088            spectra.append(spectrum)
1089
1090        df = pd.DataFrame(spectra)
1091        for column in df.columns:
1092            if column != "peaks":  # Skip 'peaks' column
1093                try:
1094                    df[column] = pd.to_numeric(df[column], errors="raise")
1095                except:
1096                    pass
1097        return df
1098
1099    def _to_df(self, input_dataframe, normalize=True):
1100        """
1101        Convert MSP-derived library to FlashEntropy library. 
1102
1103        Parameters
1104        ----------
1105        input_dataframe : :obj:`~pandas.DataFrame`
1106            Input DataFrame containing MSP-formatted spectra.
1107        normalize : bool, optional
1108            Normalize each spectrum by its magnitude.
1109            Default is True.
1110
1111        Returns
1112        -------
1113        :obj:`~pandas.DataFrame`
1114            DataFrame of with desired normalization
1115        """
1116        if not normalize:
1117            return input_dataframe
1118        else:
1119            # Convert to dictionary
1120            db_dict = input_dataframe.to_dict(orient="records")
1121
1122            # Initialize empty library
1123            lib = []
1124
1125            # Enumerate spectra
1126            for i, source in enumerate(db_dict):
1127                spectrum = source
1128                # Check that spectrum["peaks"] exists
1129                if "peaks" not in spectrum.keys():
1130                    raise KeyError(
1131                        "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute."
1132                    )
1133
1134                # Convert spectrum["peaks"] to numpy array
1135                if not isinstance(spectrum["peaks"], np.ndarray):
1136                    spectrum["peaks"] = np.array(spectrum["peaks"])
1137
1138                # Normalize peaks, if requested
1139                if normalize:
1140                    spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"])
1141                    spectrum["num peaks"] = len(spectrum["peaks"])
1142
1143                # Add spectrum to library
1144                lib.append(spectrum)
1145            
1146            # Convert to DataFrame
1147            df = pd.DataFrame(lib)
1148            return df
1149    
1150    def _to_flashentropy(self, input_dataframe, normalize=True, fe_kwargs={}):
1151        """
1152        Convert MSP-derived library to FlashEntropy library.
1153
1154        Parameters
1155        ----------
1156        input_dataframe : :obj:`~pandas.DataFrame`
1157            Input DataFrame containing MSP-formatted spectra.
1158        normalize : bool
1159            Normalize each spectrum by its magnitude.
1160        fe_kwargs : dict, optional
1161            Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search;
1162            any keys not recognized will be ignored. By default, all parameters set to defaults.
1163
1164        Returns
1165        -------
1166        :obj:`~ms_entropy.FlashEntropySearch`
1167            MS2 library as FlashEntropy search instance.
1168
1169        Raises
1170        ------
1171        ValueError
1172            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they
1173        """
1174        self._check_flash_entropy_kwargs(fe_kwargs)
1175
1176        db_df = input_dataframe
1177
1178        # Convert to dictionary
1179        db_dict = db_df.to_dict(orient="records")
1180
1181        # Initialize empty library
1182        fe_lib = []
1183
1184        # Enumerate spectra
1185        for i, source in enumerate(db_dict):
1186            # Reorganize source dict, if necessary
1187            if "spectrum_data" in source.keys():
1188                spectrum = source["spectrum_data"]
1189            else:
1190                spectrum = source
1191
1192            # Rename precursor_mz key for FlashEntropy
1193            if "precursor_mz" not in spectrum.keys():
1194                if "precursormz" in spectrum:
1195                    spectrum["precursor_mz"] = spectrum.pop("precursormz")
1196                elif "precursor_ion" in spectrum:
1197                    spectrum["precursor_mz"] = spectrum.pop("precursor_ion")
1198                else:
1199                    raise KeyError(
1200                        "MSP must have either 'precursormz' or 'precursor_ion' key to be converted to FlashEntropy format."
1201                    )
1202
1203            # Check that spectrum["peaks"] exists
1204            if "peaks" not in spectrum.keys():
1205                raise KeyError(
1206                    "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute."
1207                )
1208
1209            # Convert spectrum["peaks"] to numpy array
1210            if not isinstance(spectrum["peaks"], np.ndarray):
1211                spectrum["peaks"] = np.array(spectrum["peaks"])
1212
1213            # Normalize peaks, if requested
1214            if normalize:
1215                spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"])
1216
1217            # Add spectrum to library
1218            fe_lib.append(spectrum)
1219
1220        # Build FlashEntropy index
1221        fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs)
1222
1223        return fe_search
1224    
1225    def _check_msp_compatibility(self):
1226        """
1227        Check if the MSP file is compatible with the get_metabolomics_spectra_library method and provide feedback if it is not.
1228        """
1229        # Check polarity
1230        if (
1231            "polarity" not in self._data_frame.columns
1232            and "ionmode" not in self._data_frame.columns
1233        ):
1234            raise ValueError(
1235                "Neither 'polarity' nor 'ionmode' columns found in the input MSP metadata. Please check the file."
1236            )
1237        polarity_column = (
1238            "polarity" if "polarity" in self._data_frame.columns else "ionmode"
1239        )
1240
1241        # Check if polarity_column contents is either "positive" or "negative"
1242        if not all(self._data_frame[polarity_column].isin(["positive", "negative"])):
1243            raise ValueError(
1244                f"Input field on MSP '{polarity_column}' must contain only 'positive' or 'negative' values."
1245            )
1246
1247        # Check if the MSP file contains the required columns for metabolite metadata
1248        # inchikey, by name, not null
1249        # either formula or molecular_formula, not null
1250        if not all(self._data_frame["inchikey"].notnull()):
1251            raise ValueError(
1252                "Input field on MSP 'inchikey' must contain only non-null values."
1253            )
1254        if (
1255            "formula" not in self._data_frame.columns
1256            and "molecular_formula" not in self._data_frame.columns
1257        ):
1258            raise ValueError(
1259                "Input field on MSP must contain either 'formula' or 'molecular_formula' columns."
1260            )
1261        molecular_formula_column = (
1262            "formula" if "formula" in self._data_frame.columns else "molecular_formula"
1263        )
1264        if not all(self._data_frame[molecular_formula_column].notnull()):
1265            raise ValueError(
1266                f"Input field on MSP '{molecular_formula_column}' must contain only non-null values."
1267            )
1268
1269    def get_metabolomics_spectra_library(
1270        self,
1271        polarity,
1272        metabolite_metadata_mapping={},
1273        format="fe",
1274        normalize=True,
1275        fe_kwargs={},
1276    ):
1277        """
1278        Prepare metabolomics spectra library and associated metabolite metadata
1279
1280        Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input
1281
1282        """
1283        # Check if the MSP file is compatible with the get_metabolomics_spectra_library method
1284        self._check_msp_compatibility()
1285
1286        # Check if the polarity parameter is valid and if a polarity column exists in the dataframe
1287        if polarity not in ["positive", "negative"]:
1288            raise ValueError("Polarity must be 'positive' or 'negative'")
1289        polarity_column = (
1290            "polarity" if "polarity" in self._data_frame.columns else "ionmode"
1291        )
1292
1293        # Get a subset of the initial dataframea by polarity
1294        db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy()
1295
1296        # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping
1297        # If the mapping is not provided, use the default mapping
1298        if not metabolite_metadata_mapping:
1299            metabolite_metadata_mapping = {
1300                "chebi_id": "chebi",
1301                "kegg_id": "kegg",
1302                "refmet_name": "common_name",
1303                "molecular_formula": "formula",
1304                "gnps_spectra_id":"id",
1305                "precursormz": "precursor_mz",
1306                "precursortype":"ion_type"
1307            }
1308        db_df.rename(columns=metabolite_metadata_mapping, inplace=True)
1309        db_df["molecular_data_id"] = db_df["inchikey"]
1310
1311
1312
1313        # Check if the resulting dataframe has the required columns for the flash entropy search
1314        required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"]
1315        for col in required_columns:
1316            if col not in db_df.columns:
1317                raise ValueError(
1318                    f"Input field on MSP must contain '{col}' column for FlashEntropy search."
1319                )
1320
1321        # Pull out the metabolite metadata from the dataframe and put it into a different dataframe
1322        # First get a list of the possible attributes of the MetaboliteMetadata dataclass
1323        metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys())
1324        # Replace id with molecular_data_id in metabolite_metadata_keys
1325        metabolite_metadata_keys = [
1326            "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys
1327        ]
1328        metabolite_metadata_df = db_df[
1329            db_df.columns[db_df.columns.isin(metabolite_metadata_keys)]
1330        ].copy()
1331
1332        # Make unique and recast the id column for metabolite metadata
1333        metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True)
1334        metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"]
1335
1336        # Convert to a dictionary using the inchikey as the key
1337        metabolite_metadata_dict = metabolite_metadata_df.to_dict(
1338            orient="records"
1339        )
1340        metabolite_metadata_dict = {
1341            v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata)
1342            for v in metabolite_metadata_dict
1343        }
1344
1345        # Remove the metabolite metadata columns from the original dataframe
1346        for key in metabolite_metadata_keys:
1347            if key != "molecular_data_id":
1348                if key in db_df.columns:
1349                    db_df.drop(columns=key, inplace=True)
1350
1351        # Format the spectral library
1352        format_func = self._get_format_func(format)
1353        lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs)
1354        return (lib, metabolite_metadata_dict)
class SpectralDatabaseInterface(abc.ABC):
 22class SpectralDatabaseInterface(ABC):
 23    """
 24    Base class that facilitates connection to spectral reference databases,
 25    such as EMSL's Metabolomics Reference Database (MetabRef).
 26
 27    """
 28
 29    def __init__(self, key=None):
 30        """
 31        Initialize instance.
 32
 33        Parameters
 34        ----------
 35        key : str
 36            Token key.
 37
 38        """
 39
 40        self.key = key
 41
 42    def set_token(self, path):
 43        """
 44        Set environment variable for MetabRef database token.
 45
 46        Parameters
 47        ----------
 48        path : str
 49            Path to token.
 50
 51        """
 52
 53        # Read token from file
 54        with open(path, "r", encoding="utf-8") as f:
 55            token = f.readline().strip()
 56
 57        # Set environment variable
 58        os.environ[self.key] = token
 59
 60    def get_token(self):
 61        """
 62        Get environment variable for database token.
 63
 64        Returns
 65        -------
 66        str
 67            Token string.
 68
 69        """
 70
 71        # Check for token
 72        if self.key not in os.environ:
 73            raise ValueError("Must set {} environment variable.".format(self.key))
 74
 75        # Get token from environment variables
 76        return os.environ.get(self.key)
 77
 78    def get_header(self):
 79        """
 80        Access stored database token and prepare as header.
 81
 82        Returns
 83        -------
 84        str
 85            Header string.
 86
 87        """
 88
 89        # Get token
 90        token = self.get_token()
 91
 92        # Pad header information
 93        header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"}
 94
 95        return header
 96
 97    def get_query(self, url, use_header=True):
 98        """
 99        Request payload from URL according to `get` protocol.
100
101        Parameters
102        ----------
103        url : str
104            URL for request.
105        use_header: bool
106            Whether or not the query should include the header
107
108        Returns
109        -------
110        dict
111            Response as JSON.
112
113        """
114
115        # Query URL via `get`
116        if use_header:
117            response = requests.get(url, headers=self.get_header())
118        else:
119            response = requests.get(url)
120
121        # Check response
122        response.raise_for_status()
123
124        # Return as JSON
125        return response.json()
126
127    def post_query(self, url, variable, values, tolerance):
128        """
129        Request payload from URL according to `post` protocol.
130
131        Parameters
132        ----------
133        url : str
134            URL for request.
135        variable : str
136            Variable to query.
137        values : str
138            Specific values of `variable` to query.
139        tolerance : str
140            Query tolerance relative to `values`.
141
142        Returns
143        -------
144        dict
145            Response as JSON.
146
147        """
148
149        # Coerce to string
150        if not isinstance(variable, str):
151            variable = str(variable).replace(" ", "")
152
153        if not isinstance(values, str):
154            values = str(values).replace(" ", "")
155
156        if not isinstance(tolerance, str):
157            tolerance = str(tolerance).replace(" ", "")
158
159        # Query URL via `post`
160        response = requests.post(
161            os.path.join(url, variable, tolerance),
162            data=values,
163            headers=self.get_header(),
164        )
165
166        # Check response
167        response.raise_for_status()
168
169        # Return as JSON
170        return response.json()
171
172    def _check_flash_entropy_kwargs(self, fe_kwargs):
173        """
174        Check FlashEntropy keyword arguments.
175
176        Parameters
177        ----------
178        fe_kwargs : dict
179            Keyword arguments for FlashEntropy search.
180
181
182        Raises
183        ------
184        ValueError
185            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they
186            are not equal.
187
188        """
189        # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da
190        if (
191            "min_ms2_difference_in_da" in fe_kwargs
192            or "max_ms2_tolerance_in_da" in fe_kwargs
193        ):
194            if (
195                "min_ms2_difference_in_da" not in fe_kwargs
196                or "max_ms2_tolerance_in_da" not in fe_kwargs
197            ):
198                raise ValueError(
199                    "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified."
200                )
201            if (
202                fe_kwargs["min_ms2_difference_in_da"]
203                != 2 * fe_kwargs["max_ms2_tolerance_in_da"]
204            ):
205                raise ValueError(
206                    "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'."
207                )
208
209    def _get_format_func(self, format):
210        """
211        Obtain format function by key.
212
213        Returns
214        -------
215        func
216            Formatting function.
217        """
218
219        if format.lower() in self.format_map.keys():
220            return self.format_map[format.lower()]
221
222        raise ValueError(("{} not a supported format.").format(format))
223
224    def _dict_to_dataclass(self, metabref_lib, data_class):
225        """
226        Convert dictionary to dataclass.
227
228        Notes
229        -----
230        This function will pull the attributes a dataclass and its parent class
231        and convert the dictionary to a dataclass instance with the appropriate
232        attributes.
233
234        Parameters
235        ----------
236        data_class : :obj:`~dataclasses.dataclass`
237            Dataclass to convert to.
238        metabref_lib : dict
239            Metabref dictionary object to convert to dataclass.
240
241        Returns
242        -------
243        :obj:`~dataclasses.dataclass`
244            Dataclass instance.
245
246        """
247
248        # Get list of expected attributes of data_class
249        data_class_keys = list(data_class.__annotations__.keys())
250
251        # Does the data_class inherit from another class, if so, get the attributes of the parent class as well
252        if len(data_class.__mro__) > 2:
253            parent_class_keys = list(data_class.__bases__[0].__annotations__.keys())
254            data_class_keys = list(set(data_class_keys + parent_class_keys))
255
256        # Remove keys that are not in the data_class from the input dictionary
257        input_dict = {k: v for k, v in metabref_lib.items() if k in data_class_keys}
258
259        # Add keys that are in the data class but not in the input dictionary as None
260        for key in data_class_keys:
261            if key not in input_dict.keys():
262                input_dict[key] = None
263        return data_class(**input_dict)
264
265    @staticmethod
266    def normalize_peaks(arr):
267        """
268        Normalize peaks in an array.
269
270        Parameters
271        ----------
272        arr : :obj:`~numpy.array`
273            Array of shape (N, 2), with m/z in the first column and abundance in
274            the second.
275
276        Returns
277        -------
278        :obj:`~numpy.array`
279            Normalized array of shape (N, 2), with m/z in the first column and
280            normalized abundance in the second.
281        """
282        # Normalize the array
283        arr[:, -1] = arr[:, -1] / arr[:, -1].sum()
284
285        return arr
286
287    @staticmethod
288    def _build_flash_entropy_index(fe_lib, fe_kwargs={}, clean_spectra=True):
289        """
290        Build FlashEntropy index.
291
292        Parameters
293        ----------
294        fe_lib : list
295            List of spectra to build index from. Can be a list of dictionaries or
296            a FlashEntropy search instance.
297        fe_kwargs : dict, optional
298            Keyword arguments for FlashEntropy search.
299        clean_spectra : bool, optional
300            Clean spectra before building index. Default is True.
301
302        Returns
303        -------
304        :obj:`~ms_entropy.FlashEntropySearch`
305            FlashEntropy search instance.
306
307        """
308        # Initialize FlashEntropy
309        fe_init_kws = [
310            "max_ms2_tolerance_in_da",
311            "mz_index_step",
312            "low_memory",
313            "path_data",
314        ]
315        fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws}
316        fes = FlashEntropySearch(**fe_init_kws)
317
318        # Build FlashEntropy index
319        fe_index_kws = [
320            "max_indexed_mz",
321            "precursor_ions_removal_da",
322            "noise_threshold",
323            "min_ms2_difference_in_da",
324            "max_peak_num",
325        ]
326        fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws}
327        fes.build_index(fe_lib, **fe_index_kws, clean_spectra=clean_spectra)
328
329        return fes

Base class that facilitates connection to spectral reference databases, such as EMSL's Metabolomics Reference Database (MetabRef).

SpectralDatabaseInterface(key=None)
29    def __init__(self, key=None):
30        """
31        Initialize instance.
32
33        Parameters
34        ----------
35        key : str
36            Token key.
37
38        """
39
40        self.key = key

Initialize instance.

Parameters
  • key (str): Token key.
key
def set_token(self, path):
42    def set_token(self, path):
43        """
44        Set environment variable for MetabRef database token.
45
46        Parameters
47        ----------
48        path : str
49            Path to token.
50
51        """
52
53        # Read token from file
54        with open(path, "r", encoding="utf-8") as f:
55            token = f.readline().strip()
56
57        # Set environment variable
58        os.environ[self.key] = token

Set environment variable for MetabRef database token.

Parameters
  • path (str): Path to token.
def get_token(self):
60    def get_token(self):
61        """
62        Get environment variable for database token.
63
64        Returns
65        -------
66        str
67            Token string.
68
69        """
70
71        # Check for token
72        if self.key not in os.environ:
73            raise ValueError("Must set {} environment variable.".format(self.key))
74
75        # Get token from environment variables
76        return os.environ.get(self.key)

Get environment variable for database token.

Returns
  • str: Token string.
def get_header(self):
78    def get_header(self):
79        """
80        Access stored database token and prepare as header.
81
82        Returns
83        -------
84        str
85            Header string.
86
87        """
88
89        # Get token
90        token = self.get_token()
91
92        # Pad header information
93        header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"}
94
95        return header

Access stored database token and prepare as header.

Returns
  • str: Header string.
def get_query(self, url, use_header=True):
 97    def get_query(self, url, use_header=True):
 98        """
 99        Request payload from URL according to `get` protocol.
100
101        Parameters
102        ----------
103        url : str
104            URL for request.
105        use_header: bool
106            Whether or not the query should include the header
107
108        Returns
109        -------
110        dict
111            Response as JSON.
112
113        """
114
115        # Query URL via `get`
116        if use_header:
117            response = requests.get(url, headers=self.get_header())
118        else:
119            response = requests.get(url)
120
121        # Check response
122        response.raise_for_status()
123
124        # Return as JSON
125        return response.json()

Request payload from URL according to get protocol.

Parameters
  • url (str): URL for request.
  • use_header (bool): Whether or not the query should include the header
Returns
  • dict: Response as JSON.
def post_query(self, url, variable, values, tolerance):
127    def post_query(self, url, variable, values, tolerance):
128        """
129        Request payload from URL according to `post` protocol.
130
131        Parameters
132        ----------
133        url : str
134            URL for request.
135        variable : str
136            Variable to query.
137        values : str
138            Specific values of `variable` to query.
139        tolerance : str
140            Query tolerance relative to `values`.
141
142        Returns
143        -------
144        dict
145            Response as JSON.
146
147        """
148
149        # Coerce to string
150        if not isinstance(variable, str):
151            variable = str(variable).replace(" ", "")
152
153        if not isinstance(values, str):
154            values = str(values).replace(" ", "")
155
156        if not isinstance(tolerance, str):
157            tolerance = str(tolerance).replace(" ", "")
158
159        # Query URL via `post`
160        response = requests.post(
161            os.path.join(url, variable, tolerance),
162            data=values,
163            headers=self.get_header(),
164        )
165
166        # Check response
167        response.raise_for_status()
168
169        # Return as JSON
170        return response.json()

Request payload from URL according to post protocol.

Parameters
  • url (str): URL for request.
  • variable (str): Variable to query.
  • values (str): Specific values of variable to query.
  • tolerance (str): Query tolerance relative to values.
Returns
  • dict: Response as JSON.
@staticmethod
def normalize_peaks(arr):
265    @staticmethod
266    def normalize_peaks(arr):
267        """
268        Normalize peaks in an array.
269
270        Parameters
271        ----------
272        arr : :obj:`~numpy.array`
273            Array of shape (N, 2), with m/z in the first column and abundance in
274            the second.
275
276        Returns
277        -------
278        :obj:`~numpy.array`
279            Normalized array of shape (N, 2), with m/z in the first column and
280            normalized abundance in the second.
281        """
282        # Normalize the array
283        arr[:, -1] = arr[:, -1] / arr[:, -1].sum()
284
285        return arr

Normalize peaks in an array.

Parameters
  • arr (~numpy.array): Array of shape (N, 2), with m/z in the first column and abundance in the second.
Returns
  • ~numpy.array: Normalized array of shape (N, 2), with m/z in the first column and normalized abundance in the second.
class MetabRefInterface(SpectralDatabaseInterface):
332class MetabRefInterface(SpectralDatabaseInterface):
333    """
334    Interface to the Metabolomics Reference Database.
335    """
336
337    def __init__(self):
338        """
339        Initialize instance.
340
341        """
342
343        super().__init__(key=None)
344
345    def spectrum_to_array(self, spectrum, normalize=True):
346        """
347        Convert MetabRef-formatted spectrum to array.
348
349        Parameters
350        ----------
351        spectrum : str
352            MetabRef spectrum, i.e. list of (m/z,abundance) pairs.
353        normalize : bool
354            Normalize the spectrum by its magnitude.
355
356        Returns
357        -------
358        :obj:`~numpy.array`
359            Array of shape (N, 2), with m/z in the first column and abundance in
360            the second.
361
362        """
363
364        # Convert parenthesis-delimited string to array
365        arr = np.array(
366            re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float
367        ).reshape(-1, 2)
368
369        if normalize:
370            arr = self.normalize_peaks(arr)
371
372        return arr
373
374    def _to_flashentropy(self, metabref_lib, normalize=True, fe_kwargs={}):
375        """
376        Convert metabref-formatted library to FlashEntropy library.
377
378        Parameters
379        ----------
380        metabref_lib : dict
381            MetabRef MS2 library in JSON format or FlashEntropy search instance (for reformatting at different MS2 separation).
382        normalize : bool
383            Normalize each spectrum by its magnitude.
384        fe_kwargs : dict, optional
385            Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search;
386            any keys not recognized will be ignored. By default, all parameters set to defaults.
387
388        Returns
389        -------
390        :obj:`~ms_entropy.FlashEntropySearch`
391            MS2 library as FlashEntropy search instance.
392
393        Raises
394        ------
395        ValueError
396            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal.
397
398        """
399        self._check_flash_entropy_kwargs(fe_kwargs)
400
401        # Initialize empty library
402        fe_lib = []
403
404        # Enumerate spectra
405        for i, source in enumerate(metabref_lib):
406            # Reorganize source dict, if necessary
407            if "spectrum_data" in source.keys():
408                spectrum = source["spectrum_data"]
409            else:
410                spectrum = source
411
412            # Rename precursor_mz key for FlashEntropy
413            if "precursor_mz" not in spectrum.keys():
414                spectrum["precursor_mz"] = spectrum.pop("precursor_ion")
415
416            # Convert CoreMS spectrum to array and clean, store as `peaks`
417            spectrum["peaks"] = self.spectrum_to_array(
418                spectrum["mz"], normalize=normalize
419            )
420
421            # Add spectrum to library
422            fe_lib.append(spectrum)
423
424        # Build FlashEntropy index
425        fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs)
426
427        return fe_search
428
429    def get_query(self, url, use_header=False):
430        """Overwrites the get_query method on the parent class to default to not use a header
431
432        Notes
433        -----
434        As of January 2025, the metabref database no longer requires a token and therefore no header is needed
435
436        """
437        return super().get_query(url, use_header)

Interface to the Metabolomics Reference Database.

MetabRefInterface()
337    def __init__(self):
338        """
339        Initialize instance.
340
341        """
342
343        super().__init__(key=None)

Initialize instance.

def spectrum_to_array(self, spectrum, normalize=True):
345    def spectrum_to_array(self, spectrum, normalize=True):
346        """
347        Convert MetabRef-formatted spectrum to array.
348
349        Parameters
350        ----------
351        spectrum : str
352            MetabRef spectrum, i.e. list of (m/z,abundance) pairs.
353        normalize : bool
354            Normalize the spectrum by its magnitude.
355
356        Returns
357        -------
358        :obj:`~numpy.array`
359            Array of shape (N, 2), with m/z in the first column and abundance in
360            the second.
361
362        """
363
364        # Convert parenthesis-delimited string to array
365        arr = np.array(
366            re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float
367        ).reshape(-1, 2)
368
369        if normalize:
370            arr = self.normalize_peaks(arr)
371
372        return arr

Convert MetabRef-formatted spectrum to array.

Parameters
  • spectrum (str): MetabRef spectrum, i.e. list of (m/z,abundance) pairs.
  • normalize (bool): Normalize the spectrum by its magnitude.
Returns
  • ~numpy.array: Array of shape (N, 2), with m/z in the first column and abundance in the second.
def get_query(self, url, use_header=False):
429    def get_query(self, url, use_header=False):
430        """Overwrites the get_query method on the parent class to default to not use a header
431
432        Notes
433        -----
434        As of January 2025, the metabref database no longer requires a token and therefore no header is needed
435
436        """
437        return super().get_query(url, use_header)

Overwrites the get_query method on the parent class to default to not use a header

Notes

As of January 2025, the metabref database no longer requires a token and therefore no header is needed

class MetabRefGCInterface(MetabRefInterface):
440class MetabRefGCInterface(MetabRefInterface):
441    """
442    Interface to the Metabolomics Reference Database.
443    """
444
445    def __init__(self):
446        """
447        Initialize instance.
448
449        """
450
451        super().__init__()
452        self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1"
453        self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames"
454
455        self.__init_format_map__()
456
457    def __init_format_map__(self):
458        """
459        Initialize database format mapper, enabling multiple format requests.
460
461        """
462
463        # Define format workflows
464        self.format_map = {
465            "json": lambda x, normalize, fe_kwargs: x,
466            "dict": lambda x,
467            normalize,
468            fe_kwargs: self._to_LowResolutionEICompound_dict(x, normalize),
469            "sql": lambda x,
470            normalize,
471            fe_kwargs: self._LowResolutionEICompound_dict_to_sqlite(
472                self._to_LowResolutionEICompound_dict(x, normalize)
473            ),
474        }
475
476        # Add aliases
477        self.format_map["metabref"] = self.format_map["json"]
478        self.format_map["datadict"] = self.format_map["dict"]
479        self.format_map["data-dict"] = self.format_map["dict"]
480        self.format_map["lowreseicompound"] = self.format_map["dict"]
481        self.format_map["lowres"] = self.format_map["dict"]
482        self.format_map["lowresgc"] = self.format_map["dict"]
483        self.format_map["sqlite"] = self.format_map["sql"]
484
485    def available_formats(self):
486        """
487        View list of available formats.
488
489        Returns
490        -------
491        list
492            Format map keys.
493        """
494
495        return list(self.format_map.keys())
496
497    def get_library(self, format="json", normalize=False):
498        """
499        Request MetabRef GC/MS library.
500
501        Parameters
502        ----------
503        format : str
504            Format of requested library, i.e. "json", "sql", "flashentropy".
505            See `available_formats` method for aliases.
506        normalize : bool
507            Normalize the spectrum by its magnitude.
508
509        Returns
510        -------
511        Library in requested format.
512
513        """
514
515        # Init format function
516        format_func = self._get_format_func(format)
517
518        return format_func(
519            self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {}
520        )
521
522    def get_fames(self, format="json", normalize=False):
523        """
524        Request MetabRef GC/MS FAMEs library.
525
526        Parameters
527        ----------
528        format : str
529            Format of requested library, i.e. "json", "sql", "flashentropy".
530            See `available_formats` method for aliases.
531        normalize : bool
532            Normalize the spectrum by its magnitude.
533
534        Returns
535        -------
536        Library in requested format.
537
538        """
539
540        # Init format function
541        format_func = self._get_format_func(format)
542
543        return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {})
544
545    def _to_LowResolutionEICompound_dict(self, metabref_lib, normalize=False):
546        """
547        Convert MetabRef-formatted library to CoreMS LowResolutionEICompound-formatted
548        dictionary for local ingestion.
549
550        Parameters
551        ----------
552        metabref_lib : dict
553            MetabRef GC-MS library in JSON format.
554        normalize : bool
555            Normalize each spectrum by its magnitude.
556
557        Returns
558        -------
559        list of dict
560            List of each spectrum contained in dictionary.
561
562        """
563
564        # All below key:value lookups are based on CoreMS class definitions
565        # NOT MetabRef content. For example, MetabRef has keys for PubChem,
566        # USI, etc. that are not considered below.
567
568        # Dictionary to map metabref keys to corems keys
569        metadatar_cols = {
570            "casno": "cas",
571            "inchikey": "inchikey",
572            "inchi": "inchi",
573            "chebi": "chebi",
574            "smiles": "smiles",
575            "kegg": "kegg",
576            "iupac_name": "iupac_name",
577            "traditional_name": "traditional_name",  # Not present in metabref
578            "common_name": "common_name",  # Not present in metabref
579        }
580
581        # Dictionary to map metabref keys to corems keys
582        lowres_ei_compound_cols = {
583            "id": "metabref_id",
584            "molecule_name": "name",  # Is this correct?
585            "classify": "classify",  # Not present in metabref
586            "formula": "formula",
587            "ri": "ri",
588            "rt": "retention_time",
589            "source": "source",  # Not present in metabref
590            "casno": "casno",
591            "comments": "comment",
592            "source_temp_c": "source_temp_c",  # Not present in metabref
593            "ev": "ev",  # Not present in metabref
594            "peak_count": "peaks_count",
595            "mz": "mz",
596            "abundance": "abundance",
597        }
598
599        # Local result container
600        corems_lib = []
601
602        # Enumerate spectra
603        for i, source_ in enumerate(metabref_lib):
604            # Copy source to prevent modification
605            source = source_.copy()
606
607            # Flatten source dict
608            source = source.pop("spectrum_data") | source
609
610            # Parse target data
611            target = {
612                lowres_ei_compound_cols[k]: v
613                for k, v in source.items()
614                if k in lowres_ei_compound_cols
615            }
616
617            # Explicitly add this to connect with LowResCompoundRef later
618            target["rt"] = source["rt"]
619
620            # Parse (mz, abundance)
621            arr = self.spectrum_to_array(target["mz"], normalize=normalize)
622            target["mz"] = arr[:, 0]
623            target["abundance"] = arr[:, 1]
624
625            # Parse meta data
626            target["metadata"] = {
627                metadatar_cols[k]: v for k, v in source.items() if k in metadatar_cols
628            }
629
630            # Add anything else
631            for k in source:
632                if k not in lowres_ei_compound_cols:
633                    target[k] = source[k]
634
635            # Add to CoreMS list
636            corems_lib.append(target)
637
638        return corems_lib
639
640    def _LowResolutionEICompound_dict_to_sqlite(
641        self, lowres_ei_compound_dict, url="sqlite://"
642    ):
643        """
644        Convert CoreMS LowResolutionEICompound-formatted dictionary to SQLite
645        database for local ingestion.
646
647        Parameters
648        ----------
649        lowres_ei_compound_dict : dict
650            CoreMS GC-MS library formatted for LowResolutionEICompound.
651        url : str
652            URL to SQLite prefix.
653
654        Returns
655        -------
656        sqlite database
657            Spectra contained in SQLite database.
658
659        """
660
661        # Dictionary to map corems keys to all-caps keys
662        capped_cols = {
663            "name": "NAME",
664            "formula": "FORM",
665            "ri": "RI",
666            "retention_time": "RT",
667            "source": "SOURCE",
668            "casno": "CASNO",
669            "comment": "COMMENT",
670            "peaks_count": "NUM PEAKS",
671        }
672
673        # Initialize SQLite object
674        sqlite_obj = EI_LowRes_SQLite(url=url)
675
676        # Iterate spectra
677        for _data_dict in lowres_ei_compound_dict:
678            # Copy source to prevent modification
679            data_dict = _data_dict.copy()
680
681            # Add missing capped values
682            for k, v in capped_cols.items():
683                # Key exists
684                if k in data_dict:
685                    # # This will replace the key
686                    # data_dict[v] = data_dict.pop(k)
687
688                    # This will keep both keys
689                    data_dict[v] = data_dict[k]
690
691            # Parse number of peaks
692            if not data_dict.get("NUM PEAKS"):
693                data_dict["NUM PEAKS"] = len(data_dict.get("mz"))
694
695            # Parse CAS number
696            if not data_dict.get("CASNO"):
697                data_dict["CASNO"] = data_dict.get("CAS")
698
699            if not data_dict["CASNO"]:
700                data_dict["CASNO"] = 0
701
702            # Build linked metadata table
703            if "metadata" in data_dict:
704                if len(data_dict["metadata"]) > 0:
705                    data_dict["metadatar"] = Metadatar(**data_dict.pop("metadata"))
706                else:
707                    data_dict.pop("metadata")
708
709            # Attempt addition to sqlite
710            try:
711                sqlite_obj.add_compound(data_dict)
712            except:
713                print(data_dict["NAME"])
714
715        return sqlite_obj

Interface to the Metabolomics Reference Database.

MetabRefGCInterface()
445    def __init__(self):
446        """
447        Initialize instance.
448
449        """
450
451        super().__init__()
452        self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1"
453        self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames"
454
455        self.__init_format_map__()

Initialize instance.

GCMS_LIBRARY_URL
FAMES_URL
def available_formats(self):
485    def available_formats(self):
486        """
487        View list of available formats.
488
489        Returns
490        -------
491        list
492            Format map keys.
493        """
494
495        return list(self.format_map.keys())

View list of available formats.

Returns
  • list: Format map keys.
def get_library(self, format='json', normalize=False):
497    def get_library(self, format="json", normalize=False):
498        """
499        Request MetabRef GC/MS library.
500
501        Parameters
502        ----------
503        format : str
504            Format of requested library, i.e. "json", "sql", "flashentropy".
505            See `available_formats` method for aliases.
506        normalize : bool
507            Normalize the spectrum by its magnitude.
508
509        Returns
510        -------
511        Library in requested format.
512
513        """
514
515        # Init format function
516        format_func = self._get_format_func(format)
517
518        return format_func(
519            self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {}
520        )

Request MetabRef GC/MS library.

Parameters
  • format (str): Format of requested library, i.e. "json", "sql", "flashentropy". See available_formats method for aliases.
  • normalize (bool): Normalize the spectrum by its magnitude.
Returns
  • Library in requested format.
def get_fames(self, format='json', normalize=False):
522    def get_fames(self, format="json", normalize=False):
523        """
524        Request MetabRef GC/MS FAMEs library.
525
526        Parameters
527        ----------
528        format : str
529            Format of requested library, i.e. "json", "sql", "flashentropy".
530            See `available_formats` method for aliases.
531        normalize : bool
532            Normalize the spectrum by its magnitude.
533
534        Returns
535        -------
536        Library in requested format.
537
538        """
539
540        # Init format function
541        format_func = self._get_format_func(format)
542
543        return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {})

Request MetabRef GC/MS FAMEs library.

Parameters
  • format (str): Format of requested library, i.e. "json", "sql", "flashentropy". See available_formats method for aliases.
  • normalize (bool): Normalize the spectrum by its magnitude.
Returns
  • Library in requested format.
class MetabRefLCInterface(MetabRefInterface):
718class MetabRefLCInterface(MetabRefInterface):
719    """
720    Interface to the Metabolomics Reference Database for LC-MS data.
721    """
722
723    def __init__(self):
724        """
725        Initialize instance.
726
727        """
728
729        super().__init__()
730
731        # API endpoint for precursor m/z search
732        # inputs = mz, tolerance (in Da), polarity, page_no, per_page
733        self.PRECURSOR_MZ_URL = "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}"
734
735        # API endpoint for returning full list of precursor m/z values in database
736        # inputs = polarity, page_no, per_page
737        self.PRECURSOR_MZ_ALL_URL = (
738            "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}"
739        )
740
741        self.__init_format_map__()
742
743    def __init_format_map__(self):
744        """
745        Initialize database format mapper, enabling multiple format requests.
746
747        """
748
749        # Define format workflows
750        self.format_map = {
751            "json": lambda x, normalize, fe_kwargs: x,
752            "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy(
753                x, normalize, fe_kwargs
754            ),
755        }
756
757        # Add aliases
758        self.format_map["metabref"] = self.format_map["json"]
759        self.format_map["fe"] = self.format_map["flashentropy"]
760        self.format_map["flash-entropy"] = self.format_map["flashentropy"]
761
762    def query_by_precursor(
763        self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50
764    ):
765        """
766        Query MetabRef by precursor m/z values.
767
768        Parameters
769        ----------
770        mz_list : list
771            List of precursor m/z values.
772        polarity : str
773            Ionization polarity, either "positive" or "negative".
774        mz_tol_ppm : float
775            Tolerance in ppm for each precursor m/z value.
776            Used for retrieving from a potential match from database.
777        mz_tol_da_api : float, optional
778            Maximum tolerance between precursor m/z values for API search, in daltons.
779            Used to group similar mzs into a single API query for speed. Default is 0.2.
780        max_per_page : int, optional
781            Maximum records to return from MetabRef API query at a time.  Default is 50.
782
783        Returns
784        -------
785        list
786            List of library entries in original JSON format.
787        """
788
789        # If polarity is anything other than positive or negative, raise error
790        if polarity not in ["positive", "negative"]:
791            raise ValueError("Polarity must be 'positive' or 'negative'")
792
793        # Cluster groups of mz according to mz_tol_da_api for precursor query
794        mz_list.sort()
795        mz_groups = [[mz_list[0]]]
796        for x in mz_list[1:]:
797            if abs(x - mz_groups[-1][0]) <= mz_tol_da_api:
798                mz_groups[-1].append(x)
799            else:
800                mz_groups.append([x])
801
802        # Query MetabRef for each mz group
803        lib = []
804        for mz_group in mz_groups:
805            mz = np.mean(mz_group)
806            if len(mz_group) == 1:
807                mz = mz_group[0]
808                tol = mz_tol_ppm * 10**-6 * mz
809            else:
810                mz = (max(mz_group) - min(mz_group)) / 2 + min(mz_group)
811                tol = (max(mz_group) - min(mz_group)) / 2 + mz_tol_ppm**-6 * max(
812                    mz_group
813                )
814
815            # Get first page of results
816            response = self.get_query(
817                self.PRECURSOR_MZ_URL.format(
818                    str(mz), str(tol), polarity, 1, max_per_page
819                )
820            )
821            lib = lib + response["results"]
822
823            # If there are more pages of results, get them
824            if response["total_pages"] > 1:
825                for i in np.arange(2, response["total_pages"] + 1):
826                    lib = (
827                        lib
828                        + self.get_query(
829                            self.PRECURSOR_MZ_URL.format(
830                                str(mz), str(tol), polarity, i, max_per_page
831                            )
832                        )["results"]
833                    )
834
835        return lib
836
837    def request_all_precursors(self, polarity, per_page=50000):
838        """
839        Request all precursor m/z values for MS2 spectra from MetabRef.
840
841        Parameters
842        ----------
843        polarity : str
844            Ionization polarity, either "positive" or "negative".
845        per_page : int, optional
846            Number of records to fetch per call. Default is 50000
847
848        Returns
849        -------
850        list
851            List of all precursor m/z values, sorted.
852        """
853        # If polarity is anything other than positive or negative, raise error
854        if polarity not in ["positive", "negative"]:
855            raise ValueError("Polarity must be 'positive' or 'negative'")
856
857        precursors = []
858
859        # Get first page of results and total number of pages of results
860        response = self.get_query(
861            self.PRECURSOR_MZ_ALL_URL.format(polarity, str(1), str(per_page))
862        )
863        total_pages = response["total_pages"]
864        precursors.extend([x["precursor_ion"] for x in response["results"]])
865
866        # Go through remaining pages of results
867        for i in np.arange(2, total_pages + 1):
868            response = self.get_query(
869                self.PRECURSOR_MZ_ALL_URL.format(polarity, str(i), str(per_page))
870            )
871            precursors.extend([x["precursor_ion"] for x in response["results"]])
872
873        # Sort precursors from smallest to largest and remove duplicates
874        precursors = list(set(precursors))
875        precursors.sort()
876
877        return precursors
878
879    def get_lipid_library(
880        self,
881        mz_list,
882        polarity,
883        mz_tol_ppm,
884        mz_tol_da_api=0.2,
885        format="json",
886        normalize=True,
887        fe_kwargs={},
888    ):
889        """
890        Request MetabRef lipid library.
891
892        Parameters
893        ----------
894        mz_list : list
895            List of precursor m/z values.
896        polarity : str
897            Ionization polarity, either "positive" or "negative".
898        mz_tol_ppm : float
899            Tolerance in ppm for each precursor m/z value.
900            Used for retrieving from a potential match from database.
901        mz_tol_da_api : float, optional
902            Maximum tolerance between precursor m/z values for API search, in daltons.
903            Used to group similar mzs into a single API query for speed. Default is 0.2.
904        format : str, optional
905            Format of requested library, i.e. "json", "sql", "flashentropy".
906            See `available_formats` method for aliases. Default is "json".
907        normalize : bool, optional
908            Normalize the spectrum by its magnitude. Default is True.
909        fe_kwargs : dict, optional
910            Keyword arguments for FlashEntropy search. Default is {}.
911
912        Returns
913        -------
914        tuple
915            Library in requested format and lipid metadata as a LipidMetadata dataclass.
916
917        """
918        mz_list.sort()
919        mz_list = np.array(mz_list)
920
921        # Get all precursors in the library matching the polarity
922        precusors_in_lib = self.request_all_precursors(polarity=polarity)
923        precusors_in_lib = np.array(precusors_in_lib)
924
925        # Compare the mz_list with the precursors in the library, keep any mzs that are within mz_tol of any precursor in the library
926        lib_mz_df = pd.DataFrame(precusors_in_lib, columns=["lib_mz"])
927        lib_mz_df["closest_obs_mz"] = mz_list[
928            find_closest(mz_list, lib_mz_df.lib_mz.values)
929        ]
930        lib_mz_df["mz_diff_ppm"] = np.abs(
931            (lib_mz_df["lib_mz"] - lib_mz_df["closest_obs_mz"])
932            / lib_mz_df["lib_mz"]
933            * 1e6
934        )
935        lib_mz_sub = lib_mz_df[lib_mz_df["mz_diff_ppm"] <= mz_tol_ppm]
936
937        # Do the same in the opposite direction
938        mz_df = pd.DataFrame(mz_list, columns=["mass_feature_mz"])
939        mz_df["closest_lib_pre_mz"] = precusors_in_lib[
940            find_closest(precusors_in_lib, mz_df.mass_feature_mz.values)
941        ]
942        mz_df["mz_diff_ppm"] = np.abs(
943            (mz_df["mass_feature_mz"] - mz_df["closest_lib_pre_mz"])
944            / mz_df["mass_feature_mz"]
945            * 1e6
946        )
947        mz_df_sub = mz_df[mz_df["mz_diff_ppm"] <= mz_tol_ppm]
948
949        # Evaluate which is fewer mzs - lib_mz_sub or mz_df_sub and use that as the input for next step
950        if len(lib_mz_sub) < len(mz_df_sub):
951            mzs_to_query = lib_mz_sub.lib_mz.values
952        else:
953            mzs_to_query = mz_df_sub.mass_feature_mz.values
954
955        # Query the library for the precursors in the mz_list that are in the library to retrieve the spectra and metadata
956        lib = self.query_by_precursor(
957            mz_list=mzs_to_query,
958            polarity=polarity,
959            mz_tol_ppm=mz_tol_ppm,
960            mz_tol_da_api=mz_tol_da_api,
961        )
962
963        # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass
964        mol_data_dict = {x["id"]: x["Molecular Data"] for x in lib}
965        lipid_lib = {x["id"]: x["Lipid Tree"] for x in lib if "Lipid Tree" in x.keys()}
966        mol_data_dict = {k: {**v, **lipid_lib[k]} for k, v in mol_data_dict.items()}
967        mol_data_dict = {
968            k: self._dict_to_dataclass(v, LipidMetadata)
969            for k, v in mol_data_dict.items()
970        }
971
972        # Remove lipid metadata from the metabref library
973        lib = [
974            {k: v for k, v in x.items() if k not in ["Molecular Data", "Lipid Tree"]}
975            for x in lib
976        ]
977        # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry
978        for x in lib:
979            if "Lipid Fragments" in x.keys():
980                x.update(x.pop("Lipid Fragments"))
981            if "MSO Data" in x.keys():
982                x.update(x.pop("MSO Data"))
983
984        # Format the spectral library
985        format_func = self._get_format_func(format)
986        lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs)
987        return (lib, mol_data_dict)

Interface to the Metabolomics Reference Database for LC-MS data.

MetabRefLCInterface()
723    def __init__(self):
724        """
725        Initialize instance.
726
727        """
728
729        super().__init__()
730
731        # API endpoint for precursor m/z search
732        # inputs = mz, tolerance (in Da), polarity, page_no, per_page
733        self.PRECURSOR_MZ_URL = "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}"
734
735        # API endpoint for returning full list of precursor m/z values in database
736        # inputs = polarity, page_no, per_page
737        self.PRECURSOR_MZ_ALL_URL = (
738            "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}"
739        )
740
741        self.__init_format_map__()

Initialize instance.

PRECURSOR_MZ_URL
PRECURSOR_MZ_ALL_URL
def query_by_precursor( self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50):
762    def query_by_precursor(
763        self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50
764    ):
765        """
766        Query MetabRef by precursor m/z values.
767
768        Parameters
769        ----------
770        mz_list : list
771            List of precursor m/z values.
772        polarity : str
773            Ionization polarity, either "positive" or "negative".
774        mz_tol_ppm : float
775            Tolerance in ppm for each precursor m/z value.
776            Used for retrieving from a potential match from database.
777        mz_tol_da_api : float, optional
778            Maximum tolerance between precursor m/z values for API search, in daltons.
779            Used to group similar mzs into a single API query for speed. Default is 0.2.
780        max_per_page : int, optional
781            Maximum records to return from MetabRef API query at a time.  Default is 50.
782
783        Returns
784        -------
785        list
786            List of library entries in original JSON format.
787        """
788
789        # If polarity is anything other than positive or negative, raise error
790        if polarity not in ["positive", "negative"]:
791            raise ValueError("Polarity must be 'positive' or 'negative'")
792
793        # Cluster groups of mz according to mz_tol_da_api for precursor query
794        mz_list.sort()
795        mz_groups = [[mz_list[0]]]
796        for x in mz_list[1:]:
797            if abs(x - mz_groups[-1][0]) <= mz_tol_da_api:
798                mz_groups[-1].append(x)
799            else:
800                mz_groups.append([x])
801
802        # Query MetabRef for each mz group
803        lib = []
804        for mz_group in mz_groups:
805            mz = np.mean(mz_group)
806            if len(mz_group) == 1:
807                mz = mz_group[0]
808                tol = mz_tol_ppm * 10**-6 * mz
809            else:
810                mz = (max(mz_group) - min(mz_group)) / 2 + min(mz_group)
811                tol = (max(mz_group) - min(mz_group)) / 2 + mz_tol_ppm**-6 * max(
812                    mz_group
813                )
814
815            # Get first page of results
816            response = self.get_query(
817                self.PRECURSOR_MZ_URL.format(
818                    str(mz), str(tol), polarity, 1, max_per_page
819                )
820            )
821            lib = lib + response["results"]
822
823            # If there are more pages of results, get them
824            if response["total_pages"] > 1:
825                for i in np.arange(2, response["total_pages"] + 1):
826                    lib = (
827                        lib
828                        + self.get_query(
829                            self.PRECURSOR_MZ_URL.format(
830                                str(mz), str(tol), polarity, i, max_per_page
831                            )
832                        )["results"]
833                    )
834
835        return lib

Query MetabRef by precursor m/z values.

Parameters
  • mz_list (list): List of precursor m/z values.
  • polarity (str): Ionization polarity, either "positive" or "negative".
  • mz_tol_ppm (float): Tolerance in ppm for each precursor m/z value. Used for retrieving from a potential match from database.
  • mz_tol_da_api (float, optional): Maximum tolerance between precursor m/z values for API search, in daltons. Used to group similar mzs into a single API query for speed. Default is 0.2.
  • max_per_page (int, optional): Maximum records to return from MetabRef API query at a time. Default is 50.
Returns
  • list: List of library entries in original JSON format.
def request_all_precursors(self, polarity, per_page=50000):
837    def request_all_precursors(self, polarity, per_page=50000):
838        """
839        Request all precursor m/z values for MS2 spectra from MetabRef.
840
841        Parameters
842        ----------
843        polarity : str
844            Ionization polarity, either "positive" or "negative".
845        per_page : int, optional
846            Number of records to fetch per call. Default is 50000
847
848        Returns
849        -------
850        list
851            List of all precursor m/z values, sorted.
852        """
853        # If polarity is anything other than positive or negative, raise error
854        if polarity not in ["positive", "negative"]:
855            raise ValueError("Polarity must be 'positive' or 'negative'")
856
857        precursors = []
858
859        # Get first page of results and total number of pages of results
860        response = self.get_query(
861            self.PRECURSOR_MZ_ALL_URL.format(polarity, str(1), str(per_page))
862        )
863        total_pages = response["total_pages"]
864        precursors.extend([x["precursor_ion"] for x in response["results"]])
865
866        # Go through remaining pages of results
867        for i in np.arange(2, total_pages + 1):
868            response = self.get_query(
869                self.PRECURSOR_MZ_ALL_URL.format(polarity, str(i), str(per_page))
870            )
871            precursors.extend([x["precursor_ion"] for x in response["results"]])
872
873        # Sort precursors from smallest to largest and remove duplicates
874        precursors = list(set(precursors))
875        precursors.sort()
876
877        return precursors

Request all precursor m/z values for MS2 spectra from MetabRef.

Parameters
  • polarity (str): Ionization polarity, either "positive" or "negative".
  • per_page (int, optional): Number of records to fetch per call. Default is 50000
Returns
  • list: List of all precursor m/z values, sorted.
def get_lipid_library( self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, format='json', normalize=True, fe_kwargs={}):
879    def get_lipid_library(
880        self,
881        mz_list,
882        polarity,
883        mz_tol_ppm,
884        mz_tol_da_api=0.2,
885        format="json",
886        normalize=True,
887        fe_kwargs={},
888    ):
889        """
890        Request MetabRef lipid library.
891
892        Parameters
893        ----------
894        mz_list : list
895            List of precursor m/z values.
896        polarity : str
897            Ionization polarity, either "positive" or "negative".
898        mz_tol_ppm : float
899            Tolerance in ppm for each precursor m/z value.
900            Used for retrieving from a potential match from database.
901        mz_tol_da_api : float, optional
902            Maximum tolerance between precursor m/z values for API search, in daltons.
903            Used to group similar mzs into a single API query for speed. Default is 0.2.
904        format : str, optional
905            Format of requested library, i.e. "json", "sql", "flashentropy".
906            See `available_formats` method for aliases. Default is "json".
907        normalize : bool, optional
908            Normalize the spectrum by its magnitude. Default is True.
909        fe_kwargs : dict, optional
910            Keyword arguments for FlashEntropy search. Default is {}.
911
912        Returns
913        -------
914        tuple
915            Library in requested format and lipid metadata as a LipidMetadata dataclass.
916
917        """
918        mz_list.sort()
919        mz_list = np.array(mz_list)
920
921        # Get all precursors in the library matching the polarity
922        precusors_in_lib = self.request_all_precursors(polarity=polarity)
923        precusors_in_lib = np.array(precusors_in_lib)
924
925        # Compare the mz_list with the precursors in the library, keep any mzs that are within mz_tol of any precursor in the library
926        lib_mz_df = pd.DataFrame(precusors_in_lib, columns=["lib_mz"])
927        lib_mz_df["closest_obs_mz"] = mz_list[
928            find_closest(mz_list, lib_mz_df.lib_mz.values)
929        ]
930        lib_mz_df["mz_diff_ppm"] = np.abs(
931            (lib_mz_df["lib_mz"] - lib_mz_df["closest_obs_mz"])
932            / lib_mz_df["lib_mz"]
933            * 1e6
934        )
935        lib_mz_sub = lib_mz_df[lib_mz_df["mz_diff_ppm"] <= mz_tol_ppm]
936
937        # Do the same in the opposite direction
938        mz_df = pd.DataFrame(mz_list, columns=["mass_feature_mz"])
939        mz_df["closest_lib_pre_mz"] = precusors_in_lib[
940            find_closest(precusors_in_lib, mz_df.mass_feature_mz.values)
941        ]
942        mz_df["mz_diff_ppm"] = np.abs(
943            (mz_df["mass_feature_mz"] - mz_df["closest_lib_pre_mz"])
944            / mz_df["mass_feature_mz"]
945            * 1e6
946        )
947        mz_df_sub = mz_df[mz_df["mz_diff_ppm"] <= mz_tol_ppm]
948
949        # Evaluate which is fewer mzs - lib_mz_sub or mz_df_sub and use that as the input for next step
950        if len(lib_mz_sub) < len(mz_df_sub):
951            mzs_to_query = lib_mz_sub.lib_mz.values
952        else:
953            mzs_to_query = mz_df_sub.mass_feature_mz.values
954
955        # Query the library for the precursors in the mz_list that are in the library to retrieve the spectra and metadata
956        lib = self.query_by_precursor(
957            mz_list=mzs_to_query,
958            polarity=polarity,
959            mz_tol_ppm=mz_tol_ppm,
960            mz_tol_da_api=mz_tol_da_api,
961        )
962
963        # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass
964        mol_data_dict = {x["id"]: x["Molecular Data"] for x in lib}
965        lipid_lib = {x["id"]: x["Lipid Tree"] for x in lib if "Lipid Tree" in x.keys()}
966        mol_data_dict = {k: {**v, **lipid_lib[k]} for k, v in mol_data_dict.items()}
967        mol_data_dict = {
968            k: self._dict_to_dataclass(v, LipidMetadata)
969            for k, v in mol_data_dict.items()
970        }
971
972        # Remove lipid metadata from the metabref library
973        lib = [
974            {k: v for k, v in x.items() if k not in ["Molecular Data", "Lipid Tree"]}
975            for x in lib
976        ]
977        # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry
978        for x in lib:
979            if "Lipid Fragments" in x.keys():
980                x.update(x.pop("Lipid Fragments"))
981            if "MSO Data" in x.keys():
982                x.update(x.pop("MSO Data"))
983
984        # Format the spectral library
985        format_func = self._get_format_func(format)
986        lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs)
987        return (lib, mol_data_dict)

Request MetabRef lipid library.

Parameters
  • mz_list (list): List of precursor m/z values.
  • polarity (str): Ionization polarity, either "positive" or "negative".
  • mz_tol_ppm (float): Tolerance in ppm for each precursor m/z value. Used for retrieving from a potential match from database.
  • mz_tol_da_api (float, optional): Maximum tolerance between precursor m/z values for API search, in daltons. Used to group similar mzs into a single API query for speed. Default is 0.2.
  • format (str, optional): Format of requested library, i.e. "json", "sql", "flashentropy". See available_formats method for aliases. Default is "json".
  • normalize (bool, optional): Normalize the spectrum by its magnitude. Default is True.
  • fe_kwargs (dict, optional): Keyword arguments for FlashEntropy search. Default is {}.
Returns
  • tuple: Library in requested format and lipid metadata as a LipidMetadata dataclass.
class MSPInterface(SpectralDatabaseInterface):
 990class MSPInterface(SpectralDatabaseInterface):
 991    """
 992    Interface to parse NIST MSP files
 993    """
 994
 995    def __init__(self, file_path):
 996        """
 997        Initialize instance.
 998
 999        Parameters
1000        ----------
1001        file_path : str
1002            Path to a local MSP file.
1003
1004        Attributes
1005        ----------
1006        file_path : str
1007            Path to the MSP file.
1008        _file_content : str
1009            Content of the MSP file.
1010        _data_frame : :obj:`~pandas.DataFrame`
1011            DataFrame of spectra from the MSP file with unaltered content.
1012        """
1013        super().__init__(key=None)
1014
1015        self.file_path = file_path
1016        if not os.path.exists(self.file_path):
1017            raise FileNotFoundError(
1018                f"File {self.file_path} does not exist. Please check the file path."
1019            )
1020        with open(self.file_path, "r") as f:
1021            self._file_content = f.read()
1022
1023        self._data_frame = self._read_msp_file()
1024        self.__init_format_map__()
1025
1026    def __init_format_map__(self):
1027        """
1028        Initialize database format mapper, enabling multiple format requests.
1029
1030        """
1031
1032        # x is a pandas dataframe similar to self._data_frame format
1033        # Define format workflows
1034        self.format_map = {
1035            "msp": lambda x, normalize, fe_kwargs: self._to_msp(x, normalize),
1036            "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy(
1037                x, normalize, fe_kwargs
1038            ),
1039            "df": lambda x, normalize, fe_kwargs: self._to_df(x, normalize),
1040        }
1041
1042        # Add aliases
1043        self.format_map["fe"] = self.format_map["flashentropy"]
1044        self.format_map["flash-entropy"] = self.format_map["flashentropy"]
1045        self.format_map["dataframe"] = self.format_map["df"]
1046        self.format_map["data-frame"] = self.format_map["df"]
1047
1048    def _read_msp_file(self):
1049        """
1050        Reads the MSP files into the pandas dataframe, and sort/remove zero intensity ions in MS/MS spectra.
1051
1052        Returns
1053        -------
1054        :obj:`~pandas.DataFrame`
1055            DataFrame of spectra from the MSP file, exacly as it is in the file (no sorting, filtering etc)
1056        """
1057        # If input_dataframe is provided, return it it
1058        spectra = []
1059        spectrum = {}
1060
1061        f = StringIO(self._file_content)
1062        for line in f:
1063            line = line.strip()
1064            if not line:
1065                continue  # Skip empty lines
1066
1067            # Handle metadata
1068            if ":" in line:
1069                key, value = line.split(":", 1)
1070                key = key.strip().lower()
1071                value = value.strip()
1072
1073                if key == "name":
1074                    # Save current spectrum and start a new one
1075                    if spectrum:
1076                        spectra.append(spectrum)
1077                    spectrum = {"name": value, "peaks": []}
1078                else:
1079                    spectrum[key] = value
1080
1081            # Handle peak data (assumed to start with a number)
1082            elif line[0].isdigit():
1083                peaks = line.split()
1084                m_z = float(peaks[0])
1085                intensity = float(peaks[1])
1086                spectrum["peaks"].append(([m_z, intensity]))
1087        # Save the last spectrum
1088        if spectrum:
1089            spectra.append(spectrum)
1090
1091        df = pd.DataFrame(spectra)
1092        for column in df.columns:
1093            if column != "peaks":  # Skip 'peaks' column
1094                try:
1095                    df[column] = pd.to_numeric(df[column], errors="raise")
1096                except:
1097                    pass
1098        return df
1099
1100    def _to_df(self, input_dataframe, normalize=True):
1101        """
1102        Convert MSP-derived library to FlashEntropy library. 
1103
1104        Parameters
1105        ----------
1106        input_dataframe : :obj:`~pandas.DataFrame`
1107            Input DataFrame containing MSP-formatted spectra.
1108        normalize : bool, optional
1109            Normalize each spectrum by its magnitude.
1110            Default is True.
1111
1112        Returns
1113        -------
1114        :obj:`~pandas.DataFrame`
1115            DataFrame of with desired normalization
1116        """
1117        if not normalize:
1118            return input_dataframe
1119        else:
1120            # Convert to dictionary
1121            db_dict = input_dataframe.to_dict(orient="records")
1122
1123            # Initialize empty library
1124            lib = []
1125
1126            # Enumerate spectra
1127            for i, source in enumerate(db_dict):
1128                spectrum = source
1129                # Check that spectrum["peaks"] exists
1130                if "peaks" not in spectrum.keys():
1131                    raise KeyError(
1132                        "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute."
1133                    )
1134
1135                # Convert spectrum["peaks"] to numpy array
1136                if not isinstance(spectrum["peaks"], np.ndarray):
1137                    spectrum["peaks"] = np.array(spectrum["peaks"])
1138
1139                # Normalize peaks, if requested
1140                if normalize:
1141                    spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"])
1142                    spectrum["num peaks"] = len(spectrum["peaks"])
1143
1144                # Add spectrum to library
1145                lib.append(spectrum)
1146            
1147            # Convert to DataFrame
1148            df = pd.DataFrame(lib)
1149            return df
1150    
1151    def _to_flashentropy(self, input_dataframe, normalize=True, fe_kwargs={}):
1152        """
1153        Convert MSP-derived library to FlashEntropy library.
1154
1155        Parameters
1156        ----------
1157        input_dataframe : :obj:`~pandas.DataFrame`
1158            Input DataFrame containing MSP-formatted spectra.
1159        normalize : bool
1160            Normalize each spectrum by its magnitude.
1161        fe_kwargs : dict, optional
1162            Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search;
1163            any keys not recognized will be ignored. By default, all parameters set to defaults.
1164
1165        Returns
1166        -------
1167        :obj:`~ms_entropy.FlashEntropySearch`
1168            MS2 library as FlashEntropy search instance.
1169
1170        Raises
1171        ------
1172        ValueError
1173            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they
1174        """
1175        self._check_flash_entropy_kwargs(fe_kwargs)
1176
1177        db_df = input_dataframe
1178
1179        # Convert to dictionary
1180        db_dict = db_df.to_dict(orient="records")
1181
1182        # Initialize empty library
1183        fe_lib = []
1184
1185        # Enumerate spectra
1186        for i, source in enumerate(db_dict):
1187            # Reorganize source dict, if necessary
1188            if "spectrum_data" in source.keys():
1189                spectrum = source["spectrum_data"]
1190            else:
1191                spectrum = source
1192
1193            # Rename precursor_mz key for FlashEntropy
1194            if "precursor_mz" not in spectrum.keys():
1195                if "precursormz" in spectrum:
1196                    spectrum["precursor_mz"] = spectrum.pop("precursormz")
1197                elif "precursor_ion" in spectrum:
1198                    spectrum["precursor_mz"] = spectrum.pop("precursor_ion")
1199                else:
1200                    raise KeyError(
1201                        "MSP must have either 'precursormz' or 'precursor_ion' key to be converted to FlashEntropy format."
1202                    )
1203
1204            # Check that spectrum["peaks"] exists
1205            if "peaks" not in spectrum.keys():
1206                raise KeyError(
1207                    "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute."
1208                )
1209
1210            # Convert spectrum["peaks"] to numpy array
1211            if not isinstance(spectrum["peaks"], np.ndarray):
1212                spectrum["peaks"] = np.array(spectrum["peaks"])
1213
1214            # Normalize peaks, if requested
1215            if normalize:
1216                spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"])
1217
1218            # Add spectrum to library
1219            fe_lib.append(spectrum)
1220
1221        # Build FlashEntropy index
1222        fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs)
1223
1224        return fe_search
1225    
1226    def _check_msp_compatibility(self):
1227        """
1228        Check if the MSP file is compatible with the get_metabolomics_spectra_library method and provide feedback if it is not.
1229        """
1230        # Check polarity
1231        if (
1232            "polarity" not in self._data_frame.columns
1233            and "ionmode" not in self._data_frame.columns
1234        ):
1235            raise ValueError(
1236                "Neither 'polarity' nor 'ionmode' columns found in the input MSP metadata. Please check the file."
1237            )
1238        polarity_column = (
1239            "polarity" if "polarity" in self._data_frame.columns else "ionmode"
1240        )
1241
1242        # Check if polarity_column contents is either "positive" or "negative"
1243        if not all(self._data_frame[polarity_column].isin(["positive", "negative"])):
1244            raise ValueError(
1245                f"Input field on MSP '{polarity_column}' must contain only 'positive' or 'negative' values."
1246            )
1247
1248        # Check if the MSP file contains the required columns for metabolite metadata
1249        # inchikey, by name, not null
1250        # either formula or molecular_formula, not null
1251        if not all(self._data_frame["inchikey"].notnull()):
1252            raise ValueError(
1253                "Input field on MSP 'inchikey' must contain only non-null values."
1254            )
1255        if (
1256            "formula" not in self._data_frame.columns
1257            and "molecular_formula" not in self._data_frame.columns
1258        ):
1259            raise ValueError(
1260                "Input field on MSP must contain either 'formula' or 'molecular_formula' columns."
1261            )
1262        molecular_formula_column = (
1263            "formula" if "formula" in self._data_frame.columns else "molecular_formula"
1264        )
1265        if not all(self._data_frame[molecular_formula_column].notnull()):
1266            raise ValueError(
1267                f"Input field on MSP '{molecular_formula_column}' must contain only non-null values."
1268            )
1269
1270    def get_metabolomics_spectra_library(
1271        self,
1272        polarity,
1273        metabolite_metadata_mapping={},
1274        format="fe",
1275        normalize=True,
1276        fe_kwargs={},
1277    ):
1278        """
1279        Prepare metabolomics spectra library and associated metabolite metadata
1280
1281        Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input
1282
1283        """
1284        # Check if the MSP file is compatible with the get_metabolomics_spectra_library method
1285        self._check_msp_compatibility()
1286
1287        # Check if the polarity parameter is valid and if a polarity column exists in the dataframe
1288        if polarity not in ["positive", "negative"]:
1289            raise ValueError("Polarity must be 'positive' or 'negative'")
1290        polarity_column = (
1291            "polarity" if "polarity" in self._data_frame.columns else "ionmode"
1292        )
1293
1294        # Get a subset of the initial dataframea by polarity
1295        db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy()
1296
1297        # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping
1298        # If the mapping is not provided, use the default mapping
1299        if not metabolite_metadata_mapping:
1300            metabolite_metadata_mapping = {
1301                "chebi_id": "chebi",
1302                "kegg_id": "kegg",
1303                "refmet_name": "common_name",
1304                "molecular_formula": "formula",
1305                "gnps_spectra_id":"id",
1306                "precursormz": "precursor_mz",
1307                "precursortype":"ion_type"
1308            }
1309        db_df.rename(columns=metabolite_metadata_mapping, inplace=True)
1310        db_df["molecular_data_id"] = db_df["inchikey"]
1311
1312
1313
1314        # Check if the resulting dataframe has the required columns for the flash entropy search
1315        required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"]
1316        for col in required_columns:
1317            if col not in db_df.columns:
1318                raise ValueError(
1319                    f"Input field on MSP must contain '{col}' column for FlashEntropy search."
1320                )
1321
1322        # Pull out the metabolite metadata from the dataframe and put it into a different dataframe
1323        # First get a list of the possible attributes of the MetaboliteMetadata dataclass
1324        metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys())
1325        # Replace id with molecular_data_id in metabolite_metadata_keys
1326        metabolite_metadata_keys = [
1327            "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys
1328        ]
1329        metabolite_metadata_df = db_df[
1330            db_df.columns[db_df.columns.isin(metabolite_metadata_keys)]
1331        ].copy()
1332
1333        # Make unique and recast the id column for metabolite metadata
1334        metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True)
1335        metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"]
1336
1337        # Convert to a dictionary using the inchikey as the key
1338        metabolite_metadata_dict = metabolite_metadata_df.to_dict(
1339            orient="records"
1340        )
1341        metabolite_metadata_dict = {
1342            v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata)
1343            for v in metabolite_metadata_dict
1344        }
1345
1346        # Remove the metabolite metadata columns from the original dataframe
1347        for key in metabolite_metadata_keys:
1348            if key != "molecular_data_id":
1349                if key in db_df.columns:
1350                    db_df.drop(columns=key, inplace=True)
1351
1352        # Format the spectral library
1353        format_func = self._get_format_func(format)
1354        lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs)
1355        return (lib, metabolite_metadata_dict)

Interface to parse NIST MSP files

MSPInterface(file_path)
 995    def __init__(self, file_path):
 996        """
 997        Initialize instance.
 998
 999        Parameters
1000        ----------
1001        file_path : str
1002            Path to a local MSP file.
1003
1004        Attributes
1005        ----------
1006        file_path : str
1007            Path to the MSP file.
1008        _file_content : str
1009            Content of the MSP file.
1010        _data_frame : :obj:`~pandas.DataFrame`
1011            DataFrame of spectra from the MSP file with unaltered content.
1012        """
1013        super().__init__(key=None)
1014
1015        self.file_path = file_path
1016        if not os.path.exists(self.file_path):
1017            raise FileNotFoundError(
1018                f"File {self.file_path} does not exist. Please check the file path."
1019            )
1020        with open(self.file_path, "r") as f:
1021            self._file_content = f.read()
1022
1023        self._data_frame = self._read_msp_file()
1024        self.__init_format_map__()

Initialize instance.

Parameters
  • file_path (str): Path to a local MSP file.
Attributes
  • file_path (str): Path to the MSP file.
  • _file_content (str): Content of the MSP file.
  • _data_frame (~pandas.DataFrame): DataFrame of spectra from the MSP file with unaltered content.
file_path
def get_metabolomics_spectra_library( self, polarity, metabolite_metadata_mapping={}, format='fe', normalize=True, fe_kwargs={}):
1270    def get_metabolomics_spectra_library(
1271        self,
1272        polarity,
1273        metabolite_metadata_mapping={},
1274        format="fe",
1275        normalize=True,
1276        fe_kwargs={},
1277    ):
1278        """
1279        Prepare metabolomics spectra library and associated metabolite metadata
1280
1281        Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input
1282
1283        """
1284        # Check if the MSP file is compatible with the get_metabolomics_spectra_library method
1285        self._check_msp_compatibility()
1286
1287        # Check if the polarity parameter is valid and if a polarity column exists in the dataframe
1288        if polarity not in ["positive", "negative"]:
1289            raise ValueError("Polarity must be 'positive' or 'negative'")
1290        polarity_column = (
1291            "polarity" if "polarity" in self._data_frame.columns else "ionmode"
1292        )
1293
1294        # Get a subset of the initial dataframea by polarity
1295        db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy()
1296
1297        # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping
1298        # If the mapping is not provided, use the default mapping
1299        if not metabolite_metadata_mapping:
1300            metabolite_metadata_mapping = {
1301                "chebi_id": "chebi",
1302                "kegg_id": "kegg",
1303                "refmet_name": "common_name",
1304                "molecular_formula": "formula",
1305                "gnps_spectra_id":"id",
1306                "precursormz": "precursor_mz",
1307                "precursortype":"ion_type"
1308            }
1309        db_df.rename(columns=metabolite_metadata_mapping, inplace=True)
1310        db_df["molecular_data_id"] = db_df["inchikey"]
1311
1312
1313
1314        # Check if the resulting dataframe has the required columns for the flash entropy search
1315        required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"]
1316        for col in required_columns:
1317            if col not in db_df.columns:
1318                raise ValueError(
1319                    f"Input field on MSP must contain '{col}' column for FlashEntropy search."
1320                )
1321
1322        # Pull out the metabolite metadata from the dataframe and put it into a different dataframe
1323        # First get a list of the possible attributes of the MetaboliteMetadata dataclass
1324        metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys())
1325        # Replace id with molecular_data_id in metabolite_metadata_keys
1326        metabolite_metadata_keys = [
1327            "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys
1328        ]
1329        metabolite_metadata_df = db_df[
1330            db_df.columns[db_df.columns.isin(metabolite_metadata_keys)]
1331        ].copy()
1332
1333        # Make unique and recast the id column for metabolite metadata
1334        metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True)
1335        metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"]
1336
1337        # Convert to a dictionary using the inchikey as the key
1338        metabolite_metadata_dict = metabolite_metadata_df.to_dict(
1339            orient="records"
1340        )
1341        metabolite_metadata_dict = {
1342            v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata)
1343            for v in metabolite_metadata_dict
1344        }
1345
1346        # Remove the metabolite metadata columns from the original dataframe
1347        for key in metabolite_metadata_keys:
1348            if key != "molecular_data_id":
1349                if key in db_df.columns:
1350                    db_df.drop(columns=key, inplace=True)
1351
1352        # Format the spectral library
1353        format_func = self._get_format_func(format)
1354        lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs)
1355        return (lib, metabolite_metadata_dict)

Prepare metabolomics spectra library and associated metabolite metadata

Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input