corems.molecular_id.search.database_interfaces

  1import os
  2import re
  3from abc import ABC
  4
  5import numpy as np
  6import requests
  7import pandas as pd
  8from ms_entropy import FlashEntropySearch
  9
 10from corems.molecular_id.factory.EI_SQL import EI_LowRes_SQLite, Metadatar
 11from corems.molecular_id.factory.lipid_molecular_metadata import LipidMetadata
 12from corems.mass_spectra.calc.lc_calc import find_closest
 13
 14
 15class SpectralDatabaseInterface(ABC):
 16    """
 17    Base class that facilitates connection to spectral reference databases,
 18    such as EMSL's Metabolomics Reference Database (MetabRef).
 19
 20    """
 21
 22    def __init__(self, key=None):
 23        """
 24        Initialize instance.
 25
 26        Parameters
 27        ----------
 28        key : str
 29            Token key.
 30
 31        """
 32
 33        self.key = key
 34
 35        if self.key is None:
 36            raise ValueError(
 37                "Must specify environment variable key for token associatedwith this database interface."
 38            )
 39
 40    def set_token(self, path):
 41        """
 42        Set environment variable for MetabRef database token.
 43
 44        Parameters
 45        ----------
 46        path : str
 47            Path to token.
 48
 49        """
 50
 51        # Read token from file
 52        with open(path, "r", encoding="utf-8") as f:
 53            token = f.readline().strip()
 54
 55        # Set environment variable
 56        os.environ[self.key] = token
 57
 58    def get_token(self):
 59        """
 60        Get environment variable for database token.
 61
 62        Returns
 63        -------
 64        str
 65            Token string.
 66
 67        """
 68
 69        # Check for token
 70        if self.key not in os.environ:
 71            raise ValueError("Must set {} environment variable.".format(self.key))
 72
 73        # Get token from environment variables
 74        return os.environ.get(self.key)
 75
 76    def get_header(self):
 77        """
 78        Access stored database token and prepare as header.
 79
 80        Returns
 81        -------
 82        str
 83            Header string.
 84
 85        """
 86
 87        # Get token
 88        token = self.get_token()
 89
 90        # Pad header information
 91        header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"}
 92
 93        return header
 94
 95    def get_query(self, url, use_header=True):
 96        """
 97        Request payload from URL according to `get` protocol.
 98
 99        Parameters
100        ----------
101        url : str
102            URL for request.
103        use_header: bool
104            Whether or not the query should include the header
105
106        Returns
107        -------
108        dict
109            Response as JSON.
110
111        """
112
113        # Query URL via `get`
114        if use_header:
115            response = requests.get(url, headers=self.get_header())
116        else:
117            response = requests.get(url)
118
119        # Check response
120        response.raise_for_status()
121
122        # Return as JSON
123        return response.json()
124
125    def post_query(self, url, variable, values, tolerance):
126        """
127        Request payload from URL according to `post` protocol.
128
129        Parameters
130        ----------
131        url : str
132            URL for request.
133        variable : str
134            Variable to query.
135        values : str
136            Specific values of `variable` to query.
137        tolerance : str
138            Query tolerance relative to `values`.
139
140        Returns
141        -------
142        dict
143            Response as JSON.
144
145        """
146
147        # Coerce to string
148        if not isinstance(variable, str):
149            variable = str(variable).replace(" ", "")
150
151        if not isinstance(values, str):
152            values = str(values).replace(" ", "")
153
154        if not isinstance(tolerance, str):
155            tolerance = str(tolerance).replace(" ", "")
156
157        # Query URL via `post`
158        response = requests.post(
159            os.path.join(url, variable, tolerance),
160            data=values,
161            headers=self.get_header(),
162        )
163
164        # Check response
165        response.raise_for_status()
166
167        # Return as JSON
168        return response.json()
169
170
171class MetabRefInterface(SpectralDatabaseInterface):
172    """
173    Interface to the Metabolomics Reference Database.
174    """
175
176    def __init__(self):
177        """
178        Initialize instance.
179
180        """
181
182        super().__init__(key="METABREF_TOKEN")
183
184    def _get_format_func(self, format):
185        """
186        Obtain format function by key.
187
188        Returns
189        -------
190        func
191            Formatting function.
192        """
193
194        if format.lower() in self.format_map.keys():
195            return self.format_map[format.lower()]
196
197        raise ValueError(("{} not a supported format.").format(format))
198
199    def spectrum_to_array(self, spectrum, normalize=True):
200        """
201        Convert MetabRef-formatted spectrum to array.
202
203        Parameters
204        ----------
205        spectrum : str
206            MetabRef spectrum, i.e. list of (m/z,abundance) pairs.
207        normalize : bool
208            Normalize the spectrum by its magnitude.
209
210        Returns
211        -------
212        :obj:`~numpy.array`
213            Array of shape (N, 2), with m/z in the first column and abundance in
214            the second.
215
216        """
217
218        # Convert parenthesis-delimited string to array
219        arr = np.array(
220            re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float
221        ).reshape(-1, 2)
222
223        # Normalize the array
224        if normalize:
225            arr[:, -1] = arr[:, -1] / arr[:, -1].sum()
226
227        return arr
228
229    def _to_flashentropy(self, metabref_lib, normalize=True, fe_kwargs={}):
230        """
231        Convert metabref-formatted library to FlashEntropy library.
232
233        Parameters
234        ----------
235        metabref_lib : dict
236            MetabRef MS2 library in JSON format or FlashEntropy search instance (for reformatting at different MS2 separation).
237        normalize : bool
238            Normalize each spectrum by its magnitude.
239        fe_kwargs : dict, optional
240            Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search;
241            any keys not recognized will be ignored. By default, all parameters set to defaults.
242
243        Returns
244        -------
245        :obj:`~ms_entropy.FlashEntropySearch`
246            MS2 library as FlashEntropy search instance.
247
248        Raises
249        ------
250        ValueError
251            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal.
252
253        """
254        # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da
255        if (
256            "min_ms2_difference_in_da" in fe_kwargs
257            or "max_ms2_tolerance_in_da" in fe_kwargs
258        ):
259            if (
260                "min_ms2_difference_in_da" not in fe_kwargs
261                or "max_ms2_tolerance_in_da" not in fe_kwargs
262            ):
263                raise ValueError(
264                    "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified."
265                )
266            if (
267                fe_kwargs["min_ms2_difference_in_da"]
268                != 2 * fe_kwargs["max_ms2_tolerance_in_da"]
269            ):
270                raise ValueError(
271                    "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'."
272                )
273
274        # Initialize empty library
275        fe_lib = []
276
277        # Enumerate spectra
278        for i, source in enumerate(metabref_lib):
279            # Reorganize source dict, if necessary
280            if "spectrum_data" in source.keys():
281                spectrum = source["spectrum_data"]
282            else:
283                spectrum = source
284
285            # Rename precursor_mz key for FlashEntropy
286            if "precursor_mz" not in spectrum.keys():
287                spectrum["precursor_mz"] = spectrum.pop("precursor_ion")
288
289            # Convert CoreMS spectrum to array and clean, store as `peaks`
290            spectrum["peaks"] = self.spectrum_to_array(
291                spectrum["mz"], normalize=normalize
292            )
293
294            # Add spectrum to library
295            fe_lib.append(spectrum)
296
297        # Initialize FlashEntropy
298        fe_init_kws = [
299            "max_ms2_tolerance_in_da",
300            "mz_index_step",
301            "low_memory",
302            "path_data",
303        ]
304        fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws}
305        fes = FlashEntropySearch(**fe_init_kws)
306
307        # Build FlashEntropy index
308        fe_index_kws = [
309            "max_indexed_mz",
310            "precursor_ions_removal_da",
311            "noise_threshold",
312            "min_ms2_difference_in_da",
313            "max_peak_num",
314        ]
315        fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws}
316        fes.build_index(fe_lib, **fe_index_kws, clean_spectra=True)
317
318        return fes
319
320    def _dict_to_dataclass(self, metabref_lib, data_class):
321        """
322        Convert dictionary to dataclass.
323
324        Notes
325        -----
326        This function will pull the attributes a dataclass and its parent class
327        and convert the dictionary to a dataclass instance with the appropriate
328        attributes.
329
330        Parameters
331        ----------
332        data_class : :obj:`~dataclasses.dataclass`
333            Dataclass to convert to.
334        metabref_lib : dict
335            Metabref dictionary object to convert to dataclass.
336
337        Returns
338        -------
339        :obj:`~dataclasses.dataclass`
340            Dataclass instance.
341
342        """
343
344        # Get list of expected attributes of data_class
345        data_class_keys = list(data_class.__annotations__.keys())
346
347        # Does the data_class inherit from another class, if so, get the attributes of the parent class as well
348        if len(data_class.__mro__) > 2:
349            parent_class_keys = list(data_class.__bases__[0].__annotations__.keys())
350            data_class_keys = list(set(data_class_keys + parent_class_keys))
351
352        # Remove keys that are not in the data_class from the input dictionary
353        input_dict = {k: v for k, v in metabref_lib.items() if k in data_class_keys}
354
355        # Add keys that are in the data class but not in the input dictionary as None
356        for key in data_class_keys:
357            if key not in input_dict.keys():
358                input_dict[key] = None
359        return data_class(**input_dict)
360    
361    def get_query(self, url, use_header=False):
362        """Overwrites the get_query method on the parent class to default to not use a header
363        
364        Notes
365        -----
366        As of January 2025, the metabref database no longer requires a token and therefore no header is needed
367
368        """
369        return super().get_query(url, use_header)
370
371
372class MetabRefGCInterface(MetabRefInterface):
373    """
374    Interface to the Metabolomics Reference Database.
375    """
376
377    def __init__(self):
378        """
379        Initialize instance.
380
381        """
382
383        super().__init__()
384        self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1"
385        self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames"
386
387        self.__init_format_map__()
388
389    def __init_format_map__(self):
390        """
391        Initialize database format mapper, enabling multiple format requests.
392
393        """
394
395        # Define format workflows
396        self.format_map = {
397            "json": lambda x, normalize, fe_kwargs: x,
398            "dict": lambda x,
399            normalize,
400            fe_kwargs: self._to_LowResolutionEICompound_dict(x, normalize),
401            "sql": lambda x,
402            normalize,
403            fe_kwargs: self._LowResolutionEICompound_dict_to_sqlite(
404                self._to_LowResolutionEICompound_dict(x, normalize)
405            ),
406        }
407
408        # Add aliases
409        self.format_map["metabref"] = self.format_map["json"]
410        self.format_map["datadict"] = self.format_map["dict"]
411        self.format_map["data-dict"] = self.format_map["dict"]
412        self.format_map["lowreseicompound"] = self.format_map["dict"]
413        self.format_map["lowres"] = self.format_map["dict"]
414        self.format_map["lowresgc"] = self.format_map["dict"]
415        self.format_map["sqlite"] = self.format_map["sql"]
416
417    def available_formats(self):
418        """
419        View list of available formats.
420
421        Returns
422        -------
423        list
424            Format map keys.
425        """
426
427        return list(self.format_map.keys())
428
429    def get_library(self, format="json", normalize=False):
430        """
431        Request MetabRef GC/MS library.
432
433        Parameters
434        ----------
435        format : str
436            Format of requested library, i.e. "json", "sql", "flashentropy".
437            See `available_formats` method for aliases.
438        normalize : bool
439            Normalize the spectrum by its magnitude.
440
441        Returns
442        -------
443        Library in requested format.
444
445        """
446
447        # Init format function
448        format_func = self._get_format_func(format)
449
450        return format_func(
451            self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {}
452        )
453
454    def get_fames(self, format="json", normalize=False):
455        """
456        Request MetabRef GC/MS FAMEs library.
457
458        Parameters
459        ----------
460        format : str
461            Format of requested library, i.e. "json", "sql", "flashentropy".
462            See `available_formats` method for aliases.
463        normalize : bool
464            Normalize the spectrum by its magnitude.
465
466        Returns
467        -------
468        Library in requested format.
469
470        """
471
472        # Init format function
473        format_func = self._get_format_func(format)
474
475        return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {})
476
477    def _to_LowResolutionEICompound_dict(self, metabref_lib, normalize=False):
478        """
479        Convert MetabRef-formatted library to CoreMS LowResolutionEICompound-formatted
480        dictionary for local ingestion.
481
482        Parameters
483        ----------
484        metabref_lib : dict
485            MetabRef GC-MS library in JSON format.
486        normalize : bool
487            Normalize each spectrum by its magnitude.
488
489        Returns
490        -------
491        list of dict
492            List of each spectrum contained in dictionary.
493
494        """
495
496        # All below key:value lookups are based on CoreMS class definitions
497        # NOT MetabRef content. For example, MetabRef has keys for PubChem,
498        # USI, etc. that are not considered below.
499
500        # Dictionary to map metabref keys to corems keys
501        metadatar_cols = {
502            "casno": "cas",
503            "inchikey": "inchikey",
504            "inchi": "inchi",
505            "chebi": "chebi",
506            "smiles": "smiles",
507            "kegg": "kegg",
508            "iupac_name": "iupac_name",
509            "traditional_name": "traditional_name",  # Not present in metabref
510            "common_name": "common_name",  # Not present in metabref
511        }
512
513        # Dictionary to map metabref keys to corems keys
514        lowres_ei_compound_cols = {
515            "id": "metabref_id",
516            "molecule_name": "name",  # Is this correct?
517            "classify": "classify",  # Not present in metabref
518            "formula": "formula",
519            "ri": "ri",
520            "rt": "retention_time",
521            "source": "source",  # Not present in metabref
522            "casno": "casno",
523            "comments": "comment",
524            "source_temp_c": "source_temp_c",  # Not present in metabref
525            "ev": "ev",  # Not present in metabref
526            "peak_count": "peaks_count",
527            "mz": "mz",
528            "abundance": "abundance",
529        }
530
531        # Local result container
532        corems_lib = []
533
534        # Enumerate spectra
535        for i, source_ in enumerate(metabref_lib):
536            # Copy source to prevent modification
537            source = source_.copy()
538
539            # Flatten source dict
540            source = source.pop("spectrum_data") | source
541
542            # Parse target data
543            target = {
544                lowres_ei_compound_cols[k]: v
545                for k, v in source.items()
546                if k in lowres_ei_compound_cols
547            }
548
549            # Explicitly add this to connect with LowResCompoundRef later
550            target["rt"] = source["rt"]
551
552            # Parse (mz, abundance)
553            arr = self.spectrum_to_array(target["mz"], normalize=normalize)
554            target["mz"] = arr[:, 0]
555            target["abundance"] = arr[:, 1]
556
557            # Parse meta data
558            target["metadata"] = {
559                metadatar_cols[k]: v for k, v in source.items() if k in metadatar_cols
560            }
561
562            # Add anything else
563            for k in source:
564                if k not in lowres_ei_compound_cols:
565                    target[k] = source[k]
566
567            # Add to CoreMS list
568            corems_lib.append(target)
569
570        return corems_lib
571
572    def _LowResolutionEICompound_dict_to_sqlite(
573        self, lowres_ei_compound_dict, url="sqlite://"
574    ):
575        """
576        Convert CoreMS LowResolutionEICompound-formatted dictionary to SQLite
577        database for local ingestion.
578
579        Parameters
580        ----------
581        lowres_ei_compound_dict : dict
582            CoreMS GC-MS library formatted for LowResolutionEICompound.
583        url : str
584            URL to SQLite prefix.
585
586        Returns
587        -------
588        sqlite database
589            Spectra contained in SQLite database.
590
591        """
592
593        # Dictionary to map corems keys to all-caps keys
594        capped_cols = {
595            "name": "NAME",
596            "formula": "FORM",
597            "ri": "RI",
598            "retention_time": "RT",
599            "source": "SOURCE",
600            "casno": "CASNO",
601            "comment": "COMMENT",
602            "peaks_count": "NUM PEAKS",
603        }
604
605        # Initialize SQLite object
606        sqlite_obj = EI_LowRes_SQLite(url=url)
607
608        # Iterate spectra
609        for _data_dict in lowres_ei_compound_dict:
610            # Copy source to prevent modification
611            data_dict = _data_dict.copy()
612
613            # Add missing capped values
614            for k, v in capped_cols.items():
615                # Key exists
616                if k in data_dict:
617                    # # This will replace the key
618                    # data_dict[v] = data_dict.pop(k)
619
620                    # This will keep both keys
621                    data_dict[v] = data_dict[k]
622
623            # Parse number of peaks
624            if not data_dict.get("NUM PEAKS"):
625                data_dict["NUM PEAKS"] = len(data_dict.get("mz"))
626
627            # Parse CAS number
628            if not data_dict.get("CASNO"):
629                data_dict["CASNO"] = data_dict.get("CAS")
630
631            if not data_dict["CASNO"]:
632                data_dict["CASNO"] = 0
633
634            # Build linked metadata table
635            if "metadata" in data_dict:
636                if len(data_dict["metadata"]) > 0:
637                    data_dict["metadatar"] = Metadatar(**data_dict.pop("metadata"))
638                else:
639                    data_dict.pop("metadata")
640
641            # Attempt addition to sqlite
642            try:
643                sqlite_obj.add_compound(data_dict)
644            except:
645                print(data_dict["NAME"])
646
647        return sqlite_obj
648
649
650class MetabRefLCInterface(MetabRefInterface):
651    """
652    Interface to the Metabolomics Reference Database for LC-MS data.
653    """
654
655    def __init__(self):
656        """
657        Initialize instance.
658
659        """
660
661        super().__init__()
662
663        # API endpoint for precursor m/z search
664        # inputs = mz, tolerance (in Da), polarity, page_no, per_page
665        self.PRECURSOR_MZ_URL = (
666            "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}"
667        )
668
669        # API endpoint for returning full list of precursor m/z values in database
670        # inputs = polarity, page_no, per_page
671        self.PRECURSOR_MZ_ALL_URL = "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}"
672
673        self.__init_format_map__()
674
675    def __init_format_map__(self):
676        """
677        Initialize database format mapper, enabling multiple format requests.
678
679        """
680
681        # Define format workflows
682        self.format_map = {
683            "json": lambda x, normalize, fe_kwargs: x,
684            "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy(
685                x, normalize, fe_kwargs
686            ),
687        }
688
689        # Add aliases
690        self.format_map["metabref"] = self.format_map["json"]
691        self.format_map["fe"] = self.format_map["flashentropy"]
692        self.format_map["flash-entropy"] = self.format_map["flashentropy"]
693
694    def query_by_precursor(self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50):
695        """
696        Query MetabRef by precursor m/z values.
697
698        Parameters
699        ----------
700        mz_list : list
701            List of precursor m/z values.
702        polarity : str
703            Ionization polarity, either "positive" or "negative".
704        mz_tol_ppm : float
705            Tolerance in ppm for each precursor m/z value.
706            Used for retrieving from a potential match from database.
707        mz_tol_da_api : float, optional
708            Maximum tolerance between precursor m/z values for API search, in daltons.
709            Used to group similar mzs into a single API query for speed. Default is 0.2.
710        max_per_page : int, optional
711            Maximum records to return from MetabRef API query at a time.  Default is 50.
712
713        Returns
714        -------
715        list
716            List of library entries in original JSON format.
717        """
718
719        # If polarity is anything other than positive or negative, raise error
720        if polarity not in ["positive", "negative"]:
721            raise ValueError("Polarity must be 'positive' or 'negative'")
722
723        # Cluster groups of mz according to mz_tol_da_api for precursor query
724        mz_list.sort()
725        mz_groups = [[mz_list[0]]]
726        for x in mz_list[1:]:
727            if abs(x - mz_groups[-1][0]) <= mz_tol_da_api:
728                mz_groups[-1].append(x)
729            else:
730                mz_groups.append([x])
731
732        # Query MetabRef for each mz group
733        lib = []
734        for mz_group in mz_groups:
735            mz = np.mean(mz_group)
736            if len(mz_group) == 1:
737                mz = mz_group[0]
738                tol = mz_tol_ppm * 10**-6 * mz
739            else:
740                mz = (max(mz_group) - min(mz_group)) / 2 + min(mz_group)
741                tol = (max(mz_group) - min(mz_group)) / 2 + mz_tol_ppm**-6 * max(
742                    mz_group
743                )
744            
745            # Get first page of results
746            response = self.get_query(
747                self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity, 1, max_per_page)
748            )
749            lib = lib + response['results']
750
751            # If there are more pages of results, get them
752            if response['total_pages'] > 1: 
753                for i in np.arange(2, response['total_pages']+1):
754                    lib = lib + self.get_query(
755                        self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity, i, max_per_page)
756                        )['results']
757
758        return lib
759
760    def request_all_precursors(self, polarity, per_page = 50000):
761        """
762        Request all precursor m/z values for MS2 spectra from MetabRef.
763
764        Parameters
765        ----------
766        polarity : str
767            Ionization polarity, either "positive" or "negative".
768        per_page : int, optional
769            Number of records to fetch per call. Default is 50000
770
771        Returns
772        -------
773        list
774            List of all precursor m/z values, sorted.
775        """
776        # If polarity is anything other than positive or negative, raise error
777        if polarity not in ["positive", "negative"]:
778            raise ValueError("Polarity must be 'positive' or 'negative'")
779
780        precursors = []    
781
782        # Get first page of results and total number of pages of results
783        response = self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity, str(1), str(per_page)))
784        total_pages = response['total_pages']
785        precursors.extend([x['precursor_ion'] for x in response['results']])
786
787        # Go through remaining pages of results
788        for i in np.arange(2, total_pages + 1):
789            response = self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity, str(i), str(per_page)))
790            precursors.extend([x['precursor_ion'] for x in response['results']])
791        
792        # Sort precursors from smallest to largest and remove duplicates
793        precursors = list(set(precursors))
794        precursors.sort()
795
796        return precursors
797
798    def get_lipid_library(
799        self,
800        mz_list,
801        polarity,
802        mz_tol_ppm,
803        mz_tol_da_api=0.2,
804        format="json",
805        normalize=True,
806        fe_kwargs={},
807    ):
808        """
809        Request MetabRef lipid library.
810
811        Parameters
812        ----------
813        mz_list : list
814            List of precursor m/z values.
815        polarity : str
816            Ionization polarity, either "positive" or "negative".
817        mz_tol_ppm : float
818            Tolerance in ppm for each precursor m/z value.
819            Used for retrieving from a potential match from database.
820        mz_tol_da_api : float, optional
821            Maximum tolerance between precursor m/z values for API search, in daltons.
822            Used to group similar mzs into a single API query for speed. Default is 0.2.
823        format : str, optional
824            Format of requested library, i.e. "json", "sql", "flashentropy".
825            See `available_formats` method for aliases. Default is "json".
826        normalize : bool, optional
827            Normalize the spectrum by its magnitude. Default is True.
828        fe_kwargs : dict, optional
829            Keyword arguments for FlashEntropy search. Default is {}.
830
831        Returns
832        -------
833        tuple
834            Library in requested format and lipid metadata as a LipidMetadata dataclass.
835
836        """
837        mz_list.sort()
838        mz_list = np.array(mz_list)
839
840        # Get all precursors in the library matching the polarity
841        precusors_in_lib = self.request_all_precursors(polarity=polarity)
842        precusors_in_lib = np.array(precusors_in_lib)
843
844        # Compare the mz_list with the precursors in the library, keep any mzs that are within mz_tol of any precursor in the library
845        lib_mz_df = pd.DataFrame(precusors_in_lib, columns=["lib_mz"])
846        lib_mz_df["closest_obs_mz"] = mz_list[
847            find_closest(mz_list, lib_mz_df.lib_mz.values)
848        ]
849        lib_mz_df["mz_diff_ppm"] = np.abs(
850            (lib_mz_df["lib_mz"] - lib_mz_df["closest_obs_mz"])
851            / lib_mz_df["lib_mz"]
852            * 1e6
853        )
854        lib_mz_sub = lib_mz_df[lib_mz_df["mz_diff_ppm"] <= mz_tol_ppm]
855
856        # Do the same in the opposite direction
857        mz_df = pd.DataFrame(mz_list, columns=["mass_feature_mz"])
858        mz_df["closest_lib_pre_mz"] = precusors_in_lib[
859            find_closest(precusors_in_lib, mz_df.mass_feature_mz.values)
860        ]
861        mz_df["mz_diff_ppm"] = np.abs(
862            (mz_df["mass_feature_mz"] - mz_df["closest_lib_pre_mz"])
863            / mz_df["mass_feature_mz"]
864            * 1e6
865        )
866        mz_df_sub = mz_df[mz_df["mz_diff_ppm"] <= mz_tol_ppm]
867
868        # Evaluate which is fewer mzs - lib_mz_sub or mz_df_sub and use that as the input for next step
869        if len(lib_mz_sub) < len(mz_df_sub):
870            mzs_to_query = lib_mz_sub.lib_mz.values
871        else:
872            mzs_to_query = mz_df_sub.mass_feature_mz.values
873
874        # Query the library for the precursors in the mz_list that are in the library to retrieve the spectra and metadata
875        lib = self.query_by_precursor(
876            mz_list=mzs_to_query,
877            polarity=polarity,
878            mz_tol_ppm=mz_tol_ppm,
879            mz_tol_da_api=mz_tol_da_api,
880        )
881
882        # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass
883        mol_data_dict = {x["id"]: x["Molecular Data"] for x in lib}
884        lipid_lib = {x["id"]: x["Lipid Tree"] for x in lib if "Lipid Tree" in x.keys()}
885        mol_data_dict = {k: {**v, **lipid_lib[k]} for k, v in mol_data_dict.items()}
886        mol_data_dict = {
887            k: self._dict_to_dataclass(v, LipidMetadata)
888            for k, v in mol_data_dict.items()
889        }
890
891        # Remove lipid metadata from the metabref library
892        lib = [
893            {k: v for k, v in x.items() if k not in ["Molecular Data", "Lipid Tree"]}
894            for x in lib
895        ]
896        # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry
897        for x in lib:
898            if "Lipid Fragments" in x.keys():
899                x.update(x.pop("Lipid Fragments"))
900            if "MSO Data" in x.keys():
901                x.update(x.pop("MSO Data"))
902
903        # Format the spectral library
904        format_func = self._get_format_func(format)
905        lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs)
906        return (lib, mol_data_dict)
class SpectralDatabaseInterface(abc.ABC):
 16class SpectralDatabaseInterface(ABC):
 17    """
 18    Base class that facilitates connection to spectral reference databases,
 19    such as EMSL's Metabolomics Reference Database (MetabRef).
 20
 21    """
 22
 23    def __init__(self, key=None):
 24        """
 25        Initialize instance.
 26
 27        Parameters
 28        ----------
 29        key : str
 30            Token key.
 31
 32        """
 33
 34        self.key = key
 35
 36        if self.key is None:
 37            raise ValueError(
 38                "Must specify environment variable key for token associatedwith this database interface."
 39            )
 40
 41    def set_token(self, path):
 42        """
 43        Set environment variable for MetabRef database token.
 44
 45        Parameters
 46        ----------
 47        path : str
 48            Path to token.
 49
 50        """
 51
 52        # Read token from file
 53        with open(path, "r", encoding="utf-8") as f:
 54            token = f.readline().strip()
 55
 56        # Set environment variable
 57        os.environ[self.key] = token
 58
 59    def get_token(self):
 60        """
 61        Get environment variable for database token.
 62
 63        Returns
 64        -------
 65        str
 66            Token string.
 67
 68        """
 69
 70        # Check for token
 71        if self.key not in os.environ:
 72            raise ValueError("Must set {} environment variable.".format(self.key))
 73
 74        # Get token from environment variables
 75        return os.environ.get(self.key)
 76
 77    def get_header(self):
 78        """
 79        Access stored database token and prepare as header.
 80
 81        Returns
 82        -------
 83        str
 84            Header string.
 85
 86        """
 87
 88        # Get token
 89        token = self.get_token()
 90
 91        # Pad header information
 92        header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"}
 93
 94        return header
 95
 96    def get_query(self, url, use_header=True):
 97        """
 98        Request payload from URL according to `get` protocol.
 99
100        Parameters
101        ----------
102        url : str
103            URL for request.
104        use_header: bool
105            Whether or not the query should include the header
106
107        Returns
108        -------
109        dict
110            Response as JSON.
111
112        """
113
114        # Query URL via `get`
115        if use_header:
116            response = requests.get(url, headers=self.get_header())
117        else:
118            response = requests.get(url)
119
120        # Check response
121        response.raise_for_status()
122
123        # Return as JSON
124        return response.json()
125
126    def post_query(self, url, variable, values, tolerance):
127        """
128        Request payload from URL according to `post` protocol.
129
130        Parameters
131        ----------
132        url : str
133            URL for request.
134        variable : str
135            Variable to query.
136        values : str
137            Specific values of `variable` to query.
138        tolerance : str
139            Query tolerance relative to `values`.
140
141        Returns
142        -------
143        dict
144            Response as JSON.
145
146        """
147
148        # Coerce to string
149        if not isinstance(variable, str):
150            variable = str(variable).replace(" ", "")
151
152        if not isinstance(values, str):
153            values = str(values).replace(" ", "")
154
155        if not isinstance(tolerance, str):
156            tolerance = str(tolerance).replace(" ", "")
157
158        # Query URL via `post`
159        response = requests.post(
160            os.path.join(url, variable, tolerance),
161            data=values,
162            headers=self.get_header(),
163        )
164
165        # Check response
166        response.raise_for_status()
167
168        # Return as JSON
169        return response.json()

Base class that facilitates connection to spectral reference databases, such as EMSL's Metabolomics Reference Database (MetabRef).

SpectralDatabaseInterface(key=None)
23    def __init__(self, key=None):
24        """
25        Initialize instance.
26
27        Parameters
28        ----------
29        key : str
30            Token key.
31
32        """
33
34        self.key = key
35
36        if self.key is None:
37            raise ValueError(
38                "Must specify environment variable key for token associatedwith this database interface."
39            )

Initialize instance.

Parameters
  • key (str): Token key.
key
def set_token(self, path):
41    def set_token(self, path):
42        """
43        Set environment variable for MetabRef database token.
44
45        Parameters
46        ----------
47        path : str
48            Path to token.
49
50        """
51
52        # Read token from file
53        with open(path, "r", encoding="utf-8") as f:
54            token = f.readline().strip()
55
56        # Set environment variable
57        os.environ[self.key] = token

Set environment variable for MetabRef database token.

Parameters
  • path (str): Path to token.
def get_token(self):
59    def get_token(self):
60        """
61        Get environment variable for database token.
62
63        Returns
64        -------
65        str
66            Token string.
67
68        """
69
70        # Check for token
71        if self.key not in os.environ:
72            raise ValueError("Must set {} environment variable.".format(self.key))
73
74        # Get token from environment variables
75        return os.environ.get(self.key)

Get environment variable for database token.

Returns
  • str: Token string.
def get_header(self):
77    def get_header(self):
78        """
79        Access stored database token and prepare as header.
80
81        Returns
82        -------
83        str
84            Header string.
85
86        """
87
88        # Get token
89        token = self.get_token()
90
91        # Pad header information
92        header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"}
93
94        return header

Access stored database token and prepare as header.

Returns
  • str: Header string.
def get_query(self, url, use_header=True):
 96    def get_query(self, url, use_header=True):
 97        """
 98        Request payload from URL according to `get` protocol.
 99
100        Parameters
101        ----------
102        url : str
103            URL for request.
104        use_header: bool
105            Whether or not the query should include the header
106
107        Returns
108        -------
109        dict
110            Response as JSON.
111
112        """
113
114        # Query URL via `get`
115        if use_header:
116            response = requests.get(url, headers=self.get_header())
117        else:
118            response = requests.get(url)
119
120        # Check response
121        response.raise_for_status()
122
123        # Return as JSON
124        return response.json()

Request payload from URL according to get protocol.

Parameters
  • url (str): URL for request.
  • use_header (bool): Whether or not the query should include the header
Returns
  • dict: Response as JSON.
def post_query(self, url, variable, values, tolerance):
126    def post_query(self, url, variable, values, tolerance):
127        """
128        Request payload from URL according to `post` protocol.
129
130        Parameters
131        ----------
132        url : str
133            URL for request.
134        variable : str
135            Variable to query.
136        values : str
137            Specific values of `variable` to query.
138        tolerance : str
139            Query tolerance relative to `values`.
140
141        Returns
142        -------
143        dict
144            Response as JSON.
145
146        """
147
148        # Coerce to string
149        if not isinstance(variable, str):
150            variable = str(variable).replace(" ", "")
151
152        if not isinstance(values, str):
153            values = str(values).replace(" ", "")
154
155        if not isinstance(tolerance, str):
156            tolerance = str(tolerance).replace(" ", "")
157
158        # Query URL via `post`
159        response = requests.post(
160            os.path.join(url, variable, tolerance),
161            data=values,
162            headers=self.get_header(),
163        )
164
165        # Check response
166        response.raise_for_status()
167
168        # Return as JSON
169        return response.json()

Request payload from URL according to post protocol.

Parameters
  • url (str): URL for request.
  • variable (str): Variable to query.
  • values (str): Specific values of variable to query.
  • tolerance (str): Query tolerance relative to values.
Returns
  • dict: Response as JSON.
class MetabRefInterface(SpectralDatabaseInterface):
172class MetabRefInterface(SpectralDatabaseInterface):
173    """
174    Interface to the Metabolomics Reference Database.
175    """
176
177    def __init__(self):
178        """
179        Initialize instance.
180
181        """
182
183        super().__init__(key="METABREF_TOKEN")
184
185    def _get_format_func(self, format):
186        """
187        Obtain format function by key.
188
189        Returns
190        -------
191        func
192            Formatting function.
193        """
194
195        if format.lower() in self.format_map.keys():
196            return self.format_map[format.lower()]
197
198        raise ValueError(("{} not a supported format.").format(format))
199
200    def spectrum_to_array(self, spectrum, normalize=True):
201        """
202        Convert MetabRef-formatted spectrum to array.
203
204        Parameters
205        ----------
206        spectrum : str
207            MetabRef spectrum, i.e. list of (m/z,abundance) pairs.
208        normalize : bool
209            Normalize the spectrum by its magnitude.
210
211        Returns
212        -------
213        :obj:`~numpy.array`
214            Array of shape (N, 2), with m/z in the first column and abundance in
215            the second.
216
217        """
218
219        # Convert parenthesis-delimited string to array
220        arr = np.array(
221            re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float
222        ).reshape(-1, 2)
223
224        # Normalize the array
225        if normalize:
226            arr[:, -1] = arr[:, -1] / arr[:, -1].sum()
227
228        return arr
229
230    def _to_flashentropy(self, metabref_lib, normalize=True, fe_kwargs={}):
231        """
232        Convert metabref-formatted library to FlashEntropy library.
233
234        Parameters
235        ----------
236        metabref_lib : dict
237            MetabRef MS2 library in JSON format or FlashEntropy search instance (for reformatting at different MS2 separation).
238        normalize : bool
239            Normalize each spectrum by its magnitude.
240        fe_kwargs : dict, optional
241            Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search;
242            any keys not recognized will be ignored. By default, all parameters set to defaults.
243
244        Returns
245        -------
246        :obj:`~ms_entropy.FlashEntropySearch`
247            MS2 library as FlashEntropy search instance.
248
249        Raises
250        ------
251        ValueError
252            If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal.
253
254        """
255        # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da
256        if (
257            "min_ms2_difference_in_da" in fe_kwargs
258            or "max_ms2_tolerance_in_da" in fe_kwargs
259        ):
260            if (
261                "min_ms2_difference_in_da" not in fe_kwargs
262                or "max_ms2_tolerance_in_da" not in fe_kwargs
263            ):
264                raise ValueError(
265                    "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified."
266                )
267            if (
268                fe_kwargs["min_ms2_difference_in_da"]
269                != 2 * fe_kwargs["max_ms2_tolerance_in_da"]
270            ):
271                raise ValueError(
272                    "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'."
273                )
274
275        # Initialize empty library
276        fe_lib = []
277
278        # Enumerate spectra
279        for i, source in enumerate(metabref_lib):
280            # Reorganize source dict, if necessary
281            if "spectrum_data" in source.keys():
282                spectrum = source["spectrum_data"]
283            else:
284                spectrum = source
285
286            # Rename precursor_mz key for FlashEntropy
287            if "precursor_mz" not in spectrum.keys():
288                spectrum["precursor_mz"] = spectrum.pop("precursor_ion")
289
290            # Convert CoreMS spectrum to array and clean, store as `peaks`
291            spectrum["peaks"] = self.spectrum_to_array(
292                spectrum["mz"], normalize=normalize
293            )
294
295            # Add spectrum to library
296            fe_lib.append(spectrum)
297
298        # Initialize FlashEntropy
299        fe_init_kws = [
300            "max_ms2_tolerance_in_da",
301            "mz_index_step",
302            "low_memory",
303            "path_data",
304        ]
305        fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws}
306        fes = FlashEntropySearch(**fe_init_kws)
307
308        # Build FlashEntropy index
309        fe_index_kws = [
310            "max_indexed_mz",
311            "precursor_ions_removal_da",
312            "noise_threshold",
313            "min_ms2_difference_in_da",
314            "max_peak_num",
315        ]
316        fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws}
317        fes.build_index(fe_lib, **fe_index_kws, clean_spectra=True)
318
319        return fes
320
321    def _dict_to_dataclass(self, metabref_lib, data_class):
322        """
323        Convert dictionary to dataclass.
324
325        Notes
326        -----
327        This function will pull the attributes a dataclass and its parent class
328        and convert the dictionary to a dataclass instance with the appropriate
329        attributes.
330
331        Parameters
332        ----------
333        data_class : :obj:`~dataclasses.dataclass`
334            Dataclass to convert to.
335        metabref_lib : dict
336            Metabref dictionary object to convert to dataclass.
337
338        Returns
339        -------
340        :obj:`~dataclasses.dataclass`
341            Dataclass instance.
342
343        """
344
345        # Get list of expected attributes of data_class
346        data_class_keys = list(data_class.__annotations__.keys())
347
348        # Does the data_class inherit from another class, if so, get the attributes of the parent class as well
349        if len(data_class.__mro__) > 2:
350            parent_class_keys = list(data_class.__bases__[0].__annotations__.keys())
351            data_class_keys = list(set(data_class_keys + parent_class_keys))
352
353        # Remove keys that are not in the data_class from the input dictionary
354        input_dict = {k: v for k, v in metabref_lib.items() if k in data_class_keys}
355
356        # Add keys that are in the data class but not in the input dictionary as None
357        for key in data_class_keys:
358            if key not in input_dict.keys():
359                input_dict[key] = None
360        return data_class(**input_dict)
361    
362    def get_query(self, url, use_header=False):
363        """Overwrites the get_query method on the parent class to default to not use a header
364        
365        Notes
366        -----
367        As of January 2025, the metabref database no longer requires a token and therefore no header is needed
368
369        """
370        return super().get_query(url, use_header)

Interface to the Metabolomics Reference Database.

MetabRefInterface()
177    def __init__(self):
178        """
179        Initialize instance.
180
181        """
182
183        super().__init__(key="METABREF_TOKEN")

Initialize instance.

def spectrum_to_array(self, spectrum, normalize=True):
200    def spectrum_to_array(self, spectrum, normalize=True):
201        """
202        Convert MetabRef-formatted spectrum to array.
203
204        Parameters
205        ----------
206        spectrum : str
207            MetabRef spectrum, i.e. list of (m/z,abundance) pairs.
208        normalize : bool
209            Normalize the spectrum by its magnitude.
210
211        Returns
212        -------
213        :obj:`~numpy.array`
214            Array of shape (N, 2), with m/z in the first column and abundance in
215            the second.
216
217        """
218
219        # Convert parenthesis-delimited string to array
220        arr = np.array(
221            re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float
222        ).reshape(-1, 2)
223
224        # Normalize the array
225        if normalize:
226            arr[:, -1] = arr[:, -1] / arr[:, -1].sum()
227
228        return arr

Convert MetabRef-formatted spectrum to array.

Parameters
  • spectrum (str): MetabRef spectrum, i.e. list of (m/z,abundance) pairs.
  • normalize (bool): Normalize the spectrum by its magnitude.
Returns
  • ~numpy.array: Array of shape (N, 2), with m/z in the first column and abundance in the second.
def get_query(self, url, use_header=False):
362    def get_query(self, url, use_header=False):
363        """Overwrites the get_query method on the parent class to default to not use a header
364        
365        Notes
366        -----
367        As of January 2025, the metabref database no longer requires a token and therefore no header is needed
368
369        """
370        return super().get_query(url, use_header)

Overwrites the get_query method on the parent class to default to not use a header

Notes

As of January 2025, the metabref database no longer requires a token and therefore no header is needed

class MetabRefGCInterface(MetabRefInterface):
373class MetabRefGCInterface(MetabRefInterface):
374    """
375    Interface to the Metabolomics Reference Database.
376    """
377
378    def __init__(self):
379        """
380        Initialize instance.
381
382        """
383
384        super().__init__()
385        self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1"
386        self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames"
387
388        self.__init_format_map__()
389
390    def __init_format_map__(self):
391        """
392        Initialize database format mapper, enabling multiple format requests.
393
394        """
395
396        # Define format workflows
397        self.format_map = {
398            "json": lambda x, normalize, fe_kwargs: x,
399            "dict": lambda x,
400            normalize,
401            fe_kwargs: self._to_LowResolutionEICompound_dict(x, normalize),
402            "sql": lambda x,
403            normalize,
404            fe_kwargs: self._LowResolutionEICompound_dict_to_sqlite(
405                self._to_LowResolutionEICompound_dict(x, normalize)
406            ),
407        }
408
409        # Add aliases
410        self.format_map["metabref"] = self.format_map["json"]
411        self.format_map["datadict"] = self.format_map["dict"]
412        self.format_map["data-dict"] = self.format_map["dict"]
413        self.format_map["lowreseicompound"] = self.format_map["dict"]
414        self.format_map["lowres"] = self.format_map["dict"]
415        self.format_map["lowresgc"] = self.format_map["dict"]
416        self.format_map["sqlite"] = self.format_map["sql"]
417
418    def available_formats(self):
419        """
420        View list of available formats.
421
422        Returns
423        -------
424        list
425            Format map keys.
426        """
427
428        return list(self.format_map.keys())
429
430    def get_library(self, format="json", normalize=False):
431        """
432        Request MetabRef GC/MS library.
433
434        Parameters
435        ----------
436        format : str
437            Format of requested library, i.e. "json", "sql", "flashentropy".
438            See `available_formats` method for aliases.
439        normalize : bool
440            Normalize the spectrum by its magnitude.
441
442        Returns
443        -------
444        Library in requested format.
445
446        """
447
448        # Init format function
449        format_func = self._get_format_func(format)
450
451        return format_func(
452            self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {}
453        )
454
455    def get_fames(self, format="json", normalize=False):
456        """
457        Request MetabRef GC/MS FAMEs library.
458
459        Parameters
460        ----------
461        format : str
462            Format of requested library, i.e. "json", "sql", "flashentropy".
463            See `available_formats` method for aliases.
464        normalize : bool
465            Normalize the spectrum by its magnitude.
466
467        Returns
468        -------
469        Library in requested format.
470
471        """
472
473        # Init format function
474        format_func = self._get_format_func(format)
475
476        return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {})
477
478    def _to_LowResolutionEICompound_dict(self, metabref_lib, normalize=False):
479        """
480        Convert MetabRef-formatted library to CoreMS LowResolutionEICompound-formatted
481        dictionary for local ingestion.
482
483        Parameters
484        ----------
485        metabref_lib : dict
486            MetabRef GC-MS library in JSON format.
487        normalize : bool
488            Normalize each spectrum by its magnitude.
489
490        Returns
491        -------
492        list of dict
493            List of each spectrum contained in dictionary.
494
495        """
496
497        # All below key:value lookups are based on CoreMS class definitions
498        # NOT MetabRef content. For example, MetabRef has keys for PubChem,
499        # USI, etc. that are not considered below.
500
501        # Dictionary to map metabref keys to corems keys
502        metadatar_cols = {
503            "casno": "cas",
504            "inchikey": "inchikey",
505            "inchi": "inchi",
506            "chebi": "chebi",
507            "smiles": "smiles",
508            "kegg": "kegg",
509            "iupac_name": "iupac_name",
510            "traditional_name": "traditional_name",  # Not present in metabref
511            "common_name": "common_name",  # Not present in metabref
512        }
513
514        # Dictionary to map metabref keys to corems keys
515        lowres_ei_compound_cols = {
516            "id": "metabref_id",
517            "molecule_name": "name",  # Is this correct?
518            "classify": "classify",  # Not present in metabref
519            "formula": "formula",
520            "ri": "ri",
521            "rt": "retention_time",
522            "source": "source",  # Not present in metabref
523            "casno": "casno",
524            "comments": "comment",
525            "source_temp_c": "source_temp_c",  # Not present in metabref
526            "ev": "ev",  # Not present in metabref
527            "peak_count": "peaks_count",
528            "mz": "mz",
529            "abundance": "abundance",
530        }
531
532        # Local result container
533        corems_lib = []
534
535        # Enumerate spectra
536        for i, source_ in enumerate(metabref_lib):
537            # Copy source to prevent modification
538            source = source_.copy()
539
540            # Flatten source dict
541            source = source.pop("spectrum_data") | source
542
543            # Parse target data
544            target = {
545                lowres_ei_compound_cols[k]: v
546                for k, v in source.items()
547                if k in lowres_ei_compound_cols
548            }
549
550            # Explicitly add this to connect with LowResCompoundRef later
551            target["rt"] = source["rt"]
552
553            # Parse (mz, abundance)
554            arr = self.spectrum_to_array(target["mz"], normalize=normalize)
555            target["mz"] = arr[:, 0]
556            target["abundance"] = arr[:, 1]
557
558            # Parse meta data
559            target["metadata"] = {
560                metadatar_cols[k]: v for k, v in source.items() if k in metadatar_cols
561            }
562
563            # Add anything else
564            for k in source:
565                if k not in lowres_ei_compound_cols:
566                    target[k] = source[k]
567
568            # Add to CoreMS list
569            corems_lib.append(target)
570
571        return corems_lib
572
573    def _LowResolutionEICompound_dict_to_sqlite(
574        self, lowres_ei_compound_dict, url="sqlite://"
575    ):
576        """
577        Convert CoreMS LowResolutionEICompound-formatted dictionary to SQLite
578        database for local ingestion.
579
580        Parameters
581        ----------
582        lowres_ei_compound_dict : dict
583            CoreMS GC-MS library formatted for LowResolutionEICompound.
584        url : str
585            URL to SQLite prefix.
586
587        Returns
588        -------
589        sqlite database
590            Spectra contained in SQLite database.
591
592        """
593
594        # Dictionary to map corems keys to all-caps keys
595        capped_cols = {
596            "name": "NAME",
597            "formula": "FORM",
598            "ri": "RI",
599            "retention_time": "RT",
600            "source": "SOURCE",
601            "casno": "CASNO",
602            "comment": "COMMENT",
603            "peaks_count": "NUM PEAKS",
604        }
605
606        # Initialize SQLite object
607        sqlite_obj = EI_LowRes_SQLite(url=url)
608
609        # Iterate spectra
610        for _data_dict in lowres_ei_compound_dict:
611            # Copy source to prevent modification
612            data_dict = _data_dict.copy()
613
614            # Add missing capped values
615            for k, v in capped_cols.items():
616                # Key exists
617                if k in data_dict:
618                    # # This will replace the key
619                    # data_dict[v] = data_dict.pop(k)
620
621                    # This will keep both keys
622                    data_dict[v] = data_dict[k]
623
624            # Parse number of peaks
625            if not data_dict.get("NUM PEAKS"):
626                data_dict["NUM PEAKS"] = len(data_dict.get("mz"))
627
628            # Parse CAS number
629            if not data_dict.get("CASNO"):
630                data_dict["CASNO"] = data_dict.get("CAS")
631
632            if not data_dict["CASNO"]:
633                data_dict["CASNO"] = 0
634
635            # Build linked metadata table
636            if "metadata" in data_dict:
637                if len(data_dict["metadata"]) > 0:
638                    data_dict["metadatar"] = Metadatar(**data_dict.pop("metadata"))
639                else:
640                    data_dict.pop("metadata")
641
642            # Attempt addition to sqlite
643            try:
644                sqlite_obj.add_compound(data_dict)
645            except:
646                print(data_dict["NAME"])
647
648        return sqlite_obj

Interface to the Metabolomics Reference Database.

MetabRefGCInterface()
378    def __init__(self):
379        """
380        Initialize instance.
381
382        """
383
384        super().__init__()
385        self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1"
386        self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames"
387
388        self.__init_format_map__()

Initialize instance.

GCMS_LIBRARY_URL
FAMES_URL
def available_formats(self):
418    def available_formats(self):
419        """
420        View list of available formats.
421
422        Returns
423        -------
424        list
425            Format map keys.
426        """
427
428        return list(self.format_map.keys())

View list of available formats.

Returns
  • list: Format map keys.
def get_library(self, format='json', normalize=False):
430    def get_library(self, format="json", normalize=False):
431        """
432        Request MetabRef GC/MS library.
433
434        Parameters
435        ----------
436        format : str
437            Format of requested library, i.e. "json", "sql", "flashentropy".
438            See `available_formats` method for aliases.
439        normalize : bool
440            Normalize the spectrum by its magnitude.
441
442        Returns
443        -------
444        Library in requested format.
445
446        """
447
448        # Init format function
449        format_func = self._get_format_func(format)
450
451        return format_func(
452            self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {}
453        )

Request MetabRef GC/MS library.

Parameters
  • format (str): Format of requested library, i.e. "json", "sql", "flashentropy". See available_formats method for aliases.
  • normalize (bool): Normalize the spectrum by its magnitude.
Returns
  • Library in requested format.
def get_fames(self, format='json', normalize=False):
455    def get_fames(self, format="json", normalize=False):
456        """
457        Request MetabRef GC/MS FAMEs library.
458
459        Parameters
460        ----------
461        format : str
462            Format of requested library, i.e. "json", "sql", "flashentropy".
463            See `available_formats` method for aliases.
464        normalize : bool
465            Normalize the spectrum by its magnitude.
466
467        Returns
468        -------
469        Library in requested format.
470
471        """
472
473        # Init format function
474        format_func = self._get_format_func(format)
475
476        return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {})

Request MetabRef GC/MS FAMEs library.

Parameters
  • format (str): Format of requested library, i.e. "json", "sql", "flashentropy". See available_formats method for aliases.
  • normalize (bool): Normalize the spectrum by its magnitude.
Returns
  • Library in requested format.
class MetabRefLCInterface(MetabRefInterface):
651class MetabRefLCInterface(MetabRefInterface):
652    """
653    Interface to the Metabolomics Reference Database for LC-MS data.
654    """
655
656    def __init__(self):
657        """
658        Initialize instance.
659
660        """
661
662        super().__init__()
663
664        # API endpoint for precursor m/z search
665        # inputs = mz, tolerance (in Da), polarity, page_no, per_page
666        self.PRECURSOR_MZ_URL = (
667            "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}"
668        )
669
670        # API endpoint for returning full list of precursor m/z values in database
671        # inputs = polarity, page_no, per_page
672        self.PRECURSOR_MZ_ALL_URL = "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}"
673
674        self.__init_format_map__()
675
676    def __init_format_map__(self):
677        """
678        Initialize database format mapper, enabling multiple format requests.
679
680        """
681
682        # Define format workflows
683        self.format_map = {
684            "json": lambda x, normalize, fe_kwargs: x,
685            "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy(
686                x, normalize, fe_kwargs
687            ),
688        }
689
690        # Add aliases
691        self.format_map["metabref"] = self.format_map["json"]
692        self.format_map["fe"] = self.format_map["flashentropy"]
693        self.format_map["flash-entropy"] = self.format_map["flashentropy"]
694
695    def query_by_precursor(self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50):
696        """
697        Query MetabRef by precursor m/z values.
698
699        Parameters
700        ----------
701        mz_list : list
702            List of precursor m/z values.
703        polarity : str
704            Ionization polarity, either "positive" or "negative".
705        mz_tol_ppm : float
706            Tolerance in ppm for each precursor m/z value.
707            Used for retrieving from a potential match from database.
708        mz_tol_da_api : float, optional
709            Maximum tolerance between precursor m/z values for API search, in daltons.
710            Used to group similar mzs into a single API query for speed. Default is 0.2.
711        max_per_page : int, optional
712            Maximum records to return from MetabRef API query at a time.  Default is 50.
713
714        Returns
715        -------
716        list
717            List of library entries in original JSON format.
718        """
719
720        # If polarity is anything other than positive or negative, raise error
721        if polarity not in ["positive", "negative"]:
722            raise ValueError("Polarity must be 'positive' or 'negative'")
723
724        # Cluster groups of mz according to mz_tol_da_api for precursor query
725        mz_list.sort()
726        mz_groups = [[mz_list[0]]]
727        for x in mz_list[1:]:
728            if abs(x - mz_groups[-1][0]) <= mz_tol_da_api:
729                mz_groups[-1].append(x)
730            else:
731                mz_groups.append([x])
732
733        # Query MetabRef for each mz group
734        lib = []
735        for mz_group in mz_groups:
736            mz = np.mean(mz_group)
737            if len(mz_group) == 1:
738                mz = mz_group[0]
739                tol = mz_tol_ppm * 10**-6 * mz
740            else:
741                mz = (max(mz_group) - min(mz_group)) / 2 + min(mz_group)
742                tol = (max(mz_group) - min(mz_group)) / 2 + mz_tol_ppm**-6 * max(
743                    mz_group
744                )
745            
746            # Get first page of results
747            response = self.get_query(
748                self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity, 1, max_per_page)
749            )
750            lib = lib + response['results']
751
752            # If there are more pages of results, get them
753            if response['total_pages'] > 1: 
754                for i in np.arange(2, response['total_pages']+1):
755                    lib = lib + self.get_query(
756                        self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity, i, max_per_page)
757                        )['results']
758
759        return lib
760
761    def request_all_precursors(self, polarity, per_page = 50000):
762        """
763        Request all precursor m/z values for MS2 spectra from MetabRef.
764
765        Parameters
766        ----------
767        polarity : str
768            Ionization polarity, either "positive" or "negative".
769        per_page : int, optional
770            Number of records to fetch per call. Default is 50000
771
772        Returns
773        -------
774        list
775            List of all precursor m/z values, sorted.
776        """
777        # If polarity is anything other than positive or negative, raise error
778        if polarity not in ["positive", "negative"]:
779            raise ValueError("Polarity must be 'positive' or 'negative'")
780
781        precursors = []    
782
783        # Get first page of results and total number of pages of results
784        response = self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity, str(1), str(per_page)))
785        total_pages = response['total_pages']
786        precursors.extend([x['precursor_ion'] for x in response['results']])
787
788        # Go through remaining pages of results
789        for i in np.arange(2, total_pages + 1):
790            response = self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity, str(i), str(per_page)))
791            precursors.extend([x['precursor_ion'] for x in response['results']])
792        
793        # Sort precursors from smallest to largest and remove duplicates
794        precursors = list(set(precursors))
795        precursors.sort()
796
797        return precursors
798
799    def get_lipid_library(
800        self,
801        mz_list,
802        polarity,
803        mz_tol_ppm,
804        mz_tol_da_api=0.2,
805        format="json",
806        normalize=True,
807        fe_kwargs={},
808    ):
809        """
810        Request MetabRef lipid library.
811
812        Parameters
813        ----------
814        mz_list : list
815            List of precursor m/z values.
816        polarity : str
817            Ionization polarity, either "positive" or "negative".
818        mz_tol_ppm : float
819            Tolerance in ppm for each precursor m/z value.
820            Used for retrieving from a potential match from database.
821        mz_tol_da_api : float, optional
822            Maximum tolerance between precursor m/z values for API search, in daltons.
823            Used to group similar mzs into a single API query for speed. Default is 0.2.
824        format : str, optional
825            Format of requested library, i.e. "json", "sql", "flashentropy".
826            See `available_formats` method for aliases. Default is "json".
827        normalize : bool, optional
828            Normalize the spectrum by its magnitude. Default is True.
829        fe_kwargs : dict, optional
830            Keyword arguments for FlashEntropy search. Default is {}.
831
832        Returns
833        -------
834        tuple
835            Library in requested format and lipid metadata as a LipidMetadata dataclass.
836
837        """
838        mz_list.sort()
839        mz_list = np.array(mz_list)
840
841        # Get all precursors in the library matching the polarity
842        precusors_in_lib = self.request_all_precursors(polarity=polarity)
843        precusors_in_lib = np.array(precusors_in_lib)
844
845        # Compare the mz_list with the precursors in the library, keep any mzs that are within mz_tol of any precursor in the library
846        lib_mz_df = pd.DataFrame(precusors_in_lib, columns=["lib_mz"])
847        lib_mz_df["closest_obs_mz"] = mz_list[
848            find_closest(mz_list, lib_mz_df.lib_mz.values)
849        ]
850        lib_mz_df["mz_diff_ppm"] = np.abs(
851            (lib_mz_df["lib_mz"] - lib_mz_df["closest_obs_mz"])
852            / lib_mz_df["lib_mz"]
853            * 1e6
854        )
855        lib_mz_sub = lib_mz_df[lib_mz_df["mz_diff_ppm"] <= mz_tol_ppm]
856
857        # Do the same in the opposite direction
858        mz_df = pd.DataFrame(mz_list, columns=["mass_feature_mz"])
859        mz_df["closest_lib_pre_mz"] = precusors_in_lib[
860            find_closest(precusors_in_lib, mz_df.mass_feature_mz.values)
861        ]
862        mz_df["mz_diff_ppm"] = np.abs(
863            (mz_df["mass_feature_mz"] - mz_df["closest_lib_pre_mz"])
864            / mz_df["mass_feature_mz"]
865            * 1e6
866        )
867        mz_df_sub = mz_df[mz_df["mz_diff_ppm"] <= mz_tol_ppm]
868
869        # Evaluate which is fewer mzs - lib_mz_sub or mz_df_sub and use that as the input for next step
870        if len(lib_mz_sub) < len(mz_df_sub):
871            mzs_to_query = lib_mz_sub.lib_mz.values
872        else:
873            mzs_to_query = mz_df_sub.mass_feature_mz.values
874
875        # Query the library for the precursors in the mz_list that are in the library to retrieve the spectra and metadata
876        lib = self.query_by_precursor(
877            mz_list=mzs_to_query,
878            polarity=polarity,
879            mz_tol_ppm=mz_tol_ppm,
880            mz_tol_da_api=mz_tol_da_api,
881        )
882
883        # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass
884        mol_data_dict = {x["id"]: x["Molecular Data"] for x in lib}
885        lipid_lib = {x["id"]: x["Lipid Tree"] for x in lib if "Lipid Tree" in x.keys()}
886        mol_data_dict = {k: {**v, **lipid_lib[k]} for k, v in mol_data_dict.items()}
887        mol_data_dict = {
888            k: self._dict_to_dataclass(v, LipidMetadata)
889            for k, v in mol_data_dict.items()
890        }
891
892        # Remove lipid metadata from the metabref library
893        lib = [
894            {k: v for k, v in x.items() if k not in ["Molecular Data", "Lipid Tree"]}
895            for x in lib
896        ]
897        # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry
898        for x in lib:
899            if "Lipid Fragments" in x.keys():
900                x.update(x.pop("Lipid Fragments"))
901            if "MSO Data" in x.keys():
902                x.update(x.pop("MSO Data"))
903
904        # Format the spectral library
905        format_func = self._get_format_func(format)
906        lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs)
907        return (lib, mol_data_dict)

Interface to the Metabolomics Reference Database for LC-MS data.

MetabRefLCInterface()
656    def __init__(self):
657        """
658        Initialize instance.
659
660        """
661
662        super().__init__()
663
664        # API endpoint for precursor m/z search
665        # inputs = mz, tolerance (in Da), polarity, page_no, per_page
666        self.PRECURSOR_MZ_URL = (
667            "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}"
668        )
669
670        # API endpoint for returning full list of precursor m/z values in database
671        # inputs = polarity, page_no, per_page
672        self.PRECURSOR_MZ_ALL_URL = "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}"
673
674        self.__init_format_map__()

Initialize instance.

PRECURSOR_MZ_URL
PRECURSOR_MZ_ALL_URL
def query_by_precursor( self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50):
695    def query_by_precursor(self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50):
696        """
697        Query MetabRef by precursor m/z values.
698
699        Parameters
700        ----------
701        mz_list : list
702            List of precursor m/z values.
703        polarity : str
704            Ionization polarity, either "positive" or "negative".
705        mz_tol_ppm : float
706            Tolerance in ppm for each precursor m/z value.
707            Used for retrieving from a potential match from database.
708        mz_tol_da_api : float, optional
709            Maximum tolerance between precursor m/z values for API search, in daltons.
710            Used to group similar mzs into a single API query for speed. Default is 0.2.
711        max_per_page : int, optional
712            Maximum records to return from MetabRef API query at a time.  Default is 50.
713
714        Returns
715        -------
716        list
717            List of library entries in original JSON format.
718        """
719
720        # If polarity is anything other than positive or negative, raise error
721        if polarity not in ["positive", "negative"]:
722            raise ValueError("Polarity must be 'positive' or 'negative'")
723
724        # Cluster groups of mz according to mz_tol_da_api for precursor query
725        mz_list.sort()
726        mz_groups = [[mz_list[0]]]
727        for x in mz_list[1:]:
728            if abs(x - mz_groups[-1][0]) <= mz_tol_da_api:
729                mz_groups[-1].append(x)
730            else:
731                mz_groups.append([x])
732
733        # Query MetabRef for each mz group
734        lib = []
735        for mz_group in mz_groups:
736            mz = np.mean(mz_group)
737            if len(mz_group) == 1:
738                mz = mz_group[0]
739                tol = mz_tol_ppm * 10**-6 * mz
740            else:
741                mz = (max(mz_group) - min(mz_group)) / 2 + min(mz_group)
742                tol = (max(mz_group) - min(mz_group)) / 2 + mz_tol_ppm**-6 * max(
743                    mz_group
744                )
745            
746            # Get first page of results
747            response = self.get_query(
748                self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity, 1, max_per_page)
749            )
750            lib = lib + response['results']
751
752            # If there are more pages of results, get them
753            if response['total_pages'] > 1: 
754                for i in np.arange(2, response['total_pages']+1):
755                    lib = lib + self.get_query(
756                        self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity, i, max_per_page)
757                        )['results']
758
759        return lib

Query MetabRef by precursor m/z values.

Parameters
  • mz_list (list): List of precursor m/z values.
  • polarity (str): Ionization polarity, either "positive" or "negative".
  • mz_tol_ppm (float): Tolerance in ppm for each precursor m/z value. Used for retrieving from a potential match from database.
  • mz_tol_da_api (float, optional): Maximum tolerance between precursor m/z values for API search, in daltons. Used to group similar mzs into a single API query for speed. Default is 0.2.
  • max_per_page (int, optional): Maximum records to return from MetabRef API query at a time. Default is 50.
Returns
  • list: List of library entries in original JSON format.
def request_all_precursors(self, polarity, per_page=50000):
761    def request_all_precursors(self, polarity, per_page = 50000):
762        """
763        Request all precursor m/z values for MS2 spectra from MetabRef.
764
765        Parameters
766        ----------
767        polarity : str
768            Ionization polarity, either "positive" or "negative".
769        per_page : int, optional
770            Number of records to fetch per call. Default is 50000
771
772        Returns
773        -------
774        list
775            List of all precursor m/z values, sorted.
776        """
777        # If polarity is anything other than positive or negative, raise error
778        if polarity not in ["positive", "negative"]:
779            raise ValueError("Polarity must be 'positive' or 'negative'")
780
781        precursors = []    
782
783        # Get first page of results and total number of pages of results
784        response = self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity, str(1), str(per_page)))
785        total_pages = response['total_pages']
786        precursors.extend([x['precursor_ion'] for x in response['results']])
787
788        # Go through remaining pages of results
789        for i in np.arange(2, total_pages + 1):
790            response = self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity, str(i), str(per_page)))
791            precursors.extend([x['precursor_ion'] for x in response['results']])
792        
793        # Sort precursors from smallest to largest and remove duplicates
794        precursors = list(set(precursors))
795        precursors.sort()
796
797        return precursors

Request all precursor m/z values for MS2 spectra from MetabRef.

Parameters
  • polarity (str): Ionization polarity, either "positive" or "negative".
  • per_page (int, optional): Number of records to fetch per call. Default is 50000
Returns
  • list: List of all precursor m/z values, sorted.
def get_lipid_library( self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, format='json', normalize=True, fe_kwargs={}):
799    def get_lipid_library(
800        self,
801        mz_list,
802        polarity,
803        mz_tol_ppm,
804        mz_tol_da_api=0.2,
805        format="json",
806        normalize=True,
807        fe_kwargs={},
808    ):
809        """
810        Request MetabRef lipid library.
811
812        Parameters
813        ----------
814        mz_list : list
815            List of precursor m/z values.
816        polarity : str
817            Ionization polarity, either "positive" or "negative".
818        mz_tol_ppm : float
819            Tolerance in ppm for each precursor m/z value.
820            Used for retrieving from a potential match from database.
821        mz_tol_da_api : float, optional
822            Maximum tolerance between precursor m/z values for API search, in daltons.
823            Used to group similar mzs into a single API query for speed. Default is 0.2.
824        format : str, optional
825            Format of requested library, i.e. "json", "sql", "flashentropy".
826            See `available_formats` method for aliases. Default is "json".
827        normalize : bool, optional
828            Normalize the spectrum by its magnitude. Default is True.
829        fe_kwargs : dict, optional
830            Keyword arguments for FlashEntropy search. Default is {}.
831
832        Returns
833        -------
834        tuple
835            Library in requested format and lipid metadata as a LipidMetadata dataclass.
836
837        """
838        mz_list.sort()
839        mz_list = np.array(mz_list)
840
841        # Get all precursors in the library matching the polarity
842        precusors_in_lib = self.request_all_precursors(polarity=polarity)
843        precusors_in_lib = np.array(precusors_in_lib)
844
845        # Compare the mz_list with the precursors in the library, keep any mzs that are within mz_tol of any precursor in the library
846        lib_mz_df = pd.DataFrame(precusors_in_lib, columns=["lib_mz"])
847        lib_mz_df["closest_obs_mz"] = mz_list[
848            find_closest(mz_list, lib_mz_df.lib_mz.values)
849        ]
850        lib_mz_df["mz_diff_ppm"] = np.abs(
851            (lib_mz_df["lib_mz"] - lib_mz_df["closest_obs_mz"])
852            / lib_mz_df["lib_mz"]
853            * 1e6
854        )
855        lib_mz_sub = lib_mz_df[lib_mz_df["mz_diff_ppm"] <= mz_tol_ppm]
856
857        # Do the same in the opposite direction
858        mz_df = pd.DataFrame(mz_list, columns=["mass_feature_mz"])
859        mz_df["closest_lib_pre_mz"] = precusors_in_lib[
860            find_closest(precusors_in_lib, mz_df.mass_feature_mz.values)
861        ]
862        mz_df["mz_diff_ppm"] = np.abs(
863            (mz_df["mass_feature_mz"] - mz_df["closest_lib_pre_mz"])
864            / mz_df["mass_feature_mz"]
865            * 1e6
866        )
867        mz_df_sub = mz_df[mz_df["mz_diff_ppm"] <= mz_tol_ppm]
868
869        # Evaluate which is fewer mzs - lib_mz_sub or mz_df_sub and use that as the input for next step
870        if len(lib_mz_sub) < len(mz_df_sub):
871            mzs_to_query = lib_mz_sub.lib_mz.values
872        else:
873            mzs_to_query = mz_df_sub.mass_feature_mz.values
874
875        # Query the library for the precursors in the mz_list that are in the library to retrieve the spectra and metadata
876        lib = self.query_by_precursor(
877            mz_list=mzs_to_query,
878            polarity=polarity,
879            mz_tol_ppm=mz_tol_ppm,
880            mz_tol_da_api=mz_tol_da_api,
881        )
882
883        # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass
884        mol_data_dict = {x["id"]: x["Molecular Data"] for x in lib}
885        lipid_lib = {x["id"]: x["Lipid Tree"] for x in lib if "Lipid Tree" in x.keys()}
886        mol_data_dict = {k: {**v, **lipid_lib[k]} for k, v in mol_data_dict.items()}
887        mol_data_dict = {
888            k: self._dict_to_dataclass(v, LipidMetadata)
889            for k, v in mol_data_dict.items()
890        }
891
892        # Remove lipid metadata from the metabref library
893        lib = [
894            {k: v for k, v in x.items() if k not in ["Molecular Data", "Lipid Tree"]}
895            for x in lib
896        ]
897        # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry
898        for x in lib:
899            if "Lipid Fragments" in x.keys():
900                x.update(x.pop("Lipid Fragments"))
901            if "MSO Data" in x.keys():
902                x.update(x.pop("MSO Data"))
903
904        # Format the spectral library
905        format_func = self._get_format_func(format)
906        lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs)
907        return (lib, mol_data_dict)

Request MetabRef lipid library.

Parameters
  • mz_list (list): List of precursor m/z values.
  • polarity (str): Ionization polarity, either "positive" or "negative".
  • mz_tol_ppm (float): Tolerance in ppm for each precursor m/z value. Used for retrieving from a potential match from database.
  • mz_tol_da_api (float, optional): Maximum tolerance between precursor m/z values for API search, in daltons. Used to group similar mzs into a single API query for speed. Default is 0.2.
  • format (str, optional): Format of requested library, i.e. "json", "sql", "flashentropy". See available_formats method for aliases. Default is "json".
  • normalize (bool, optional): Normalize the spectrum by its magnitude. Default is True.
  • fe_kwargs (dict, optional): Keyword arguments for FlashEntropy search. Default is {}.
Returns
  • tuple: Library in requested format and lipid metadata as a LipidMetadata dataclass.