corems.molecular_id.search.database_interfaces
1import os 2import re 3from abc import ABC 4 5import numpy as np 6import requests 7import pandas as pd 8from ms_entropy import FlashEntropySearch 9 10from corems.molecular_id.factory.EI_SQL import EI_LowRes_SQLite, Metadatar 11from corems.molecular_id.factory.lipid_molecular_metadata import LipidMetadata 12from corems.mass_spectra.calc.lc_calc import find_closest 13 14 15class SpectralDatabaseInterface(ABC): 16 """ 17 Base class that facilitates connection to spectral reference databases, 18 such as EMSL's Metabolomics Reference Database (MetabRef). 19 20 """ 21 22 def __init__(self, key=None): 23 """ 24 Initialize instance. 25 26 Parameters 27 ---------- 28 key : str 29 Token key. 30 31 """ 32 33 self.key = key 34 35 if self.key is None: 36 raise ValueError( 37 "Must specify environment variable key for token associatedwith this database interface." 38 ) 39 40 def set_token(self, path): 41 """ 42 Set environment variable for MetabRef database token. 43 44 Parameters 45 ---------- 46 path : str 47 Path to token. 48 49 """ 50 51 # Read token from file 52 with open(path, "r", encoding="utf-8") as f: 53 token = f.readline().strip() 54 55 # Set environment variable 56 os.environ[self.key] = token 57 58 def get_token(self): 59 """ 60 Get environment variable for database token. 61 62 Returns 63 ------- 64 str 65 Token string. 66 67 """ 68 69 # Check for token 70 if self.key not in os.environ: 71 raise ValueError("Must set {} environment variable.".format(self.key)) 72 73 # Get token from environment variables 74 return os.environ.get(self.key) 75 76 def get_header(self): 77 """ 78 Access stored database token and prepare as header. 79 80 Returns 81 ------- 82 str 83 Header string. 84 85 """ 86 87 # Get token 88 token = self.get_token() 89 90 # Pad header information 91 header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"} 92 93 return header 94 95 def get_query(self, url, use_header=True): 96 """ 97 Request payload from URL according to `get` protocol. 98 99 Parameters 100 ---------- 101 url : str 102 URL for request. 103 use_header: bool 104 Whether or not the query should include the header 105 106 Returns 107 ------- 108 dict 109 Response as JSON. 110 111 """ 112 113 # Query URL via `get` 114 if use_header: 115 response = requests.get(url, headers=self.get_header()) 116 else: 117 response = requests.get(url) 118 119 # Check response 120 response.raise_for_status() 121 122 # Return as JSON 123 return response.json() 124 125 def post_query(self, url, variable, values, tolerance): 126 """ 127 Request payload from URL according to `post` protocol. 128 129 Parameters 130 ---------- 131 url : str 132 URL for request. 133 variable : str 134 Variable to query. 135 values : str 136 Specific values of `variable` to query. 137 tolerance : str 138 Query tolerance relative to `values`. 139 140 Returns 141 ------- 142 dict 143 Response as JSON. 144 145 """ 146 147 # Coerce to string 148 if not isinstance(variable, str): 149 variable = str(variable).replace(" ", "") 150 151 if not isinstance(values, str): 152 values = str(values).replace(" ", "") 153 154 if not isinstance(tolerance, str): 155 tolerance = str(tolerance).replace(" ", "") 156 157 # Query URL via `post` 158 response = requests.post( 159 os.path.join(url, variable, tolerance), 160 data=values, 161 headers=self.get_header(), 162 ) 163 164 # Check response 165 response.raise_for_status() 166 167 # Return as JSON 168 return response.json() 169 170 171class MetabRefInterface(SpectralDatabaseInterface): 172 """ 173 Interface to the Metabolomics Reference Database. 174 """ 175 176 def __init__(self): 177 """ 178 Initialize instance. 179 180 """ 181 182 super().__init__(key="METABREF_TOKEN") 183 184 def _get_format_func(self, format): 185 """ 186 Obtain format function by key. 187 188 Returns 189 ------- 190 func 191 Formatting function. 192 """ 193 194 if format.lower() in self.format_map.keys(): 195 return self.format_map[format.lower()] 196 197 raise ValueError(("{} not a supported format.").format(format)) 198 199 def spectrum_to_array(self, spectrum, normalize=True): 200 """ 201 Convert MetabRef-formatted spectrum to array. 202 203 Parameters 204 ---------- 205 spectrum : str 206 MetabRef spectrum, i.e. list of (m/z,abundance) pairs. 207 normalize : bool 208 Normalize the spectrum by its magnitude. 209 210 Returns 211 ------- 212 :obj:`~numpy.array` 213 Array of shape (N, 2), with m/z in the first column and abundance in 214 the second. 215 216 """ 217 218 # Convert parenthesis-delimited string to array 219 arr = np.array( 220 re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float 221 ).reshape(-1, 2) 222 223 # Normalize the array 224 if normalize: 225 arr[:, -1] = arr[:, -1] / arr[:, -1].sum() 226 227 return arr 228 229 def _to_flashentropy(self, metabref_lib, normalize=True, fe_kwargs={}): 230 """ 231 Convert metabref-formatted library to FlashEntropy library. 232 233 Parameters 234 ---------- 235 metabref_lib : dict 236 MetabRef MS2 library in JSON format or FlashEntropy search instance (for reformatting at different MS2 separation). 237 normalize : bool 238 Normalize each spectrum by its magnitude. 239 fe_kwargs : dict, optional 240 Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search; 241 any keys not recognized will be ignored. By default, all parameters set to defaults. 242 243 Returns 244 ------- 245 :obj:`~ms_entropy.FlashEntropySearch` 246 MS2 library as FlashEntropy search instance. 247 248 Raises 249 ------ 250 ValueError 251 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal. 252 253 """ 254 # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da 255 if ( 256 "min_ms2_difference_in_da" in fe_kwargs 257 or "max_ms2_tolerance_in_da" in fe_kwargs 258 ): 259 if ( 260 "min_ms2_difference_in_da" not in fe_kwargs 261 or "max_ms2_tolerance_in_da" not in fe_kwargs 262 ): 263 raise ValueError( 264 "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified." 265 ) 266 if ( 267 fe_kwargs["min_ms2_difference_in_da"] 268 != 2 * fe_kwargs["max_ms2_tolerance_in_da"] 269 ): 270 raise ValueError( 271 "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'." 272 ) 273 274 # Initialize empty library 275 fe_lib = [] 276 277 # Enumerate spectra 278 for i, source in enumerate(metabref_lib): 279 # Reorganize source dict, if necessary 280 if "spectrum_data" in source.keys(): 281 spectrum = source["spectrum_data"] 282 else: 283 spectrum = source 284 285 # Rename precursor_mz key for FlashEntropy 286 if "precursor_mz" not in spectrum.keys(): 287 spectrum["precursor_mz"] = spectrum.pop("precursor_ion") 288 289 # Convert CoreMS spectrum to array and clean, store as `peaks` 290 spectrum["peaks"] = self.spectrum_to_array( 291 spectrum["mz"], normalize=normalize 292 ) 293 294 # Add spectrum to library 295 fe_lib.append(spectrum) 296 297 # Initialize FlashEntropy 298 fe_init_kws = [ 299 "max_ms2_tolerance_in_da", 300 "mz_index_step", 301 "low_memory", 302 "path_data", 303 ] 304 fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws} 305 fes = FlashEntropySearch(**fe_init_kws) 306 307 # Build FlashEntropy index 308 fe_index_kws = [ 309 "max_indexed_mz", 310 "precursor_ions_removal_da", 311 "noise_threshold", 312 "min_ms2_difference_in_da", 313 "max_peak_num", 314 ] 315 fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws} 316 fes.build_index(fe_lib, **fe_index_kws, clean_spectra=True) 317 318 return fes 319 320 def _dict_to_dataclass(self, metabref_lib, data_class): 321 """ 322 Convert dictionary to dataclass. 323 324 Notes 325 ----- 326 This function will pull the attributes a dataclass and its parent class 327 and convert the dictionary to a dataclass instance with the appropriate 328 attributes. 329 330 Parameters 331 ---------- 332 data_class : :obj:`~dataclasses.dataclass` 333 Dataclass to convert to. 334 metabref_lib : dict 335 Metabref dictionary object to convert to dataclass. 336 337 Returns 338 ------- 339 :obj:`~dataclasses.dataclass` 340 Dataclass instance. 341 342 """ 343 344 # Get list of expected attributes of data_class 345 data_class_keys = list(data_class.__annotations__.keys()) 346 347 # Does the data_class inherit from another class, if so, get the attributes of the parent class as well 348 if len(data_class.__mro__) > 2: 349 parent_class_keys = list(data_class.__bases__[0].__annotations__.keys()) 350 data_class_keys = list(set(data_class_keys + parent_class_keys)) 351 352 # Remove keys that are not in the data_class from the input dictionary 353 input_dict = {k: v for k, v in metabref_lib.items() if k in data_class_keys} 354 355 # Add keys that are in the data class but not in the input dictionary as None 356 for key in data_class_keys: 357 if key not in input_dict.keys(): 358 input_dict[key] = None 359 return data_class(**input_dict) 360 361 def get_query(self, url, use_header=False): 362 """Overwrites the get_query method on the parent class to default to not use a header 363 364 Notes 365 ----- 366 As of January 2025, the metabref database no longer requires a token and therefore no header is needed 367 368 """ 369 return super().get_query(url, use_header) 370 371 372class MetabRefGCInterface(MetabRefInterface): 373 """ 374 Interface to the Metabolomics Reference Database. 375 """ 376 377 def __init__(self): 378 """ 379 Initialize instance. 380 381 """ 382 383 super().__init__() 384 self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1" 385 self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames" 386 387 self.__init_format_map__() 388 389 def __init_format_map__(self): 390 """ 391 Initialize database format mapper, enabling multiple format requests. 392 393 """ 394 395 # Define format workflows 396 self.format_map = { 397 "json": lambda x, normalize, fe_kwargs: x, 398 "dict": lambda x, 399 normalize, 400 fe_kwargs: self._to_LowResolutionEICompound_dict(x, normalize), 401 "sql": lambda x, 402 normalize, 403 fe_kwargs: self._LowResolutionEICompound_dict_to_sqlite( 404 self._to_LowResolutionEICompound_dict(x, normalize) 405 ), 406 } 407 408 # Add aliases 409 self.format_map["metabref"] = self.format_map["json"] 410 self.format_map["datadict"] = self.format_map["dict"] 411 self.format_map["data-dict"] = self.format_map["dict"] 412 self.format_map["lowreseicompound"] = self.format_map["dict"] 413 self.format_map["lowres"] = self.format_map["dict"] 414 self.format_map["lowresgc"] = self.format_map["dict"] 415 self.format_map["sqlite"] = self.format_map["sql"] 416 417 def available_formats(self): 418 """ 419 View list of available formats. 420 421 Returns 422 ------- 423 list 424 Format map keys. 425 """ 426 427 return list(self.format_map.keys()) 428 429 def get_library(self, format="json", normalize=False): 430 """ 431 Request MetabRef GC/MS library. 432 433 Parameters 434 ---------- 435 format : str 436 Format of requested library, i.e. "json", "sql", "flashentropy". 437 See `available_formats` method for aliases. 438 normalize : bool 439 Normalize the spectrum by its magnitude. 440 441 Returns 442 ------- 443 Library in requested format. 444 445 """ 446 447 # Init format function 448 format_func = self._get_format_func(format) 449 450 return format_func( 451 self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {} 452 ) 453 454 def get_fames(self, format="json", normalize=False): 455 """ 456 Request MetabRef GC/MS FAMEs library. 457 458 Parameters 459 ---------- 460 format : str 461 Format of requested library, i.e. "json", "sql", "flashentropy". 462 See `available_formats` method for aliases. 463 normalize : bool 464 Normalize the spectrum by its magnitude. 465 466 Returns 467 ------- 468 Library in requested format. 469 470 """ 471 472 # Init format function 473 format_func = self._get_format_func(format) 474 475 return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {}) 476 477 def _to_LowResolutionEICompound_dict(self, metabref_lib, normalize=False): 478 """ 479 Convert MetabRef-formatted library to CoreMS LowResolutionEICompound-formatted 480 dictionary for local ingestion. 481 482 Parameters 483 ---------- 484 metabref_lib : dict 485 MetabRef GC-MS library in JSON format. 486 normalize : bool 487 Normalize each spectrum by its magnitude. 488 489 Returns 490 ------- 491 list of dict 492 List of each spectrum contained in dictionary. 493 494 """ 495 496 # All below key:value lookups are based on CoreMS class definitions 497 # NOT MetabRef content. For example, MetabRef has keys for PubChem, 498 # USI, etc. that are not considered below. 499 500 # Dictionary to map metabref keys to corems keys 501 metadatar_cols = { 502 "casno": "cas", 503 "inchikey": "inchikey", 504 "inchi": "inchi", 505 "chebi": "chebi", 506 "smiles": "smiles", 507 "kegg": "kegg", 508 "iupac_name": "iupac_name", 509 "traditional_name": "traditional_name", # Not present in metabref 510 "common_name": "common_name", # Not present in metabref 511 } 512 513 # Dictionary to map metabref keys to corems keys 514 lowres_ei_compound_cols = { 515 "id": "metabref_id", 516 "molecule_name": "name", # Is this correct? 517 "classify": "classify", # Not present in metabref 518 "formula": "formula", 519 "ri": "ri", 520 "rt": "retention_time", 521 "source": "source", # Not present in metabref 522 "casno": "casno", 523 "comments": "comment", 524 "source_temp_c": "source_temp_c", # Not present in metabref 525 "ev": "ev", # Not present in metabref 526 "peak_count": "peaks_count", 527 "mz": "mz", 528 "abundance": "abundance", 529 } 530 531 # Local result container 532 corems_lib = [] 533 534 # Enumerate spectra 535 for i, source_ in enumerate(metabref_lib): 536 # Copy source to prevent modification 537 source = source_.copy() 538 539 # Flatten source dict 540 source = source.pop("spectrum_data") | source 541 542 # Parse target data 543 target = { 544 lowres_ei_compound_cols[k]: v 545 for k, v in source.items() 546 if k in lowres_ei_compound_cols 547 } 548 549 # Explicitly add this to connect with LowResCompoundRef later 550 target["rt"] = source["rt"] 551 552 # Parse (mz, abundance) 553 arr = self.spectrum_to_array(target["mz"], normalize=normalize) 554 target["mz"] = arr[:, 0] 555 target["abundance"] = arr[:, 1] 556 557 # Parse meta data 558 target["metadata"] = { 559 metadatar_cols[k]: v for k, v in source.items() if k in metadatar_cols 560 } 561 562 # Add anything else 563 for k in source: 564 if k not in lowres_ei_compound_cols: 565 target[k] = source[k] 566 567 # Add to CoreMS list 568 corems_lib.append(target) 569 570 return corems_lib 571 572 def _LowResolutionEICompound_dict_to_sqlite( 573 self, lowres_ei_compound_dict, url="sqlite://" 574 ): 575 """ 576 Convert CoreMS LowResolutionEICompound-formatted dictionary to SQLite 577 database for local ingestion. 578 579 Parameters 580 ---------- 581 lowres_ei_compound_dict : dict 582 CoreMS GC-MS library formatted for LowResolutionEICompound. 583 url : str 584 URL to SQLite prefix. 585 586 Returns 587 ------- 588 sqlite database 589 Spectra contained in SQLite database. 590 591 """ 592 593 # Dictionary to map corems keys to all-caps keys 594 capped_cols = { 595 "name": "NAME", 596 "formula": "FORM", 597 "ri": "RI", 598 "retention_time": "RT", 599 "source": "SOURCE", 600 "casno": "CASNO", 601 "comment": "COMMENT", 602 "peaks_count": "NUM PEAKS", 603 } 604 605 # Initialize SQLite object 606 sqlite_obj = EI_LowRes_SQLite(url=url) 607 608 # Iterate spectra 609 for _data_dict in lowres_ei_compound_dict: 610 # Copy source to prevent modification 611 data_dict = _data_dict.copy() 612 613 # Add missing capped values 614 for k, v in capped_cols.items(): 615 # Key exists 616 if k in data_dict: 617 # # This will replace the key 618 # data_dict[v] = data_dict.pop(k) 619 620 # This will keep both keys 621 data_dict[v] = data_dict[k] 622 623 # Parse number of peaks 624 if not data_dict.get("NUM PEAKS"): 625 data_dict["NUM PEAKS"] = len(data_dict.get("mz")) 626 627 # Parse CAS number 628 if not data_dict.get("CASNO"): 629 data_dict["CASNO"] = data_dict.get("CAS") 630 631 if not data_dict["CASNO"]: 632 data_dict["CASNO"] = 0 633 634 # Build linked metadata table 635 if "metadata" in data_dict: 636 if len(data_dict["metadata"]) > 0: 637 data_dict["metadatar"] = Metadatar(**data_dict.pop("metadata")) 638 else: 639 data_dict.pop("metadata") 640 641 # Attempt addition to sqlite 642 try: 643 sqlite_obj.add_compound(data_dict) 644 except: 645 print(data_dict["NAME"]) 646 647 return sqlite_obj 648 649 650class MetabRefLCInterface(MetabRefInterface): 651 """ 652 Interface to the Metabolomics Reference Database for LC-MS data. 653 """ 654 655 def __init__(self): 656 """ 657 Initialize instance. 658 659 """ 660 661 super().__init__() 662 663 # API endpoint for precursor m/z search 664 # inputs = mz, tolerance (in Da), polarity, page_no, per_page 665 self.PRECURSOR_MZ_URL = ( 666 "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}" 667 ) 668 669 # API endpoint for returning full list of precursor m/z values in database 670 # inputs = polarity, page_no, per_page 671 self.PRECURSOR_MZ_ALL_URL = "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}" 672 673 self.__init_format_map__() 674 675 def __init_format_map__(self): 676 """ 677 Initialize database format mapper, enabling multiple format requests. 678 679 """ 680 681 # Define format workflows 682 self.format_map = { 683 "json": lambda x, normalize, fe_kwargs: x, 684 "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy( 685 x, normalize, fe_kwargs 686 ), 687 } 688 689 # Add aliases 690 self.format_map["metabref"] = self.format_map["json"] 691 self.format_map["fe"] = self.format_map["flashentropy"] 692 self.format_map["flash-entropy"] = self.format_map["flashentropy"] 693 694 def query_by_precursor(self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50): 695 """ 696 Query MetabRef by precursor m/z values. 697 698 Parameters 699 ---------- 700 mz_list : list 701 List of precursor m/z values. 702 polarity : str 703 Ionization polarity, either "positive" or "negative". 704 mz_tol_ppm : float 705 Tolerance in ppm for each precursor m/z value. 706 Used for retrieving from a potential match from database. 707 mz_tol_da_api : float, optional 708 Maximum tolerance between precursor m/z values for API search, in daltons. 709 Used to group similar mzs into a single API query for speed. Default is 0.2. 710 max_per_page : int, optional 711 Maximum records to return from MetabRef API query at a time. Default is 50. 712 713 Returns 714 ------- 715 list 716 List of library entries in original JSON format. 717 """ 718 719 # If polarity is anything other than positive or negative, raise error 720 if polarity not in ["positive", "negative"]: 721 raise ValueError("Polarity must be 'positive' or 'negative'") 722 723 # Cluster groups of mz according to mz_tol_da_api for precursor query 724 mz_list.sort() 725 mz_groups = [[mz_list[0]]] 726 for x in mz_list[1:]: 727 if abs(x - mz_groups[-1][0]) <= mz_tol_da_api: 728 mz_groups[-1].append(x) 729 else: 730 mz_groups.append([x]) 731 732 # Query MetabRef for each mz group 733 lib = [] 734 for mz_group in mz_groups: 735 mz = np.mean(mz_group) 736 if len(mz_group) == 1: 737 mz = mz_group[0] 738 tol = mz_tol_ppm * 10**-6 * mz 739 else: 740 mz = (max(mz_group) - min(mz_group)) / 2 + min(mz_group) 741 tol = (max(mz_group) - min(mz_group)) / 2 + mz_tol_ppm**-6 * max( 742 mz_group 743 ) 744 745 # Get first page of results 746 response = self.get_query( 747 self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity, 1, max_per_page) 748 ) 749 lib = lib + response['results'] 750 751 # If there are more pages of results, get them 752 if response['total_pages'] > 1: 753 for i in np.arange(2, response['total_pages']+1): 754 lib = lib + self.get_query( 755 self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity, i, max_per_page) 756 )['results'] 757 758 return lib 759 760 def request_all_precursors(self, polarity, per_page = 50000): 761 """ 762 Request all precursor m/z values for MS2 spectra from MetabRef. 763 764 Parameters 765 ---------- 766 polarity : str 767 Ionization polarity, either "positive" or "negative". 768 per_page : int, optional 769 Number of records to fetch per call. Default is 50000 770 771 Returns 772 ------- 773 list 774 List of all precursor m/z values, sorted. 775 """ 776 # If polarity is anything other than positive or negative, raise error 777 if polarity not in ["positive", "negative"]: 778 raise ValueError("Polarity must be 'positive' or 'negative'") 779 780 precursors = [] 781 782 # Get first page of results and total number of pages of results 783 response = self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity, str(1), str(per_page))) 784 total_pages = response['total_pages'] 785 precursors.extend([x['precursor_ion'] for x in response['results']]) 786 787 # Go through remaining pages of results 788 for i in np.arange(2, total_pages + 1): 789 response = self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity, str(i), str(per_page))) 790 precursors.extend([x['precursor_ion'] for x in response['results']]) 791 792 # Sort precursors from smallest to largest and remove duplicates 793 precursors = list(set(precursors)) 794 precursors.sort() 795 796 return precursors 797 798 def get_lipid_library( 799 self, 800 mz_list, 801 polarity, 802 mz_tol_ppm, 803 mz_tol_da_api=0.2, 804 format="json", 805 normalize=True, 806 fe_kwargs={}, 807 ): 808 """ 809 Request MetabRef lipid library. 810 811 Parameters 812 ---------- 813 mz_list : list 814 List of precursor m/z values. 815 polarity : str 816 Ionization polarity, either "positive" or "negative". 817 mz_tol_ppm : float 818 Tolerance in ppm for each precursor m/z value. 819 Used for retrieving from a potential match from database. 820 mz_tol_da_api : float, optional 821 Maximum tolerance between precursor m/z values for API search, in daltons. 822 Used to group similar mzs into a single API query for speed. Default is 0.2. 823 format : str, optional 824 Format of requested library, i.e. "json", "sql", "flashentropy". 825 See `available_formats` method for aliases. Default is "json". 826 normalize : bool, optional 827 Normalize the spectrum by its magnitude. Default is True. 828 fe_kwargs : dict, optional 829 Keyword arguments for FlashEntropy search. Default is {}. 830 831 Returns 832 ------- 833 tuple 834 Library in requested format and lipid metadata as a LipidMetadata dataclass. 835 836 """ 837 mz_list.sort() 838 mz_list = np.array(mz_list) 839 840 # Get all precursors in the library matching the polarity 841 precusors_in_lib = self.request_all_precursors(polarity=polarity) 842 precusors_in_lib = np.array(precusors_in_lib) 843 844 # Compare the mz_list with the precursors in the library, keep any mzs that are within mz_tol of any precursor in the library 845 lib_mz_df = pd.DataFrame(precusors_in_lib, columns=["lib_mz"]) 846 lib_mz_df["closest_obs_mz"] = mz_list[ 847 find_closest(mz_list, lib_mz_df.lib_mz.values) 848 ] 849 lib_mz_df["mz_diff_ppm"] = np.abs( 850 (lib_mz_df["lib_mz"] - lib_mz_df["closest_obs_mz"]) 851 / lib_mz_df["lib_mz"] 852 * 1e6 853 ) 854 lib_mz_sub = lib_mz_df[lib_mz_df["mz_diff_ppm"] <= mz_tol_ppm] 855 856 # Do the same in the opposite direction 857 mz_df = pd.DataFrame(mz_list, columns=["mass_feature_mz"]) 858 mz_df["closest_lib_pre_mz"] = precusors_in_lib[ 859 find_closest(precusors_in_lib, mz_df.mass_feature_mz.values) 860 ] 861 mz_df["mz_diff_ppm"] = np.abs( 862 (mz_df["mass_feature_mz"] - mz_df["closest_lib_pre_mz"]) 863 / mz_df["mass_feature_mz"] 864 * 1e6 865 ) 866 mz_df_sub = mz_df[mz_df["mz_diff_ppm"] <= mz_tol_ppm] 867 868 # Evaluate which is fewer mzs - lib_mz_sub or mz_df_sub and use that as the input for next step 869 if len(lib_mz_sub) < len(mz_df_sub): 870 mzs_to_query = lib_mz_sub.lib_mz.values 871 else: 872 mzs_to_query = mz_df_sub.mass_feature_mz.values 873 874 # Query the library for the precursors in the mz_list that are in the library to retrieve the spectra and metadata 875 lib = self.query_by_precursor( 876 mz_list=mzs_to_query, 877 polarity=polarity, 878 mz_tol_ppm=mz_tol_ppm, 879 mz_tol_da_api=mz_tol_da_api, 880 ) 881 882 # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass 883 mol_data_dict = {x["id"]: x["Molecular Data"] for x in lib} 884 lipid_lib = {x["id"]: x["Lipid Tree"] for x in lib if "Lipid Tree" in x.keys()} 885 mol_data_dict = {k: {**v, **lipid_lib[k]} for k, v in mol_data_dict.items()} 886 mol_data_dict = { 887 k: self._dict_to_dataclass(v, LipidMetadata) 888 for k, v in mol_data_dict.items() 889 } 890 891 # Remove lipid metadata from the metabref library 892 lib = [ 893 {k: v for k, v in x.items() if k not in ["Molecular Data", "Lipid Tree"]} 894 for x in lib 895 ] 896 # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry 897 for x in lib: 898 if "Lipid Fragments" in x.keys(): 899 x.update(x.pop("Lipid Fragments")) 900 if "MSO Data" in x.keys(): 901 x.update(x.pop("MSO Data")) 902 903 # Format the spectral library 904 format_func = self._get_format_func(format) 905 lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs) 906 return (lib, mol_data_dict)
16class SpectralDatabaseInterface(ABC): 17 """ 18 Base class that facilitates connection to spectral reference databases, 19 such as EMSL's Metabolomics Reference Database (MetabRef). 20 21 """ 22 23 def __init__(self, key=None): 24 """ 25 Initialize instance. 26 27 Parameters 28 ---------- 29 key : str 30 Token key. 31 32 """ 33 34 self.key = key 35 36 if self.key is None: 37 raise ValueError( 38 "Must specify environment variable key for token associatedwith this database interface." 39 ) 40 41 def set_token(self, path): 42 """ 43 Set environment variable for MetabRef database token. 44 45 Parameters 46 ---------- 47 path : str 48 Path to token. 49 50 """ 51 52 # Read token from file 53 with open(path, "r", encoding="utf-8") as f: 54 token = f.readline().strip() 55 56 # Set environment variable 57 os.environ[self.key] = token 58 59 def get_token(self): 60 """ 61 Get environment variable for database token. 62 63 Returns 64 ------- 65 str 66 Token string. 67 68 """ 69 70 # Check for token 71 if self.key not in os.environ: 72 raise ValueError("Must set {} environment variable.".format(self.key)) 73 74 # Get token from environment variables 75 return os.environ.get(self.key) 76 77 def get_header(self): 78 """ 79 Access stored database token and prepare as header. 80 81 Returns 82 ------- 83 str 84 Header string. 85 86 """ 87 88 # Get token 89 token = self.get_token() 90 91 # Pad header information 92 header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"} 93 94 return header 95 96 def get_query(self, url, use_header=True): 97 """ 98 Request payload from URL according to `get` protocol. 99 100 Parameters 101 ---------- 102 url : str 103 URL for request. 104 use_header: bool 105 Whether or not the query should include the header 106 107 Returns 108 ------- 109 dict 110 Response as JSON. 111 112 """ 113 114 # Query URL via `get` 115 if use_header: 116 response = requests.get(url, headers=self.get_header()) 117 else: 118 response = requests.get(url) 119 120 # Check response 121 response.raise_for_status() 122 123 # Return as JSON 124 return response.json() 125 126 def post_query(self, url, variable, values, tolerance): 127 """ 128 Request payload from URL according to `post` protocol. 129 130 Parameters 131 ---------- 132 url : str 133 URL for request. 134 variable : str 135 Variable to query. 136 values : str 137 Specific values of `variable` to query. 138 tolerance : str 139 Query tolerance relative to `values`. 140 141 Returns 142 ------- 143 dict 144 Response as JSON. 145 146 """ 147 148 # Coerce to string 149 if not isinstance(variable, str): 150 variable = str(variable).replace(" ", "") 151 152 if not isinstance(values, str): 153 values = str(values).replace(" ", "") 154 155 if not isinstance(tolerance, str): 156 tolerance = str(tolerance).replace(" ", "") 157 158 # Query URL via `post` 159 response = requests.post( 160 os.path.join(url, variable, tolerance), 161 data=values, 162 headers=self.get_header(), 163 ) 164 165 # Check response 166 response.raise_for_status() 167 168 # Return as JSON 169 return response.json()
Base class that facilitates connection to spectral reference databases, such as EMSL's Metabolomics Reference Database (MetabRef).
23 def __init__(self, key=None): 24 """ 25 Initialize instance. 26 27 Parameters 28 ---------- 29 key : str 30 Token key. 31 32 """ 33 34 self.key = key 35 36 if self.key is None: 37 raise ValueError( 38 "Must specify environment variable key for token associatedwith this database interface." 39 )
Initialize instance.
Parameters
- key (str): Token key.
41 def set_token(self, path): 42 """ 43 Set environment variable for MetabRef database token. 44 45 Parameters 46 ---------- 47 path : str 48 Path to token. 49 50 """ 51 52 # Read token from file 53 with open(path, "r", encoding="utf-8") as f: 54 token = f.readline().strip() 55 56 # Set environment variable 57 os.environ[self.key] = token
Set environment variable for MetabRef database token.
Parameters
- path (str): Path to token.
59 def get_token(self): 60 """ 61 Get environment variable for database token. 62 63 Returns 64 ------- 65 str 66 Token string. 67 68 """ 69 70 # Check for token 71 if self.key not in os.environ: 72 raise ValueError("Must set {} environment variable.".format(self.key)) 73 74 # Get token from environment variables 75 return os.environ.get(self.key)
Get environment variable for database token.
Returns
- str: Token string.
77 def get_header(self): 78 """ 79 Access stored database token and prepare as header. 80 81 Returns 82 ------- 83 str 84 Header string. 85 86 """ 87 88 # Get token 89 token = self.get_token() 90 91 # Pad header information 92 header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"} 93 94 return header
Access stored database token and prepare as header.
Returns
- str: Header string.
96 def get_query(self, url, use_header=True): 97 """ 98 Request payload from URL according to `get` protocol. 99 100 Parameters 101 ---------- 102 url : str 103 URL for request. 104 use_header: bool 105 Whether or not the query should include the header 106 107 Returns 108 ------- 109 dict 110 Response as JSON. 111 112 """ 113 114 # Query URL via `get` 115 if use_header: 116 response = requests.get(url, headers=self.get_header()) 117 else: 118 response = requests.get(url) 119 120 # Check response 121 response.raise_for_status() 122 123 # Return as JSON 124 return response.json()
Request payload from URL according to get
protocol.
Parameters
- url (str): URL for request.
- use_header (bool): Whether or not the query should include the header
Returns
- dict: Response as JSON.
126 def post_query(self, url, variable, values, tolerance): 127 """ 128 Request payload from URL according to `post` protocol. 129 130 Parameters 131 ---------- 132 url : str 133 URL for request. 134 variable : str 135 Variable to query. 136 values : str 137 Specific values of `variable` to query. 138 tolerance : str 139 Query tolerance relative to `values`. 140 141 Returns 142 ------- 143 dict 144 Response as JSON. 145 146 """ 147 148 # Coerce to string 149 if not isinstance(variable, str): 150 variable = str(variable).replace(" ", "") 151 152 if not isinstance(values, str): 153 values = str(values).replace(" ", "") 154 155 if not isinstance(tolerance, str): 156 tolerance = str(tolerance).replace(" ", "") 157 158 # Query URL via `post` 159 response = requests.post( 160 os.path.join(url, variable, tolerance), 161 data=values, 162 headers=self.get_header(), 163 ) 164 165 # Check response 166 response.raise_for_status() 167 168 # Return as JSON 169 return response.json()
Request payload from URL according to post
protocol.
Parameters
- url (str): URL for request.
- variable (str): Variable to query.
- values (str):
Specific values of
variable
to query. - tolerance (str):
Query tolerance relative to
values
.
Returns
- dict: Response as JSON.
172class MetabRefInterface(SpectralDatabaseInterface): 173 """ 174 Interface to the Metabolomics Reference Database. 175 """ 176 177 def __init__(self): 178 """ 179 Initialize instance. 180 181 """ 182 183 super().__init__(key="METABREF_TOKEN") 184 185 def _get_format_func(self, format): 186 """ 187 Obtain format function by key. 188 189 Returns 190 ------- 191 func 192 Formatting function. 193 """ 194 195 if format.lower() in self.format_map.keys(): 196 return self.format_map[format.lower()] 197 198 raise ValueError(("{} not a supported format.").format(format)) 199 200 def spectrum_to_array(self, spectrum, normalize=True): 201 """ 202 Convert MetabRef-formatted spectrum to array. 203 204 Parameters 205 ---------- 206 spectrum : str 207 MetabRef spectrum, i.e. list of (m/z,abundance) pairs. 208 normalize : bool 209 Normalize the spectrum by its magnitude. 210 211 Returns 212 ------- 213 :obj:`~numpy.array` 214 Array of shape (N, 2), with m/z in the first column and abundance in 215 the second. 216 217 """ 218 219 # Convert parenthesis-delimited string to array 220 arr = np.array( 221 re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float 222 ).reshape(-1, 2) 223 224 # Normalize the array 225 if normalize: 226 arr[:, -1] = arr[:, -1] / arr[:, -1].sum() 227 228 return arr 229 230 def _to_flashentropy(self, metabref_lib, normalize=True, fe_kwargs={}): 231 """ 232 Convert metabref-formatted library to FlashEntropy library. 233 234 Parameters 235 ---------- 236 metabref_lib : dict 237 MetabRef MS2 library in JSON format or FlashEntropy search instance (for reformatting at different MS2 separation). 238 normalize : bool 239 Normalize each spectrum by its magnitude. 240 fe_kwargs : dict, optional 241 Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search; 242 any keys not recognized will be ignored. By default, all parameters set to defaults. 243 244 Returns 245 ------- 246 :obj:`~ms_entropy.FlashEntropySearch` 247 MS2 library as FlashEntropy search instance. 248 249 Raises 250 ------ 251 ValueError 252 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal. 253 254 """ 255 # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da 256 if ( 257 "min_ms2_difference_in_da" in fe_kwargs 258 or "max_ms2_tolerance_in_da" in fe_kwargs 259 ): 260 if ( 261 "min_ms2_difference_in_da" not in fe_kwargs 262 or "max_ms2_tolerance_in_da" not in fe_kwargs 263 ): 264 raise ValueError( 265 "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified." 266 ) 267 if ( 268 fe_kwargs["min_ms2_difference_in_da"] 269 != 2 * fe_kwargs["max_ms2_tolerance_in_da"] 270 ): 271 raise ValueError( 272 "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'." 273 ) 274 275 # Initialize empty library 276 fe_lib = [] 277 278 # Enumerate spectra 279 for i, source in enumerate(metabref_lib): 280 # Reorganize source dict, if necessary 281 if "spectrum_data" in source.keys(): 282 spectrum = source["spectrum_data"] 283 else: 284 spectrum = source 285 286 # Rename precursor_mz key for FlashEntropy 287 if "precursor_mz" not in spectrum.keys(): 288 spectrum["precursor_mz"] = spectrum.pop("precursor_ion") 289 290 # Convert CoreMS spectrum to array and clean, store as `peaks` 291 spectrum["peaks"] = self.spectrum_to_array( 292 spectrum["mz"], normalize=normalize 293 ) 294 295 # Add spectrum to library 296 fe_lib.append(spectrum) 297 298 # Initialize FlashEntropy 299 fe_init_kws = [ 300 "max_ms2_tolerance_in_da", 301 "mz_index_step", 302 "low_memory", 303 "path_data", 304 ] 305 fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws} 306 fes = FlashEntropySearch(**fe_init_kws) 307 308 # Build FlashEntropy index 309 fe_index_kws = [ 310 "max_indexed_mz", 311 "precursor_ions_removal_da", 312 "noise_threshold", 313 "min_ms2_difference_in_da", 314 "max_peak_num", 315 ] 316 fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws} 317 fes.build_index(fe_lib, **fe_index_kws, clean_spectra=True) 318 319 return fes 320 321 def _dict_to_dataclass(self, metabref_lib, data_class): 322 """ 323 Convert dictionary to dataclass. 324 325 Notes 326 ----- 327 This function will pull the attributes a dataclass and its parent class 328 and convert the dictionary to a dataclass instance with the appropriate 329 attributes. 330 331 Parameters 332 ---------- 333 data_class : :obj:`~dataclasses.dataclass` 334 Dataclass to convert to. 335 metabref_lib : dict 336 Metabref dictionary object to convert to dataclass. 337 338 Returns 339 ------- 340 :obj:`~dataclasses.dataclass` 341 Dataclass instance. 342 343 """ 344 345 # Get list of expected attributes of data_class 346 data_class_keys = list(data_class.__annotations__.keys()) 347 348 # Does the data_class inherit from another class, if so, get the attributes of the parent class as well 349 if len(data_class.__mro__) > 2: 350 parent_class_keys = list(data_class.__bases__[0].__annotations__.keys()) 351 data_class_keys = list(set(data_class_keys + parent_class_keys)) 352 353 # Remove keys that are not in the data_class from the input dictionary 354 input_dict = {k: v for k, v in metabref_lib.items() if k in data_class_keys} 355 356 # Add keys that are in the data class but not in the input dictionary as None 357 for key in data_class_keys: 358 if key not in input_dict.keys(): 359 input_dict[key] = None 360 return data_class(**input_dict) 361 362 def get_query(self, url, use_header=False): 363 """Overwrites the get_query method on the parent class to default to not use a header 364 365 Notes 366 ----- 367 As of January 2025, the metabref database no longer requires a token and therefore no header is needed 368 369 """ 370 return super().get_query(url, use_header)
Interface to the Metabolomics Reference Database.
177 def __init__(self): 178 """ 179 Initialize instance. 180 181 """ 182 183 super().__init__(key="METABREF_TOKEN")
Initialize instance.
200 def spectrum_to_array(self, spectrum, normalize=True): 201 """ 202 Convert MetabRef-formatted spectrum to array. 203 204 Parameters 205 ---------- 206 spectrum : str 207 MetabRef spectrum, i.e. list of (m/z,abundance) pairs. 208 normalize : bool 209 Normalize the spectrum by its magnitude. 210 211 Returns 212 ------- 213 :obj:`~numpy.array` 214 Array of shape (N, 2), with m/z in the first column and abundance in 215 the second. 216 217 """ 218 219 # Convert parenthesis-delimited string to array 220 arr = np.array( 221 re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float 222 ).reshape(-1, 2) 223 224 # Normalize the array 225 if normalize: 226 arr[:, -1] = arr[:, -1] / arr[:, -1].sum() 227 228 return arr
Convert MetabRef-formatted spectrum to array.
Parameters
- spectrum (str): MetabRef spectrum, i.e. list of (m/z,abundance) pairs.
- normalize (bool): Normalize the spectrum by its magnitude.
Returns
~numpy.array
: Array of shape (N, 2), with m/z in the first column and abundance in the second.
362 def get_query(self, url, use_header=False): 363 """Overwrites the get_query method on the parent class to default to not use a header 364 365 Notes 366 ----- 367 As of January 2025, the metabref database no longer requires a token and therefore no header is needed 368 369 """ 370 return super().get_query(url, use_header)
Overwrites the get_query method on the parent class to default to not use a header
Notes
As of January 2025, the metabref database no longer requires a token and therefore no header is needed
Inherited Members
373class MetabRefGCInterface(MetabRefInterface): 374 """ 375 Interface to the Metabolomics Reference Database. 376 """ 377 378 def __init__(self): 379 """ 380 Initialize instance. 381 382 """ 383 384 super().__init__() 385 self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1" 386 self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames" 387 388 self.__init_format_map__() 389 390 def __init_format_map__(self): 391 """ 392 Initialize database format mapper, enabling multiple format requests. 393 394 """ 395 396 # Define format workflows 397 self.format_map = { 398 "json": lambda x, normalize, fe_kwargs: x, 399 "dict": lambda x, 400 normalize, 401 fe_kwargs: self._to_LowResolutionEICompound_dict(x, normalize), 402 "sql": lambda x, 403 normalize, 404 fe_kwargs: self._LowResolutionEICompound_dict_to_sqlite( 405 self._to_LowResolutionEICompound_dict(x, normalize) 406 ), 407 } 408 409 # Add aliases 410 self.format_map["metabref"] = self.format_map["json"] 411 self.format_map["datadict"] = self.format_map["dict"] 412 self.format_map["data-dict"] = self.format_map["dict"] 413 self.format_map["lowreseicompound"] = self.format_map["dict"] 414 self.format_map["lowres"] = self.format_map["dict"] 415 self.format_map["lowresgc"] = self.format_map["dict"] 416 self.format_map["sqlite"] = self.format_map["sql"] 417 418 def available_formats(self): 419 """ 420 View list of available formats. 421 422 Returns 423 ------- 424 list 425 Format map keys. 426 """ 427 428 return list(self.format_map.keys()) 429 430 def get_library(self, format="json", normalize=False): 431 """ 432 Request MetabRef GC/MS library. 433 434 Parameters 435 ---------- 436 format : str 437 Format of requested library, i.e. "json", "sql", "flashentropy". 438 See `available_formats` method for aliases. 439 normalize : bool 440 Normalize the spectrum by its magnitude. 441 442 Returns 443 ------- 444 Library in requested format. 445 446 """ 447 448 # Init format function 449 format_func = self._get_format_func(format) 450 451 return format_func( 452 self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {} 453 ) 454 455 def get_fames(self, format="json", normalize=False): 456 """ 457 Request MetabRef GC/MS FAMEs library. 458 459 Parameters 460 ---------- 461 format : str 462 Format of requested library, i.e. "json", "sql", "flashentropy". 463 See `available_formats` method for aliases. 464 normalize : bool 465 Normalize the spectrum by its magnitude. 466 467 Returns 468 ------- 469 Library in requested format. 470 471 """ 472 473 # Init format function 474 format_func = self._get_format_func(format) 475 476 return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {}) 477 478 def _to_LowResolutionEICompound_dict(self, metabref_lib, normalize=False): 479 """ 480 Convert MetabRef-formatted library to CoreMS LowResolutionEICompound-formatted 481 dictionary for local ingestion. 482 483 Parameters 484 ---------- 485 metabref_lib : dict 486 MetabRef GC-MS library in JSON format. 487 normalize : bool 488 Normalize each spectrum by its magnitude. 489 490 Returns 491 ------- 492 list of dict 493 List of each spectrum contained in dictionary. 494 495 """ 496 497 # All below key:value lookups are based on CoreMS class definitions 498 # NOT MetabRef content. For example, MetabRef has keys for PubChem, 499 # USI, etc. that are not considered below. 500 501 # Dictionary to map metabref keys to corems keys 502 metadatar_cols = { 503 "casno": "cas", 504 "inchikey": "inchikey", 505 "inchi": "inchi", 506 "chebi": "chebi", 507 "smiles": "smiles", 508 "kegg": "kegg", 509 "iupac_name": "iupac_name", 510 "traditional_name": "traditional_name", # Not present in metabref 511 "common_name": "common_name", # Not present in metabref 512 } 513 514 # Dictionary to map metabref keys to corems keys 515 lowres_ei_compound_cols = { 516 "id": "metabref_id", 517 "molecule_name": "name", # Is this correct? 518 "classify": "classify", # Not present in metabref 519 "formula": "formula", 520 "ri": "ri", 521 "rt": "retention_time", 522 "source": "source", # Not present in metabref 523 "casno": "casno", 524 "comments": "comment", 525 "source_temp_c": "source_temp_c", # Not present in metabref 526 "ev": "ev", # Not present in metabref 527 "peak_count": "peaks_count", 528 "mz": "mz", 529 "abundance": "abundance", 530 } 531 532 # Local result container 533 corems_lib = [] 534 535 # Enumerate spectra 536 for i, source_ in enumerate(metabref_lib): 537 # Copy source to prevent modification 538 source = source_.copy() 539 540 # Flatten source dict 541 source = source.pop("spectrum_data") | source 542 543 # Parse target data 544 target = { 545 lowres_ei_compound_cols[k]: v 546 for k, v in source.items() 547 if k in lowres_ei_compound_cols 548 } 549 550 # Explicitly add this to connect with LowResCompoundRef later 551 target["rt"] = source["rt"] 552 553 # Parse (mz, abundance) 554 arr = self.spectrum_to_array(target["mz"], normalize=normalize) 555 target["mz"] = arr[:, 0] 556 target["abundance"] = arr[:, 1] 557 558 # Parse meta data 559 target["metadata"] = { 560 metadatar_cols[k]: v for k, v in source.items() if k in metadatar_cols 561 } 562 563 # Add anything else 564 for k in source: 565 if k not in lowres_ei_compound_cols: 566 target[k] = source[k] 567 568 # Add to CoreMS list 569 corems_lib.append(target) 570 571 return corems_lib 572 573 def _LowResolutionEICompound_dict_to_sqlite( 574 self, lowres_ei_compound_dict, url="sqlite://" 575 ): 576 """ 577 Convert CoreMS LowResolutionEICompound-formatted dictionary to SQLite 578 database for local ingestion. 579 580 Parameters 581 ---------- 582 lowres_ei_compound_dict : dict 583 CoreMS GC-MS library formatted for LowResolutionEICompound. 584 url : str 585 URL to SQLite prefix. 586 587 Returns 588 ------- 589 sqlite database 590 Spectra contained in SQLite database. 591 592 """ 593 594 # Dictionary to map corems keys to all-caps keys 595 capped_cols = { 596 "name": "NAME", 597 "formula": "FORM", 598 "ri": "RI", 599 "retention_time": "RT", 600 "source": "SOURCE", 601 "casno": "CASNO", 602 "comment": "COMMENT", 603 "peaks_count": "NUM PEAKS", 604 } 605 606 # Initialize SQLite object 607 sqlite_obj = EI_LowRes_SQLite(url=url) 608 609 # Iterate spectra 610 for _data_dict in lowres_ei_compound_dict: 611 # Copy source to prevent modification 612 data_dict = _data_dict.copy() 613 614 # Add missing capped values 615 for k, v in capped_cols.items(): 616 # Key exists 617 if k in data_dict: 618 # # This will replace the key 619 # data_dict[v] = data_dict.pop(k) 620 621 # This will keep both keys 622 data_dict[v] = data_dict[k] 623 624 # Parse number of peaks 625 if not data_dict.get("NUM PEAKS"): 626 data_dict["NUM PEAKS"] = len(data_dict.get("mz")) 627 628 # Parse CAS number 629 if not data_dict.get("CASNO"): 630 data_dict["CASNO"] = data_dict.get("CAS") 631 632 if not data_dict["CASNO"]: 633 data_dict["CASNO"] = 0 634 635 # Build linked metadata table 636 if "metadata" in data_dict: 637 if len(data_dict["metadata"]) > 0: 638 data_dict["metadatar"] = Metadatar(**data_dict.pop("metadata")) 639 else: 640 data_dict.pop("metadata") 641 642 # Attempt addition to sqlite 643 try: 644 sqlite_obj.add_compound(data_dict) 645 except: 646 print(data_dict["NAME"]) 647 648 return sqlite_obj
Interface to the Metabolomics Reference Database.
378 def __init__(self): 379 """ 380 Initialize instance. 381 382 """ 383 384 super().__init__() 385 self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1" 386 self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames" 387 388 self.__init_format_map__()
Initialize instance.
418 def available_formats(self): 419 """ 420 View list of available formats. 421 422 Returns 423 ------- 424 list 425 Format map keys. 426 """ 427 428 return list(self.format_map.keys())
View list of available formats.
Returns
- list: Format map keys.
430 def get_library(self, format="json", normalize=False): 431 """ 432 Request MetabRef GC/MS library. 433 434 Parameters 435 ---------- 436 format : str 437 Format of requested library, i.e. "json", "sql", "flashentropy". 438 See `available_formats` method for aliases. 439 normalize : bool 440 Normalize the spectrum by its magnitude. 441 442 Returns 443 ------- 444 Library in requested format. 445 446 """ 447 448 # Init format function 449 format_func = self._get_format_func(format) 450 451 return format_func( 452 self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {} 453 )
Request MetabRef GC/MS library.
Parameters
- format (str):
Format of requested library, i.e. "json", "sql", "flashentropy".
See
available_formats
method for aliases. - normalize (bool): Normalize the spectrum by its magnitude.
Returns
- Library in requested format.
455 def get_fames(self, format="json", normalize=False): 456 """ 457 Request MetabRef GC/MS FAMEs library. 458 459 Parameters 460 ---------- 461 format : str 462 Format of requested library, i.e. "json", "sql", "flashentropy". 463 See `available_formats` method for aliases. 464 normalize : bool 465 Normalize the spectrum by its magnitude. 466 467 Returns 468 ------- 469 Library in requested format. 470 471 """ 472 473 # Init format function 474 format_func = self._get_format_func(format) 475 476 return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {})
Request MetabRef GC/MS FAMEs library.
Parameters
- format (str):
Format of requested library, i.e. "json", "sql", "flashentropy".
See
available_formats
method for aliases. - normalize (bool): Normalize the spectrum by its magnitude.
Returns
- Library in requested format.
651class MetabRefLCInterface(MetabRefInterface): 652 """ 653 Interface to the Metabolomics Reference Database for LC-MS data. 654 """ 655 656 def __init__(self): 657 """ 658 Initialize instance. 659 660 """ 661 662 super().__init__() 663 664 # API endpoint for precursor m/z search 665 # inputs = mz, tolerance (in Da), polarity, page_no, per_page 666 self.PRECURSOR_MZ_URL = ( 667 "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}" 668 ) 669 670 # API endpoint for returning full list of precursor m/z values in database 671 # inputs = polarity, page_no, per_page 672 self.PRECURSOR_MZ_ALL_URL = "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}" 673 674 self.__init_format_map__() 675 676 def __init_format_map__(self): 677 """ 678 Initialize database format mapper, enabling multiple format requests. 679 680 """ 681 682 # Define format workflows 683 self.format_map = { 684 "json": lambda x, normalize, fe_kwargs: x, 685 "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy( 686 x, normalize, fe_kwargs 687 ), 688 } 689 690 # Add aliases 691 self.format_map["metabref"] = self.format_map["json"] 692 self.format_map["fe"] = self.format_map["flashentropy"] 693 self.format_map["flash-entropy"] = self.format_map["flashentropy"] 694 695 def query_by_precursor(self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50): 696 """ 697 Query MetabRef by precursor m/z values. 698 699 Parameters 700 ---------- 701 mz_list : list 702 List of precursor m/z values. 703 polarity : str 704 Ionization polarity, either "positive" or "negative". 705 mz_tol_ppm : float 706 Tolerance in ppm for each precursor m/z value. 707 Used for retrieving from a potential match from database. 708 mz_tol_da_api : float, optional 709 Maximum tolerance between precursor m/z values for API search, in daltons. 710 Used to group similar mzs into a single API query for speed. Default is 0.2. 711 max_per_page : int, optional 712 Maximum records to return from MetabRef API query at a time. Default is 50. 713 714 Returns 715 ------- 716 list 717 List of library entries in original JSON format. 718 """ 719 720 # If polarity is anything other than positive or negative, raise error 721 if polarity not in ["positive", "negative"]: 722 raise ValueError("Polarity must be 'positive' or 'negative'") 723 724 # Cluster groups of mz according to mz_tol_da_api for precursor query 725 mz_list.sort() 726 mz_groups = [[mz_list[0]]] 727 for x in mz_list[1:]: 728 if abs(x - mz_groups[-1][0]) <= mz_tol_da_api: 729 mz_groups[-1].append(x) 730 else: 731 mz_groups.append([x]) 732 733 # Query MetabRef for each mz group 734 lib = [] 735 for mz_group in mz_groups: 736 mz = np.mean(mz_group) 737 if len(mz_group) == 1: 738 mz = mz_group[0] 739 tol = mz_tol_ppm * 10**-6 * mz 740 else: 741 mz = (max(mz_group) - min(mz_group)) / 2 + min(mz_group) 742 tol = (max(mz_group) - min(mz_group)) / 2 + mz_tol_ppm**-6 * max( 743 mz_group 744 ) 745 746 # Get first page of results 747 response = self.get_query( 748 self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity, 1, max_per_page) 749 ) 750 lib = lib + response['results'] 751 752 # If there are more pages of results, get them 753 if response['total_pages'] > 1: 754 for i in np.arange(2, response['total_pages']+1): 755 lib = lib + self.get_query( 756 self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity, i, max_per_page) 757 )['results'] 758 759 return lib 760 761 def request_all_precursors(self, polarity, per_page = 50000): 762 """ 763 Request all precursor m/z values for MS2 spectra from MetabRef. 764 765 Parameters 766 ---------- 767 polarity : str 768 Ionization polarity, either "positive" or "negative". 769 per_page : int, optional 770 Number of records to fetch per call. Default is 50000 771 772 Returns 773 ------- 774 list 775 List of all precursor m/z values, sorted. 776 """ 777 # If polarity is anything other than positive or negative, raise error 778 if polarity not in ["positive", "negative"]: 779 raise ValueError("Polarity must be 'positive' or 'negative'") 780 781 precursors = [] 782 783 # Get first page of results and total number of pages of results 784 response = self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity, str(1), str(per_page))) 785 total_pages = response['total_pages'] 786 precursors.extend([x['precursor_ion'] for x in response['results']]) 787 788 # Go through remaining pages of results 789 for i in np.arange(2, total_pages + 1): 790 response = self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity, str(i), str(per_page))) 791 precursors.extend([x['precursor_ion'] for x in response['results']]) 792 793 # Sort precursors from smallest to largest and remove duplicates 794 precursors = list(set(precursors)) 795 precursors.sort() 796 797 return precursors 798 799 def get_lipid_library( 800 self, 801 mz_list, 802 polarity, 803 mz_tol_ppm, 804 mz_tol_da_api=0.2, 805 format="json", 806 normalize=True, 807 fe_kwargs={}, 808 ): 809 """ 810 Request MetabRef lipid library. 811 812 Parameters 813 ---------- 814 mz_list : list 815 List of precursor m/z values. 816 polarity : str 817 Ionization polarity, either "positive" or "negative". 818 mz_tol_ppm : float 819 Tolerance in ppm for each precursor m/z value. 820 Used for retrieving from a potential match from database. 821 mz_tol_da_api : float, optional 822 Maximum tolerance between precursor m/z values for API search, in daltons. 823 Used to group similar mzs into a single API query for speed. Default is 0.2. 824 format : str, optional 825 Format of requested library, i.e. "json", "sql", "flashentropy". 826 See `available_formats` method for aliases. Default is "json". 827 normalize : bool, optional 828 Normalize the spectrum by its magnitude. Default is True. 829 fe_kwargs : dict, optional 830 Keyword arguments for FlashEntropy search. Default is {}. 831 832 Returns 833 ------- 834 tuple 835 Library in requested format and lipid metadata as a LipidMetadata dataclass. 836 837 """ 838 mz_list.sort() 839 mz_list = np.array(mz_list) 840 841 # Get all precursors in the library matching the polarity 842 precusors_in_lib = self.request_all_precursors(polarity=polarity) 843 precusors_in_lib = np.array(precusors_in_lib) 844 845 # Compare the mz_list with the precursors in the library, keep any mzs that are within mz_tol of any precursor in the library 846 lib_mz_df = pd.DataFrame(precusors_in_lib, columns=["lib_mz"]) 847 lib_mz_df["closest_obs_mz"] = mz_list[ 848 find_closest(mz_list, lib_mz_df.lib_mz.values) 849 ] 850 lib_mz_df["mz_diff_ppm"] = np.abs( 851 (lib_mz_df["lib_mz"] - lib_mz_df["closest_obs_mz"]) 852 / lib_mz_df["lib_mz"] 853 * 1e6 854 ) 855 lib_mz_sub = lib_mz_df[lib_mz_df["mz_diff_ppm"] <= mz_tol_ppm] 856 857 # Do the same in the opposite direction 858 mz_df = pd.DataFrame(mz_list, columns=["mass_feature_mz"]) 859 mz_df["closest_lib_pre_mz"] = precusors_in_lib[ 860 find_closest(precusors_in_lib, mz_df.mass_feature_mz.values) 861 ] 862 mz_df["mz_diff_ppm"] = np.abs( 863 (mz_df["mass_feature_mz"] - mz_df["closest_lib_pre_mz"]) 864 / mz_df["mass_feature_mz"] 865 * 1e6 866 ) 867 mz_df_sub = mz_df[mz_df["mz_diff_ppm"] <= mz_tol_ppm] 868 869 # Evaluate which is fewer mzs - lib_mz_sub or mz_df_sub and use that as the input for next step 870 if len(lib_mz_sub) < len(mz_df_sub): 871 mzs_to_query = lib_mz_sub.lib_mz.values 872 else: 873 mzs_to_query = mz_df_sub.mass_feature_mz.values 874 875 # Query the library for the precursors in the mz_list that are in the library to retrieve the spectra and metadata 876 lib = self.query_by_precursor( 877 mz_list=mzs_to_query, 878 polarity=polarity, 879 mz_tol_ppm=mz_tol_ppm, 880 mz_tol_da_api=mz_tol_da_api, 881 ) 882 883 # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass 884 mol_data_dict = {x["id"]: x["Molecular Data"] for x in lib} 885 lipid_lib = {x["id"]: x["Lipid Tree"] for x in lib if "Lipid Tree" in x.keys()} 886 mol_data_dict = {k: {**v, **lipid_lib[k]} for k, v in mol_data_dict.items()} 887 mol_data_dict = { 888 k: self._dict_to_dataclass(v, LipidMetadata) 889 for k, v in mol_data_dict.items() 890 } 891 892 # Remove lipid metadata from the metabref library 893 lib = [ 894 {k: v for k, v in x.items() if k not in ["Molecular Data", "Lipid Tree"]} 895 for x in lib 896 ] 897 # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry 898 for x in lib: 899 if "Lipid Fragments" in x.keys(): 900 x.update(x.pop("Lipid Fragments")) 901 if "MSO Data" in x.keys(): 902 x.update(x.pop("MSO Data")) 903 904 # Format the spectral library 905 format_func = self._get_format_func(format) 906 lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs) 907 return (lib, mol_data_dict)
Interface to the Metabolomics Reference Database for LC-MS data.
656 def __init__(self): 657 """ 658 Initialize instance. 659 660 """ 661 662 super().__init__() 663 664 # API endpoint for precursor m/z search 665 # inputs = mz, tolerance (in Da), polarity, page_no, per_page 666 self.PRECURSOR_MZ_URL = ( 667 "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}" 668 ) 669 670 # API endpoint for returning full list of precursor m/z values in database 671 # inputs = polarity, page_no, per_page 672 self.PRECURSOR_MZ_ALL_URL = "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}" 673 674 self.__init_format_map__()
Initialize instance.
695 def query_by_precursor(self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50): 696 """ 697 Query MetabRef by precursor m/z values. 698 699 Parameters 700 ---------- 701 mz_list : list 702 List of precursor m/z values. 703 polarity : str 704 Ionization polarity, either "positive" or "negative". 705 mz_tol_ppm : float 706 Tolerance in ppm for each precursor m/z value. 707 Used for retrieving from a potential match from database. 708 mz_tol_da_api : float, optional 709 Maximum tolerance between precursor m/z values for API search, in daltons. 710 Used to group similar mzs into a single API query for speed. Default is 0.2. 711 max_per_page : int, optional 712 Maximum records to return from MetabRef API query at a time. Default is 50. 713 714 Returns 715 ------- 716 list 717 List of library entries in original JSON format. 718 """ 719 720 # If polarity is anything other than positive or negative, raise error 721 if polarity not in ["positive", "negative"]: 722 raise ValueError("Polarity must be 'positive' or 'negative'") 723 724 # Cluster groups of mz according to mz_tol_da_api for precursor query 725 mz_list.sort() 726 mz_groups = [[mz_list[0]]] 727 for x in mz_list[1:]: 728 if abs(x - mz_groups[-1][0]) <= mz_tol_da_api: 729 mz_groups[-1].append(x) 730 else: 731 mz_groups.append([x]) 732 733 # Query MetabRef for each mz group 734 lib = [] 735 for mz_group in mz_groups: 736 mz = np.mean(mz_group) 737 if len(mz_group) == 1: 738 mz = mz_group[0] 739 tol = mz_tol_ppm * 10**-6 * mz 740 else: 741 mz = (max(mz_group) - min(mz_group)) / 2 + min(mz_group) 742 tol = (max(mz_group) - min(mz_group)) / 2 + mz_tol_ppm**-6 * max( 743 mz_group 744 ) 745 746 # Get first page of results 747 response = self.get_query( 748 self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity, 1, max_per_page) 749 ) 750 lib = lib + response['results'] 751 752 # If there are more pages of results, get them 753 if response['total_pages'] > 1: 754 for i in np.arange(2, response['total_pages']+1): 755 lib = lib + self.get_query( 756 self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity, i, max_per_page) 757 )['results'] 758 759 return lib
Query MetabRef by precursor m/z values.
Parameters
- mz_list (list): List of precursor m/z values.
- polarity (str): Ionization polarity, either "positive" or "negative".
- mz_tol_ppm (float): Tolerance in ppm for each precursor m/z value. Used for retrieving from a potential match from database.
- mz_tol_da_api (float, optional): Maximum tolerance between precursor m/z values for API search, in daltons. Used to group similar mzs into a single API query for speed. Default is 0.2.
- max_per_page (int, optional): Maximum records to return from MetabRef API query at a time. Default is 50.
Returns
- list: List of library entries in original JSON format.
761 def request_all_precursors(self, polarity, per_page = 50000): 762 """ 763 Request all precursor m/z values for MS2 spectra from MetabRef. 764 765 Parameters 766 ---------- 767 polarity : str 768 Ionization polarity, either "positive" or "negative". 769 per_page : int, optional 770 Number of records to fetch per call. Default is 50000 771 772 Returns 773 ------- 774 list 775 List of all precursor m/z values, sorted. 776 """ 777 # If polarity is anything other than positive or negative, raise error 778 if polarity not in ["positive", "negative"]: 779 raise ValueError("Polarity must be 'positive' or 'negative'") 780 781 precursors = [] 782 783 # Get first page of results and total number of pages of results 784 response = self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity, str(1), str(per_page))) 785 total_pages = response['total_pages'] 786 precursors.extend([x['precursor_ion'] for x in response['results']]) 787 788 # Go through remaining pages of results 789 for i in np.arange(2, total_pages + 1): 790 response = self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity, str(i), str(per_page))) 791 precursors.extend([x['precursor_ion'] for x in response['results']]) 792 793 # Sort precursors from smallest to largest and remove duplicates 794 precursors = list(set(precursors)) 795 precursors.sort() 796 797 return precursors
Request all precursor m/z values for MS2 spectra from MetabRef.
Parameters
- polarity (str): Ionization polarity, either "positive" or "negative".
- per_page (int, optional): Number of records to fetch per call. Default is 50000
Returns
- list: List of all precursor m/z values, sorted.
799 def get_lipid_library( 800 self, 801 mz_list, 802 polarity, 803 mz_tol_ppm, 804 mz_tol_da_api=0.2, 805 format="json", 806 normalize=True, 807 fe_kwargs={}, 808 ): 809 """ 810 Request MetabRef lipid library. 811 812 Parameters 813 ---------- 814 mz_list : list 815 List of precursor m/z values. 816 polarity : str 817 Ionization polarity, either "positive" or "negative". 818 mz_tol_ppm : float 819 Tolerance in ppm for each precursor m/z value. 820 Used for retrieving from a potential match from database. 821 mz_tol_da_api : float, optional 822 Maximum tolerance between precursor m/z values for API search, in daltons. 823 Used to group similar mzs into a single API query for speed. Default is 0.2. 824 format : str, optional 825 Format of requested library, i.e. "json", "sql", "flashentropy". 826 See `available_formats` method for aliases. Default is "json". 827 normalize : bool, optional 828 Normalize the spectrum by its magnitude. Default is True. 829 fe_kwargs : dict, optional 830 Keyword arguments for FlashEntropy search. Default is {}. 831 832 Returns 833 ------- 834 tuple 835 Library in requested format and lipid metadata as a LipidMetadata dataclass. 836 837 """ 838 mz_list.sort() 839 mz_list = np.array(mz_list) 840 841 # Get all precursors in the library matching the polarity 842 precusors_in_lib = self.request_all_precursors(polarity=polarity) 843 precusors_in_lib = np.array(precusors_in_lib) 844 845 # Compare the mz_list with the precursors in the library, keep any mzs that are within mz_tol of any precursor in the library 846 lib_mz_df = pd.DataFrame(precusors_in_lib, columns=["lib_mz"]) 847 lib_mz_df["closest_obs_mz"] = mz_list[ 848 find_closest(mz_list, lib_mz_df.lib_mz.values) 849 ] 850 lib_mz_df["mz_diff_ppm"] = np.abs( 851 (lib_mz_df["lib_mz"] - lib_mz_df["closest_obs_mz"]) 852 / lib_mz_df["lib_mz"] 853 * 1e6 854 ) 855 lib_mz_sub = lib_mz_df[lib_mz_df["mz_diff_ppm"] <= mz_tol_ppm] 856 857 # Do the same in the opposite direction 858 mz_df = pd.DataFrame(mz_list, columns=["mass_feature_mz"]) 859 mz_df["closest_lib_pre_mz"] = precusors_in_lib[ 860 find_closest(precusors_in_lib, mz_df.mass_feature_mz.values) 861 ] 862 mz_df["mz_diff_ppm"] = np.abs( 863 (mz_df["mass_feature_mz"] - mz_df["closest_lib_pre_mz"]) 864 / mz_df["mass_feature_mz"] 865 * 1e6 866 ) 867 mz_df_sub = mz_df[mz_df["mz_diff_ppm"] <= mz_tol_ppm] 868 869 # Evaluate which is fewer mzs - lib_mz_sub or mz_df_sub and use that as the input for next step 870 if len(lib_mz_sub) < len(mz_df_sub): 871 mzs_to_query = lib_mz_sub.lib_mz.values 872 else: 873 mzs_to_query = mz_df_sub.mass_feature_mz.values 874 875 # Query the library for the precursors in the mz_list that are in the library to retrieve the spectra and metadata 876 lib = self.query_by_precursor( 877 mz_list=mzs_to_query, 878 polarity=polarity, 879 mz_tol_ppm=mz_tol_ppm, 880 mz_tol_da_api=mz_tol_da_api, 881 ) 882 883 # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass 884 mol_data_dict = {x["id"]: x["Molecular Data"] for x in lib} 885 lipid_lib = {x["id"]: x["Lipid Tree"] for x in lib if "Lipid Tree" in x.keys()} 886 mol_data_dict = {k: {**v, **lipid_lib[k]} for k, v in mol_data_dict.items()} 887 mol_data_dict = { 888 k: self._dict_to_dataclass(v, LipidMetadata) 889 for k, v in mol_data_dict.items() 890 } 891 892 # Remove lipid metadata from the metabref library 893 lib = [ 894 {k: v for k, v in x.items() if k not in ["Molecular Data", "Lipid Tree"]} 895 for x in lib 896 ] 897 # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry 898 for x in lib: 899 if "Lipid Fragments" in x.keys(): 900 x.update(x.pop("Lipid Fragments")) 901 if "MSO Data" in x.keys(): 902 x.update(x.pop("MSO Data")) 903 904 # Format the spectral library 905 format_func = self._get_format_func(format) 906 lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs) 907 return (lib, mol_data_dict)
Request MetabRef lipid library.
Parameters
- mz_list (list): List of precursor m/z values.
- polarity (str): Ionization polarity, either "positive" or "negative".
- mz_tol_ppm (float): Tolerance in ppm for each precursor m/z value. Used for retrieving from a potential match from database.
- mz_tol_da_api (float, optional): Maximum tolerance between precursor m/z values for API search, in daltons. Used to group similar mzs into a single API query for speed. Default is 0.2.
- format (str, optional):
Format of requested library, i.e. "json", "sql", "flashentropy".
See
available_formats
method for aliases. Default is "json". - normalize (bool, optional): Normalize the spectrum by its magnitude. Default is True.
- fe_kwargs (dict, optional): Keyword arguments for FlashEntropy search. Default is {}.
Returns
- tuple: Library in requested format and lipid metadata as a LipidMetadata dataclass.