corems.molecular_id.search.database_interfaces
1import os 2import re 3from abc import ABC 4from io import StringIO 5from pathlib import Path 6import time 7import json 8 9import numpy as np 10import requests 11import pandas as pd 12from ms_entropy import FlashEntropySearch 13 14from corems.molecular_id.factory.EI_SQL import ( 15 EI_LowRes_SQLite, 16 Metadatar, 17 MetaboliteMetadata, 18) 19from corems.molecular_id.factory.lipid_molecular_metadata import LipidMetadata 20from corems.mass_spectra.calc.lc_calc import find_closest 21 22 23class SpectralDatabaseInterface(ABC): 24 """ 25 Base class that facilitates connection to spectral reference databases, 26 such as EMSL's Metabolomics Reference Database (MetabRef). 27 28 """ 29 30 def __init__(self, key=None): 31 """ 32 Initialize instance. 33 34 Parameters 35 ---------- 36 key : str 37 Token key. 38 39 """ 40 41 self.key = key 42 43 def set_token(self, path): 44 """ 45 Set environment variable for MetabRef database token. 46 47 Parameters 48 ---------- 49 path : str 50 Path to token. 51 52 """ 53 54 # Read token from file 55 with open(path, "r", encoding="utf-8") as f: 56 token = f.readline().strip() 57 58 # Set environment variable 59 os.environ[self.key] = token 60 61 def get_token(self): 62 """ 63 Get environment variable for database token. 64 65 Returns 66 ------- 67 str 68 Token string. 69 70 """ 71 72 # Check for token 73 if self.key not in os.environ: 74 raise ValueError("Must set {} environment variable.".format(self.key)) 75 76 # Get token from environment variables 77 return os.environ.get(self.key) 78 79 def get_header(self): 80 """ 81 Access stored database token and prepare as header. 82 83 Returns 84 ------- 85 str 86 Header string. 87 88 """ 89 90 # Get token 91 token = self.get_token() 92 93 # Pad header information 94 header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"} 95 96 return header 97 98 def get_query(self, url, use_header=True): 99 """ 100 Request payload from URL according to `get` protocol. 101 102 Parameters 103 ---------- 104 url : str 105 URL for request. 106 use_header: bool 107 Whether or not the query should include the header 108 109 Returns 110 ------- 111 dict 112 Response as JSON. 113 114 """ 115 116 # Query URL via `get` 117 if use_header: 118 response = requests.get(url, headers=self.get_header()) 119 else: 120 response = requests.get(url) 121 122 # Check response 123 response.raise_for_status() 124 125 # Return as JSON 126 return response.json() 127 128 def post_query(self, url, variable, values, tolerance): 129 """ 130 Request payload from URL according to `post` protocol. 131 132 Parameters 133 ---------- 134 url : str 135 URL for request. 136 variable : str 137 Variable to query. 138 values : str 139 Specific values of `variable` to query. 140 tolerance : str 141 Query tolerance relative to `values`. 142 143 Returns 144 ------- 145 dict 146 Response as JSON. 147 148 """ 149 150 # Coerce to string 151 if not isinstance(variable, str): 152 variable = str(variable).replace(" ", "") 153 154 if not isinstance(values, str): 155 values = str(values).replace(" ", "") 156 157 if not isinstance(tolerance, str): 158 tolerance = str(tolerance).replace(" ", "") 159 160 # Query URL via `post` 161 response = requests.post( 162 os.path.join(url, variable, tolerance), 163 data=values, 164 headers=self.get_header(), 165 ) 166 167 # Check response 168 response.raise_for_status() 169 170 # Return as JSON 171 return response.json() 172 173 def _check_flash_entropy_kwargs(self, fe_kwargs): 174 """ 175 Check FlashEntropy keyword arguments. 176 177 Parameters 178 ---------- 179 fe_kwargs : dict 180 Keyword arguments for FlashEntropy search. 181 182 183 Raises 184 ------ 185 ValueError 186 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they 187 are not equal. 188 189 """ 190 # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da 191 if ( 192 "min_ms2_difference_in_da" in fe_kwargs 193 or "max_ms2_tolerance_in_da" in fe_kwargs 194 ): 195 if ( 196 "min_ms2_difference_in_da" not in fe_kwargs 197 or "max_ms2_tolerance_in_da" not in fe_kwargs 198 ): 199 raise ValueError( 200 "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified." 201 ) 202 if ( 203 fe_kwargs["min_ms2_difference_in_da"] 204 != 2 * fe_kwargs["max_ms2_tolerance_in_da"] 205 ): 206 raise ValueError( 207 "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'." 208 ) 209 210 def _get_format_func(self, format): 211 """ 212 Obtain format function by key. 213 214 Returns 215 ------- 216 func 217 Formatting function. 218 """ 219 220 if format.lower() in self.format_map.keys(): 221 return self.format_map[format.lower()] 222 223 raise ValueError(("{} not a supported format.").format(format)) 224 225 def _dict_to_dataclass(self, metabref_lib, data_class): 226 """ 227 Convert dictionary to dataclass. 228 229 Notes 230 ----- 231 This function will pull the attributes a dataclass and its parent class 232 and convert the dictionary to a dataclass instance with the appropriate 233 attributes. 234 235 Parameters 236 ---------- 237 data_class : :obj:`~dataclasses.dataclass` 238 Dataclass to convert to. 239 metabref_lib : dict 240 Metabref dictionary object to convert to dataclass. 241 242 Returns 243 ------- 244 :obj:`~dataclasses.dataclass` 245 Dataclass instance. 246 247 """ 248 249 # Get list of expected attributes of data_class 250 data_class_keys = list(data_class.__annotations__.keys()) 251 252 # Does the data_class inherit from another class, if so, get the attributes of the parent class as well 253 if len(data_class.__mro__) > 2: 254 parent_class_keys = list(data_class.__bases__[0].__annotations__.keys()) 255 data_class_keys = list(set(data_class_keys + parent_class_keys)) 256 257 # Remove keys that are not in the data_class from the input dictionary 258 input_dict = {k: v for k, v in metabref_lib.items() if k in data_class_keys} 259 260 # Add keys that are in the data class but not in the input dictionary as None 261 for key in data_class_keys: 262 if key not in input_dict.keys(): 263 input_dict[key] = None 264 return data_class(**input_dict) 265 266 @staticmethod 267 def normalize_peaks(arr): 268 """ 269 Normalize peaks in an array. 270 271 Parameters 272 ---------- 273 arr : :obj:`~numpy.array` 274 Array of shape (N, 2), with m/z in the first column and abundance in 275 the second. 276 277 Returns 278 ------- 279 :obj:`~numpy.array` 280 Normalized array of shape (N, 2), with m/z in the first column and 281 normalized abundance in the second. 282 """ 283 # Normalize the array 284 arr[:, -1] = arr[:, -1] / arr[:, -1].sum() 285 286 return arr 287 288 @staticmethod 289 def _build_flash_entropy_index(fe_lib, fe_kwargs={}, clean_spectra=True): 290 """ 291 Build FlashEntropy index. 292 293 Parameters 294 ---------- 295 fe_lib : list 296 List of spectra to build index from. Can be a list of dictionaries or 297 a FlashEntropy search instance. 298 fe_kwargs : dict, optional 299 Keyword arguments for FlashEntropy search. 300 clean_spectra : bool, optional 301 Clean spectra before building index. Default is True. 302 303 Returns 304 ------- 305 :obj:`~ms_entropy.FlashEntropySearch` 306 FlashEntropy search instance. 307 308 """ 309 # Initialize FlashEntropy 310 fe_init_kws = [ 311 "max_ms2_tolerance_in_da", 312 "mz_index_step", 313 "low_memory", 314 "path_data", 315 ] 316 fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws} 317 fes = FlashEntropySearch(**fe_init_kws) 318 319 # Build FlashEntropy index 320 fe_index_kws = [ 321 "max_indexed_mz", 322 "precursor_ions_removal_da", 323 "noise_threshold", 324 "min_ms2_difference_in_da", 325 "max_peak_num", 326 ] 327 fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws} 328 fes.build_index(fe_lib, **fe_index_kws, clean_spectra=clean_spectra) 329 330 return fes 331 332 333class MetabRefInterface(SpectralDatabaseInterface): 334 """ 335 Interface to the Metabolomics Reference Database. 336 """ 337 338 def __init__(self): 339 """ 340 Initialize instance. 341 342 """ 343 344 super().__init__(key=None) 345 346 def spectrum_to_array(self, spectrum, normalize=True): 347 """ 348 Convert MetabRef-formatted spectrum to array. 349 350 Parameters 351 ---------- 352 spectrum : str 353 MetabRef spectrum, i.e. list of (m/z,abundance) pairs. 354 normalize : bool 355 Normalize the spectrum by its magnitude. 356 357 Returns 358 ------- 359 :obj:`~numpy.array` 360 Array of shape (N, 2), with m/z in the first column and abundance in 361 the second. 362 363 """ 364 365 # Convert parenthesis-delimited string to array 366 arr = np.array( 367 re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float 368 ).reshape(-1, 2) 369 370 if normalize: 371 arr = self.normalize_peaks(arr) 372 373 return arr 374 375 def _to_flashentropy(self, metabref_lib, normalize=True, fe_kwargs={}): 376 """ 377 Convert metabref-formatted library to FlashEntropy library. 378 379 Parameters 380 ---------- 381 metabref_lib : dict 382 MetabRef MS2 library in JSON format or FlashEntropy search instance (for reformatting at different MS2 separation). 383 normalize : bool 384 Normalize each spectrum by its magnitude. 385 fe_kwargs : dict, optional 386 Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search; 387 any keys not recognized will be ignored. By default, all parameters set to defaults. 388 389 Returns 390 ------- 391 :obj:`~ms_entropy.FlashEntropySearch` 392 MS2 library as FlashEntropy search instance. 393 394 Raises 395 ------ 396 ValueError 397 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal. 398 399 """ 400 self._check_flash_entropy_kwargs(fe_kwargs) 401 402 # Initialize empty library 403 fe_lib = [] 404 405 # Enumerate spectra 406 for i, source in enumerate(metabref_lib): 407 # Reorganize source dict, if necessary 408 if "spectrum_data" in source.keys(): 409 spectrum = source["spectrum_data"] 410 else: 411 spectrum = source 412 413 # Rename precursor_mz key for FlashEntropy 414 if "precursor_mz" not in spectrum.keys(): 415 spectrum["precursor_mz"] = spectrum.pop("precursor_ion") 416 417 # Convert CoreMS spectrum to array and clean, store as `peaks` 418 spectrum["peaks"] = self.spectrum_to_array( 419 spectrum["mz"], normalize=normalize 420 ) 421 422 # Add spectrum to library 423 fe_lib.append(spectrum) 424 425 # Build FlashEntropy index 426 fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs) 427 428 return fe_search 429 430 def get_query(self, url, use_header=False): 431 """Overwrites the get_query method on the parent class to default to not use a header 432 433 Notes 434 ----- 435 As of January 2025, the metabref database no longer requires a token and therefore no header is needed 436 437 """ 438 return super().get_query(url, use_header) 439 440 441class MetabRefGCInterface(MetabRefInterface): 442 """ 443 Interface to the Metabolomics Reference Database. 444 """ 445 446 def __init__(self): 447 """ 448 Initialize instance. 449 450 """ 451 452 super().__init__() 453 self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1" 454 self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames" 455 456 self.__init_format_map__() 457 458 def __init_format_map__(self): 459 """ 460 Initialize database format mapper, enabling multiple format requests. 461 462 """ 463 464 # Define format workflows 465 self.format_map = { 466 "json": lambda x, normalize, fe_kwargs: x, 467 "dict": lambda x, 468 normalize, 469 fe_kwargs: self._to_LowResolutionEICompound_dict(x, normalize), 470 "sql": lambda x, 471 normalize, 472 fe_kwargs: self._LowResolutionEICompound_dict_to_sqlite( 473 self._to_LowResolutionEICompound_dict(x, normalize) 474 ), 475 } 476 477 # Add aliases 478 self.format_map["metabref"] = self.format_map["json"] 479 self.format_map["datadict"] = self.format_map["dict"] 480 self.format_map["data-dict"] = self.format_map["dict"] 481 self.format_map["lowreseicompound"] = self.format_map["dict"] 482 self.format_map["lowres"] = self.format_map["dict"] 483 self.format_map["lowresgc"] = self.format_map["dict"] 484 self.format_map["sqlite"] = self.format_map["sql"] 485 486 def available_formats(self): 487 """ 488 View list of available formats. 489 490 Returns 491 ------- 492 list 493 Format map keys. 494 """ 495 496 return list(self.format_map.keys()) 497 498 def get_library(self, format="json", normalize=False): 499 """ 500 Request MetabRef GC/MS library. 501 502 Parameters 503 ---------- 504 format : str 505 Format of requested library, i.e. "json", "sql", "flashentropy". 506 See `available_formats` method for aliases. 507 normalize : bool 508 Normalize the spectrum by its magnitude. 509 510 Returns 511 ------- 512 Library in requested format. 513 514 """ 515 516 # Init format function 517 format_func = self._get_format_func(format) 518 519 return format_func( 520 self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {} 521 ) 522 523 def get_fames(self, format="json", normalize=False): 524 """ 525 Request MetabRef GC/MS FAMEs library. 526 527 Parameters 528 ---------- 529 format : str 530 Format of requested library, i.e. "json", "sql", "flashentropy". 531 See `available_formats` method for aliases. 532 normalize : bool 533 Normalize the spectrum by its magnitude. 534 535 Returns 536 ------- 537 Library in requested format. 538 539 """ 540 541 # Init format function 542 format_func = self._get_format_func(format) 543 544 return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {}) 545 546 def _to_LowResolutionEICompound_dict(self, metabref_lib, normalize=False): 547 """ 548 Convert MetabRef-formatted library to CoreMS LowResolutionEICompound-formatted 549 dictionary for local ingestion. 550 551 Parameters 552 ---------- 553 metabref_lib : dict 554 MetabRef GC-MS library in JSON format. 555 normalize : bool 556 Normalize each spectrum by its magnitude. 557 558 Returns 559 ------- 560 list of dict 561 List of each spectrum contained in dictionary. 562 563 """ 564 565 # All below key:value lookups are based on CoreMS class definitions 566 # NOT MetabRef content. For example, MetabRef has keys for PubChem, 567 # USI, etc. that are not considered below. 568 569 # Dictionary to map metabref keys to corems keys 570 metadatar_cols = { 571 "casno": "cas", 572 "inchikey": "inchikey", 573 "inchi": "inchi", 574 "chebi": "chebi", 575 "smiles": "smiles", 576 "kegg": "kegg", 577 "iupac_name": "iupac_name", 578 "traditional_name": "traditional_name", # Not present in metabref 579 "common_name": "common_name", # Not present in metabref 580 } 581 582 # Dictionary to map metabref keys to corems keys 583 lowres_ei_compound_cols = { 584 "id": "metabref_id", 585 "molecule_name": "name", # Is this correct? 586 "classify": "classify", # Not present in metabref 587 "formula": "formula", 588 "ri": "ri", 589 "rt": "retention_time", 590 "source": "source", # Not present in metabref 591 "casno": "casno", 592 "comments": "comment", 593 "source_temp_c": "source_temp_c", # Not present in metabref 594 "ev": "ev", # Not present in metabref 595 "peak_count": "peaks_count", 596 "mz": "mz", 597 "abundance": "abundance", 598 } 599 600 # Local result container 601 corems_lib = [] 602 603 # Enumerate spectra 604 for i, source_ in enumerate(metabref_lib): 605 # Copy source to prevent modification 606 source = source_.copy() 607 608 # Flatten source dict 609 source = source.pop("spectrum_data") | source 610 611 # Parse target data 612 target = { 613 lowres_ei_compound_cols[k]: v 614 for k, v in source.items() 615 if k in lowres_ei_compound_cols 616 } 617 618 # Explicitly add this to connect with LowResCompoundRef later 619 target["rt"] = source["rt"] 620 621 # Parse (mz, abundance) 622 arr = self.spectrum_to_array(target["mz"], normalize=normalize) 623 target["mz"] = arr[:, 0] 624 target["abundance"] = arr[:, 1] 625 626 # Parse meta data 627 target["metadata"] = { 628 metadatar_cols[k]: v for k, v in source.items() if k in metadatar_cols 629 } 630 631 # Add anything else 632 for k in source: 633 if k not in lowres_ei_compound_cols: 634 target[k] = source[k] 635 636 # Add to CoreMS list 637 corems_lib.append(target) 638 639 return corems_lib 640 641 def _LowResolutionEICompound_dict_to_sqlite( 642 self, lowres_ei_compound_dict, url="sqlite://" 643 ): 644 """ 645 Convert CoreMS LowResolutionEICompound-formatted dictionary to SQLite 646 database for local ingestion. 647 648 Parameters 649 ---------- 650 lowres_ei_compound_dict : dict 651 CoreMS GC-MS library formatted for LowResolutionEICompound. 652 url : str 653 URL to SQLite prefix. 654 655 Returns 656 ------- 657 sqlite database 658 Spectra contained in SQLite database. 659 660 """ 661 662 # Dictionary to map corems keys to all-caps keys 663 capped_cols = { 664 "name": "NAME", 665 "formula": "FORM", 666 "ri": "RI", 667 "retention_time": "RT", 668 "source": "SOURCE", 669 "casno": "CASNO", 670 "comment": "COMMENT", 671 "peaks_count": "NUM PEAKS", 672 } 673 674 # Initialize SQLite object 675 sqlite_obj = EI_LowRes_SQLite(url=url) 676 677 # Iterate spectra 678 for _data_dict in lowres_ei_compound_dict: 679 # Copy source to prevent modification 680 data_dict = _data_dict.copy() 681 682 # Add missing capped values 683 for k, v in capped_cols.items(): 684 # Key exists 685 if k in data_dict: 686 # # This will replace the key 687 # data_dict[v] = data_dict.pop(k) 688 689 # This will keep both keys 690 data_dict[v] = data_dict[k] 691 692 # Parse number of peaks 693 if not data_dict.get("NUM PEAKS"): 694 data_dict["NUM PEAKS"] = len(data_dict.get("mz")) 695 696 # Parse CAS number 697 if not data_dict.get("CASNO"): 698 data_dict["CASNO"] = data_dict.get("CAS") 699 700 if not data_dict["CASNO"]: 701 data_dict["CASNO"] = 0 702 703 # Build linked metadata table 704 if "metadata" in data_dict: 705 if len(data_dict["metadata"]) > 0: 706 data_dict["metadatar"] = Metadatar(**data_dict.pop("metadata")) 707 else: 708 data_dict.pop("metadata") 709 710 # Attempt addition to sqlite 711 try: 712 sqlite_obj.add_compound(data_dict) 713 except: 714 print(data_dict["NAME"]) 715 716 return sqlite_obj 717 718 719class MetabRefLCInterface(MetabRefInterface): 720 """ 721 Interface to the Metabolomics Reference Database for LC-MS data. 722 """ 723 724 def __init__(self): 725 """ 726 Initialize instance. 727 728 """ 729 730 super().__init__() 731 732 # API endpoint for precursor m/z search 733 # inputs = mz, tolerance (in Da), polarity, page_no, per_page 734 self.PRECURSOR_MZ_URL = "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}" 735 736 # API endpoint for returning full list of precursor m/z values in database 737 # inputs = polarity, page_no, per_page 738 self.PRECURSOR_MZ_ALL_URL = ( 739 "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}" 740 ) 741 742 # API endpoint for lipid data 743 self.LIPID_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/lipid/data" 744 745 self.__init_format_map__() 746 747 def __init_format_map__(self): 748 """ 749 Initialize database format mapper, enabling multiple format requests. 750 751 """ 752 753 # Define format workflows 754 self.format_map = { 755 "json": lambda x, normalize, fe_kwargs: x, 756 "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy( 757 x, normalize, fe_kwargs 758 ), 759 } 760 761 # Add aliases 762 self.format_map["metabref"] = self.format_map["json"] 763 self.format_map["fe"] = self.format_map["flashentropy"] 764 self.format_map["flash-entropy"] = self.format_map["flashentropy"] 765 766 def query_by_precursor( 767 self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50 768 ): 769 """ 770 Query MetabRef by precursor m/z values. 771 772 Parameters 773 ---------- 774 mz_list : list 775 List of precursor m/z values. 776 polarity : str 777 Ionization polarity, either "positive" or "negative". 778 mz_tol_ppm : float 779 Tolerance in ppm for each precursor m/z value. 780 Used for retrieving from a potential match from database. 781 mz_tol_da_api : float, optional 782 Maximum tolerance between precursor m/z values for API search, in daltons. 783 Used to group similar mzs into a single API query for speed. Default is 0.2. 784 max_per_page : int, optional 785 Maximum records to return from MetabRef API query at a time. Default is 50. 786 787 Returns 788 ------- 789 list 790 List of library entries in original JSON format. 791 """ 792 raise DeprecationWarning( 793 "query_by_precursor is deprecated. Use get_lipid_library instead." 794 ) 795 796 def request_all_precursors(self, polarity, per_page=50000): 797 """ 798 Request all precursor m/z values for MS2 spectra from MetabRef. 799 800 Parameters 801 ---------- 802 polarity : str 803 Ionization polarity, either "positive" or "negative". 804 per_page : int, optional 805 Number of records to fetch per call. Default is 50000 806 807 Returns 808 ------- 809 list 810 List of all precursor m/z values, sorted. 811 """ 812 raise DeprecationWarning("request_all_precursors is deprecated.") 813 814 def post_lipid_query(self, mz_list, polarity, mz_tol_ppm): 815 """ 816 Post query to get MetabRef lipid spectra. 817 818 Parameters 819 ---------- 820 mz_list : list 821 List of precursor m/z values. 822 polarity : str 823 Ionization polarity, either "positive" or "negative". 824 mz_tol_ppm : float 825 Tolerance in ppm for each precursor m/z value. 826 827 Returns 828 ------- 829 download_id : str 830 Download ID for the lipid library query. 831 832 Raises 833 ------ 834 ValueError 835 If any input parameter is invalid. 836 If no download ID is returned. 837 """ 838 url = self.LIPID_LIBRARY_URL 839 840 headers = { 841 'accept': '*/*', 842 'Content-Type': 'application/json' 843 } 844 845 payload = { 846 "tolerance_ppm": mz_tol_ppm, 847 "polarity": polarity, 848 "mz_list": list(set(np.sort(mz_list))) 849 } 850 851 try: 852 response = requests.post(url, headers=headers, json=payload) 853 response.raise_for_status() # Raises an HTTPError for bad responses 854 text = response.text.strip() 855 # Drop everything before the final space 856 if not text: 857 raise ValueError("Empty response from MetabRef lipid library API.") 858 if " " in text: 859 text = text.rsplit(" ", 1)[-1] 860 return text 861 else: 862 raise ValueError("Unexpected response format from MetabRef lipid library API.") 863 except requests.exceptions.RequestException as e: 864 raise ValueError(f"Error querying MetabRef lipid library: {e}") 865 866 def get_lipid_data(self, job_id, attempts=10, delay=5): 867 """ 868 Get download content from lipid library query from MetabRef using job ID. 869 870 Parameters 871 ---------- 872 job_id : str 873 Job ID for the lipid library query. 874 Retrieved from the post_lipid_query method. 875 attempts : int, optional 876 Number of attempts to retrieve the data. Default is 10. 877 delay : int, optional 878 Delay in seconds between attempts. Default is 5. 879 880 Returns 881 ------- 882 str 883 Download content from the lipid library query. 884 885 Raises 886 ------ 887 ValueError 888 If no download content is returned. 889 """ 890 url = f"https://metabref.emsl.pnnl.gov/api/lipid/data/download/{job_id}" 891 892 # Check the response, if it's 400, try again in 5 seconds. Try up to 10 times 893 for attempt in range(attempts): 894 try: 895 response = requests.get(url) 896 response.raise_for_status() # Raises an HTTPError for bad responses 897 if response.status_code == 200: 898 if response.content == b"Job still running": 899 if attempt < attempts - 1: 900 time.sleep(delay) 901 continue 902 else: 903 lib = response.content 904 return lib.decode('utf-8') if isinstance(lib, bytes) else lib 905 elif response.status_code == 400: 906 if attempt < attempts - 1: 907 time.sleep(delay) # Wait before retrying 908 continue 909 else: 910 raise ValueError("Job ID not found or job is still processing.") 911 except requests.exceptions.RequestException as e: 912 if attempt < attempts - 1: 913 time.sleep(delay) 914 continue 915 else: 916 raise ValueError(f"Error retrieving lipid library job: {e}") 917 918 def get_lipid_library( 919 self, 920 mz_list, 921 polarity, 922 mz_tol_ppm, 923 mz_tol_da_api=None, 924 format="json", 925 normalize=True, 926 fe_kwargs={}, 927 api_delay=5, 928 api_attempts=10, 929 ): 930 """ 931 Request MetabRef lipid library. 932 933 Parameters 934 ---------- 935 mz_list : list 936 List of precursor m/z values. 937 polarity : str 938 Ionization polarity, either "positive" or "negative". 939 mz_tol_ppm : float 940 Tolerance in ppm for each precursor m/z value. 941 Used for retrieving from a potential match from database. 942 mz_tol_da_api : float, optional 943 DEPRECATED. No longer used, but kept for backwards compatibility. 944 format : str, optional 945 Format of requested library, i.e. "json", "sql", "flashentropy". 946 See `available_formats` method for aliases. Default is "json". 947 normalize : bool, optional 948 Normalize the spectrum by its magnitude. Default is True. 949 fe_kwargs : dict, optional 950 Keyword arguments for FlashEntropy search. Default is {}. 951 api_delay : int, optional 952 Delay in seconds between API attempts. Default is 5. 953 api_attempts : int, optional 954 Number of attempts to retrieve the data from the API. Default is 10. 955 956 Returns 957 ------- 958 tuple 959 Library in requested format and lipid metadata as a LipidMetadata dataclass. 960 961 """ 962 # Check for valid types in mz_list, polarity, and mz_tol_ppm 963 if not isinstance(mz_list, (list, np.ndarray)): 964 raise ValueError("mz_list must be a list or numpy array") 965 if not all(isinstance(mz, (float, int)) for mz in mz_list): 966 raise ValueError("All elements in mz_list must be float or int") 967 if not isinstance(polarity, str): 968 raise ValueError("polarity must be a string") 969 if not isinstance(mz_tol_ppm, (float, int)): 970 raise ValueError("mz_tol_ppm must be a float or int") 971 972 job_id = self.post_lipid_query( 973 mz_list=mz_list, 974 polarity=polarity, 975 mz_tol_ppm=mz_tol_ppm, 976 ) 977 978 lib = self.get_lipid_data( 979 job_id=job_id, 980 attempts=api_attempts, 981 delay=api_delay, 982 ) 983 lib = json.loads(lib) 984 985 # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass 986 mol_data_dict = lib['molecular_data'] 987 mol_data_dict = { 988 int(k): self._dict_to_dataclass(v, LipidMetadata) 989 for k, v in mol_data_dict.items() 990 } 991 992 # Remove lipid metadata from the metabref library 993 lib = lib['mass_spectrum_data'] 994 # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry 995 for x in lib: 996 if "Lipid Fragments" in x.keys(): 997 x.update(x.pop("Lipid Fragments")) 998 if "MSO Data" in x.keys(): 999 x.update(x.pop("MSO Data")) 1000 1001 # Format the spectral library 1002 format_func = self._get_format_func(format) 1003 lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs) 1004 return (lib, mol_data_dict) 1005 1006 1007class MSPInterface(SpectralDatabaseInterface): 1008 """ 1009 Interface to parse NIST MSP files 1010 """ 1011 1012 def __init__(self, file_path): 1013 """ 1014 Initialize instance. 1015 1016 Parameters 1017 ---------- 1018 file_path : str 1019 Path to a local MSP file. 1020 1021 Attributes 1022 ---------- 1023 file_path : str 1024 Path to the MSP file. 1025 _file_content : str 1026 Content of the MSP file. 1027 _data_frame : :obj:`~pandas.DataFrame` 1028 DataFrame of spectra from the MSP file with unaltered content. 1029 """ 1030 super().__init__(key=None) 1031 1032 self.file_path = file_path 1033 if not os.path.exists(self.file_path): 1034 raise FileNotFoundError( 1035 f"File {self.file_path} does not exist. Please check the file path." 1036 ) 1037 with open(self.file_path, "r") as f: 1038 self._file_content = f.read() 1039 1040 self._data_frame = self._read_msp_file() 1041 self.__init_format_map__() 1042 1043 def __init_format_map__(self): 1044 """ 1045 Initialize database format mapper, enabling multiple format requests. 1046 1047 """ 1048 1049 # x is a pandas dataframe similar to self._data_frame format 1050 # Define format workflows 1051 self.format_map = { 1052 "msp": lambda x, normalize, fe_kwargs: self._to_msp(x, normalize), 1053 "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy( 1054 x, normalize, fe_kwargs 1055 ), 1056 "df": lambda x, normalize, fe_kwargs: self._to_df(x, normalize), 1057 } 1058 1059 # Add aliases 1060 self.format_map["fe"] = self.format_map["flashentropy"] 1061 self.format_map["flash-entropy"] = self.format_map["flashentropy"] 1062 self.format_map["dataframe"] = self.format_map["df"] 1063 self.format_map["data-frame"] = self.format_map["df"] 1064 1065 def _read_msp_file(self): 1066 """ 1067 Reads the MSP files into the pandas dataframe, and sort/remove zero intensity ions in MS/MS spectra. 1068 1069 Returns 1070 ------- 1071 :obj:`~pandas.DataFrame` 1072 DataFrame of spectra from the MSP file, exacly as it is in the file (no sorting, filtering etc) 1073 """ 1074 # If input_dataframe is provided, return it it 1075 spectra = [] 1076 spectrum = {} 1077 1078 f = StringIO(self._file_content) 1079 for line in f: 1080 line = line.strip() 1081 if not line: 1082 continue # Skip empty lines 1083 1084 # Handle metadata 1085 if ":" in line: 1086 key, value = line.split(":", 1) 1087 key = key.strip().lower() 1088 value = value.strip() 1089 1090 if key == "name": 1091 # Save current spectrum and start a new one 1092 if spectrum: 1093 spectra.append(spectrum) 1094 spectrum = {"name": value, "peaks": []} 1095 else: 1096 spectrum[key] = value 1097 1098 # Handle peak data (assumed to start with a number) 1099 elif line[0].isdigit(): 1100 peaks = line.split() 1101 m_z = float(peaks[0]) 1102 intensity = float(peaks[1]) 1103 spectrum["peaks"].append(([m_z, intensity])) 1104 # Save the last spectrum 1105 if spectrum: 1106 spectra.append(spectrum) 1107 1108 df = pd.DataFrame(spectra) 1109 for column in df.columns: 1110 if column != "peaks": # Skip 'peaks' column 1111 try: 1112 df[column] = pd.to_numeric(df[column], errors="raise") 1113 except: 1114 pass 1115 return df 1116 1117 def _to_df(self, input_dataframe, normalize=True): 1118 """ 1119 Convert MSP-derived library to FlashEntropy library. 1120 1121 Parameters 1122 ---------- 1123 input_dataframe : :obj:`~pandas.DataFrame` 1124 Input DataFrame containing MSP-formatted spectra. 1125 normalize : bool, optional 1126 Normalize each spectrum by its magnitude. 1127 Default is True. 1128 1129 Returns 1130 ------- 1131 :obj:`~pandas.DataFrame` 1132 DataFrame of with desired normalization 1133 """ 1134 if not normalize: 1135 return input_dataframe 1136 else: 1137 # Convert to dictionary 1138 db_dict = input_dataframe.to_dict(orient="records") 1139 1140 # Initialize empty library 1141 lib = [] 1142 1143 # Enumerate spectra 1144 for i, source in enumerate(db_dict): 1145 spectrum = source 1146 # Check that spectrum["peaks"] exists 1147 if "peaks" not in spectrum.keys(): 1148 raise KeyError( 1149 "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute." 1150 ) 1151 1152 # Convert spectrum["peaks"] to numpy array 1153 if not isinstance(spectrum["peaks"], np.ndarray): 1154 spectrum["peaks"] = np.array(spectrum["peaks"]) 1155 1156 # Normalize peaks, if requested 1157 if normalize: 1158 spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"]) 1159 spectrum["num peaks"] = len(spectrum["peaks"]) 1160 1161 # Add spectrum to library 1162 lib.append(spectrum) 1163 1164 # Convert to DataFrame 1165 df = pd.DataFrame(lib) 1166 return df 1167 1168 def _to_flashentropy(self, input_dataframe, normalize=True, fe_kwargs={}): 1169 """ 1170 Convert MSP-derived library to FlashEntropy library. 1171 1172 Parameters 1173 ---------- 1174 input_dataframe : :obj:`~pandas.DataFrame` 1175 Input DataFrame containing MSP-formatted spectra. 1176 normalize : bool 1177 Normalize each spectrum by its magnitude. 1178 fe_kwargs : dict, optional 1179 Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search; 1180 any keys not recognized will be ignored. By default, all parameters set to defaults. 1181 1182 Returns 1183 ------- 1184 :obj:`~ms_entropy.FlashEntropySearch` 1185 MS2 library as FlashEntropy search instance. 1186 1187 Raises 1188 ------ 1189 ValueError 1190 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they 1191 """ 1192 self._check_flash_entropy_kwargs(fe_kwargs) 1193 1194 db_df = input_dataframe 1195 1196 # Convert to dictionary 1197 db_dict = db_df.to_dict(orient="records") 1198 1199 # Initialize empty library 1200 fe_lib = [] 1201 1202 # Enumerate spectra 1203 for i, source in enumerate(db_dict): 1204 # Reorganize source dict, if necessary 1205 if "spectrum_data" in source.keys(): 1206 spectrum = source["spectrum_data"] 1207 else: 1208 spectrum = source 1209 1210 # Rename precursor_mz key for FlashEntropy 1211 if "precursor_mz" not in spectrum.keys(): 1212 if "precursormz" in spectrum: 1213 spectrum["precursor_mz"] = spectrum.pop("precursormz") 1214 elif "precursor_ion" in spectrum: 1215 spectrum["precursor_mz"] = spectrum.pop("precursor_ion") 1216 else: 1217 raise KeyError( 1218 "MSP must have either 'precursormz' or 'precursor_ion' key to be converted to FlashEntropy format." 1219 ) 1220 1221 # Check that spectrum["peaks"] exists 1222 if "peaks" not in spectrum.keys(): 1223 raise KeyError( 1224 "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute." 1225 ) 1226 1227 # Convert spectrum["peaks"] to numpy array 1228 if not isinstance(spectrum["peaks"], np.ndarray): 1229 spectrum["peaks"] = np.array(spectrum["peaks"]) 1230 1231 # Normalize peaks, if requested 1232 if normalize: 1233 spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"]) 1234 1235 # Add spectrum to library 1236 fe_lib.append(spectrum) 1237 1238 # Build FlashEntropy index 1239 fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs) 1240 1241 return fe_search 1242 1243 def _check_msp_compatibility(self): 1244 """ 1245 Check if the MSP file is compatible with the get_metabolomics_spectra_library method and provide feedback if it is not. 1246 """ 1247 # Check polarity 1248 if ( 1249 "polarity" not in self._data_frame.columns 1250 and "ionmode" not in self._data_frame.columns 1251 ): 1252 raise ValueError( 1253 "Neither 'polarity' nor 'ionmode' columns found in the input MSP metadata. Please check the file." 1254 ) 1255 polarity_column = ( 1256 "polarity" if "polarity" in self._data_frame.columns else "ionmode" 1257 ) 1258 1259 # Check if polarity_column contents is either "positive" or "negative" 1260 if not all(self._data_frame[polarity_column].isin(["positive", "negative"])): 1261 raise ValueError( 1262 f"Input field on MSP '{polarity_column}' must contain only 'positive' or 'negative' values." 1263 ) 1264 1265 # Check if the MSP file contains the required columns for metabolite metadata 1266 # inchikey, by name, not null 1267 # either formula or molecular_formula, not null 1268 if not all(self._data_frame["inchikey"].notnull()): 1269 raise ValueError( 1270 "Input field on MSP 'inchikey' must contain only non-null values." 1271 ) 1272 if ( 1273 "formula" not in self._data_frame.columns 1274 and "molecular_formula" not in self._data_frame.columns 1275 ): 1276 raise ValueError( 1277 "Input field on MSP must contain either 'formula' or 'molecular_formula' columns." 1278 ) 1279 molecular_formula_column = ( 1280 "formula" if "formula" in self._data_frame.columns else "molecular_formula" 1281 ) 1282 if not all(self._data_frame[molecular_formula_column].notnull()): 1283 raise ValueError( 1284 f"Input field on MSP '{molecular_formula_column}' must contain only non-null values." 1285 ) 1286 1287 def get_metabolomics_spectra_library( 1288 self, 1289 polarity, 1290 metabolite_metadata_mapping={}, 1291 format="fe", 1292 normalize=True, 1293 fe_kwargs={}, 1294 ): 1295 """ 1296 Prepare metabolomics spectra library and associated metabolite metadata 1297 1298 Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input 1299 1300 """ 1301 # Check if the MSP file is compatible with the get_metabolomics_spectra_library method 1302 self._check_msp_compatibility() 1303 1304 # Check if the polarity parameter is valid and if a polarity column exists in the dataframe 1305 if polarity not in ["positive", "negative"]: 1306 raise ValueError("Polarity must be 'positive' or 'negative'") 1307 polarity_column = ( 1308 "polarity" if "polarity" in self._data_frame.columns else "ionmode" 1309 ) 1310 1311 # Get a subset of the initial dataframea by polarity 1312 db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy() 1313 1314 # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping 1315 # If the mapping is not provided, use the default mapping 1316 if not metabolite_metadata_mapping: 1317 metabolite_metadata_mapping = { 1318 "chebi_id": "chebi", 1319 "kegg_id": "kegg", 1320 "refmet_name": "common_name", 1321 "molecular_formula": "formula", 1322 "gnps_spectra_id":"id", 1323 "precursormz": "precursor_mz", 1324 "precursortype":"ion_type" 1325 } 1326 db_df.rename(columns=metabolite_metadata_mapping, inplace=True) 1327 db_df["molecular_data_id"] = db_df["inchikey"] 1328 1329 1330 1331 # Check if the resulting dataframe has the required columns for the flash entropy search 1332 required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"] 1333 for col in required_columns: 1334 if col not in db_df.columns: 1335 raise ValueError( 1336 f"Input field on MSP must contain '{col}' column for FlashEntropy search." 1337 ) 1338 1339 # Pull out the metabolite metadata from the dataframe and put it into a different dataframe 1340 # First get a list of the possible attributes of the MetaboliteMetadata dataclass 1341 metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys()) 1342 # Replace id with molecular_data_id in metabolite_metadata_keys 1343 metabolite_metadata_keys = [ 1344 "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys 1345 ] 1346 metabolite_metadata_df = db_df[ 1347 db_df.columns[db_df.columns.isin(metabolite_metadata_keys)] 1348 ].copy() 1349 1350 # Make unique and recast the id column for metabolite metadata 1351 metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True) 1352 metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"] 1353 1354 # Convert to a dictionary using the inchikey as the key 1355 metabolite_metadata_dict = metabolite_metadata_df.to_dict( 1356 orient="records" 1357 ) 1358 metabolite_metadata_dict = { 1359 v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata) 1360 for v in metabolite_metadata_dict 1361 } 1362 1363 # Remove the metabolite metadata columns from the original dataframe 1364 for key in metabolite_metadata_keys: 1365 if key != "molecular_data_id": 1366 if key in db_df.columns: 1367 db_df.drop(columns=key, inplace=True) 1368 1369 # Format the spectral library 1370 format_func = self._get_format_func(format) 1371 lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs) 1372 return (lib, metabolite_metadata_dict)
24class SpectralDatabaseInterface(ABC): 25 """ 26 Base class that facilitates connection to spectral reference databases, 27 such as EMSL's Metabolomics Reference Database (MetabRef). 28 29 """ 30 31 def __init__(self, key=None): 32 """ 33 Initialize instance. 34 35 Parameters 36 ---------- 37 key : str 38 Token key. 39 40 """ 41 42 self.key = key 43 44 def set_token(self, path): 45 """ 46 Set environment variable for MetabRef database token. 47 48 Parameters 49 ---------- 50 path : str 51 Path to token. 52 53 """ 54 55 # Read token from file 56 with open(path, "r", encoding="utf-8") as f: 57 token = f.readline().strip() 58 59 # Set environment variable 60 os.environ[self.key] = token 61 62 def get_token(self): 63 """ 64 Get environment variable for database token. 65 66 Returns 67 ------- 68 str 69 Token string. 70 71 """ 72 73 # Check for token 74 if self.key not in os.environ: 75 raise ValueError("Must set {} environment variable.".format(self.key)) 76 77 # Get token from environment variables 78 return os.environ.get(self.key) 79 80 def get_header(self): 81 """ 82 Access stored database token and prepare as header. 83 84 Returns 85 ------- 86 str 87 Header string. 88 89 """ 90 91 # Get token 92 token = self.get_token() 93 94 # Pad header information 95 header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"} 96 97 return header 98 99 def get_query(self, url, use_header=True): 100 """ 101 Request payload from URL according to `get` protocol. 102 103 Parameters 104 ---------- 105 url : str 106 URL for request. 107 use_header: bool 108 Whether or not the query should include the header 109 110 Returns 111 ------- 112 dict 113 Response as JSON. 114 115 """ 116 117 # Query URL via `get` 118 if use_header: 119 response = requests.get(url, headers=self.get_header()) 120 else: 121 response = requests.get(url) 122 123 # Check response 124 response.raise_for_status() 125 126 # Return as JSON 127 return response.json() 128 129 def post_query(self, url, variable, values, tolerance): 130 """ 131 Request payload from URL according to `post` protocol. 132 133 Parameters 134 ---------- 135 url : str 136 URL for request. 137 variable : str 138 Variable to query. 139 values : str 140 Specific values of `variable` to query. 141 tolerance : str 142 Query tolerance relative to `values`. 143 144 Returns 145 ------- 146 dict 147 Response as JSON. 148 149 """ 150 151 # Coerce to string 152 if not isinstance(variable, str): 153 variable = str(variable).replace(" ", "") 154 155 if not isinstance(values, str): 156 values = str(values).replace(" ", "") 157 158 if not isinstance(tolerance, str): 159 tolerance = str(tolerance).replace(" ", "") 160 161 # Query URL via `post` 162 response = requests.post( 163 os.path.join(url, variable, tolerance), 164 data=values, 165 headers=self.get_header(), 166 ) 167 168 # Check response 169 response.raise_for_status() 170 171 # Return as JSON 172 return response.json() 173 174 def _check_flash_entropy_kwargs(self, fe_kwargs): 175 """ 176 Check FlashEntropy keyword arguments. 177 178 Parameters 179 ---------- 180 fe_kwargs : dict 181 Keyword arguments for FlashEntropy search. 182 183 184 Raises 185 ------ 186 ValueError 187 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they 188 are not equal. 189 190 """ 191 # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da 192 if ( 193 "min_ms2_difference_in_da" in fe_kwargs 194 or "max_ms2_tolerance_in_da" in fe_kwargs 195 ): 196 if ( 197 "min_ms2_difference_in_da" not in fe_kwargs 198 or "max_ms2_tolerance_in_da" not in fe_kwargs 199 ): 200 raise ValueError( 201 "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified." 202 ) 203 if ( 204 fe_kwargs["min_ms2_difference_in_da"] 205 != 2 * fe_kwargs["max_ms2_tolerance_in_da"] 206 ): 207 raise ValueError( 208 "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'." 209 ) 210 211 def _get_format_func(self, format): 212 """ 213 Obtain format function by key. 214 215 Returns 216 ------- 217 func 218 Formatting function. 219 """ 220 221 if format.lower() in self.format_map.keys(): 222 return self.format_map[format.lower()] 223 224 raise ValueError(("{} not a supported format.").format(format)) 225 226 def _dict_to_dataclass(self, metabref_lib, data_class): 227 """ 228 Convert dictionary to dataclass. 229 230 Notes 231 ----- 232 This function will pull the attributes a dataclass and its parent class 233 and convert the dictionary to a dataclass instance with the appropriate 234 attributes. 235 236 Parameters 237 ---------- 238 data_class : :obj:`~dataclasses.dataclass` 239 Dataclass to convert to. 240 metabref_lib : dict 241 Metabref dictionary object to convert to dataclass. 242 243 Returns 244 ------- 245 :obj:`~dataclasses.dataclass` 246 Dataclass instance. 247 248 """ 249 250 # Get list of expected attributes of data_class 251 data_class_keys = list(data_class.__annotations__.keys()) 252 253 # Does the data_class inherit from another class, if so, get the attributes of the parent class as well 254 if len(data_class.__mro__) > 2: 255 parent_class_keys = list(data_class.__bases__[0].__annotations__.keys()) 256 data_class_keys = list(set(data_class_keys + parent_class_keys)) 257 258 # Remove keys that are not in the data_class from the input dictionary 259 input_dict = {k: v for k, v in metabref_lib.items() if k in data_class_keys} 260 261 # Add keys that are in the data class but not in the input dictionary as None 262 for key in data_class_keys: 263 if key not in input_dict.keys(): 264 input_dict[key] = None 265 return data_class(**input_dict) 266 267 @staticmethod 268 def normalize_peaks(arr): 269 """ 270 Normalize peaks in an array. 271 272 Parameters 273 ---------- 274 arr : :obj:`~numpy.array` 275 Array of shape (N, 2), with m/z in the first column and abundance in 276 the second. 277 278 Returns 279 ------- 280 :obj:`~numpy.array` 281 Normalized array of shape (N, 2), with m/z in the first column and 282 normalized abundance in the second. 283 """ 284 # Normalize the array 285 arr[:, -1] = arr[:, -1] / arr[:, -1].sum() 286 287 return arr 288 289 @staticmethod 290 def _build_flash_entropy_index(fe_lib, fe_kwargs={}, clean_spectra=True): 291 """ 292 Build FlashEntropy index. 293 294 Parameters 295 ---------- 296 fe_lib : list 297 List of spectra to build index from. Can be a list of dictionaries or 298 a FlashEntropy search instance. 299 fe_kwargs : dict, optional 300 Keyword arguments for FlashEntropy search. 301 clean_spectra : bool, optional 302 Clean spectra before building index. Default is True. 303 304 Returns 305 ------- 306 :obj:`~ms_entropy.FlashEntropySearch` 307 FlashEntropy search instance. 308 309 """ 310 # Initialize FlashEntropy 311 fe_init_kws = [ 312 "max_ms2_tolerance_in_da", 313 "mz_index_step", 314 "low_memory", 315 "path_data", 316 ] 317 fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws} 318 fes = FlashEntropySearch(**fe_init_kws) 319 320 # Build FlashEntropy index 321 fe_index_kws = [ 322 "max_indexed_mz", 323 "precursor_ions_removal_da", 324 "noise_threshold", 325 "min_ms2_difference_in_da", 326 "max_peak_num", 327 ] 328 fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws} 329 fes.build_index(fe_lib, **fe_index_kws, clean_spectra=clean_spectra) 330 331 return fes
Base class that facilitates connection to spectral reference databases, such as EMSL's Metabolomics Reference Database (MetabRef).
31 def __init__(self, key=None): 32 """ 33 Initialize instance. 34 35 Parameters 36 ---------- 37 key : str 38 Token key. 39 40 """ 41 42 self.key = key
Initialize instance.
Parameters
- key (str): Token key.
44 def set_token(self, path): 45 """ 46 Set environment variable for MetabRef database token. 47 48 Parameters 49 ---------- 50 path : str 51 Path to token. 52 53 """ 54 55 # Read token from file 56 with open(path, "r", encoding="utf-8") as f: 57 token = f.readline().strip() 58 59 # Set environment variable 60 os.environ[self.key] = token
Set environment variable for MetabRef database token.
Parameters
- path (str): Path to token.
62 def get_token(self): 63 """ 64 Get environment variable for database token. 65 66 Returns 67 ------- 68 str 69 Token string. 70 71 """ 72 73 # Check for token 74 if self.key not in os.environ: 75 raise ValueError("Must set {} environment variable.".format(self.key)) 76 77 # Get token from environment variables 78 return os.environ.get(self.key)
Get environment variable for database token.
Returns
- str: Token string.
80 def get_header(self): 81 """ 82 Access stored database token and prepare as header. 83 84 Returns 85 ------- 86 str 87 Header string. 88 89 """ 90 91 # Get token 92 token = self.get_token() 93 94 # Pad header information 95 header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"} 96 97 return header
Access stored database token and prepare as header.
Returns
- str: Header string.
99 def get_query(self, url, use_header=True): 100 """ 101 Request payload from URL according to `get` protocol. 102 103 Parameters 104 ---------- 105 url : str 106 URL for request. 107 use_header: bool 108 Whether or not the query should include the header 109 110 Returns 111 ------- 112 dict 113 Response as JSON. 114 115 """ 116 117 # Query URL via `get` 118 if use_header: 119 response = requests.get(url, headers=self.get_header()) 120 else: 121 response = requests.get(url) 122 123 # Check response 124 response.raise_for_status() 125 126 # Return as JSON 127 return response.json()
Request payload from URL according to get
protocol.
Parameters
- url (str): URL for request.
- use_header (bool): Whether or not the query should include the header
Returns
- dict: Response as JSON.
129 def post_query(self, url, variable, values, tolerance): 130 """ 131 Request payload from URL according to `post` protocol. 132 133 Parameters 134 ---------- 135 url : str 136 URL for request. 137 variable : str 138 Variable to query. 139 values : str 140 Specific values of `variable` to query. 141 tolerance : str 142 Query tolerance relative to `values`. 143 144 Returns 145 ------- 146 dict 147 Response as JSON. 148 149 """ 150 151 # Coerce to string 152 if not isinstance(variable, str): 153 variable = str(variable).replace(" ", "") 154 155 if not isinstance(values, str): 156 values = str(values).replace(" ", "") 157 158 if not isinstance(tolerance, str): 159 tolerance = str(tolerance).replace(" ", "") 160 161 # Query URL via `post` 162 response = requests.post( 163 os.path.join(url, variable, tolerance), 164 data=values, 165 headers=self.get_header(), 166 ) 167 168 # Check response 169 response.raise_for_status() 170 171 # Return as JSON 172 return response.json()
Request payload from URL according to post
protocol.
Parameters
- url (str): URL for request.
- variable (str): Variable to query.
- values (str):
Specific values of
variable
to query. - tolerance (str):
Query tolerance relative to
values
.
Returns
- dict: Response as JSON.
267 @staticmethod 268 def normalize_peaks(arr): 269 """ 270 Normalize peaks in an array. 271 272 Parameters 273 ---------- 274 arr : :obj:`~numpy.array` 275 Array of shape (N, 2), with m/z in the first column and abundance in 276 the second. 277 278 Returns 279 ------- 280 :obj:`~numpy.array` 281 Normalized array of shape (N, 2), with m/z in the first column and 282 normalized abundance in the second. 283 """ 284 # Normalize the array 285 arr[:, -1] = arr[:, -1] / arr[:, -1].sum() 286 287 return arr
Normalize peaks in an array.
Parameters
- arr (
~numpy.array
): Array of shape (N, 2), with m/z in the first column and abundance in the second.
Returns
~numpy.array
: Normalized array of shape (N, 2), with m/z in the first column and normalized abundance in the second.
334class MetabRefInterface(SpectralDatabaseInterface): 335 """ 336 Interface to the Metabolomics Reference Database. 337 """ 338 339 def __init__(self): 340 """ 341 Initialize instance. 342 343 """ 344 345 super().__init__(key=None) 346 347 def spectrum_to_array(self, spectrum, normalize=True): 348 """ 349 Convert MetabRef-formatted spectrum to array. 350 351 Parameters 352 ---------- 353 spectrum : str 354 MetabRef spectrum, i.e. list of (m/z,abundance) pairs. 355 normalize : bool 356 Normalize the spectrum by its magnitude. 357 358 Returns 359 ------- 360 :obj:`~numpy.array` 361 Array of shape (N, 2), with m/z in the first column and abundance in 362 the second. 363 364 """ 365 366 # Convert parenthesis-delimited string to array 367 arr = np.array( 368 re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float 369 ).reshape(-1, 2) 370 371 if normalize: 372 arr = self.normalize_peaks(arr) 373 374 return arr 375 376 def _to_flashentropy(self, metabref_lib, normalize=True, fe_kwargs={}): 377 """ 378 Convert metabref-formatted library to FlashEntropy library. 379 380 Parameters 381 ---------- 382 metabref_lib : dict 383 MetabRef MS2 library in JSON format or FlashEntropy search instance (for reformatting at different MS2 separation). 384 normalize : bool 385 Normalize each spectrum by its magnitude. 386 fe_kwargs : dict, optional 387 Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search; 388 any keys not recognized will be ignored. By default, all parameters set to defaults. 389 390 Returns 391 ------- 392 :obj:`~ms_entropy.FlashEntropySearch` 393 MS2 library as FlashEntropy search instance. 394 395 Raises 396 ------ 397 ValueError 398 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal. 399 400 """ 401 self._check_flash_entropy_kwargs(fe_kwargs) 402 403 # Initialize empty library 404 fe_lib = [] 405 406 # Enumerate spectra 407 for i, source in enumerate(metabref_lib): 408 # Reorganize source dict, if necessary 409 if "spectrum_data" in source.keys(): 410 spectrum = source["spectrum_data"] 411 else: 412 spectrum = source 413 414 # Rename precursor_mz key for FlashEntropy 415 if "precursor_mz" not in spectrum.keys(): 416 spectrum["precursor_mz"] = spectrum.pop("precursor_ion") 417 418 # Convert CoreMS spectrum to array and clean, store as `peaks` 419 spectrum["peaks"] = self.spectrum_to_array( 420 spectrum["mz"], normalize=normalize 421 ) 422 423 # Add spectrum to library 424 fe_lib.append(spectrum) 425 426 # Build FlashEntropy index 427 fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs) 428 429 return fe_search 430 431 def get_query(self, url, use_header=False): 432 """Overwrites the get_query method on the parent class to default to not use a header 433 434 Notes 435 ----- 436 As of January 2025, the metabref database no longer requires a token and therefore no header is needed 437 438 """ 439 return super().get_query(url, use_header)
Interface to the Metabolomics Reference Database.
339 def __init__(self): 340 """ 341 Initialize instance. 342 343 """ 344 345 super().__init__(key=None)
Initialize instance.
347 def spectrum_to_array(self, spectrum, normalize=True): 348 """ 349 Convert MetabRef-formatted spectrum to array. 350 351 Parameters 352 ---------- 353 spectrum : str 354 MetabRef spectrum, i.e. list of (m/z,abundance) pairs. 355 normalize : bool 356 Normalize the spectrum by its magnitude. 357 358 Returns 359 ------- 360 :obj:`~numpy.array` 361 Array of shape (N, 2), with m/z in the first column and abundance in 362 the second. 363 364 """ 365 366 # Convert parenthesis-delimited string to array 367 arr = np.array( 368 re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float 369 ).reshape(-1, 2) 370 371 if normalize: 372 arr = self.normalize_peaks(arr) 373 374 return arr
Convert MetabRef-formatted spectrum to array.
Parameters
- spectrum (str): MetabRef spectrum, i.e. list of (m/z,abundance) pairs.
- normalize (bool): Normalize the spectrum by its magnitude.
Returns
~numpy.array
: Array of shape (N, 2), with m/z in the first column and abundance in the second.
431 def get_query(self, url, use_header=False): 432 """Overwrites the get_query method on the parent class to default to not use a header 433 434 Notes 435 ----- 436 As of January 2025, the metabref database no longer requires a token and therefore no header is needed 437 438 """ 439 return super().get_query(url, use_header)
Overwrites the get_query method on the parent class to default to not use a header
Notes
As of January 2025, the metabref database no longer requires a token and therefore no header is needed
Inherited Members
442class MetabRefGCInterface(MetabRefInterface): 443 """ 444 Interface to the Metabolomics Reference Database. 445 """ 446 447 def __init__(self): 448 """ 449 Initialize instance. 450 451 """ 452 453 super().__init__() 454 self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1" 455 self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames" 456 457 self.__init_format_map__() 458 459 def __init_format_map__(self): 460 """ 461 Initialize database format mapper, enabling multiple format requests. 462 463 """ 464 465 # Define format workflows 466 self.format_map = { 467 "json": lambda x, normalize, fe_kwargs: x, 468 "dict": lambda x, 469 normalize, 470 fe_kwargs: self._to_LowResolutionEICompound_dict(x, normalize), 471 "sql": lambda x, 472 normalize, 473 fe_kwargs: self._LowResolutionEICompound_dict_to_sqlite( 474 self._to_LowResolutionEICompound_dict(x, normalize) 475 ), 476 } 477 478 # Add aliases 479 self.format_map["metabref"] = self.format_map["json"] 480 self.format_map["datadict"] = self.format_map["dict"] 481 self.format_map["data-dict"] = self.format_map["dict"] 482 self.format_map["lowreseicompound"] = self.format_map["dict"] 483 self.format_map["lowres"] = self.format_map["dict"] 484 self.format_map["lowresgc"] = self.format_map["dict"] 485 self.format_map["sqlite"] = self.format_map["sql"] 486 487 def available_formats(self): 488 """ 489 View list of available formats. 490 491 Returns 492 ------- 493 list 494 Format map keys. 495 """ 496 497 return list(self.format_map.keys()) 498 499 def get_library(self, format="json", normalize=False): 500 """ 501 Request MetabRef GC/MS library. 502 503 Parameters 504 ---------- 505 format : str 506 Format of requested library, i.e. "json", "sql", "flashentropy". 507 See `available_formats` method for aliases. 508 normalize : bool 509 Normalize the spectrum by its magnitude. 510 511 Returns 512 ------- 513 Library in requested format. 514 515 """ 516 517 # Init format function 518 format_func = self._get_format_func(format) 519 520 return format_func( 521 self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {} 522 ) 523 524 def get_fames(self, format="json", normalize=False): 525 """ 526 Request MetabRef GC/MS FAMEs library. 527 528 Parameters 529 ---------- 530 format : str 531 Format of requested library, i.e. "json", "sql", "flashentropy". 532 See `available_formats` method for aliases. 533 normalize : bool 534 Normalize the spectrum by its magnitude. 535 536 Returns 537 ------- 538 Library in requested format. 539 540 """ 541 542 # Init format function 543 format_func = self._get_format_func(format) 544 545 return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {}) 546 547 def _to_LowResolutionEICompound_dict(self, metabref_lib, normalize=False): 548 """ 549 Convert MetabRef-formatted library to CoreMS LowResolutionEICompound-formatted 550 dictionary for local ingestion. 551 552 Parameters 553 ---------- 554 metabref_lib : dict 555 MetabRef GC-MS library in JSON format. 556 normalize : bool 557 Normalize each spectrum by its magnitude. 558 559 Returns 560 ------- 561 list of dict 562 List of each spectrum contained in dictionary. 563 564 """ 565 566 # All below key:value lookups are based on CoreMS class definitions 567 # NOT MetabRef content. For example, MetabRef has keys for PubChem, 568 # USI, etc. that are not considered below. 569 570 # Dictionary to map metabref keys to corems keys 571 metadatar_cols = { 572 "casno": "cas", 573 "inchikey": "inchikey", 574 "inchi": "inchi", 575 "chebi": "chebi", 576 "smiles": "smiles", 577 "kegg": "kegg", 578 "iupac_name": "iupac_name", 579 "traditional_name": "traditional_name", # Not present in metabref 580 "common_name": "common_name", # Not present in metabref 581 } 582 583 # Dictionary to map metabref keys to corems keys 584 lowres_ei_compound_cols = { 585 "id": "metabref_id", 586 "molecule_name": "name", # Is this correct? 587 "classify": "classify", # Not present in metabref 588 "formula": "formula", 589 "ri": "ri", 590 "rt": "retention_time", 591 "source": "source", # Not present in metabref 592 "casno": "casno", 593 "comments": "comment", 594 "source_temp_c": "source_temp_c", # Not present in metabref 595 "ev": "ev", # Not present in metabref 596 "peak_count": "peaks_count", 597 "mz": "mz", 598 "abundance": "abundance", 599 } 600 601 # Local result container 602 corems_lib = [] 603 604 # Enumerate spectra 605 for i, source_ in enumerate(metabref_lib): 606 # Copy source to prevent modification 607 source = source_.copy() 608 609 # Flatten source dict 610 source = source.pop("spectrum_data") | source 611 612 # Parse target data 613 target = { 614 lowres_ei_compound_cols[k]: v 615 for k, v in source.items() 616 if k in lowres_ei_compound_cols 617 } 618 619 # Explicitly add this to connect with LowResCompoundRef later 620 target["rt"] = source["rt"] 621 622 # Parse (mz, abundance) 623 arr = self.spectrum_to_array(target["mz"], normalize=normalize) 624 target["mz"] = arr[:, 0] 625 target["abundance"] = arr[:, 1] 626 627 # Parse meta data 628 target["metadata"] = { 629 metadatar_cols[k]: v for k, v in source.items() if k in metadatar_cols 630 } 631 632 # Add anything else 633 for k in source: 634 if k not in lowres_ei_compound_cols: 635 target[k] = source[k] 636 637 # Add to CoreMS list 638 corems_lib.append(target) 639 640 return corems_lib 641 642 def _LowResolutionEICompound_dict_to_sqlite( 643 self, lowres_ei_compound_dict, url="sqlite://" 644 ): 645 """ 646 Convert CoreMS LowResolutionEICompound-formatted dictionary to SQLite 647 database for local ingestion. 648 649 Parameters 650 ---------- 651 lowres_ei_compound_dict : dict 652 CoreMS GC-MS library formatted for LowResolutionEICompound. 653 url : str 654 URL to SQLite prefix. 655 656 Returns 657 ------- 658 sqlite database 659 Spectra contained in SQLite database. 660 661 """ 662 663 # Dictionary to map corems keys to all-caps keys 664 capped_cols = { 665 "name": "NAME", 666 "formula": "FORM", 667 "ri": "RI", 668 "retention_time": "RT", 669 "source": "SOURCE", 670 "casno": "CASNO", 671 "comment": "COMMENT", 672 "peaks_count": "NUM PEAKS", 673 } 674 675 # Initialize SQLite object 676 sqlite_obj = EI_LowRes_SQLite(url=url) 677 678 # Iterate spectra 679 for _data_dict in lowres_ei_compound_dict: 680 # Copy source to prevent modification 681 data_dict = _data_dict.copy() 682 683 # Add missing capped values 684 for k, v in capped_cols.items(): 685 # Key exists 686 if k in data_dict: 687 # # This will replace the key 688 # data_dict[v] = data_dict.pop(k) 689 690 # This will keep both keys 691 data_dict[v] = data_dict[k] 692 693 # Parse number of peaks 694 if not data_dict.get("NUM PEAKS"): 695 data_dict["NUM PEAKS"] = len(data_dict.get("mz")) 696 697 # Parse CAS number 698 if not data_dict.get("CASNO"): 699 data_dict["CASNO"] = data_dict.get("CAS") 700 701 if not data_dict["CASNO"]: 702 data_dict["CASNO"] = 0 703 704 # Build linked metadata table 705 if "metadata" in data_dict: 706 if len(data_dict["metadata"]) > 0: 707 data_dict["metadatar"] = Metadatar(**data_dict.pop("metadata")) 708 else: 709 data_dict.pop("metadata") 710 711 # Attempt addition to sqlite 712 try: 713 sqlite_obj.add_compound(data_dict) 714 except: 715 print(data_dict["NAME"]) 716 717 return sqlite_obj
Interface to the Metabolomics Reference Database.
447 def __init__(self): 448 """ 449 Initialize instance. 450 451 """ 452 453 super().__init__() 454 self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1" 455 self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames" 456 457 self.__init_format_map__()
Initialize instance.
487 def available_formats(self): 488 """ 489 View list of available formats. 490 491 Returns 492 ------- 493 list 494 Format map keys. 495 """ 496 497 return list(self.format_map.keys())
View list of available formats.
Returns
- list: Format map keys.
499 def get_library(self, format="json", normalize=False): 500 """ 501 Request MetabRef GC/MS library. 502 503 Parameters 504 ---------- 505 format : str 506 Format of requested library, i.e. "json", "sql", "flashentropy". 507 See `available_formats` method for aliases. 508 normalize : bool 509 Normalize the spectrum by its magnitude. 510 511 Returns 512 ------- 513 Library in requested format. 514 515 """ 516 517 # Init format function 518 format_func = self._get_format_func(format) 519 520 return format_func( 521 self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {} 522 )
Request MetabRef GC/MS library.
Parameters
- format (str):
Format of requested library, i.e. "json", "sql", "flashentropy".
See
available_formats
method for aliases. - normalize (bool): Normalize the spectrum by its magnitude.
Returns
- Library in requested format.
524 def get_fames(self, format="json", normalize=False): 525 """ 526 Request MetabRef GC/MS FAMEs library. 527 528 Parameters 529 ---------- 530 format : str 531 Format of requested library, i.e. "json", "sql", "flashentropy". 532 See `available_formats` method for aliases. 533 normalize : bool 534 Normalize the spectrum by its magnitude. 535 536 Returns 537 ------- 538 Library in requested format. 539 540 """ 541 542 # Init format function 543 format_func = self._get_format_func(format) 544 545 return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {})
Request MetabRef GC/MS FAMEs library.
Parameters
- format (str):
Format of requested library, i.e. "json", "sql", "flashentropy".
See
available_formats
method for aliases. - normalize (bool): Normalize the spectrum by its magnitude.
Returns
- Library in requested format.
720class MetabRefLCInterface(MetabRefInterface): 721 """ 722 Interface to the Metabolomics Reference Database for LC-MS data. 723 """ 724 725 def __init__(self): 726 """ 727 Initialize instance. 728 729 """ 730 731 super().__init__() 732 733 # API endpoint for precursor m/z search 734 # inputs = mz, tolerance (in Da), polarity, page_no, per_page 735 self.PRECURSOR_MZ_URL = "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}" 736 737 # API endpoint for returning full list of precursor m/z values in database 738 # inputs = polarity, page_no, per_page 739 self.PRECURSOR_MZ_ALL_URL = ( 740 "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}" 741 ) 742 743 # API endpoint for lipid data 744 self.LIPID_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/lipid/data" 745 746 self.__init_format_map__() 747 748 def __init_format_map__(self): 749 """ 750 Initialize database format mapper, enabling multiple format requests. 751 752 """ 753 754 # Define format workflows 755 self.format_map = { 756 "json": lambda x, normalize, fe_kwargs: x, 757 "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy( 758 x, normalize, fe_kwargs 759 ), 760 } 761 762 # Add aliases 763 self.format_map["metabref"] = self.format_map["json"] 764 self.format_map["fe"] = self.format_map["flashentropy"] 765 self.format_map["flash-entropy"] = self.format_map["flashentropy"] 766 767 def query_by_precursor( 768 self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50 769 ): 770 """ 771 Query MetabRef by precursor m/z values. 772 773 Parameters 774 ---------- 775 mz_list : list 776 List of precursor m/z values. 777 polarity : str 778 Ionization polarity, either "positive" or "negative". 779 mz_tol_ppm : float 780 Tolerance in ppm for each precursor m/z value. 781 Used for retrieving from a potential match from database. 782 mz_tol_da_api : float, optional 783 Maximum tolerance between precursor m/z values for API search, in daltons. 784 Used to group similar mzs into a single API query for speed. Default is 0.2. 785 max_per_page : int, optional 786 Maximum records to return from MetabRef API query at a time. Default is 50. 787 788 Returns 789 ------- 790 list 791 List of library entries in original JSON format. 792 """ 793 raise DeprecationWarning( 794 "query_by_precursor is deprecated. Use get_lipid_library instead." 795 ) 796 797 def request_all_precursors(self, polarity, per_page=50000): 798 """ 799 Request all precursor m/z values for MS2 spectra from MetabRef. 800 801 Parameters 802 ---------- 803 polarity : str 804 Ionization polarity, either "positive" or "negative". 805 per_page : int, optional 806 Number of records to fetch per call. Default is 50000 807 808 Returns 809 ------- 810 list 811 List of all precursor m/z values, sorted. 812 """ 813 raise DeprecationWarning("request_all_precursors is deprecated.") 814 815 def post_lipid_query(self, mz_list, polarity, mz_tol_ppm): 816 """ 817 Post query to get MetabRef lipid spectra. 818 819 Parameters 820 ---------- 821 mz_list : list 822 List of precursor m/z values. 823 polarity : str 824 Ionization polarity, either "positive" or "negative". 825 mz_tol_ppm : float 826 Tolerance in ppm for each precursor m/z value. 827 828 Returns 829 ------- 830 download_id : str 831 Download ID for the lipid library query. 832 833 Raises 834 ------ 835 ValueError 836 If any input parameter is invalid. 837 If no download ID is returned. 838 """ 839 url = self.LIPID_LIBRARY_URL 840 841 headers = { 842 'accept': '*/*', 843 'Content-Type': 'application/json' 844 } 845 846 payload = { 847 "tolerance_ppm": mz_tol_ppm, 848 "polarity": polarity, 849 "mz_list": list(set(np.sort(mz_list))) 850 } 851 852 try: 853 response = requests.post(url, headers=headers, json=payload) 854 response.raise_for_status() # Raises an HTTPError for bad responses 855 text = response.text.strip() 856 # Drop everything before the final space 857 if not text: 858 raise ValueError("Empty response from MetabRef lipid library API.") 859 if " " in text: 860 text = text.rsplit(" ", 1)[-1] 861 return text 862 else: 863 raise ValueError("Unexpected response format from MetabRef lipid library API.") 864 except requests.exceptions.RequestException as e: 865 raise ValueError(f"Error querying MetabRef lipid library: {e}") 866 867 def get_lipid_data(self, job_id, attempts=10, delay=5): 868 """ 869 Get download content from lipid library query from MetabRef using job ID. 870 871 Parameters 872 ---------- 873 job_id : str 874 Job ID for the lipid library query. 875 Retrieved from the post_lipid_query method. 876 attempts : int, optional 877 Number of attempts to retrieve the data. Default is 10. 878 delay : int, optional 879 Delay in seconds between attempts. Default is 5. 880 881 Returns 882 ------- 883 str 884 Download content from the lipid library query. 885 886 Raises 887 ------ 888 ValueError 889 If no download content is returned. 890 """ 891 url = f"https://metabref.emsl.pnnl.gov/api/lipid/data/download/{job_id}" 892 893 # Check the response, if it's 400, try again in 5 seconds. Try up to 10 times 894 for attempt in range(attempts): 895 try: 896 response = requests.get(url) 897 response.raise_for_status() # Raises an HTTPError for bad responses 898 if response.status_code == 200: 899 if response.content == b"Job still running": 900 if attempt < attempts - 1: 901 time.sleep(delay) 902 continue 903 else: 904 lib = response.content 905 return lib.decode('utf-8') if isinstance(lib, bytes) else lib 906 elif response.status_code == 400: 907 if attempt < attempts - 1: 908 time.sleep(delay) # Wait before retrying 909 continue 910 else: 911 raise ValueError("Job ID not found or job is still processing.") 912 except requests.exceptions.RequestException as e: 913 if attempt < attempts - 1: 914 time.sleep(delay) 915 continue 916 else: 917 raise ValueError(f"Error retrieving lipid library job: {e}") 918 919 def get_lipid_library( 920 self, 921 mz_list, 922 polarity, 923 mz_tol_ppm, 924 mz_tol_da_api=None, 925 format="json", 926 normalize=True, 927 fe_kwargs={}, 928 api_delay=5, 929 api_attempts=10, 930 ): 931 """ 932 Request MetabRef lipid library. 933 934 Parameters 935 ---------- 936 mz_list : list 937 List of precursor m/z values. 938 polarity : str 939 Ionization polarity, either "positive" or "negative". 940 mz_tol_ppm : float 941 Tolerance in ppm for each precursor m/z value. 942 Used for retrieving from a potential match from database. 943 mz_tol_da_api : float, optional 944 DEPRECATED. No longer used, but kept for backwards compatibility. 945 format : str, optional 946 Format of requested library, i.e. "json", "sql", "flashentropy". 947 See `available_formats` method for aliases. Default is "json". 948 normalize : bool, optional 949 Normalize the spectrum by its magnitude. Default is True. 950 fe_kwargs : dict, optional 951 Keyword arguments for FlashEntropy search. Default is {}. 952 api_delay : int, optional 953 Delay in seconds between API attempts. Default is 5. 954 api_attempts : int, optional 955 Number of attempts to retrieve the data from the API. Default is 10. 956 957 Returns 958 ------- 959 tuple 960 Library in requested format and lipid metadata as a LipidMetadata dataclass. 961 962 """ 963 # Check for valid types in mz_list, polarity, and mz_tol_ppm 964 if not isinstance(mz_list, (list, np.ndarray)): 965 raise ValueError("mz_list must be a list or numpy array") 966 if not all(isinstance(mz, (float, int)) for mz in mz_list): 967 raise ValueError("All elements in mz_list must be float or int") 968 if not isinstance(polarity, str): 969 raise ValueError("polarity must be a string") 970 if not isinstance(mz_tol_ppm, (float, int)): 971 raise ValueError("mz_tol_ppm must be a float or int") 972 973 job_id = self.post_lipid_query( 974 mz_list=mz_list, 975 polarity=polarity, 976 mz_tol_ppm=mz_tol_ppm, 977 ) 978 979 lib = self.get_lipid_data( 980 job_id=job_id, 981 attempts=api_attempts, 982 delay=api_delay, 983 ) 984 lib = json.loads(lib) 985 986 # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass 987 mol_data_dict = lib['molecular_data'] 988 mol_data_dict = { 989 int(k): self._dict_to_dataclass(v, LipidMetadata) 990 for k, v in mol_data_dict.items() 991 } 992 993 # Remove lipid metadata from the metabref library 994 lib = lib['mass_spectrum_data'] 995 # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry 996 for x in lib: 997 if "Lipid Fragments" in x.keys(): 998 x.update(x.pop("Lipid Fragments")) 999 if "MSO Data" in x.keys(): 1000 x.update(x.pop("MSO Data")) 1001 1002 # Format the spectral library 1003 format_func = self._get_format_func(format) 1004 lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs) 1005 return (lib, mol_data_dict)
Interface to the Metabolomics Reference Database for LC-MS data.
725 def __init__(self): 726 """ 727 Initialize instance. 728 729 """ 730 731 super().__init__() 732 733 # API endpoint for precursor m/z search 734 # inputs = mz, tolerance (in Da), polarity, page_no, per_page 735 self.PRECURSOR_MZ_URL = "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}" 736 737 # API endpoint for returning full list of precursor m/z values in database 738 # inputs = polarity, page_no, per_page 739 self.PRECURSOR_MZ_ALL_URL = ( 740 "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}" 741 ) 742 743 # API endpoint for lipid data 744 self.LIPID_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/lipid/data" 745 746 self.__init_format_map__()
Initialize instance.
767 def query_by_precursor( 768 self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50 769 ): 770 """ 771 Query MetabRef by precursor m/z values. 772 773 Parameters 774 ---------- 775 mz_list : list 776 List of precursor m/z values. 777 polarity : str 778 Ionization polarity, either "positive" or "negative". 779 mz_tol_ppm : float 780 Tolerance in ppm for each precursor m/z value. 781 Used for retrieving from a potential match from database. 782 mz_tol_da_api : float, optional 783 Maximum tolerance between precursor m/z values for API search, in daltons. 784 Used to group similar mzs into a single API query for speed. Default is 0.2. 785 max_per_page : int, optional 786 Maximum records to return from MetabRef API query at a time. Default is 50. 787 788 Returns 789 ------- 790 list 791 List of library entries in original JSON format. 792 """ 793 raise DeprecationWarning( 794 "query_by_precursor is deprecated. Use get_lipid_library instead." 795 )
Query MetabRef by precursor m/z values.
Parameters
- mz_list (list): List of precursor m/z values.
- polarity (str): Ionization polarity, either "positive" or "negative".
- mz_tol_ppm (float): Tolerance in ppm for each precursor m/z value. Used for retrieving from a potential match from database.
- mz_tol_da_api (float, optional): Maximum tolerance between precursor m/z values for API search, in daltons. Used to group similar mzs into a single API query for speed. Default is 0.2.
- max_per_page (int, optional): Maximum records to return from MetabRef API query at a time. Default is 50.
Returns
- list: List of library entries in original JSON format.
797 def request_all_precursors(self, polarity, per_page=50000): 798 """ 799 Request all precursor m/z values for MS2 spectra from MetabRef. 800 801 Parameters 802 ---------- 803 polarity : str 804 Ionization polarity, either "positive" or "negative". 805 per_page : int, optional 806 Number of records to fetch per call. Default is 50000 807 808 Returns 809 ------- 810 list 811 List of all precursor m/z values, sorted. 812 """ 813 raise DeprecationWarning("request_all_precursors is deprecated.")
Request all precursor m/z values for MS2 spectra from MetabRef.
Parameters
- polarity (str): Ionization polarity, either "positive" or "negative".
- per_page (int, optional): Number of records to fetch per call. Default is 50000
Returns
- list: List of all precursor m/z values, sorted.
815 def post_lipid_query(self, mz_list, polarity, mz_tol_ppm): 816 """ 817 Post query to get MetabRef lipid spectra. 818 819 Parameters 820 ---------- 821 mz_list : list 822 List of precursor m/z values. 823 polarity : str 824 Ionization polarity, either "positive" or "negative". 825 mz_tol_ppm : float 826 Tolerance in ppm for each precursor m/z value. 827 828 Returns 829 ------- 830 download_id : str 831 Download ID for the lipid library query. 832 833 Raises 834 ------ 835 ValueError 836 If any input parameter is invalid. 837 If no download ID is returned. 838 """ 839 url = self.LIPID_LIBRARY_URL 840 841 headers = { 842 'accept': '*/*', 843 'Content-Type': 'application/json' 844 } 845 846 payload = { 847 "tolerance_ppm": mz_tol_ppm, 848 "polarity": polarity, 849 "mz_list": list(set(np.sort(mz_list))) 850 } 851 852 try: 853 response = requests.post(url, headers=headers, json=payload) 854 response.raise_for_status() # Raises an HTTPError for bad responses 855 text = response.text.strip() 856 # Drop everything before the final space 857 if not text: 858 raise ValueError("Empty response from MetabRef lipid library API.") 859 if " " in text: 860 text = text.rsplit(" ", 1)[-1] 861 return text 862 else: 863 raise ValueError("Unexpected response format from MetabRef lipid library API.") 864 except requests.exceptions.RequestException as e: 865 raise ValueError(f"Error querying MetabRef lipid library: {e}")
Post query to get MetabRef lipid spectra.
Parameters
- mz_list (list): List of precursor m/z values.
- polarity (str): Ionization polarity, either "positive" or "negative".
- mz_tol_ppm (float): Tolerance in ppm for each precursor m/z value.
Returns
- download_id (str): Download ID for the lipid library query.
Raises
- ValueError: If any input parameter is invalid. If no download ID is returned.
867 def get_lipid_data(self, job_id, attempts=10, delay=5): 868 """ 869 Get download content from lipid library query from MetabRef using job ID. 870 871 Parameters 872 ---------- 873 job_id : str 874 Job ID for the lipid library query. 875 Retrieved from the post_lipid_query method. 876 attempts : int, optional 877 Number of attempts to retrieve the data. Default is 10. 878 delay : int, optional 879 Delay in seconds between attempts. Default is 5. 880 881 Returns 882 ------- 883 str 884 Download content from the lipid library query. 885 886 Raises 887 ------ 888 ValueError 889 If no download content is returned. 890 """ 891 url = f"https://metabref.emsl.pnnl.gov/api/lipid/data/download/{job_id}" 892 893 # Check the response, if it's 400, try again in 5 seconds. Try up to 10 times 894 for attempt in range(attempts): 895 try: 896 response = requests.get(url) 897 response.raise_for_status() # Raises an HTTPError for bad responses 898 if response.status_code == 200: 899 if response.content == b"Job still running": 900 if attempt < attempts - 1: 901 time.sleep(delay) 902 continue 903 else: 904 lib = response.content 905 return lib.decode('utf-8') if isinstance(lib, bytes) else lib 906 elif response.status_code == 400: 907 if attempt < attempts - 1: 908 time.sleep(delay) # Wait before retrying 909 continue 910 else: 911 raise ValueError("Job ID not found or job is still processing.") 912 except requests.exceptions.RequestException as e: 913 if attempt < attempts - 1: 914 time.sleep(delay) 915 continue 916 else: 917 raise ValueError(f"Error retrieving lipid library job: {e}")
Get download content from lipid library query from MetabRef using job ID.
Parameters
- job_id (str): Job ID for the lipid library query. Retrieved from the post_lipid_query method.
- attempts (int, optional): Number of attempts to retrieve the data. Default is 10.
- delay (int, optional): Delay in seconds between attempts. Default is 5.
Returns
- str: Download content from the lipid library query.
Raises
- ValueError: If no download content is returned.
919 def get_lipid_library( 920 self, 921 mz_list, 922 polarity, 923 mz_tol_ppm, 924 mz_tol_da_api=None, 925 format="json", 926 normalize=True, 927 fe_kwargs={}, 928 api_delay=5, 929 api_attempts=10, 930 ): 931 """ 932 Request MetabRef lipid library. 933 934 Parameters 935 ---------- 936 mz_list : list 937 List of precursor m/z values. 938 polarity : str 939 Ionization polarity, either "positive" or "negative". 940 mz_tol_ppm : float 941 Tolerance in ppm for each precursor m/z value. 942 Used for retrieving from a potential match from database. 943 mz_tol_da_api : float, optional 944 DEPRECATED. No longer used, but kept for backwards compatibility. 945 format : str, optional 946 Format of requested library, i.e. "json", "sql", "flashentropy". 947 See `available_formats` method for aliases. Default is "json". 948 normalize : bool, optional 949 Normalize the spectrum by its magnitude. Default is True. 950 fe_kwargs : dict, optional 951 Keyword arguments for FlashEntropy search. Default is {}. 952 api_delay : int, optional 953 Delay in seconds between API attempts. Default is 5. 954 api_attempts : int, optional 955 Number of attempts to retrieve the data from the API. Default is 10. 956 957 Returns 958 ------- 959 tuple 960 Library in requested format and lipid metadata as a LipidMetadata dataclass. 961 962 """ 963 # Check for valid types in mz_list, polarity, and mz_tol_ppm 964 if not isinstance(mz_list, (list, np.ndarray)): 965 raise ValueError("mz_list must be a list or numpy array") 966 if not all(isinstance(mz, (float, int)) for mz in mz_list): 967 raise ValueError("All elements in mz_list must be float or int") 968 if not isinstance(polarity, str): 969 raise ValueError("polarity must be a string") 970 if not isinstance(mz_tol_ppm, (float, int)): 971 raise ValueError("mz_tol_ppm must be a float or int") 972 973 job_id = self.post_lipid_query( 974 mz_list=mz_list, 975 polarity=polarity, 976 mz_tol_ppm=mz_tol_ppm, 977 ) 978 979 lib = self.get_lipid_data( 980 job_id=job_id, 981 attempts=api_attempts, 982 delay=api_delay, 983 ) 984 lib = json.loads(lib) 985 986 # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass 987 mol_data_dict = lib['molecular_data'] 988 mol_data_dict = { 989 int(k): self._dict_to_dataclass(v, LipidMetadata) 990 for k, v in mol_data_dict.items() 991 } 992 993 # Remove lipid metadata from the metabref library 994 lib = lib['mass_spectrum_data'] 995 # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry 996 for x in lib: 997 if "Lipid Fragments" in x.keys(): 998 x.update(x.pop("Lipid Fragments")) 999 if "MSO Data" in x.keys(): 1000 x.update(x.pop("MSO Data")) 1001 1002 # Format the spectral library 1003 format_func = self._get_format_func(format) 1004 lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs) 1005 return (lib, mol_data_dict)
Request MetabRef lipid library.
Parameters
- mz_list (list): List of precursor m/z values.
- polarity (str): Ionization polarity, either "positive" or "negative".
- mz_tol_ppm (float): Tolerance in ppm for each precursor m/z value. Used for retrieving from a potential match from database.
- mz_tol_da_api (float, optional): DEPRECATED. No longer used, but kept for backwards compatibility.
- format (str, optional):
Format of requested library, i.e. "json", "sql", "flashentropy".
See
available_formats
method for aliases. Default is "json". - normalize (bool, optional): Normalize the spectrum by its magnitude. Default is True.
- fe_kwargs (dict, optional): Keyword arguments for FlashEntropy search. Default is {}.
- api_delay (int, optional): Delay in seconds between API attempts. Default is 5.
- api_attempts (int, optional): Number of attempts to retrieve the data from the API. Default is 10.
Returns
- tuple: Library in requested format and lipid metadata as a LipidMetadata dataclass.
1008class MSPInterface(SpectralDatabaseInterface): 1009 """ 1010 Interface to parse NIST MSP files 1011 """ 1012 1013 def __init__(self, file_path): 1014 """ 1015 Initialize instance. 1016 1017 Parameters 1018 ---------- 1019 file_path : str 1020 Path to a local MSP file. 1021 1022 Attributes 1023 ---------- 1024 file_path : str 1025 Path to the MSP file. 1026 _file_content : str 1027 Content of the MSP file. 1028 _data_frame : :obj:`~pandas.DataFrame` 1029 DataFrame of spectra from the MSP file with unaltered content. 1030 """ 1031 super().__init__(key=None) 1032 1033 self.file_path = file_path 1034 if not os.path.exists(self.file_path): 1035 raise FileNotFoundError( 1036 f"File {self.file_path} does not exist. Please check the file path." 1037 ) 1038 with open(self.file_path, "r") as f: 1039 self._file_content = f.read() 1040 1041 self._data_frame = self._read_msp_file() 1042 self.__init_format_map__() 1043 1044 def __init_format_map__(self): 1045 """ 1046 Initialize database format mapper, enabling multiple format requests. 1047 1048 """ 1049 1050 # x is a pandas dataframe similar to self._data_frame format 1051 # Define format workflows 1052 self.format_map = { 1053 "msp": lambda x, normalize, fe_kwargs: self._to_msp(x, normalize), 1054 "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy( 1055 x, normalize, fe_kwargs 1056 ), 1057 "df": lambda x, normalize, fe_kwargs: self._to_df(x, normalize), 1058 } 1059 1060 # Add aliases 1061 self.format_map["fe"] = self.format_map["flashentropy"] 1062 self.format_map["flash-entropy"] = self.format_map["flashentropy"] 1063 self.format_map["dataframe"] = self.format_map["df"] 1064 self.format_map["data-frame"] = self.format_map["df"] 1065 1066 def _read_msp_file(self): 1067 """ 1068 Reads the MSP files into the pandas dataframe, and sort/remove zero intensity ions in MS/MS spectra. 1069 1070 Returns 1071 ------- 1072 :obj:`~pandas.DataFrame` 1073 DataFrame of spectra from the MSP file, exacly as it is in the file (no sorting, filtering etc) 1074 """ 1075 # If input_dataframe is provided, return it it 1076 spectra = [] 1077 spectrum = {} 1078 1079 f = StringIO(self._file_content) 1080 for line in f: 1081 line = line.strip() 1082 if not line: 1083 continue # Skip empty lines 1084 1085 # Handle metadata 1086 if ":" in line: 1087 key, value = line.split(":", 1) 1088 key = key.strip().lower() 1089 value = value.strip() 1090 1091 if key == "name": 1092 # Save current spectrum and start a new one 1093 if spectrum: 1094 spectra.append(spectrum) 1095 spectrum = {"name": value, "peaks": []} 1096 else: 1097 spectrum[key] = value 1098 1099 # Handle peak data (assumed to start with a number) 1100 elif line[0].isdigit(): 1101 peaks = line.split() 1102 m_z = float(peaks[0]) 1103 intensity = float(peaks[1]) 1104 spectrum["peaks"].append(([m_z, intensity])) 1105 # Save the last spectrum 1106 if spectrum: 1107 spectra.append(spectrum) 1108 1109 df = pd.DataFrame(spectra) 1110 for column in df.columns: 1111 if column != "peaks": # Skip 'peaks' column 1112 try: 1113 df[column] = pd.to_numeric(df[column], errors="raise") 1114 except: 1115 pass 1116 return df 1117 1118 def _to_df(self, input_dataframe, normalize=True): 1119 """ 1120 Convert MSP-derived library to FlashEntropy library. 1121 1122 Parameters 1123 ---------- 1124 input_dataframe : :obj:`~pandas.DataFrame` 1125 Input DataFrame containing MSP-formatted spectra. 1126 normalize : bool, optional 1127 Normalize each spectrum by its magnitude. 1128 Default is True. 1129 1130 Returns 1131 ------- 1132 :obj:`~pandas.DataFrame` 1133 DataFrame of with desired normalization 1134 """ 1135 if not normalize: 1136 return input_dataframe 1137 else: 1138 # Convert to dictionary 1139 db_dict = input_dataframe.to_dict(orient="records") 1140 1141 # Initialize empty library 1142 lib = [] 1143 1144 # Enumerate spectra 1145 for i, source in enumerate(db_dict): 1146 spectrum = source 1147 # Check that spectrum["peaks"] exists 1148 if "peaks" not in spectrum.keys(): 1149 raise KeyError( 1150 "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute." 1151 ) 1152 1153 # Convert spectrum["peaks"] to numpy array 1154 if not isinstance(spectrum["peaks"], np.ndarray): 1155 spectrum["peaks"] = np.array(spectrum["peaks"]) 1156 1157 # Normalize peaks, if requested 1158 if normalize: 1159 spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"]) 1160 spectrum["num peaks"] = len(spectrum["peaks"]) 1161 1162 # Add spectrum to library 1163 lib.append(spectrum) 1164 1165 # Convert to DataFrame 1166 df = pd.DataFrame(lib) 1167 return df 1168 1169 def _to_flashentropy(self, input_dataframe, normalize=True, fe_kwargs={}): 1170 """ 1171 Convert MSP-derived library to FlashEntropy library. 1172 1173 Parameters 1174 ---------- 1175 input_dataframe : :obj:`~pandas.DataFrame` 1176 Input DataFrame containing MSP-formatted spectra. 1177 normalize : bool 1178 Normalize each spectrum by its magnitude. 1179 fe_kwargs : dict, optional 1180 Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search; 1181 any keys not recognized will be ignored. By default, all parameters set to defaults. 1182 1183 Returns 1184 ------- 1185 :obj:`~ms_entropy.FlashEntropySearch` 1186 MS2 library as FlashEntropy search instance. 1187 1188 Raises 1189 ------ 1190 ValueError 1191 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they 1192 """ 1193 self._check_flash_entropy_kwargs(fe_kwargs) 1194 1195 db_df = input_dataframe 1196 1197 # Convert to dictionary 1198 db_dict = db_df.to_dict(orient="records") 1199 1200 # Initialize empty library 1201 fe_lib = [] 1202 1203 # Enumerate spectra 1204 for i, source in enumerate(db_dict): 1205 # Reorganize source dict, if necessary 1206 if "spectrum_data" in source.keys(): 1207 spectrum = source["spectrum_data"] 1208 else: 1209 spectrum = source 1210 1211 # Rename precursor_mz key for FlashEntropy 1212 if "precursor_mz" not in spectrum.keys(): 1213 if "precursormz" in spectrum: 1214 spectrum["precursor_mz"] = spectrum.pop("precursormz") 1215 elif "precursor_ion" in spectrum: 1216 spectrum["precursor_mz"] = spectrum.pop("precursor_ion") 1217 else: 1218 raise KeyError( 1219 "MSP must have either 'precursormz' or 'precursor_ion' key to be converted to FlashEntropy format." 1220 ) 1221 1222 # Check that spectrum["peaks"] exists 1223 if "peaks" not in spectrum.keys(): 1224 raise KeyError( 1225 "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute." 1226 ) 1227 1228 # Convert spectrum["peaks"] to numpy array 1229 if not isinstance(spectrum["peaks"], np.ndarray): 1230 spectrum["peaks"] = np.array(spectrum["peaks"]) 1231 1232 # Normalize peaks, if requested 1233 if normalize: 1234 spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"]) 1235 1236 # Add spectrum to library 1237 fe_lib.append(spectrum) 1238 1239 # Build FlashEntropy index 1240 fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs) 1241 1242 return fe_search 1243 1244 def _check_msp_compatibility(self): 1245 """ 1246 Check if the MSP file is compatible with the get_metabolomics_spectra_library method and provide feedback if it is not. 1247 """ 1248 # Check polarity 1249 if ( 1250 "polarity" not in self._data_frame.columns 1251 and "ionmode" not in self._data_frame.columns 1252 ): 1253 raise ValueError( 1254 "Neither 'polarity' nor 'ionmode' columns found in the input MSP metadata. Please check the file." 1255 ) 1256 polarity_column = ( 1257 "polarity" if "polarity" in self._data_frame.columns else "ionmode" 1258 ) 1259 1260 # Check if polarity_column contents is either "positive" or "negative" 1261 if not all(self._data_frame[polarity_column].isin(["positive", "negative"])): 1262 raise ValueError( 1263 f"Input field on MSP '{polarity_column}' must contain only 'positive' or 'negative' values." 1264 ) 1265 1266 # Check if the MSP file contains the required columns for metabolite metadata 1267 # inchikey, by name, not null 1268 # either formula or molecular_formula, not null 1269 if not all(self._data_frame["inchikey"].notnull()): 1270 raise ValueError( 1271 "Input field on MSP 'inchikey' must contain only non-null values." 1272 ) 1273 if ( 1274 "formula" not in self._data_frame.columns 1275 and "molecular_formula" not in self._data_frame.columns 1276 ): 1277 raise ValueError( 1278 "Input field on MSP must contain either 'formula' or 'molecular_formula' columns." 1279 ) 1280 molecular_formula_column = ( 1281 "formula" if "formula" in self._data_frame.columns else "molecular_formula" 1282 ) 1283 if not all(self._data_frame[molecular_formula_column].notnull()): 1284 raise ValueError( 1285 f"Input field on MSP '{molecular_formula_column}' must contain only non-null values." 1286 ) 1287 1288 def get_metabolomics_spectra_library( 1289 self, 1290 polarity, 1291 metabolite_metadata_mapping={}, 1292 format="fe", 1293 normalize=True, 1294 fe_kwargs={}, 1295 ): 1296 """ 1297 Prepare metabolomics spectra library and associated metabolite metadata 1298 1299 Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input 1300 1301 """ 1302 # Check if the MSP file is compatible with the get_metabolomics_spectra_library method 1303 self._check_msp_compatibility() 1304 1305 # Check if the polarity parameter is valid and if a polarity column exists in the dataframe 1306 if polarity not in ["positive", "negative"]: 1307 raise ValueError("Polarity must be 'positive' or 'negative'") 1308 polarity_column = ( 1309 "polarity" if "polarity" in self._data_frame.columns else "ionmode" 1310 ) 1311 1312 # Get a subset of the initial dataframea by polarity 1313 db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy() 1314 1315 # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping 1316 # If the mapping is not provided, use the default mapping 1317 if not metabolite_metadata_mapping: 1318 metabolite_metadata_mapping = { 1319 "chebi_id": "chebi", 1320 "kegg_id": "kegg", 1321 "refmet_name": "common_name", 1322 "molecular_formula": "formula", 1323 "gnps_spectra_id":"id", 1324 "precursormz": "precursor_mz", 1325 "precursortype":"ion_type" 1326 } 1327 db_df.rename(columns=metabolite_metadata_mapping, inplace=True) 1328 db_df["molecular_data_id"] = db_df["inchikey"] 1329 1330 1331 1332 # Check if the resulting dataframe has the required columns for the flash entropy search 1333 required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"] 1334 for col in required_columns: 1335 if col not in db_df.columns: 1336 raise ValueError( 1337 f"Input field on MSP must contain '{col}' column for FlashEntropy search." 1338 ) 1339 1340 # Pull out the metabolite metadata from the dataframe and put it into a different dataframe 1341 # First get a list of the possible attributes of the MetaboliteMetadata dataclass 1342 metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys()) 1343 # Replace id with molecular_data_id in metabolite_metadata_keys 1344 metabolite_metadata_keys = [ 1345 "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys 1346 ] 1347 metabolite_metadata_df = db_df[ 1348 db_df.columns[db_df.columns.isin(metabolite_metadata_keys)] 1349 ].copy() 1350 1351 # Make unique and recast the id column for metabolite metadata 1352 metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True) 1353 metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"] 1354 1355 # Convert to a dictionary using the inchikey as the key 1356 metabolite_metadata_dict = metabolite_metadata_df.to_dict( 1357 orient="records" 1358 ) 1359 metabolite_metadata_dict = { 1360 v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata) 1361 for v in metabolite_metadata_dict 1362 } 1363 1364 # Remove the metabolite metadata columns from the original dataframe 1365 for key in metabolite_metadata_keys: 1366 if key != "molecular_data_id": 1367 if key in db_df.columns: 1368 db_df.drop(columns=key, inplace=True) 1369 1370 # Format the spectral library 1371 format_func = self._get_format_func(format) 1372 lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs) 1373 return (lib, metabolite_metadata_dict)
Interface to parse NIST MSP files
1013 def __init__(self, file_path): 1014 """ 1015 Initialize instance. 1016 1017 Parameters 1018 ---------- 1019 file_path : str 1020 Path to a local MSP file. 1021 1022 Attributes 1023 ---------- 1024 file_path : str 1025 Path to the MSP file. 1026 _file_content : str 1027 Content of the MSP file. 1028 _data_frame : :obj:`~pandas.DataFrame` 1029 DataFrame of spectra from the MSP file with unaltered content. 1030 """ 1031 super().__init__(key=None) 1032 1033 self.file_path = file_path 1034 if not os.path.exists(self.file_path): 1035 raise FileNotFoundError( 1036 f"File {self.file_path} does not exist. Please check the file path." 1037 ) 1038 with open(self.file_path, "r") as f: 1039 self._file_content = f.read() 1040 1041 self._data_frame = self._read_msp_file() 1042 self.__init_format_map__()
Initialize instance.
Parameters
- file_path (str): Path to a local MSP file.
Attributes
- file_path (str): Path to the MSP file.
- _file_content (str): Content of the MSP file.
- _data_frame (
~pandas.DataFrame
): DataFrame of spectra from the MSP file with unaltered content.
1288 def get_metabolomics_spectra_library( 1289 self, 1290 polarity, 1291 metabolite_metadata_mapping={}, 1292 format="fe", 1293 normalize=True, 1294 fe_kwargs={}, 1295 ): 1296 """ 1297 Prepare metabolomics spectra library and associated metabolite metadata 1298 1299 Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input 1300 1301 """ 1302 # Check if the MSP file is compatible with the get_metabolomics_spectra_library method 1303 self._check_msp_compatibility() 1304 1305 # Check if the polarity parameter is valid and if a polarity column exists in the dataframe 1306 if polarity not in ["positive", "negative"]: 1307 raise ValueError("Polarity must be 'positive' or 'negative'") 1308 polarity_column = ( 1309 "polarity" if "polarity" in self._data_frame.columns else "ionmode" 1310 ) 1311 1312 # Get a subset of the initial dataframea by polarity 1313 db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy() 1314 1315 # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping 1316 # If the mapping is not provided, use the default mapping 1317 if not metabolite_metadata_mapping: 1318 metabolite_metadata_mapping = { 1319 "chebi_id": "chebi", 1320 "kegg_id": "kegg", 1321 "refmet_name": "common_name", 1322 "molecular_formula": "formula", 1323 "gnps_spectra_id":"id", 1324 "precursormz": "precursor_mz", 1325 "precursortype":"ion_type" 1326 } 1327 db_df.rename(columns=metabolite_metadata_mapping, inplace=True) 1328 db_df["molecular_data_id"] = db_df["inchikey"] 1329 1330 1331 1332 # Check if the resulting dataframe has the required columns for the flash entropy search 1333 required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"] 1334 for col in required_columns: 1335 if col not in db_df.columns: 1336 raise ValueError( 1337 f"Input field on MSP must contain '{col}' column for FlashEntropy search." 1338 ) 1339 1340 # Pull out the metabolite metadata from the dataframe and put it into a different dataframe 1341 # First get a list of the possible attributes of the MetaboliteMetadata dataclass 1342 metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys()) 1343 # Replace id with molecular_data_id in metabolite_metadata_keys 1344 metabolite_metadata_keys = [ 1345 "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys 1346 ] 1347 metabolite_metadata_df = db_df[ 1348 db_df.columns[db_df.columns.isin(metabolite_metadata_keys)] 1349 ].copy() 1350 1351 # Make unique and recast the id column for metabolite metadata 1352 metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True) 1353 metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"] 1354 1355 # Convert to a dictionary using the inchikey as the key 1356 metabolite_metadata_dict = metabolite_metadata_df.to_dict( 1357 orient="records" 1358 ) 1359 metabolite_metadata_dict = { 1360 v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata) 1361 for v in metabolite_metadata_dict 1362 } 1363 1364 # Remove the metabolite metadata columns from the original dataframe 1365 for key in metabolite_metadata_keys: 1366 if key != "molecular_data_id": 1367 if key in db_df.columns: 1368 db_df.drop(columns=key, inplace=True) 1369 1370 # Format the spectral library 1371 format_func = self._get_format_func(format) 1372 lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs) 1373 return (lib, metabolite_metadata_dict)
Prepare metabolomics spectra library and associated metabolite metadata
Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input