corems.molecular_id.search.database_interfaces
1import os 2import re 3from abc import ABC 4from io import StringIO 5from pathlib import Path 6 7import numpy as np 8import requests 9import pandas as pd 10from ms_entropy import FlashEntropySearch 11 12from corems.molecular_id.factory.EI_SQL import ( 13 EI_LowRes_SQLite, 14 Metadatar, 15 MetaboliteMetadata, 16) 17from corems.molecular_id.factory.lipid_molecular_metadata import LipidMetadata 18from corems.mass_spectra.calc.lc_calc import find_closest 19 20 21class SpectralDatabaseInterface(ABC): 22 """ 23 Base class that facilitates connection to spectral reference databases, 24 such as EMSL's Metabolomics Reference Database (MetabRef). 25 26 """ 27 28 def __init__(self, key=None): 29 """ 30 Initialize instance. 31 32 Parameters 33 ---------- 34 key : str 35 Token key. 36 37 """ 38 39 self.key = key 40 41 def set_token(self, path): 42 """ 43 Set environment variable for MetabRef database token. 44 45 Parameters 46 ---------- 47 path : str 48 Path to token. 49 50 """ 51 52 # Read token from file 53 with open(path, "r", encoding="utf-8") as f: 54 token = f.readline().strip() 55 56 # Set environment variable 57 os.environ[self.key] = token 58 59 def get_token(self): 60 """ 61 Get environment variable for database token. 62 63 Returns 64 ------- 65 str 66 Token string. 67 68 """ 69 70 # Check for token 71 if self.key not in os.environ: 72 raise ValueError("Must set {} environment variable.".format(self.key)) 73 74 # Get token from environment variables 75 return os.environ.get(self.key) 76 77 def get_header(self): 78 """ 79 Access stored database token and prepare as header. 80 81 Returns 82 ------- 83 str 84 Header string. 85 86 """ 87 88 # Get token 89 token = self.get_token() 90 91 # Pad header information 92 header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"} 93 94 return header 95 96 def get_query(self, url, use_header=True): 97 """ 98 Request payload from URL according to `get` protocol. 99 100 Parameters 101 ---------- 102 url : str 103 URL for request. 104 use_header: bool 105 Whether or not the query should include the header 106 107 Returns 108 ------- 109 dict 110 Response as JSON. 111 112 """ 113 114 # Query URL via `get` 115 if use_header: 116 response = requests.get(url, headers=self.get_header()) 117 else: 118 response = requests.get(url) 119 120 # Check response 121 response.raise_for_status() 122 123 # Return as JSON 124 return response.json() 125 126 def post_query(self, url, variable, values, tolerance): 127 """ 128 Request payload from URL according to `post` protocol. 129 130 Parameters 131 ---------- 132 url : str 133 URL for request. 134 variable : str 135 Variable to query. 136 values : str 137 Specific values of `variable` to query. 138 tolerance : str 139 Query tolerance relative to `values`. 140 141 Returns 142 ------- 143 dict 144 Response as JSON. 145 146 """ 147 148 # Coerce to string 149 if not isinstance(variable, str): 150 variable = str(variable).replace(" ", "") 151 152 if not isinstance(values, str): 153 values = str(values).replace(" ", "") 154 155 if not isinstance(tolerance, str): 156 tolerance = str(tolerance).replace(" ", "") 157 158 # Query URL via `post` 159 response = requests.post( 160 os.path.join(url, variable, tolerance), 161 data=values, 162 headers=self.get_header(), 163 ) 164 165 # Check response 166 response.raise_for_status() 167 168 # Return as JSON 169 return response.json() 170 171 def _check_flash_entropy_kwargs(self, fe_kwargs): 172 """ 173 Check FlashEntropy keyword arguments. 174 175 Parameters 176 ---------- 177 fe_kwargs : dict 178 Keyword arguments for FlashEntropy search. 179 180 181 Raises 182 ------ 183 ValueError 184 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they 185 are not equal. 186 187 """ 188 # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da 189 if ( 190 "min_ms2_difference_in_da" in fe_kwargs 191 or "max_ms2_tolerance_in_da" in fe_kwargs 192 ): 193 if ( 194 "min_ms2_difference_in_da" not in fe_kwargs 195 or "max_ms2_tolerance_in_da" not in fe_kwargs 196 ): 197 raise ValueError( 198 "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified." 199 ) 200 if ( 201 fe_kwargs["min_ms2_difference_in_da"] 202 != 2 * fe_kwargs["max_ms2_tolerance_in_da"] 203 ): 204 raise ValueError( 205 "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'." 206 ) 207 208 def _get_format_func(self, format): 209 """ 210 Obtain format function by key. 211 212 Returns 213 ------- 214 func 215 Formatting function. 216 """ 217 218 if format.lower() in self.format_map.keys(): 219 return self.format_map[format.lower()] 220 221 raise ValueError(("{} not a supported format.").format(format)) 222 223 def _dict_to_dataclass(self, metabref_lib, data_class): 224 """ 225 Convert dictionary to dataclass. 226 227 Notes 228 ----- 229 This function will pull the attributes a dataclass and its parent class 230 and convert the dictionary to a dataclass instance with the appropriate 231 attributes. 232 233 Parameters 234 ---------- 235 data_class : :obj:`~dataclasses.dataclass` 236 Dataclass to convert to. 237 metabref_lib : dict 238 Metabref dictionary object to convert to dataclass. 239 240 Returns 241 ------- 242 :obj:`~dataclasses.dataclass` 243 Dataclass instance. 244 245 """ 246 247 # Get list of expected attributes of data_class 248 data_class_keys = list(data_class.__annotations__.keys()) 249 250 # Does the data_class inherit from another class, if so, get the attributes of the parent class as well 251 if len(data_class.__mro__) > 2: 252 parent_class_keys = list(data_class.__bases__[0].__annotations__.keys()) 253 data_class_keys = list(set(data_class_keys + parent_class_keys)) 254 255 # Remove keys that are not in the data_class from the input dictionary 256 input_dict = {k: v for k, v in metabref_lib.items() if k in data_class_keys} 257 258 # Add keys that are in the data class but not in the input dictionary as None 259 for key in data_class_keys: 260 if key not in input_dict.keys(): 261 input_dict[key] = None 262 return data_class(**input_dict) 263 264 @staticmethod 265 def normalize_peaks(arr): 266 """ 267 Normalize peaks in an array. 268 269 Parameters 270 ---------- 271 arr : :obj:`~numpy.array` 272 Array of shape (N, 2), with m/z in the first column and abundance in 273 the second. 274 275 Returns 276 ------- 277 :obj:`~numpy.array` 278 Normalized array of shape (N, 2), with m/z in the first column and 279 normalized abundance in the second. 280 """ 281 # Normalize the array 282 arr[:, -1] = arr[:, -1] / arr[:, -1].sum() 283 284 return arr 285 286 @staticmethod 287 def _build_flash_entropy_index(fe_lib, fe_kwargs={}, clean_spectra=True): 288 """ 289 Build FlashEntropy index. 290 291 Parameters 292 ---------- 293 fe_lib : list 294 List of spectra to build index from. Can be a list of dictionaries or 295 a FlashEntropy search instance. 296 fe_kwargs : dict, optional 297 Keyword arguments for FlashEntropy search. 298 clean_spectra : bool, optional 299 Clean spectra before building index. Default is True. 300 301 Returns 302 ------- 303 :obj:`~ms_entropy.FlashEntropySearch` 304 FlashEntropy search instance. 305 306 """ 307 # Initialize FlashEntropy 308 fe_init_kws = [ 309 "max_ms2_tolerance_in_da", 310 "mz_index_step", 311 "low_memory", 312 "path_data", 313 ] 314 fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws} 315 fes = FlashEntropySearch(**fe_init_kws) 316 317 # Build FlashEntropy index 318 fe_index_kws = [ 319 "max_indexed_mz", 320 "precursor_ions_removal_da", 321 "noise_threshold", 322 "min_ms2_difference_in_da", 323 "max_peak_num", 324 ] 325 fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws} 326 fes.build_index(fe_lib, **fe_index_kws, clean_spectra=clean_spectra) 327 328 return fes 329 330 331class MetabRefInterface(SpectralDatabaseInterface): 332 """ 333 Interface to the Metabolomics Reference Database. 334 """ 335 336 def __init__(self): 337 """ 338 Initialize instance. 339 340 """ 341 342 super().__init__(key=None) 343 344 def spectrum_to_array(self, spectrum, normalize=True): 345 """ 346 Convert MetabRef-formatted spectrum to array. 347 348 Parameters 349 ---------- 350 spectrum : str 351 MetabRef spectrum, i.e. list of (m/z,abundance) pairs. 352 normalize : bool 353 Normalize the spectrum by its magnitude. 354 355 Returns 356 ------- 357 :obj:`~numpy.array` 358 Array of shape (N, 2), with m/z in the first column and abundance in 359 the second. 360 361 """ 362 363 # Convert parenthesis-delimited string to array 364 arr = np.array( 365 re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float 366 ).reshape(-1, 2) 367 368 if normalize: 369 arr = self.normalize_peaks(arr) 370 371 return arr 372 373 def _to_flashentropy(self, metabref_lib, normalize=True, fe_kwargs={}): 374 """ 375 Convert metabref-formatted library to FlashEntropy library. 376 377 Parameters 378 ---------- 379 metabref_lib : dict 380 MetabRef MS2 library in JSON format or FlashEntropy search instance (for reformatting at different MS2 separation). 381 normalize : bool 382 Normalize each spectrum by its magnitude. 383 fe_kwargs : dict, optional 384 Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search; 385 any keys not recognized will be ignored. By default, all parameters set to defaults. 386 387 Returns 388 ------- 389 :obj:`~ms_entropy.FlashEntropySearch` 390 MS2 library as FlashEntropy search instance. 391 392 Raises 393 ------ 394 ValueError 395 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal. 396 397 """ 398 self._check_flash_entropy_kwargs(fe_kwargs) 399 400 # Initialize empty library 401 fe_lib = [] 402 403 # Enumerate spectra 404 for i, source in enumerate(metabref_lib): 405 # Reorganize source dict, if necessary 406 if "spectrum_data" in source.keys(): 407 spectrum = source["spectrum_data"] 408 else: 409 spectrum = source 410 411 # Rename precursor_mz key for FlashEntropy 412 if "precursor_mz" not in spectrum.keys(): 413 spectrum["precursor_mz"] = spectrum.pop("precursor_ion") 414 415 # Convert CoreMS spectrum to array and clean, store as `peaks` 416 spectrum["peaks"] = self.spectrum_to_array( 417 spectrum["mz"], normalize=normalize 418 ) 419 420 # Add spectrum to library 421 fe_lib.append(spectrum) 422 423 # Build FlashEntropy index 424 fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs) 425 426 return fe_search 427 428 def get_query(self, url, use_header=False): 429 """Overwrites the get_query method on the parent class to default to not use a header 430 431 Notes 432 ----- 433 As of January 2025, the metabref database no longer requires a token and therefore no header is needed 434 435 """ 436 return super().get_query(url, use_header) 437 438 439class MetabRefGCInterface(MetabRefInterface): 440 """ 441 Interface to the Metabolomics Reference Database. 442 """ 443 444 def __init__(self): 445 """ 446 Initialize instance. 447 448 """ 449 450 super().__init__() 451 self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1" 452 self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames" 453 454 self.__init_format_map__() 455 456 def __init_format_map__(self): 457 """ 458 Initialize database format mapper, enabling multiple format requests. 459 460 """ 461 462 # Define format workflows 463 self.format_map = { 464 "json": lambda x, normalize, fe_kwargs: x, 465 "dict": lambda x, 466 normalize, 467 fe_kwargs: self._to_LowResolutionEICompound_dict(x, normalize), 468 "sql": lambda x, 469 normalize, 470 fe_kwargs: self._LowResolutionEICompound_dict_to_sqlite( 471 self._to_LowResolutionEICompound_dict(x, normalize) 472 ), 473 } 474 475 # Add aliases 476 self.format_map["metabref"] = self.format_map["json"] 477 self.format_map["datadict"] = self.format_map["dict"] 478 self.format_map["data-dict"] = self.format_map["dict"] 479 self.format_map["lowreseicompound"] = self.format_map["dict"] 480 self.format_map["lowres"] = self.format_map["dict"] 481 self.format_map["lowresgc"] = self.format_map["dict"] 482 self.format_map["sqlite"] = self.format_map["sql"] 483 484 def available_formats(self): 485 """ 486 View list of available formats. 487 488 Returns 489 ------- 490 list 491 Format map keys. 492 """ 493 494 return list(self.format_map.keys()) 495 496 def get_library(self, format="json", normalize=False): 497 """ 498 Request MetabRef GC/MS library. 499 500 Parameters 501 ---------- 502 format : str 503 Format of requested library, i.e. "json", "sql", "flashentropy". 504 See `available_formats` method for aliases. 505 normalize : bool 506 Normalize the spectrum by its magnitude. 507 508 Returns 509 ------- 510 Library in requested format. 511 512 """ 513 514 # Init format function 515 format_func = self._get_format_func(format) 516 517 return format_func( 518 self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {} 519 ) 520 521 def get_fames(self, format="json", normalize=False): 522 """ 523 Request MetabRef GC/MS FAMEs library. 524 525 Parameters 526 ---------- 527 format : str 528 Format of requested library, i.e. "json", "sql", "flashentropy". 529 See `available_formats` method for aliases. 530 normalize : bool 531 Normalize the spectrum by its magnitude. 532 533 Returns 534 ------- 535 Library in requested format. 536 537 """ 538 539 # Init format function 540 format_func = self._get_format_func(format) 541 542 return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {}) 543 544 def _to_LowResolutionEICompound_dict(self, metabref_lib, normalize=False): 545 """ 546 Convert MetabRef-formatted library to CoreMS LowResolutionEICompound-formatted 547 dictionary for local ingestion. 548 549 Parameters 550 ---------- 551 metabref_lib : dict 552 MetabRef GC-MS library in JSON format. 553 normalize : bool 554 Normalize each spectrum by its magnitude. 555 556 Returns 557 ------- 558 list of dict 559 List of each spectrum contained in dictionary. 560 561 """ 562 563 # All below key:value lookups are based on CoreMS class definitions 564 # NOT MetabRef content. For example, MetabRef has keys for PubChem, 565 # USI, etc. that are not considered below. 566 567 # Dictionary to map metabref keys to corems keys 568 metadatar_cols = { 569 "casno": "cas", 570 "inchikey": "inchikey", 571 "inchi": "inchi", 572 "chebi": "chebi", 573 "smiles": "smiles", 574 "kegg": "kegg", 575 "iupac_name": "iupac_name", 576 "traditional_name": "traditional_name", # Not present in metabref 577 "common_name": "common_name", # Not present in metabref 578 } 579 580 # Dictionary to map metabref keys to corems keys 581 lowres_ei_compound_cols = { 582 "id": "metabref_id", 583 "molecule_name": "name", # Is this correct? 584 "classify": "classify", # Not present in metabref 585 "formula": "formula", 586 "ri": "ri", 587 "rt": "retention_time", 588 "source": "source", # Not present in metabref 589 "casno": "casno", 590 "comments": "comment", 591 "source_temp_c": "source_temp_c", # Not present in metabref 592 "ev": "ev", # Not present in metabref 593 "peak_count": "peaks_count", 594 "mz": "mz", 595 "abundance": "abundance", 596 } 597 598 # Local result container 599 corems_lib = [] 600 601 # Enumerate spectra 602 for i, source_ in enumerate(metabref_lib): 603 # Copy source to prevent modification 604 source = source_.copy() 605 606 # Flatten source dict 607 source = source.pop("spectrum_data") | source 608 609 # Parse target data 610 target = { 611 lowres_ei_compound_cols[k]: v 612 for k, v in source.items() 613 if k in lowres_ei_compound_cols 614 } 615 616 # Explicitly add this to connect with LowResCompoundRef later 617 target["rt"] = source["rt"] 618 619 # Parse (mz, abundance) 620 arr = self.spectrum_to_array(target["mz"], normalize=normalize) 621 target["mz"] = arr[:, 0] 622 target["abundance"] = arr[:, 1] 623 624 # Parse meta data 625 target["metadata"] = { 626 metadatar_cols[k]: v for k, v in source.items() if k in metadatar_cols 627 } 628 629 # Add anything else 630 for k in source: 631 if k not in lowres_ei_compound_cols: 632 target[k] = source[k] 633 634 # Add to CoreMS list 635 corems_lib.append(target) 636 637 return corems_lib 638 639 def _LowResolutionEICompound_dict_to_sqlite( 640 self, lowres_ei_compound_dict, url="sqlite://" 641 ): 642 """ 643 Convert CoreMS LowResolutionEICompound-formatted dictionary to SQLite 644 database for local ingestion. 645 646 Parameters 647 ---------- 648 lowres_ei_compound_dict : dict 649 CoreMS GC-MS library formatted for LowResolutionEICompound. 650 url : str 651 URL to SQLite prefix. 652 653 Returns 654 ------- 655 sqlite database 656 Spectra contained in SQLite database. 657 658 """ 659 660 # Dictionary to map corems keys to all-caps keys 661 capped_cols = { 662 "name": "NAME", 663 "formula": "FORM", 664 "ri": "RI", 665 "retention_time": "RT", 666 "source": "SOURCE", 667 "casno": "CASNO", 668 "comment": "COMMENT", 669 "peaks_count": "NUM PEAKS", 670 } 671 672 # Initialize SQLite object 673 sqlite_obj = EI_LowRes_SQLite(url=url) 674 675 # Iterate spectra 676 for _data_dict in lowres_ei_compound_dict: 677 # Copy source to prevent modification 678 data_dict = _data_dict.copy() 679 680 # Add missing capped values 681 for k, v in capped_cols.items(): 682 # Key exists 683 if k in data_dict: 684 # # This will replace the key 685 # data_dict[v] = data_dict.pop(k) 686 687 # This will keep both keys 688 data_dict[v] = data_dict[k] 689 690 # Parse number of peaks 691 if not data_dict.get("NUM PEAKS"): 692 data_dict["NUM PEAKS"] = len(data_dict.get("mz")) 693 694 # Parse CAS number 695 if not data_dict.get("CASNO"): 696 data_dict["CASNO"] = data_dict.get("CAS") 697 698 if not data_dict["CASNO"]: 699 data_dict["CASNO"] = 0 700 701 # Build linked metadata table 702 if "metadata" in data_dict: 703 if len(data_dict["metadata"]) > 0: 704 data_dict["metadatar"] = Metadatar(**data_dict.pop("metadata")) 705 else: 706 data_dict.pop("metadata") 707 708 # Attempt addition to sqlite 709 try: 710 sqlite_obj.add_compound(data_dict) 711 except: 712 print(data_dict["NAME"]) 713 714 return sqlite_obj 715 716 717class MetabRefLCInterface(MetabRefInterface): 718 """ 719 Interface to the Metabolomics Reference Database for LC-MS data. 720 """ 721 722 def __init__(self): 723 """ 724 Initialize instance. 725 726 """ 727 728 super().__init__() 729 730 # API endpoint for precursor m/z search 731 # inputs = mz, tolerance (in Da), polarity, page_no, per_page 732 self.PRECURSOR_MZ_URL = "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}" 733 734 # API endpoint for returning full list of precursor m/z values in database 735 # inputs = polarity, page_no, per_page 736 self.PRECURSOR_MZ_ALL_URL = ( 737 "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}" 738 ) 739 740 self.__init_format_map__() 741 742 def __init_format_map__(self): 743 """ 744 Initialize database format mapper, enabling multiple format requests. 745 746 """ 747 748 # Define format workflows 749 self.format_map = { 750 "json": lambda x, normalize, fe_kwargs: x, 751 "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy( 752 x, normalize, fe_kwargs 753 ), 754 } 755 756 # Add aliases 757 self.format_map["metabref"] = self.format_map["json"] 758 self.format_map["fe"] = self.format_map["flashentropy"] 759 self.format_map["flash-entropy"] = self.format_map["flashentropy"] 760 761 def query_by_precursor( 762 self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50 763 ): 764 """ 765 Query MetabRef by precursor m/z values. 766 767 Parameters 768 ---------- 769 mz_list : list 770 List of precursor m/z values. 771 polarity : str 772 Ionization polarity, either "positive" or "negative". 773 mz_tol_ppm : float 774 Tolerance in ppm for each precursor m/z value. 775 Used for retrieving from a potential match from database. 776 mz_tol_da_api : float, optional 777 Maximum tolerance between precursor m/z values for API search, in daltons. 778 Used to group similar mzs into a single API query for speed. Default is 0.2. 779 max_per_page : int, optional 780 Maximum records to return from MetabRef API query at a time. Default is 50. 781 782 Returns 783 ------- 784 list 785 List of library entries in original JSON format. 786 """ 787 788 # If polarity is anything other than positive or negative, raise error 789 if polarity not in ["positive", "negative"]: 790 raise ValueError("Polarity must be 'positive' or 'negative'") 791 792 # Cluster groups of mz according to mz_tol_da_api for precursor query 793 mz_list.sort() 794 mz_groups = [[mz_list[0]]] 795 for x in mz_list[1:]: 796 if abs(x - mz_groups[-1][0]) <= mz_tol_da_api: 797 mz_groups[-1].append(x) 798 else: 799 mz_groups.append([x]) 800 801 # Query MetabRef for each mz group 802 lib = [] 803 for mz_group in mz_groups: 804 mz = np.mean(mz_group) 805 if len(mz_group) == 1: 806 mz = mz_group[0] 807 tol = mz_tol_ppm * 10**-6 * mz 808 else: 809 mz = (max(mz_group) - min(mz_group)) / 2 + min(mz_group) 810 tol = (max(mz_group) - min(mz_group)) / 2 + mz_tol_ppm**-6 * max( 811 mz_group 812 ) 813 814 # Get first page of results 815 response = self.get_query( 816 self.PRECURSOR_MZ_URL.format( 817 str(mz), str(tol), polarity, 1, max_per_page 818 ) 819 ) 820 lib = lib + response["results"] 821 822 # If there are more pages of results, get them 823 if response["total_pages"] > 1: 824 for i in np.arange(2, response["total_pages"] + 1): 825 lib = ( 826 lib 827 + self.get_query( 828 self.PRECURSOR_MZ_URL.format( 829 str(mz), str(tol), polarity, i, max_per_page 830 ) 831 )["results"] 832 ) 833 834 return lib 835 836 def request_all_precursors(self, polarity, per_page=50000): 837 """ 838 Request all precursor m/z values for MS2 spectra from MetabRef. 839 840 Parameters 841 ---------- 842 polarity : str 843 Ionization polarity, either "positive" or "negative". 844 per_page : int, optional 845 Number of records to fetch per call. Default is 50000 846 847 Returns 848 ------- 849 list 850 List of all precursor m/z values, sorted. 851 """ 852 # If polarity is anything other than positive or negative, raise error 853 if polarity not in ["positive", "negative"]: 854 raise ValueError("Polarity must be 'positive' or 'negative'") 855 856 precursors = [] 857 858 # Get first page of results and total number of pages of results 859 response = self.get_query( 860 self.PRECURSOR_MZ_ALL_URL.format(polarity, str(1), str(per_page)) 861 ) 862 total_pages = response["total_pages"] 863 precursors.extend([x["precursor_ion"] for x in response["results"]]) 864 865 # Go through remaining pages of results 866 for i in np.arange(2, total_pages + 1): 867 response = self.get_query( 868 self.PRECURSOR_MZ_ALL_URL.format(polarity, str(i), str(per_page)) 869 ) 870 precursors.extend([x["precursor_ion"] for x in response["results"]]) 871 872 # Sort precursors from smallest to largest and remove duplicates 873 precursors = list(set(precursors)) 874 precursors.sort() 875 876 return precursors 877 878 def get_lipid_library( 879 self, 880 mz_list, 881 polarity, 882 mz_tol_ppm, 883 mz_tol_da_api=0.2, 884 format="json", 885 normalize=True, 886 fe_kwargs={}, 887 ): 888 """ 889 Request MetabRef lipid library. 890 891 Parameters 892 ---------- 893 mz_list : list 894 List of precursor m/z values. 895 polarity : str 896 Ionization polarity, either "positive" or "negative". 897 mz_tol_ppm : float 898 Tolerance in ppm for each precursor m/z value. 899 Used for retrieving from a potential match from database. 900 mz_tol_da_api : float, optional 901 Maximum tolerance between precursor m/z values for API search, in daltons. 902 Used to group similar mzs into a single API query for speed. Default is 0.2. 903 format : str, optional 904 Format of requested library, i.e. "json", "sql", "flashentropy". 905 See `available_formats` method for aliases. Default is "json". 906 normalize : bool, optional 907 Normalize the spectrum by its magnitude. Default is True. 908 fe_kwargs : dict, optional 909 Keyword arguments for FlashEntropy search. Default is {}. 910 911 Returns 912 ------- 913 tuple 914 Library in requested format and lipid metadata as a LipidMetadata dataclass. 915 916 """ 917 mz_list.sort() 918 mz_list = np.array(mz_list) 919 920 # Get all precursors in the library matching the polarity 921 precusors_in_lib = self.request_all_precursors(polarity=polarity) 922 precusors_in_lib = np.array(precusors_in_lib) 923 924 # Compare the mz_list with the precursors in the library, keep any mzs that are within mz_tol of any precursor in the library 925 lib_mz_df = pd.DataFrame(precusors_in_lib, columns=["lib_mz"]) 926 lib_mz_df["closest_obs_mz"] = mz_list[ 927 find_closest(mz_list, lib_mz_df.lib_mz.values) 928 ] 929 lib_mz_df["mz_diff_ppm"] = np.abs( 930 (lib_mz_df["lib_mz"] - lib_mz_df["closest_obs_mz"]) 931 / lib_mz_df["lib_mz"] 932 * 1e6 933 ) 934 lib_mz_sub = lib_mz_df[lib_mz_df["mz_diff_ppm"] <= mz_tol_ppm] 935 936 # Do the same in the opposite direction 937 mz_df = pd.DataFrame(mz_list, columns=["mass_feature_mz"]) 938 mz_df["closest_lib_pre_mz"] = precusors_in_lib[ 939 find_closest(precusors_in_lib, mz_df.mass_feature_mz.values) 940 ] 941 mz_df["mz_diff_ppm"] = np.abs( 942 (mz_df["mass_feature_mz"] - mz_df["closest_lib_pre_mz"]) 943 / mz_df["mass_feature_mz"] 944 * 1e6 945 ) 946 mz_df_sub = mz_df[mz_df["mz_diff_ppm"] <= mz_tol_ppm] 947 948 # Evaluate which is fewer mzs - lib_mz_sub or mz_df_sub and use that as the input for next step 949 if len(lib_mz_sub) < len(mz_df_sub): 950 mzs_to_query = lib_mz_sub.lib_mz.values 951 else: 952 mzs_to_query = mz_df_sub.mass_feature_mz.values 953 954 # Query the library for the precursors in the mz_list that are in the library to retrieve the spectra and metadata 955 lib = self.query_by_precursor( 956 mz_list=mzs_to_query, 957 polarity=polarity, 958 mz_tol_ppm=mz_tol_ppm, 959 mz_tol_da_api=mz_tol_da_api, 960 ) 961 962 # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass 963 mol_data_dict = {x["id"]: x["Molecular Data"] for x in lib} 964 lipid_lib = {x["id"]: x["Lipid Tree"] for x in lib if "Lipid Tree" in x.keys()} 965 mol_data_dict = {k: {**v, **lipid_lib[k]} for k, v in mol_data_dict.items()} 966 mol_data_dict = { 967 k: self._dict_to_dataclass(v, LipidMetadata) 968 for k, v in mol_data_dict.items() 969 } 970 971 # Remove lipid metadata from the metabref library 972 lib = [ 973 {k: v for k, v in x.items() if k not in ["Molecular Data", "Lipid Tree"]} 974 for x in lib 975 ] 976 # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry 977 for x in lib: 978 if "Lipid Fragments" in x.keys(): 979 x.update(x.pop("Lipid Fragments")) 980 if "MSO Data" in x.keys(): 981 x.update(x.pop("MSO Data")) 982 983 # Format the spectral library 984 format_func = self._get_format_func(format) 985 lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs) 986 return (lib, mol_data_dict) 987 988 989class MSPInterface(SpectralDatabaseInterface): 990 """ 991 Interface to parse NIST MSP files 992 """ 993 994 def __init__(self, file_path): 995 """ 996 Initialize instance. 997 998 Parameters 999 ---------- 1000 file_path : str 1001 Path to a local MSP file. 1002 1003 Attributes 1004 ---------- 1005 file_path : str 1006 Path to the MSP file. 1007 _file_content : str 1008 Content of the MSP file. 1009 _data_frame : :obj:`~pandas.DataFrame` 1010 DataFrame of spectra from the MSP file with unaltered content. 1011 """ 1012 super().__init__(key=None) 1013 1014 self.file_path = file_path 1015 if not os.path.exists(self.file_path): 1016 raise FileNotFoundError( 1017 f"File {self.file_path} does not exist. Please check the file path." 1018 ) 1019 with open(self.file_path, "r") as f: 1020 self._file_content = f.read() 1021 1022 self._data_frame = self._read_msp_file() 1023 self.__init_format_map__() 1024 1025 def __init_format_map__(self): 1026 """ 1027 Initialize database format mapper, enabling multiple format requests. 1028 1029 """ 1030 1031 # x is a pandas dataframe similar to self._data_frame format 1032 # Define format workflows 1033 self.format_map = { 1034 "msp": lambda x, normalize, fe_kwargs: self._to_msp(x, normalize), 1035 "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy( 1036 x, normalize, fe_kwargs 1037 ), 1038 "df": lambda x, normalize, fe_kwargs: self._to_df(x, normalize), 1039 } 1040 1041 # Add aliases 1042 self.format_map["fe"] = self.format_map["flashentropy"] 1043 self.format_map["flash-entropy"] = self.format_map["flashentropy"] 1044 self.format_map["dataframe"] = self.format_map["df"] 1045 self.format_map["data-frame"] = self.format_map["df"] 1046 1047 def _read_msp_file(self): 1048 """ 1049 Reads the MSP files into the pandas dataframe, and sort/remove zero intensity ions in MS/MS spectra. 1050 1051 Returns 1052 ------- 1053 :obj:`~pandas.DataFrame` 1054 DataFrame of spectra from the MSP file, exacly as it is in the file (no sorting, filtering etc) 1055 """ 1056 # If input_dataframe is provided, return it it 1057 spectra = [] 1058 spectrum = {} 1059 1060 f = StringIO(self._file_content) 1061 for line in f: 1062 line = line.strip() 1063 if not line: 1064 continue # Skip empty lines 1065 1066 # Handle metadata 1067 if ":" in line: 1068 key, value = line.split(":", 1) 1069 key = key.strip().lower() 1070 value = value.strip() 1071 1072 if key == "name": 1073 # Save current spectrum and start a new one 1074 if spectrum: 1075 spectra.append(spectrum) 1076 spectrum = {"name": value, "peaks": []} 1077 else: 1078 spectrum[key] = value 1079 1080 # Handle peak data (assumed to start with a number) 1081 elif line[0].isdigit(): 1082 peaks = line.split() 1083 m_z = float(peaks[0]) 1084 intensity = float(peaks[1]) 1085 spectrum["peaks"].append(([m_z, intensity])) 1086 # Save the last spectrum 1087 if spectrum: 1088 spectra.append(spectrum) 1089 1090 df = pd.DataFrame(spectra) 1091 for column in df.columns: 1092 if column != "peaks": # Skip 'peaks' column 1093 try: 1094 df[column] = pd.to_numeric(df[column], errors="raise") 1095 except: 1096 pass 1097 return df 1098 1099 def _to_df(self, input_dataframe, normalize=True): 1100 """ 1101 Convert MSP-derived library to FlashEntropy library. 1102 1103 Parameters 1104 ---------- 1105 input_dataframe : :obj:`~pandas.DataFrame` 1106 Input DataFrame containing MSP-formatted spectra. 1107 normalize : bool, optional 1108 Normalize each spectrum by its magnitude. 1109 Default is True. 1110 1111 Returns 1112 ------- 1113 :obj:`~pandas.DataFrame` 1114 DataFrame of with desired normalization 1115 """ 1116 if not normalize: 1117 return input_dataframe 1118 else: 1119 # Convert to dictionary 1120 db_dict = input_dataframe.to_dict(orient="records") 1121 1122 # Initialize empty library 1123 lib = [] 1124 1125 # Enumerate spectra 1126 for i, source in enumerate(db_dict): 1127 spectrum = source 1128 # Check that spectrum["peaks"] exists 1129 if "peaks" not in spectrum.keys(): 1130 raise KeyError( 1131 "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute." 1132 ) 1133 1134 # Convert spectrum["peaks"] to numpy array 1135 if not isinstance(spectrum["peaks"], np.ndarray): 1136 spectrum["peaks"] = np.array(spectrum["peaks"]) 1137 1138 # Normalize peaks, if requested 1139 if normalize: 1140 spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"]) 1141 spectrum["num peaks"] = len(spectrum["peaks"]) 1142 1143 # Add spectrum to library 1144 lib.append(spectrum) 1145 1146 # Convert to DataFrame 1147 df = pd.DataFrame(lib) 1148 return df 1149 1150 def _to_flashentropy(self, input_dataframe, normalize=True, fe_kwargs={}): 1151 """ 1152 Convert MSP-derived library to FlashEntropy library. 1153 1154 Parameters 1155 ---------- 1156 input_dataframe : :obj:`~pandas.DataFrame` 1157 Input DataFrame containing MSP-formatted spectra. 1158 normalize : bool 1159 Normalize each spectrum by its magnitude. 1160 fe_kwargs : dict, optional 1161 Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search; 1162 any keys not recognized will be ignored. By default, all parameters set to defaults. 1163 1164 Returns 1165 ------- 1166 :obj:`~ms_entropy.FlashEntropySearch` 1167 MS2 library as FlashEntropy search instance. 1168 1169 Raises 1170 ------ 1171 ValueError 1172 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they 1173 """ 1174 self._check_flash_entropy_kwargs(fe_kwargs) 1175 1176 db_df = input_dataframe 1177 1178 # Convert to dictionary 1179 db_dict = db_df.to_dict(orient="records") 1180 1181 # Initialize empty library 1182 fe_lib = [] 1183 1184 # Enumerate spectra 1185 for i, source in enumerate(db_dict): 1186 # Reorganize source dict, if necessary 1187 if "spectrum_data" in source.keys(): 1188 spectrum = source["spectrum_data"] 1189 else: 1190 spectrum = source 1191 1192 # Rename precursor_mz key for FlashEntropy 1193 if "precursor_mz" not in spectrum.keys(): 1194 if "precursormz" in spectrum: 1195 spectrum["precursor_mz"] = spectrum.pop("precursormz") 1196 elif "precursor_ion" in spectrum: 1197 spectrum["precursor_mz"] = spectrum.pop("precursor_ion") 1198 else: 1199 raise KeyError( 1200 "MSP must have either 'precursormz' or 'precursor_ion' key to be converted to FlashEntropy format." 1201 ) 1202 1203 # Check that spectrum["peaks"] exists 1204 if "peaks" not in spectrum.keys(): 1205 raise KeyError( 1206 "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute." 1207 ) 1208 1209 # Convert spectrum["peaks"] to numpy array 1210 if not isinstance(spectrum["peaks"], np.ndarray): 1211 spectrum["peaks"] = np.array(spectrum["peaks"]) 1212 1213 # Normalize peaks, if requested 1214 if normalize: 1215 spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"]) 1216 1217 # Add spectrum to library 1218 fe_lib.append(spectrum) 1219 1220 # Build FlashEntropy index 1221 fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs) 1222 1223 return fe_search 1224 1225 def _check_msp_compatibility(self): 1226 """ 1227 Check if the MSP file is compatible with the get_metabolomics_spectra_library method and provide feedback if it is not. 1228 """ 1229 # Check polarity 1230 if ( 1231 "polarity" not in self._data_frame.columns 1232 and "ionmode" not in self._data_frame.columns 1233 ): 1234 raise ValueError( 1235 "Neither 'polarity' nor 'ionmode' columns found in the input MSP metadata. Please check the file." 1236 ) 1237 polarity_column = ( 1238 "polarity" if "polarity" in self._data_frame.columns else "ionmode" 1239 ) 1240 1241 # Check if polarity_column contents is either "positive" or "negative" 1242 if not all(self._data_frame[polarity_column].isin(["positive", "negative"])): 1243 raise ValueError( 1244 f"Input field on MSP '{polarity_column}' must contain only 'positive' or 'negative' values." 1245 ) 1246 1247 # Check if the MSP file contains the required columns for metabolite metadata 1248 # inchikey, by name, not null 1249 # either formula or molecular_formula, not null 1250 if not all(self._data_frame["inchikey"].notnull()): 1251 raise ValueError( 1252 "Input field on MSP 'inchikey' must contain only non-null values." 1253 ) 1254 if ( 1255 "formula" not in self._data_frame.columns 1256 and "molecular_formula" not in self._data_frame.columns 1257 ): 1258 raise ValueError( 1259 "Input field on MSP must contain either 'formula' or 'molecular_formula' columns." 1260 ) 1261 molecular_formula_column = ( 1262 "formula" if "formula" in self._data_frame.columns else "molecular_formula" 1263 ) 1264 if not all(self._data_frame[molecular_formula_column].notnull()): 1265 raise ValueError( 1266 f"Input field on MSP '{molecular_formula_column}' must contain only non-null values." 1267 ) 1268 1269 def get_metabolomics_spectra_library( 1270 self, 1271 polarity, 1272 metabolite_metadata_mapping={}, 1273 format="fe", 1274 normalize=True, 1275 fe_kwargs={}, 1276 ): 1277 """ 1278 Prepare metabolomics spectra library and associated metabolite metadata 1279 1280 Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input 1281 1282 """ 1283 # Check if the MSP file is compatible with the get_metabolomics_spectra_library method 1284 self._check_msp_compatibility() 1285 1286 # Check if the polarity parameter is valid and if a polarity column exists in the dataframe 1287 if polarity not in ["positive", "negative"]: 1288 raise ValueError("Polarity must be 'positive' or 'negative'") 1289 polarity_column = ( 1290 "polarity" if "polarity" in self._data_frame.columns else "ionmode" 1291 ) 1292 1293 # Get a subset of the initial dataframea by polarity 1294 db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy() 1295 1296 # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping 1297 # If the mapping is not provided, use the default mapping 1298 if not metabolite_metadata_mapping: 1299 metabolite_metadata_mapping = { 1300 "chebi_id": "chebi", 1301 "kegg_id": "kegg", 1302 "refmet_name": "common_name", 1303 "molecular_formula": "formula", 1304 "gnps_spectra_id":"id", 1305 "precursormz": "precursor_mz", 1306 "precursortype":"ion_type" 1307 } 1308 db_df.rename(columns=metabolite_metadata_mapping, inplace=True) 1309 db_df["molecular_data_id"] = db_df["inchikey"] 1310 1311 1312 1313 # Check if the resulting dataframe has the required columns for the flash entropy search 1314 required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"] 1315 for col in required_columns: 1316 if col not in db_df.columns: 1317 raise ValueError( 1318 f"Input field on MSP must contain '{col}' column for FlashEntropy search." 1319 ) 1320 1321 # Pull out the metabolite metadata from the dataframe and put it into a different dataframe 1322 # First get a list of the possible attributes of the MetaboliteMetadata dataclass 1323 metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys()) 1324 # Replace id with molecular_data_id in metabolite_metadata_keys 1325 metabolite_metadata_keys = [ 1326 "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys 1327 ] 1328 metabolite_metadata_df = db_df[ 1329 db_df.columns[db_df.columns.isin(metabolite_metadata_keys)] 1330 ].copy() 1331 1332 # Make unique and recast the id column for metabolite metadata 1333 metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True) 1334 metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"] 1335 1336 # Convert to a dictionary using the inchikey as the key 1337 metabolite_metadata_dict = metabolite_metadata_df.to_dict( 1338 orient="records" 1339 ) 1340 metabolite_metadata_dict = { 1341 v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata) 1342 for v in metabolite_metadata_dict 1343 } 1344 1345 # Remove the metabolite metadata columns from the original dataframe 1346 for key in metabolite_metadata_keys: 1347 if key != "molecular_data_id": 1348 if key in db_df.columns: 1349 db_df.drop(columns=key, inplace=True) 1350 1351 # Format the spectral library 1352 format_func = self._get_format_func(format) 1353 lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs) 1354 return (lib, metabolite_metadata_dict)
22class SpectralDatabaseInterface(ABC): 23 """ 24 Base class that facilitates connection to spectral reference databases, 25 such as EMSL's Metabolomics Reference Database (MetabRef). 26 27 """ 28 29 def __init__(self, key=None): 30 """ 31 Initialize instance. 32 33 Parameters 34 ---------- 35 key : str 36 Token key. 37 38 """ 39 40 self.key = key 41 42 def set_token(self, path): 43 """ 44 Set environment variable for MetabRef database token. 45 46 Parameters 47 ---------- 48 path : str 49 Path to token. 50 51 """ 52 53 # Read token from file 54 with open(path, "r", encoding="utf-8") as f: 55 token = f.readline().strip() 56 57 # Set environment variable 58 os.environ[self.key] = token 59 60 def get_token(self): 61 """ 62 Get environment variable for database token. 63 64 Returns 65 ------- 66 str 67 Token string. 68 69 """ 70 71 # Check for token 72 if self.key not in os.environ: 73 raise ValueError("Must set {} environment variable.".format(self.key)) 74 75 # Get token from environment variables 76 return os.environ.get(self.key) 77 78 def get_header(self): 79 """ 80 Access stored database token and prepare as header. 81 82 Returns 83 ------- 84 str 85 Header string. 86 87 """ 88 89 # Get token 90 token = self.get_token() 91 92 # Pad header information 93 header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"} 94 95 return header 96 97 def get_query(self, url, use_header=True): 98 """ 99 Request payload from URL according to `get` protocol. 100 101 Parameters 102 ---------- 103 url : str 104 URL for request. 105 use_header: bool 106 Whether or not the query should include the header 107 108 Returns 109 ------- 110 dict 111 Response as JSON. 112 113 """ 114 115 # Query URL via `get` 116 if use_header: 117 response = requests.get(url, headers=self.get_header()) 118 else: 119 response = requests.get(url) 120 121 # Check response 122 response.raise_for_status() 123 124 # Return as JSON 125 return response.json() 126 127 def post_query(self, url, variable, values, tolerance): 128 """ 129 Request payload from URL according to `post` protocol. 130 131 Parameters 132 ---------- 133 url : str 134 URL for request. 135 variable : str 136 Variable to query. 137 values : str 138 Specific values of `variable` to query. 139 tolerance : str 140 Query tolerance relative to `values`. 141 142 Returns 143 ------- 144 dict 145 Response as JSON. 146 147 """ 148 149 # Coerce to string 150 if not isinstance(variable, str): 151 variable = str(variable).replace(" ", "") 152 153 if not isinstance(values, str): 154 values = str(values).replace(" ", "") 155 156 if not isinstance(tolerance, str): 157 tolerance = str(tolerance).replace(" ", "") 158 159 # Query URL via `post` 160 response = requests.post( 161 os.path.join(url, variable, tolerance), 162 data=values, 163 headers=self.get_header(), 164 ) 165 166 # Check response 167 response.raise_for_status() 168 169 # Return as JSON 170 return response.json() 171 172 def _check_flash_entropy_kwargs(self, fe_kwargs): 173 """ 174 Check FlashEntropy keyword arguments. 175 176 Parameters 177 ---------- 178 fe_kwargs : dict 179 Keyword arguments for FlashEntropy search. 180 181 182 Raises 183 ------ 184 ValueError 185 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they 186 are not equal. 187 188 """ 189 # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da 190 if ( 191 "min_ms2_difference_in_da" in fe_kwargs 192 or "max_ms2_tolerance_in_da" in fe_kwargs 193 ): 194 if ( 195 "min_ms2_difference_in_da" not in fe_kwargs 196 or "max_ms2_tolerance_in_da" not in fe_kwargs 197 ): 198 raise ValueError( 199 "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified." 200 ) 201 if ( 202 fe_kwargs["min_ms2_difference_in_da"] 203 != 2 * fe_kwargs["max_ms2_tolerance_in_da"] 204 ): 205 raise ValueError( 206 "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'." 207 ) 208 209 def _get_format_func(self, format): 210 """ 211 Obtain format function by key. 212 213 Returns 214 ------- 215 func 216 Formatting function. 217 """ 218 219 if format.lower() in self.format_map.keys(): 220 return self.format_map[format.lower()] 221 222 raise ValueError(("{} not a supported format.").format(format)) 223 224 def _dict_to_dataclass(self, metabref_lib, data_class): 225 """ 226 Convert dictionary to dataclass. 227 228 Notes 229 ----- 230 This function will pull the attributes a dataclass and its parent class 231 and convert the dictionary to a dataclass instance with the appropriate 232 attributes. 233 234 Parameters 235 ---------- 236 data_class : :obj:`~dataclasses.dataclass` 237 Dataclass to convert to. 238 metabref_lib : dict 239 Metabref dictionary object to convert to dataclass. 240 241 Returns 242 ------- 243 :obj:`~dataclasses.dataclass` 244 Dataclass instance. 245 246 """ 247 248 # Get list of expected attributes of data_class 249 data_class_keys = list(data_class.__annotations__.keys()) 250 251 # Does the data_class inherit from another class, if so, get the attributes of the parent class as well 252 if len(data_class.__mro__) > 2: 253 parent_class_keys = list(data_class.__bases__[0].__annotations__.keys()) 254 data_class_keys = list(set(data_class_keys + parent_class_keys)) 255 256 # Remove keys that are not in the data_class from the input dictionary 257 input_dict = {k: v for k, v in metabref_lib.items() if k in data_class_keys} 258 259 # Add keys that are in the data class but not in the input dictionary as None 260 for key in data_class_keys: 261 if key not in input_dict.keys(): 262 input_dict[key] = None 263 return data_class(**input_dict) 264 265 @staticmethod 266 def normalize_peaks(arr): 267 """ 268 Normalize peaks in an array. 269 270 Parameters 271 ---------- 272 arr : :obj:`~numpy.array` 273 Array of shape (N, 2), with m/z in the first column and abundance in 274 the second. 275 276 Returns 277 ------- 278 :obj:`~numpy.array` 279 Normalized array of shape (N, 2), with m/z in the first column and 280 normalized abundance in the second. 281 """ 282 # Normalize the array 283 arr[:, -1] = arr[:, -1] / arr[:, -1].sum() 284 285 return arr 286 287 @staticmethod 288 def _build_flash_entropy_index(fe_lib, fe_kwargs={}, clean_spectra=True): 289 """ 290 Build FlashEntropy index. 291 292 Parameters 293 ---------- 294 fe_lib : list 295 List of spectra to build index from. Can be a list of dictionaries or 296 a FlashEntropy search instance. 297 fe_kwargs : dict, optional 298 Keyword arguments for FlashEntropy search. 299 clean_spectra : bool, optional 300 Clean spectra before building index. Default is True. 301 302 Returns 303 ------- 304 :obj:`~ms_entropy.FlashEntropySearch` 305 FlashEntropy search instance. 306 307 """ 308 # Initialize FlashEntropy 309 fe_init_kws = [ 310 "max_ms2_tolerance_in_da", 311 "mz_index_step", 312 "low_memory", 313 "path_data", 314 ] 315 fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws} 316 fes = FlashEntropySearch(**fe_init_kws) 317 318 # Build FlashEntropy index 319 fe_index_kws = [ 320 "max_indexed_mz", 321 "precursor_ions_removal_da", 322 "noise_threshold", 323 "min_ms2_difference_in_da", 324 "max_peak_num", 325 ] 326 fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws} 327 fes.build_index(fe_lib, **fe_index_kws, clean_spectra=clean_spectra) 328 329 return fes
Base class that facilitates connection to spectral reference databases, such as EMSL's Metabolomics Reference Database (MetabRef).
29 def __init__(self, key=None): 30 """ 31 Initialize instance. 32 33 Parameters 34 ---------- 35 key : str 36 Token key. 37 38 """ 39 40 self.key = key
Initialize instance.
Parameters
- key (str): Token key.
42 def set_token(self, path): 43 """ 44 Set environment variable for MetabRef database token. 45 46 Parameters 47 ---------- 48 path : str 49 Path to token. 50 51 """ 52 53 # Read token from file 54 with open(path, "r", encoding="utf-8") as f: 55 token = f.readline().strip() 56 57 # Set environment variable 58 os.environ[self.key] = token
Set environment variable for MetabRef database token.
Parameters
- path (str): Path to token.
60 def get_token(self): 61 """ 62 Get environment variable for database token. 63 64 Returns 65 ------- 66 str 67 Token string. 68 69 """ 70 71 # Check for token 72 if self.key not in os.environ: 73 raise ValueError("Must set {} environment variable.".format(self.key)) 74 75 # Get token from environment variables 76 return os.environ.get(self.key)
Get environment variable for database token.
Returns
- str: Token string.
78 def get_header(self): 79 """ 80 Access stored database token and prepare as header. 81 82 Returns 83 ------- 84 str 85 Header string. 86 87 """ 88 89 # Get token 90 token = self.get_token() 91 92 # Pad header information 93 header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"} 94 95 return header
Access stored database token and prepare as header.
Returns
- str: Header string.
97 def get_query(self, url, use_header=True): 98 """ 99 Request payload from URL according to `get` protocol. 100 101 Parameters 102 ---------- 103 url : str 104 URL for request. 105 use_header: bool 106 Whether or not the query should include the header 107 108 Returns 109 ------- 110 dict 111 Response as JSON. 112 113 """ 114 115 # Query URL via `get` 116 if use_header: 117 response = requests.get(url, headers=self.get_header()) 118 else: 119 response = requests.get(url) 120 121 # Check response 122 response.raise_for_status() 123 124 # Return as JSON 125 return response.json()
Request payload from URL according to get
protocol.
Parameters
- url (str): URL for request.
- use_header (bool): Whether or not the query should include the header
Returns
- dict: Response as JSON.
127 def post_query(self, url, variable, values, tolerance): 128 """ 129 Request payload from URL according to `post` protocol. 130 131 Parameters 132 ---------- 133 url : str 134 URL for request. 135 variable : str 136 Variable to query. 137 values : str 138 Specific values of `variable` to query. 139 tolerance : str 140 Query tolerance relative to `values`. 141 142 Returns 143 ------- 144 dict 145 Response as JSON. 146 147 """ 148 149 # Coerce to string 150 if not isinstance(variable, str): 151 variable = str(variable).replace(" ", "") 152 153 if not isinstance(values, str): 154 values = str(values).replace(" ", "") 155 156 if not isinstance(tolerance, str): 157 tolerance = str(tolerance).replace(" ", "") 158 159 # Query URL via `post` 160 response = requests.post( 161 os.path.join(url, variable, tolerance), 162 data=values, 163 headers=self.get_header(), 164 ) 165 166 # Check response 167 response.raise_for_status() 168 169 # Return as JSON 170 return response.json()
Request payload from URL according to post
protocol.
Parameters
- url (str): URL for request.
- variable (str): Variable to query.
- values (str):
Specific values of
variable
to query. - tolerance (str):
Query tolerance relative to
values
.
Returns
- dict: Response as JSON.
265 @staticmethod 266 def normalize_peaks(arr): 267 """ 268 Normalize peaks in an array. 269 270 Parameters 271 ---------- 272 arr : :obj:`~numpy.array` 273 Array of shape (N, 2), with m/z in the first column and abundance in 274 the second. 275 276 Returns 277 ------- 278 :obj:`~numpy.array` 279 Normalized array of shape (N, 2), with m/z in the first column and 280 normalized abundance in the second. 281 """ 282 # Normalize the array 283 arr[:, -1] = arr[:, -1] / arr[:, -1].sum() 284 285 return arr
Normalize peaks in an array.
Parameters
- arr (
~numpy.array
): Array of shape (N, 2), with m/z in the first column and abundance in the second.
Returns
~numpy.array
: Normalized array of shape (N, 2), with m/z in the first column and normalized abundance in the second.
332class MetabRefInterface(SpectralDatabaseInterface): 333 """ 334 Interface to the Metabolomics Reference Database. 335 """ 336 337 def __init__(self): 338 """ 339 Initialize instance. 340 341 """ 342 343 super().__init__(key=None) 344 345 def spectrum_to_array(self, spectrum, normalize=True): 346 """ 347 Convert MetabRef-formatted spectrum to array. 348 349 Parameters 350 ---------- 351 spectrum : str 352 MetabRef spectrum, i.e. list of (m/z,abundance) pairs. 353 normalize : bool 354 Normalize the spectrum by its magnitude. 355 356 Returns 357 ------- 358 :obj:`~numpy.array` 359 Array of shape (N, 2), with m/z in the first column and abundance in 360 the second. 361 362 """ 363 364 # Convert parenthesis-delimited string to array 365 arr = np.array( 366 re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float 367 ).reshape(-1, 2) 368 369 if normalize: 370 arr = self.normalize_peaks(arr) 371 372 return arr 373 374 def _to_flashentropy(self, metabref_lib, normalize=True, fe_kwargs={}): 375 """ 376 Convert metabref-formatted library to FlashEntropy library. 377 378 Parameters 379 ---------- 380 metabref_lib : dict 381 MetabRef MS2 library in JSON format or FlashEntropy search instance (for reformatting at different MS2 separation). 382 normalize : bool 383 Normalize each spectrum by its magnitude. 384 fe_kwargs : dict, optional 385 Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search; 386 any keys not recognized will be ignored. By default, all parameters set to defaults. 387 388 Returns 389 ------- 390 :obj:`~ms_entropy.FlashEntropySearch` 391 MS2 library as FlashEntropy search instance. 392 393 Raises 394 ------ 395 ValueError 396 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal. 397 398 """ 399 self._check_flash_entropy_kwargs(fe_kwargs) 400 401 # Initialize empty library 402 fe_lib = [] 403 404 # Enumerate spectra 405 for i, source in enumerate(metabref_lib): 406 # Reorganize source dict, if necessary 407 if "spectrum_data" in source.keys(): 408 spectrum = source["spectrum_data"] 409 else: 410 spectrum = source 411 412 # Rename precursor_mz key for FlashEntropy 413 if "precursor_mz" not in spectrum.keys(): 414 spectrum["precursor_mz"] = spectrum.pop("precursor_ion") 415 416 # Convert CoreMS spectrum to array and clean, store as `peaks` 417 spectrum["peaks"] = self.spectrum_to_array( 418 spectrum["mz"], normalize=normalize 419 ) 420 421 # Add spectrum to library 422 fe_lib.append(spectrum) 423 424 # Build FlashEntropy index 425 fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs) 426 427 return fe_search 428 429 def get_query(self, url, use_header=False): 430 """Overwrites the get_query method on the parent class to default to not use a header 431 432 Notes 433 ----- 434 As of January 2025, the metabref database no longer requires a token and therefore no header is needed 435 436 """ 437 return super().get_query(url, use_header)
Interface to the Metabolomics Reference Database.
337 def __init__(self): 338 """ 339 Initialize instance. 340 341 """ 342 343 super().__init__(key=None)
Initialize instance.
345 def spectrum_to_array(self, spectrum, normalize=True): 346 """ 347 Convert MetabRef-formatted spectrum to array. 348 349 Parameters 350 ---------- 351 spectrum : str 352 MetabRef spectrum, i.e. list of (m/z,abundance) pairs. 353 normalize : bool 354 Normalize the spectrum by its magnitude. 355 356 Returns 357 ------- 358 :obj:`~numpy.array` 359 Array of shape (N, 2), with m/z in the first column and abundance in 360 the second. 361 362 """ 363 364 # Convert parenthesis-delimited string to array 365 arr = np.array( 366 re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float 367 ).reshape(-1, 2) 368 369 if normalize: 370 arr = self.normalize_peaks(arr) 371 372 return arr
Convert MetabRef-formatted spectrum to array.
Parameters
- spectrum (str): MetabRef spectrum, i.e. list of (m/z,abundance) pairs.
- normalize (bool): Normalize the spectrum by its magnitude.
Returns
~numpy.array
: Array of shape (N, 2), with m/z in the first column and abundance in the second.
429 def get_query(self, url, use_header=False): 430 """Overwrites the get_query method on the parent class to default to not use a header 431 432 Notes 433 ----- 434 As of January 2025, the metabref database no longer requires a token and therefore no header is needed 435 436 """ 437 return super().get_query(url, use_header)
Overwrites the get_query method on the parent class to default to not use a header
Notes
As of January 2025, the metabref database no longer requires a token and therefore no header is needed
Inherited Members
440class MetabRefGCInterface(MetabRefInterface): 441 """ 442 Interface to the Metabolomics Reference Database. 443 """ 444 445 def __init__(self): 446 """ 447 Initialize instance. 448 449 """ 450 451 super().__init__() 452 self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1" 453 self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames" 454 455 self.__init_format_map__() 456 457 def __init_format_map__(self): 458 """ 459 Initialize database format mapper, enabling multiple format requests. 460 461 """ 462 463 # Define format workflows 464 self.format_map = { 465 "json": lambda x, normalize, fe_kwargs: x, 466 "dict": lambda x, 467 normalize, 468 fe_kwargs: self._to_LowResolutionEICompound_dict(x, normalize), 469 "sql": lambda x, 470 normalize, 471 fe_kwargs: self._LowResolutionEICompound_dict_to_sqlite( 472 self._to_LowResolutionEICompound_dict(x, normalize) 473 ), 474 } 475 476 # Add aliases 477 self.format_map["metabref"] = self.format_map["json"] 478 self.format_map["datadict"] = self.format_map["dict"] 479 self.format_map["data-dict"] = self.format_map["dict"] 480 self.format_map["lowreseicompound"] = self.format_map["dict"] 481 self.format_map["lowres"] = self.format_map["dict"] 482 self.format_map["lowresgc"] = self.format_map["dict"] 483 self.format_map["sqlite"] = self.format_map["sql"] 484 485 def available_formats(self): 486 """ 487 View list of available formats. 488 489 Returns 490 ------- 491 list 492 Format map keys. 493 """ 494 495 return list(self.format_map.keys()) 496 497 def get_library(self, format="json", normalize=False): 498 """ 499 Request MetabRef GC/MS library. 500 501 Parameters 502 ---------- 503 format : str 504 Format of requested library, i.e. "json", "sql", "flashentropy". 505 See `available_formats` method for aliases. 506 normalize : bool 507 Normalize the spectrum by its magnitude. 508 509 Returns 510 ------- 511 Library in requested format. 512 513 """ 514 515 # Init format function 516 format_func = self._get_format_func(format) 517 518 return format_func( 519 self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {} 520 ) 521 522 def get_fames(self, format="json", normalize=False): 523 """ 524 Request MetabRef GC/MS FAMEs library. 525 526 Parameters 527 ---------- 528 format : str 529 Format of requested library, i.e. "json", "sql", "flashentropy". 530 See `available_formats` method for aliases. 531 normalize : bool 532 Normalize the spectrum by its magnitude. 533 534 Returns 535 ------- 536 Library in requested format. 537 538 """ 539 540 # Init format function 541 format_func = self._get_format_func(format) 542 543 return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {}) 544 545 def _to_LowResolutionEICompound_dict(self, metabref_lib, normalize=False): 546 """ 547 Convert MetabRef-formatted library to CoreMS LowResolutionEICompound-formatted 548 dictionary for local ingestion. 549 550 Parameters 551 ---------- 552 metabref_lib : dict 553 MetabRef GC-MS library in JSON format. 554 normalize : bool 555 Normalize each spectrum by its magnitude. 556 557 Returns 558 ------- 559 list of dict 560 List of each spectrum contained in dictionary. 561 562 """ 563 564 # All below key:value lookups are based on CoreMS class definitions 565 # NOT MetabRef content. For example, MetabRef has keys for PubChem, 566 # USI, etc. that are not considered below. 567 568 # Dictionary to map metabref keys to corems keys 569 metadatar_cols = { 570 "casno": "cas", 571 "inchikey": "inchikey", 572 "inchi": "inchi", 573 "chebi": "chebi", 574 "smiles": "smiles", 575 "kegg": "kegg", 576 "iupac_name": "iupac_name", 577 "traditional_name": "traditional_name", # Not present in metabref 578 "common_name": "common_name", # Not present in metabref 579 } 580 581 # Dictionary to map metabref keys to corems keys 582 lowres_ei_compound_cols = { 583 "id": "metabref_id", 584 "molecule_name": "name", # Is this correct? 585 "classify": "classify", # Not present in metabref 586 "formula": "formula", 587 "ri": "ri", 588 "rt": "retention_time", 589 "source": "source", # Not present in metabref 590 "casno": "casno", 591 "comments": "comment", 592 "source_temp_c": "source_temp_c", # Not present in metabref 593 "ev": "ev", # Not present in metabref 594 "peak_count": "peaks_count", 595 "mz": "mz", 596 "abundance": "abundance", 597 } 598 599 # Local result container 600 corems_lib = [] 601 602 # Enumerate spectra 603 for i, source_ in enumerate(metabref_lib): 604 # Copy source to prevent modification 605 source = source_.copy() 606 607 # Flatten source dict 608 source = source.pop("spectrum_data") | source 609 610 # Parse target data 611 target = { 612 lowres_ei_compound_cols[k]: v 613 for k, v in source.items() 614 if k in lowres_ei_compound_cols 615 } 616 617 # Explicitly add this to connect with LowResCompoundRef later 618 target["rt"] = source["rt"] 619 620 # Parse (mz, abundance) 621 arr = self.spectrum_to_array(target["mz"], normalize=normalize) 622 target["mz"] = arr[:, 0] 623 target["abundance"] = arr[:, 1] 624 625 # Parse meta data 626 target["metadata"] = { 627 metadatar_cols[k]: v for k, v in source.items() if k in metadatar_cols 628 } 629 630 # Add anything else 631 for k in source: 632 if k not in lowres_ei_compound_cols: 633 target[k] = source[k] 634 635 # Add to CoreMS list 636 corems_lib.append(target) 637 638 return corems_lib 639 640 def _LowResolutionEICompound_dict_to_sqlite( 641 self, lowres_ei_compound_dict, url="sqlite://" 642 ): 643 """ 644 Convert CoreMS LowResolutionEICompound-formatted dictionary to SQLite 645 database for local ingestion. 646 647 Parameters 648 ---------- 649 lowres_ei_compound_dict : dict 650 CoreMS GC-MS library formatted for LowResolutionEICompound. 651 url : str 652 URL to SQLite prefix. 653 654 Returns 655 ------- 656 sqlite database 657 Spectra contained in SQLite database. 658 659 """ 660 661 # Dictionary to map corems keys to all-caps keys 662 capped_cols = { 663 "name": "NAME", 664 "formula": "FORM", 665 "ri": "RI", 666 "retention_time": "RT", 667 "source": "SOURCE", 668 "casno": "CASNO", 669 "comment": "COMMENT", 670 "peaks_count": "NUM PEAKS", 671 } 672 673 # Initialize SQLite object 674 sqlite_obj = EI_LowRes_SQLite(url=url) 675 676 # Iterate spectra 677 for _data_dict in lowres_ei_compound_dict: 678 # Copy source to prevent modification 679 data_dict = _data_dict.copy() 680 681 # Add missing capped values 682 for k, v in capped_cols.items(): 683 # Key exists 684 if k in data_dict: 685 # # This will replace the key 686 # data_dict[v] = data_dict.pop(k) 687 688 # This will keep both keys 689 data_dict[v] = data_dict[k] 690 691 # Parse number of peaks 692 if not data_dict.get("NUM PEAKS"): 693 data_dict["NUM PEAKS"] = len(data_dict.get("mz")) 694 695 # Parse CAS number 696 if not data_dict.get("CASNO"): 697 data_dict["CASNO"] = data_dict.get("CAS") 698 699 if not data_dict["CASNO"]: 700 data_dict["CASNO"] = 0 701 702 # Build linked metadata table 703 if "metadata" in data_dict: 704 if len(data_dict["metadata"]) > 0: 705 data_dict["metadatar"] = Metadatar(**data_dict.pop("metadata")) 706 else: 707 data_dict.pop("metadata") 708 709 # Attempt addition to sqlite 710 try: 711 sqlite_obj.add_compound(data_dict) 712 except: 713 print(data_dict["NAME"]) 714 715 return sqlite_obj
Interface to the Metabolomics Reference Database.
445 def __init__(self): 446 """ 447 Initialize instance. 448 449 """ 450 451 super().__init__() 452 self.GCMS_LIBRARY_URL = "https://metabref.emsl.pnnl.gov/api/mslevel/1" 453 self.FAMES_URL = "https://metabref.emsl.pnnl.gov/api/fames" 454 455 self.__init_format_map__()
Initialize instance.
485 def available_formats(self): 486 """ 487 View list of available formats. 488 489 Returns 490 ------- 491 list 492 Format map keys. 493 """ 494 495 return list(self.format_map.keys())
View list of available formats.
Returns
- list: Format map keys.
497 def get_library(self, format="json", normalize=False): 498 """ 499 Request MetabRef GC/MS library. 500 501 Parameters 502 ---------- 503 format : str 504 Format of requested library, i.e. "json", "sql", "flashentropy". 505 See `available_formats` method for aliases. 506 normalize : bool 507 Normalize the spectrum by its magnitude. 508 509 Returns 510 ------- 511 Library in requested format. 512 513 """ 514 515 # Init format function 516 format_func = self._get_format_func(format) 517 518 return format_func( 519 self.get_query(self.GCMS_LIBRARY_URL)["GC-MS"], normalize, {} 520 )
Request MetabRef GC/MS library.
Parameters
- format (str):
Format of requested library, i.e. "json", "sql", "flashentropy".
See
available_formats
method for aliases. - normalize (bool): Normalize the spectrum by its magnitude.
Returns
- Library in requested format.
522 def get_fames(self, format="json", normalize=False): 523 """ 524 Request MetabRef GC/MS FAMEs library. 525 526 Parameters 527 ---------- 528 format : str 529 Format of requested library, i.e. "json", "sql", "flashentropy". 530 See `available_formats` method for aliases. 531 normalize : bool 532 Normalize the spectrum by its magnitude. 533 534 Returns 535 ------- 536 Library in requested format. 537 538 """ 539 540 # Init format function 541 format_func = self._get_format_func(format) 542 543 return format_func(self.get_query(self.FAMES_URL)["GC-MS"], normalize, {})
Request MetabRef GC/MS FAMEs library.
Parameters
- format (str):
Format of requested library, i.e. "json", "sql", "flashentropy".
See
available_formats
method for aliases. - normalize (bool): Normalize the spectrum by its magnitude.
Returns
- Library in requested format.
718class MetabRefLCInterface(MetabRefInterface): 719 """ 720 Interface to the Metabolomics Reference Database for LC-MS data. 721 """ 722 723 def __init__(self): 724 """ 725 Initialize instance. 726 727 """ 728 729 super().__init__() 730 731 # API endpoint for precursor m/z search 732 # inputs = mz, tolerance (in Da), polarity, page_no, per_page 733 self.PRECURSOR_MZ_URL = "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}" 734 735 # API endpoint for returning full list of precursor m/z values in database 736 # inputs = polarity, page_no, per_page 737 self.PRECURSOR_MZ_ALL_URL = ( 738 "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}" 739 ) 740 741 self.__init_format_map__() 742 743 def __init_format_map__(self): 744 """ 745 Initialize database format mapper, enabling multiple format requests. 746 747 """ 748 749 # Define format workflows 750 self.format_map = { 751 "json": lambda x, normalize, fe_kwargs: x, 752 "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy( 753 x, normalize, fe_kwargs 754 ), 755 } 756 757 # Add aliases 758 self.format_map["metabref"] = self.format_map["json"] 759 self.format_map["fe"] = self.format_map["flashentropy"] 760 self.format_map["flash-entropy"] = self.format_map["flashentropy"] 761 762 def query_by_precursor( 763 self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50 764 ): 765 """ 766 Query MetabRef by precursor m/z values. 767 768 Parameters 769 ---------- 770 mz_list : list 771 List of precursor m/z values. 772 polarity : str 773 Ionization polarity, either "positive" or "negative". 774 mz_tol_ppm : float 775 Tolerance in ppm for each precursor m/z value. 776 Used for retrieving from a potential match from database. 777 mz_tol_da_api : float, optional 778 Maximum tolerance between precursor m/z values for API search, in daltons. 779 Used to group similar mzs into a single API query for speed. Default is 0.2. 780 max_per_page : int, optional 781 Maximum records to return from MetabRef API query at a time. Default is 50. 782 783 Returns 784 ------- 785 list 786 List of library entries in original JSON format. 787 """ 788 789 # If polarity is anything other than positive or negative, raise error 790 if polarity not in ["positive", "negative"]: 791 raise ValueError("Polarity must be 'positive' or 'negative'") 792 793 # Cluster groups of mz according to mz_tol_da_api for precursor query 794 mz_list.sort() 795 mz_groups = [[mz_list[0]]] 796 for x in mz_list[1:]: 797 if abs(x - mz_groups[-1][0]) <= mz_tol_da_api: 798 mz_groups[-1].append(x) 799 else: 800 mz_groups.append([x]) 801 802 # Query MetabRef for each mz group 803 lib = [] 804 for mz_group in mz_groups: 805 mz = np.mean(mz_group) 806 if len(mz_group) == 1: 807 mz = mz_group[0] 808 tol = mz_tol_ppm * 10**-6 * mz 809 else: 810 mz = (max(mz_group) - min(mz_group)) / 2 + min(mz_group) 811 tol = (max(mz_group) - min(mz_group)) / 2 + mz_tol_ppm**-6 * max( 812 mz_group 813 ) 814 815 # Get first page of results 816 response = self.get_query( 817 self.PRECURSOR_MZ_URL.format( 818 str(mz), str(tol), polarity, 1, max_per_page 819 ) 820 ) 821 lib = lib + response["results"] 822 823 # If there are more pages of results, get them 824 if response["total_pages"] > 1: 825 for i in np.arange(2, response["total_pages"] + 1): 826 lib = ( 827 lib 828 + self.get_query( 829 self.PRECURSOR_MZ_URL.format( 830 str(mz), str(tol), polarity, i, max_per_page 831 ) 832 )["results"] 833 ) 834 835 return lib 836 837 def request_all_precursors(self, polarity, per_page=50000): 838 """ 839 Request all precursor m/z values for MS2 spectra from MetabRef. 840 841 Parameters 842 ---------- 843 polarity : str 844 Ionization polarity, either "positive" or "negative". 845 per_page : int, optional 846 Number of records to fetch per call. Default is 50000 847 848 Returns 849 ------- 850 list 851 List of all precursor m/z values, sorted. 852 """ 853 # If polarity is anything other than positive or negative, raise error 854 if polarity not in ["positive", "negative"]: 855 raise ValueError("Polarity must be 'positive' or 'negative'") 856 857 precursors = [] 858 859 # Get first page of results and total number of pages of results 860 response = self.get_query( 861 self.PRECURSOR_MZ_ALL_URL.format(polarity, str(1), str(per_page)) 862 ) 863 total_pages = response["total_pages"] 864 precursors.extend([x["precursor_ion"] for x in response["results"]]) 865 866 # Go through remaining pages of results 867 for i in np.arange(2, total_pages + 1): 868 response = self.get_query( 869 self.PRECURSOR_MZ_ALL_URL.format(polarity, str(i), str(per_page)) 870 ) 871 precursors.extend([x["precursor_ion"] for x in response["results"]]) 872 873 # Sort precursors from smallest to largest and remove duplicates 874 precursors = list(set(precursors)) 875 precursors.sort() 876 877 return precursors 878 879 def get_lipid_library( 880 self, 881 mz_list, 882 polarity, 883 mz_tol_ppm, 884 mz_tol_da_api=0.2, 885 format="json", 886 normalize=True, 887 fe_kwargs={}, 888 ): 889 """ 890 Request MetabRef lipid library. 891 892 Parameters 893 ---------- 894 mz_list : list 895 List of precursor m/z values. 896 polarity : str 897 Ionization polarity, either "positive" or "negative". 898 mz_tol_ppm : float 899 Tolerance in ppm for each precursor m/z value. 900 Used for retrieving from a potential match from database. 901 mz_tol_da_api : float, optional 902 Maximum tolerance between precursor m/z values for API search, in daltons. 903 Used to group similar mzs into a single API query for speed. Default is 0.2. 904 format : str, optional 905 Format of requested library, i.e. "json", "sql", "flashentropy". 906 See `available_formats` method for aliases. Default is "json". 907 normalize : bool, optional 908 Normalize the spectrum by its magnitude. Default is True. 909 fe_kwargs : dict, optional 910 Keyword arguments for FlashEntropy search. Default is {}. 911 912 Returns 913 ------- 914 tuple 915 Library in requested format and lipid metadata as a LipidMetadata dataclass. 916 917 """ 918 mz_list.sort() 919 mz_list = np.array(mz_list) 920 921 # Get all precursors in the library matching the polarity 922 precusors_in_lib = self.request_all_precursors(polarity=polarity) 923 precusors_in_lib = np.array(precusors_in_lib) 924 925 # Compare the mz_list with the precursors in the library, keep any mzs that are within mz_tol of any precursor in the library 926 lib_mz_df = pd.DataFrame(precusors_in_lib, columns=["lib_mz"]) 927 lib_mz_df["closest_obs_mz"] = mz_list[ 928 find_closest(mz_list, lib_mz_df.lib_mz.values) 929 ] 930 lib_mz_df["mz_diff_ppm"] = np.abs( 931 (lib_mz_df["lib_mz"] - lib_mz_df["closest_obs_mz"]) 932 / lib_mz_df["lib_mz"] 933 * 1e6 934 ) 935 lib_mz_sub = lib_mz_df[lib_mz_df["mz_diff_ppm"] <= mz_tol_ppm] 936 937 # Do the same in the opposite direction 938 mz_df = pd.DataFrame(mz_list, columns=["mass_feature_mz"]) 939 mz_df["closest_lib_pre_mz"] = precusors_in_lib[ 940 find_closest(precusors_in_lib, mz_df.mass_feature_mz.values) 941 ] 942 mz_df["mz_diff_ppm"] = np.abs( 943 (mz_df["mass_feature_mz"] - mz_df["closest_lib_pre_mz"]) 944 / mz_df["mass_feature_mz"] 945 * 1e6 946 ) 947 mz_df_sub = mz_df[mz_df["mz_diff_ppm"] <= mz_tol_ppm] 948 949 # Evaluate which is fewer mzs - lib_mz_sub or mz_df_sub and use that as the input for next step 950 if len(lib_mz_sub) < len(mz_df_sub): 951 mzs_to_query = lib_mz_sub.lib_mz.values 952 else: 953 mzs_to_query = mz_df_sub.mass_feature_mz.values 954 955 # Query the library for the precursors in the mz_list that are in the library to retrieve the spectra and metadata 956 lib = self.query_by_precursor( 957 mz_list=mzs_to_query, 958 polarity=polarity, 959 mz_tol_ppm=mz_tol_ppm, 960 mz_tol_da_api=mz_tol_da_api, 961 ) 962 963 # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass 964 mol_data_dict = {x["id"]: x["Molecular Data"] for x in lib} 965 lipid_lib = {x["id"]: x["Lipid Tree"] for x in lib if "Lipid Tree" in x.keys()} 966 mol_data_dict = {k: {**v, **lipid_lib[k]} for k, v in mol_data_dict.items()} 967 mol_data_dict = { 968 k: self._dict_to_dataclass(v, LipidMetadata) 969 for k, v in mol_data_dict.items() 970 } 971 972 # Remove lipid metadata from the metabref library 973 lib = [ 974 {k: v for k, v in x.items() if k not in ["Molecular Data", "Lipid Tree"]} 975 for x in lib 976 ] 977 # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry 978 for x in lib: 979 if "Lipid Fragments" in x.keys(): 980 x.update(x.pop("Lipid Fragments")) 981 if "MSO Data" in x.keys(): 982 x.update(x.pop("MSO Data")) 983 984 # Format the spectral library 985 format_func = self._get_format_func(format) 986 lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs) 987 return (lib, mol_data_dict)
Interface to the Metabolomics Reference Database for LC-MS data.
723 def __init__(self): 724 """ 725 Initialize instance. 726 727 """ 728 729 super().__init__() 730 731 # API endpoint for precursor m/z search 732 # inputs = mz, tolerance (in Da), polarity, page_no, per_page 733 self.PRECURSOR_MZ_URL = "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}" 734 735 # API endpoint for returning full list of precursor m/z values in database 736 # inputs = polarity, page_no, per_page 737 self.PRECURSOR_MZ_ALL_URL = ( 738 "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}" 739 ) 740 741 self.__init_format_map__()
Initialize instance.
762 def query_by_precursor( 763 self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50 764 ): 765 """ 766 Query MetabRef by precursor m/z values. 767 768 Parameters 769 ---------- 770 mz_list : list 771 List of precursor m/z values. 772 polarity : str 773 Ionization polarity, either "positive" or "negative". 774 mz_tol_ppm : float 775 Tolerance in ppm for each precursor m/z value. 776 Used for retrieving from a potential match from database. 777 mz_tol_da_api : float, optional 778 Maximum tolerance between precursor m/z values for API search, in daltons. 779 Used to group similar mzs into a single API query for speed. Default is 0.2. 780 max_per_page : int, optional 781 Maximum records to return from MetabRef API query at a time. Default is 50. 782 783 Returns 784 ------- 785 list 786 List of library entries in original JSON format. 787 """ 788 789 # If polarity is anything other than positive or negative, raise error 790 if polarity not in ["positive", "negative"]: 791 raise ValueError("Polarity must be 'positive' or 'negative'") 792 793 # Cluster groups of mz according to mz_tol_da_api for precursor query 794 mz_list.sort() 795 mz_groups = [[mz_list[0]]] 796 for x in mz_list[1:]: 797 if abs(x - mz_groups[-1][0]) <= mz_tol_da_api: 798 mz_groups[-1].append(x) 799 else: 800 mz_groups.append([x]) 801 802 # Query MetabRef for each mz group 803 lib = [] 804 for mz_group in mz_groups: 805 mz = np.mean(mz_group) 806 if len(mz_group) == 1: 807 mz = mz_group[0] 808 tol = mz_tol_ppm * 10**-6 * mz 809 else: 810 mz = (max(mz_group) - min(mz_group)) / 2 + min(mz_group) 811 tol = (max(mz_group) - min(mz_group)) / 2 + mz_tol_ppm**-6 * max( 812 mz_group 813 ) 814 815 # Get first page of results 816 response = self.get_query( 817 self.PRECURSOR_MZ_URL.format( 818 str(mz), str(tol), polarity, 1, max_per_page 819 ) 820 ) 821 lib = lib + response["results"] 822 823 # If there are more pages of results, get them 824 if response["total_pages"] > 1: 825 for i in np.arange(2, response["total_pages"] + 1): 826 lib = ( 827 lib 828 + self.get_query( 829 self.PRECURSOR_MZ_URL.format( 830 str(mz), str(tol), polarity, i, max_per_page 831 ) 832 )["results"] 833 ) 834 835 return lib
Query MetabRef by precursor m/z values.
Parameters
- mz_list (list): List of precursor m/z values.
- polarity (str): Ionization polarity, either "positive" or "negative".
- mz_tol_ppm (float): Tolerance in ppm for each precursor m/z value. Used for retrieving from a potential match from database.
- mz_tol_da_api (float, optional): Maximum tolerance between precursor m/z values for API search, in daltons. Used to group similar mzs into a single API query for speed. Default is 0.2.
- max_per_page (int, optional): Maximum records to return from MetabRef API query at a time. Default is 50.
Returns
- list: List of library entries in original JSON format.
837 def request_all_precursors(self, polarity, per_page=50000): 838 """ 839 Request all precursor m/z values for MS2 spectra from MetabRef. 840 841 Parameters 842 ---------- 843 polarity : str 844 Ionization polarity, either "positive" or "negative". 845 per_page : int, optional 846 Number of records to fetch per call. Default is 50000 847 848 Returns 849 ------- 850 list 851 List of all precursor m/z values, sorted. 852 """ 853 # If polarity is anything other than positive or negative, raise error 854 if polarity not in ["positive", "negative"]: 855 raise ValueError("Polarity must be 'positive' or 'negative'") 856 857 precursors = [] 858 859 # Get first page of results and total number of pages of results 860 response = self.get_query( 861 self.PRECURSOR_MZ_ALL_URL.format(polarity, str(1), str(per_page)) 862 ) 863 total_pages = response["total_pages"] 864 precursors.extend([x["precursor_ion"] for x in response["results"]]) 865 866 # Go through remaining pages of results 867 for i in np.arange(2, total_pages + 1): 868 response = self.get_query( 869 self.PRECURSOR_MZ_ALL_URL.format(polarity, str(i), str(per_page)) 870 ) 871 precursors.extend([x["precursor_ion"] for x in response["results"]]) 872 873 # Sort precursors from smallest to largest and remove duplicates 874 precursors = list(set(precursors)) 875 precursors.sort() 876 877 return precursors
Request all precursor m/z values for MS2 spectra from MetabRef.
Parameters
- polarity (str): Ionization polarity, either "positive" or "negative".
- per_page (int, optional): Number of records to fetch per call. Default is 50000
Returns
- list: List of all precursor m/z values, sorted.
879 def get_lipid_library( 880 self, 881 mz_list, 882 polarity, 883 mz_tol_ppm, 884 mz_tol_da_api=0.2, 885 format="json", 886 normalize=True, 887 fe_kwargs={}, 888 ): 889 """ 890 Request MetabRef lipid library. 891 892 Parameters 893 ---------- 894 mz_list : list 895 List of precursor m/z values. 896 polarity : str 897 Ionization polarity, either "positive" or "negative". 898 mz_tol_ppm : float 899 Tolerance in ppm for each precursor m/z value. 900 Used for retrieving from a potential match from database. 901 mz_tol_da_api : float, optional 902 Maximum tolerance between precursor m/z values for API search, in daltons. 903 Used to group similar mzs into a single API query for speed. Default is 0.2. 904 format : str, optional 905 Format of requested library, i.e. "json", "sql", "flashentropy". 906 See `available_formats` method for aliases. Default is "json". 907 normalize : bool, optional 908 Normalize the spectrum by its magnitude. Default is True. 909 fe_kwargs : dict, optional 910 Keyword arguments for FlashEntropy search. Default is {}. 911 912 Returns 913 ------- 914 tuple 915 Library in requested format and lipid metadata as a LipidMetadata dataclass. 916 917 """ 918 mz_list.sort() 919 mz_list = np.array(mz_list) 920 921 # Get all precursors in the library matching the polarity 922 precusors_in_lib = self.request_all_precursors(polarity=polarity) 923 precusors_in_lib = np.array(precusors_in_lib) 924 925 # Compare the mz_list with the precursors in the library, keep any mzs that are within mz_tol of any precursor in the library 926 lib_mz_df = pd.DataFrame(precusors_in_lib, columns=["lib_mz"]) 927 lib_mz_df["closest_obs_mz"] = mz_list[ 928 find_closest(mz_list, lib_mz_df.lib_mz.values) 929 ] 930 lib_mz_df["mz_diff_ppm"] = np.abs( 931 (lib_mz_df["lib_mz"] - lib_mz_df["closest_obs_mz"]) 932 / lib_mz_df["lib_mz"] 933 * 1e6 934 ) 935 lib_mz_sub = lib_mz_df[lib_mz_df["mz_diff_ppm"] <= mz_tol_ppm] 936 937 # Do the same in the opposite direction 938 mz_df = pd.DataFrame(mz_list, columns=["mass_feature_mz"]) 939 mz_df["closest_lib_pre_mz"] = precusors_in_lib[ 940 find_closest(precusors_in_lib, mz_df.mass_feature_mz.values) 941 ] 942 mz_df["mz_diff_ppm"] = np.abs( 943 (mz_df["mass_feature_mz"] - mz_df["closest_lib_pre_mz"]) 944 / mz_df["mass_feature_mz"] 945 * 1e6 946 ) 947 mz_df_sub = mz_df[mz_df["mz_diff_ppm"] <= mz_tol_ppm] 948 949 # Evaluate which is fewer mzs - lib_mz_sub or mz_df_sub and use that as the input for next step 950 if len(lib_mz_sub) < len(mz_df_sub): 951 mzs_to_query = lib_mz_sub.lib_mz.values 952 else: 953 mzs_to_query = mz_df_sub.mass_feature_mz.values 954 955 # Query the library for the precursors in the mz_list that are in the library to retrieve the spectra and metadata 956 lib = self.query_by_precursor( 957 mz_list=mzs_to_query, 958 polarity=polarity, 959 mz_tol_ppm=mz_tol_ppm, 960 mz_tol_da_api=mz_tol_da_api, 961 ) 962 963 # Pull out lipid metadata from the metabref library and convert to LipidMetadata dataclass 964 mol_data_dict = {x["id"]: x["Molecular Data"] for x in lib} 965 lipid_lib = {x["id"]: x["Lipid Tree"] for x in lib if "Lipid Tree" in x.keys()} 966 mol_data_dict = {k: {**v, **lipid_lib[k]} for k, v in mol_data_dict.items()} 967 mol_data_dict = { 968 k: self._dict_to_dataclass(v, LipidMetadata) 969 for k, v in mol_data_dict.items() 970 } 971 972 # Remove lipid metadata from the metabref library 973 lib = [ 974 {k: v for k, v in x.items() if k not in ["Molecular Data", "Lipid Tree"]} 975 for x in lib 976 ] 977 # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry 978 for x in lib: 979 if "Lipid Fragments" in x.keys(): 980 x.update(x.pop("Lipid Fragments")) 981 if "MSO Data" in x.keys(): 982 x.update(x.pop("MSO Data")) 983 984 # Format the spectral library 985 format_func = self._get_format_func(format) 986 lib = format_func(lib, normalize=normalize, fe_kwargs=fe_kwargs) 987 return (lib, mol_data_dict)
Request MetabRef lipid library.
Parameters
- mz_list (list): List of precursor m/z values.
- polarity (str): Ionization polarity, either "positive" or "negative".
- mz_tol_ppm (float): Tolerance in ppm for each precursor m/z value. Used for retrieving from a potential match from database.
- mz_tol_da_api (float, optional): Maximum tolerance between precursor m/z values for API search, in daltons. Used to group similar mzs into a single API query for speed. Default is 0.2.
- format (str, optional):
Format of requested library, i.e. "json", "sql", "flashentropy".
See
available_formats
method for aliases. Default is "json". - normalize (bool, optional): Normalize the spectrum by its magnitude. Default is True.
- fe_kwargs (dict, optional): Keyword arguments for FlashEntropy search. Default is {}.
Returns
- tuple: Library in requested format and lipid metadata as a LipidMetadata dataclass.
990class MSPInterface(SpectralDatabaseInterface): 991 """ 992 Interface to parse NIST MSP files 993 """ 994 995 def __init__(self, file_path): 996 """ 997 Initialize instance. 998 999 Parameters 1000 ---------- 1001 file_path : str 1002 Path to a local MSP file. 1003 1004 Attributes 1005 ---------- 1006 file_path : str 1007 Path to the MSP file. 1008 _file_content : str 1009 Content of the MSP file. 1010 _data_frame : :obj:`~pandas.DataFrame` 1011 DataFrame of spectra from the MSP file with unaltered content. 1012 """ 1013 super().__init__(key=None) 1014 1015 self.file_path = file_path 1016 if not os.path.exists(self.file_path): 1017 raise FileNotFoundError( 1018 f"File {self.file_path} does not exist. Please check the file path." 1019 ) 1020 with open(self.file_path, "r") as f: 1021 self._file_content = f.read() 1022 1023 self._data_frame = self._read_msp_file() 1024 self.__init_format_map__() 1025 1026 def __init_format_map__(self): 1027 """ 1028 Initialize database format mapper, enabling multiple format requests. 1029 1030 """ 1031 1032 # x is a pandas dataframe similar to self._data_frame format 1033 # Define format workflows 1034 self.format_map = { 1035 "msp": lambda x, normalize, fe_kwargs: self._to_msp(x, normalize), 1036 "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy( 1037 x, normalize, fe_kwargs 1038 ), 1039 "df": lambda x, normalize, fe_kwargs: self._to_df(x, normalize), 1040 } 1041 1042 # Add aliases 1043 self.format_map["fe"] = self.format_map["flashentropy"] 1044 self.format_map["flash-entropy"] = self.format_map["flashentropy"] 1045 self.format_map["dataframe"] = self.format_map["df"] 1046 self.format_map["data-frame"] = self.format_map["df"] 1047 1048 def _read_msp_file(self): 1049 """ 1050 Reads the MSP files into the pandas dataframe, and sort/remove zero intensity ions in MS/MS spectra. 1051 1052 Returns 1053 ------- 1054 :obj:`~pandas.DataFrame` 1055 DataFrame of spectra from the MSP file, exacly as it is in the file (no sorting, filtering etc) 1056 """ 1057 # If input_dataframe is provided, return it it 1058 spectra = [] 1059 spectrum = {} 1060 1061 f = StringIO(self._file_content) 1062 for line in f: 1063 line = line.strip() 1064 if not line: 1065 continue # Skip empty lines 1066 1067 # Handle metadata 1068 if ":" in line: 1069 key, value = line.split(":", 1) 1070 key = key.strip().lower() 1071 value = value.strip() 1072 1073 if key == "name": 1074 # Save current spectrum and start a new one 1075 if spectrum: 1076 spectra.append(spectrum) 1077 spectrum = {"name": value, "peaks": []} 1078 else: 1079 spectrum[key] = value 1080 1081 # Handle peak data (assumed to start with a number) 1082 elif line[0].isdigit(): 1083 peaks = line.split() 1084 m_z = float(peaks[0]) 1085 intensity = float(peaks[1]) 1086 spectrum["peaks"].append(([m_z, intensity])) 1087 # Save the last spectrum 1088 if spectrum: 1089 spectra.append(spectrum) 1090 1091 df = pd.DataFrame(spectra) 1092 for column in df.columns: 1093 if column != "peaks": # Skip 'peaks' column 1094 try: 1095 df[column] = pd.to_numeric(df[column], errors="raise") 1096 except: 1097 pass 1098 return df 1099 1100 def _to_df(self, input_dataframe, normalize=True): 1101 """ 1102 Convert MSP-derived library to FlashEntropy library. 1103 1104 Parameters 1105 ---------- 1106 input_dataframe : :obj:`~pandas.DataFrame` 1107 Input DataFrame containing MSP-formatted spectra. 1108 normalize : bool, optional 1109 Normalize each spectrum by its magnitude. 1110 Default is True. 1111 1112 Returns 1113 ------- 1114 :obj:`~pandas.DataFrame` 1115 DataFrame of with desired normalization 1116 """ 1117 if not normalize: 1118 return input_dataframe 1119 else: 1120 # Convert to dictionary 1121 db_dict = input_dataframe.to_dict(orient="records") 1122 1123 # Initialize empty library 1124 lib = [] 1125 1126 # Enumerate spectra 1127 for i, source in enumerate(db_dict): 1128 spectrum = source 1129 # Check that spectrum["peaks"] exists 1130 if "peaks" not in spectrum.keys(): 1131 raise KeyError( 1132 "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute." 1133 ) 1134 1135 # Convert spectrum["peaks"] to numpy array 1136 if not isinstance(spectrum["peaks"], np.ndarray): 1137 spectrum["peaks"] = np.array(spectrum["peaks"]) 1138 1139 # Normalize peaks, if requested 1140 if normalize: 1141 spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"]) 1142 spectrum["num peaks"] = len(spectrum["peaks"]) 1143 1144 # Add spectrum to library 1145 lib.append(spectrum) 1146 1147 # Convert to DataFrame 1148 df = pd.DataFrame(lib) 1149 return df 1150 1151 def _to_flashentropy(self, input_dataframe, normalize=True, fe_kwargs={}): 1152 """ 1153 Convert MSP-derived library to FlashEntropy library. 1154 1155 Parameters 1156 ---------- 1157 input_dataframe : :obj:`~pandas.DataFrame` 1158 Input DataFrame containing MSP-formatted spectra. 1159 normalize : bool 1160 Normalize each spectrum by its magnitude. 1161 fe_kwargs : dict, optional 1162 Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search; 1163 any keys not recognized will be ignored. By default, all parameters set to defaults. 1164 1165 Returns 1166 ------- 1167 :obj:`~ms_entropy.FlashEntropySearch` 1168 MS2 library as FlashEntropy search instance. 1169 1170 Raises 1171 ------ 1172 ValueError 1173 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they 1174 """ 1175 self._check_flash_entropy_kwargs(fe_kwargs) 1176 1177 db_df = input_dataframe 1178 1179 # Convert to dictionary 1180 db_dict = db_df.to_dict(orient="records") 1181 1182 # Initialize empty library 1183 fe_lib = [] 1184 1185 # Enumerate spectra 1186 for i, source in enumerate(db_dict): 1187 # Reorganize source dict, if necessary 1188 if "spectrum_data" in source.keys(): 1189 spectrum = source["spectrum_data"] 1190 else: 1191 spectrum = source 1192 1193 # Rename precursor_mz key for FlashEntropy 1194 if "precursor_mz" not in spectrum.keys(): 1195 if "precursormz" in spectrum: 1196 spectrum["precursor_mz"] = spectrum.pop("precursormz") 1197 elif "precursor_ion" in spectrum: 1198 spectrum["precursor_mz"] = spectrum.pop("precursor_ion") 1199 else: 1200 raise KeyError( 1201 "MSP must have either 'precursormz' or 'precursor_ion' key to be converted to FlashEntropy format." 1202 ) 1203 1204 # Check that spectrum["peaks"] exists 1205 if "peaks" not in spectrum.keys(): 1206 raise KeyError( 1207 "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute." 1208 ) 1209 1210 # Convert spectrum["peaks"] to numpy array 1211 if not isinstance(spectrum["peaks"], np.ndarray): 1212 spectrum["peaks"] = np.array(spectrum["peaks"]) 1213 1214 # Normalize peaks, if requested 1215 if normalize: 1216 spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"]) 1217 1218 # Add spectrum to library 1219 fe_lib.append(spectrum) 1220 1221 # Build FlashEntropy index 1222 fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs) 1223 1224 return fe_search 1225 1226 def _check_msp_compatibility(self): 1227 """ 1228 Check if the MSP file is compatible with the get_metabolomics_spectra_library method and provide feedback if it is not. 1229 """ 1230 # Check polarity 1231 if ( 1232 "polarity" not in self._data_frame.columns 1233 and "ionmode" not in self._data_frame.columns 1234 ): 1235 raise ValueError( 1236 "Neither 'polarity' nor 'ionmode' columns found in the input MSP metadata. Please check the file." 1237 ) 1238 polarity_column = ( 1239 "polarity" if "polarity" in self._data_frame.columns else "ionmode" 1240 ) 1241 1242 # Check if polarity_column contents is either "positive" or "negative" 1243 if not all(self._data_frame[polarity_column].isin(["positive", "negative"])): 1244 raise ValueError( 1245 f"Input field on MSP '{polarity_column}' must contain only 'positive' or 'negative' values." 1246 ) 1247 1248 # Check if the MSP file contains the required columns for metabolite metadata 1249 # inchikey, by name, not null 1250 # either formula or molecular_formula, not null 1251 if not all(self._data_frame["inchikey"].notnull()): 1252 raise ValueError( 1253 "Input field on MSP 'inchikey' must contain only non-null values." 1254 ) 1255 if ( 1256 "formula" not in self._data_frame.columns 1257 and "molecular_formula" not in self._data_frame.columns 1258 ): 1259 raise ValueError( 1260 "Input field on MSP must contain either 'formula' or 'molecular_formula' columns." 1261 ) 1262 molecular_formula_column = ( 1263 "formula" if "formula" in self._data_frame.columns else "molecular_formula" 1264 ) 1265 if not all(self._data_frame[molecular_formula_column].notnull()): 1266 raise ValueError( 1267 f"Input field on MSP '{molecular_formula_column}' must contain only non-null values." 1268 ) 1269 1270 def get_metabolomics_spectra_library( 1271 self, 1272 polarity, 1273 metabolite_metadata_mapping={}, 1274 format="fe", 1275 normalize=True, 1276 fe_kwargs={}, 1277 ): 1278 """ 1279 Prepare metabolomics spectra library and associated metabolite metadata 1280 1281 Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input 1282 1283 """ 1284 # Check if the MSP file is compatible with the get_metabolomics_spectra_library method 1285 self._check_msp_compatibility() 1286 1287 # Check if the polarity parameter is valid and if a polarity column exists in the dataframe 1288 if polarity not in ["positive", "negative"]: 1289 raise ValueError("Polarity must be 'positive' or 'negative'") 1290 polarity_column = ( 1291 "polarity" if "polarity" in self._data_frame.columns else "ionmode" 1292 ) 1293 1294 # Get a subset of the initial dataframea by polarity 1295 db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy() 1296 1297 # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping 1298 # If the mapping is not provided, use the default mapping 1299 if not metabolite_metadata_mapping: 1300 metabolite_metadata_mapping = { 1301 "chebi_id": "chebi", 1302 "kegg_id": "kegg", 1303 "refmet_name": "common_name", 1304 "molecular_formula": "formula", 1305 "gnps_spectra_id":"id", 1306 "precursormz": "precursor_mz", 1307 "precursortype":"ion_type" 1308 } 1309 db_df.rename(columns=metabolite_metadata_mapping, inplace=True) 1310 db_df["molecular_data_id"] = db_df["inchikey"] 1311 1312 1313 1314 # Check if the resulting dataframe has the required columns for the flash entropy search 1315 required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"] 1316 for col in required_columns: 1317 if col not in db_df.columns: 1318 raise ValueError( 1319 f"Input field on MSP must contain '{col}' column for FlashEntropy search." 1320 ) 1321 1322 # Pull out the metabolite metadata from the dataframe and put it into a different dataframe 1323 # First get a list of the possible attributes of the MetaboliteMetadata dataclass 1324 metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys()) 1325 # Replace id with molecular_data_id in metabolite_metadata_keys 1326 metabolite_metadata_keys = [ 1327 "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys 1328 ] 1329 metabolite_metadata_df = db_df[ 1330 db_df.columns[db_df.columns.isin(metabolite_metadata_keys)] 1331 ].copy() 1332 1333 # Make unique and recast the id column for metabolite metadata 1334 metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True) 1335 metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"] 1336 1337 # Convert to a dictionary using the inchikey as the key 1338 metabolite_metadata_dict = metabolite_metadata_df.to_dict( 1339 orient="records" 1340 ) 1341 metabolite_metadata_dict = { 1342 v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata) 1343 for v in metabolite_metadata_dict 1344 } 1345 1346 # Remove the metabolite metadata columns from the original dataframe 1347 for key in metabolite_metadata_keys: 1348 if key != "molecular_data_id": 1349 if key in db_df.columns: 1350 db_df.drop(columns=key, inplace=True) 1351 1352 # Format the spectral library 1353 format_func = self._get_format_func(format) 1354 lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs) 1355 return (lib, metabolite_metadata_dict)
Interface to parse NIST MSP files
995 def __init__(self, file_path): 996 """ 997 Initialize instance. 998 999 Parameters 1000 ---------- 1001 file_path : str 1002 Path to a local MSP file. 1003 1004 Attributes 1005 ---------- 1006 file_path : str 1007 Path to the MSP file. 1008 _file_content : str 1009 Content of the MSP file. 1010 _data_frame : :obj:`~pandas.DataFrame` 1011 DataFrame of spectra from the MSP file with unaltered content. 1012 """ 1013 super().__init__(key=None) 1014 1015 self.file_path = file_path 1016 if not os.path.exists(self.file_path): 1017 raise FileNotFoundError( 1018 f"File {self.file_path} does not exist. Please check the file path." 1019 ) 1020 with open(self.file_path, "r") as f: 1021 self._file_content = f.read() 1022 1023 self._data_frame = self._read_msp_file() 1024 self.__init_format_map__()
Initialize instance.
Parameters
- file_path (str): Path to a local MSP file.
Attributes
- file_path (str): Path to the MSP file.
- _file_content (str): Content of the MSP file.
- _data_frame (
~pandas.DataFrame
): DataFrame of spectra from the MSP file with unaltered content.
1270 def get_metabolomics_spectra_library( 1271 self, 1272 polarity, 1273 metabolite_metadata_mapping={}, 1274 format="fe", 1275 normalize=True, 1276 fe_kwargs={}, 1277 ): 1278 """ 1279 Prepare metabolomics spectra library and associated metabolite metadata 1280 1281 Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input 1282 1283 """ 1284 # Check if the MSP file is compatible with the get_metabolomics_spectra_library method 1285 self._check_msp_compatibility() 1286 1287 # Check if the polarity parameter is valid and if a polarity column exists in the dataframe 1288 if polarity not in ["positive", "negative"]: 1289 raise ValueError("Polarity must be 'positive' or 'negative'") 1290 polarity_column = ( 1291 "polarity" if "polarity" in self._data_frame.columns else "ionmode" 1292 ) 1293 1294 # Get a subset of the initial dataframea by polarity 1295 db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy() 1296 1297 # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping 1298 # If the mapping is not provided, use the default mapping 1299 if not metabolite_metadata_mapping: 1300 metabolite_metadata_mapping = { 1301 "chebi_id": "chebi", 1302 "kegg_id": "kegg", 1303 "refmet_name": "common_name", 1304 "molecular_formula": "formula", 1305 "gnps_spectra_id":"id", 1306 "precursormz": "precursor_mz", 1307 "precursortype":"ion_type" 1308 } 1309 db_df.rename(columns=metabolite_metadata_mapping, inplace=True) 1310 db_df["molecular_data_id"] = db_df["inchikey"] 1311 1312 1313 1314 # Check if the resulting dataframe has the required columns for the flash entropy search 1315 required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"] 1316 for col in required_columns: 1317 if col not in db_df.columns: 1318 raise ValueError( 1319 f"Input field on MSP must contain '{col}' column for FlashEntropy search." 1320 ) 1321 1322 # Pull out the metabolite metadata from the dataframe and put it into a different dataframe 1323 # First get a list of the possible attributes of the MetaboliteMetadata dataclass 1324 metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys()) 1325 # Replace id with molecular_data_id in metabolite_metadata_keys 1326 metabolite_metadata_keys = [ 1327 "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys 1328 ] 1329 metabolite_metadata_df = db_df[ 1330 db_df.columns[db_df.columns.isin(metabolite_metadata_keys)] 1331 ].copy() 1332 1333 # Make unique and recast the id column for metabolite metadata 1334 metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True) 1335 metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"] 1336 1337 # Convert to a dictionary using the inchikey as the key 1338 metabolite_metadata_dict = metabolite_metadata_df.to_dict( 1339 orient="records" 1340 ) 1341 metabolite_metadata_dict = { 1342 v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata) 1343 for v in metabolite_metadata_dict 1344 } 1345 1346 # Remove the metabolite metadata columns from the original dataframe 1347 for key in metabolite_metadata_keys: 1348 if key != "molecular_data_id": 1349 if key in db_df.columns: 1350 db_df.drop(columns=key, inplace=True) 1351 1352 # Format the spectral library 1353 format_func = self._get_format_func(format) 1354 lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs) 1355 return (lib, metabolite_metadata_dict)
Prepare metabolomics spectra library and associated metabolite metadata
Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input