corems.molecular_id.search.database_interfaces
1import os 2import re 3import warnings 4from abc import ABC 5from io import StringIO 6from pathlib import Path 7import sqlite3 8 9import numpy as np 10import requests 11import pandas as pd 12from ms_entropy import FlashEntropySearch 13 14from corems.molecular_id.factory.EI_SQL import ( 15 EI_LowRes_SQLite, 16 Metadatar, 17 MetaboliteMetadata, 18) 19from corems.molecular_id.factory.lipid_molecular_metadata import LipidMetadata 20from corems.mass_spectra.calc.lc_calc import find_closest 21 22 23class SpectralDatabaseInterface(ABC): 24 """ 25 Base class that facilitates connection to spectral reference databases, 26 such as EMSL's Metabolomics Reference Database (MetabRef). 27 28 """ 29 30 def __init__(self, key=None): 31 """ 32 Initialize instance. 33 34 Parameters 35 ---------- 36 key : str 37 Token key. 38 39 """ 40 41 self.key = key 42 43 def set_token(self, path): 44 """ 45 Set environment variable for MetabRef database token. 46 47 Parameters 48 ---------- 49 path : str 50 Path to token. 51 52 """ 53 54 # Read token from file 55 with open(path, "r", encoding="utf-8") as f: 56 token = f.readline().strip() 57 58 # Set environment variable 59 os.environ[self.key] = token 60 61 def get_token(self): 62 """ 63 Get environment variable for database token. 64 65 Returns 66 ------- 67 str 68 Token string. 69 70 """ 71 72 # Check for token 73 if self.key not in os.environ: 74 raise ValueError("Must set {} environment variable.".format(self.key)) 75 76 # Get token from environment variables 77 return os.environ.get(self.key) 78 79 def get_header(self): 80 """ 81 Access stored database token and prepare as header. 82 83 Returns 84 ------- 85 str 86 Header string. 87 88 """ 89 90 # Get token 91 token = self.get_token() 92 93 # Pad header information 94 header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"} 95 96 return header 97 98 def get_query(self, url, use_header=True): 99 """ 100 Request payload from URL according to `get` protocol. 101 102 Parameters 103 ---------- 104 url : str 105 URL for request. 106 use_header: bool 107 Whether or not the query should include the header 108 109 Returns 110 ------- 111 dict 112 Response as JSON. 113 114 """ 115 116 # Query URL via `get` 117 if use_header: 118 response = requests.get(url, headers=self.get_header()) 119 else: 120 response = requests.get(url) 121 122 # Check response 123 response.raise_for_status() 124 125 # Return as JSON 126 return response.json() 127 128 def post_query(self, url, variable, values, tolerance): 129 """ 130 Request payload from URL according to `post` protocol. 131 132 Parameters 133 ---------- 134 url : str 135 URL for request. 136 variable : str 137 Variable to query. 138 values : str 139 Specific values of `variable` to query. 140 tolerance : str 141 Query tolerance relative to `values`. 142 143 Returns 144 ------- 145 dict 146 Response as JSON. 147 148 """ 149 150 # Coerce to string 151 if not isinstance(variable, str): 152 variable = str(variable).replace(" ", "") 153 154 if not isinstance(values, str): 155 values = str(values).replace(" ", "") 156 157 if not isinstance(tolerance, str): 158 tolerance = str(tolerance).replace(" ", "") 159 160 # Query URL via `post` 161 response = requests.post( 162 os.path.join(url, variable, tolerance), 163 data=values, 164 headers=self.get_header(), 165 ) 166 167 # Check response 168 response.raise_for_status() 169 170 # Return as JSON 171 return response.json() 172 173 def _check_flash_entropy_kwargs(self, fe_kwargs): 174 """ 175 Check FlashEntropy keyword arguments. 176 177 Parameters 178 ---------- 179 fe_kwargs : dict 180 Keyword arguments for FlashEntropy search. 181 182 183 Raises 184 ------ 185 ValueError 186 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they 187 are not equal. 188 189 """ 190 # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da 191 if ( 192 "min_ms2_difference_in_da" in fe_kwargs 193 or "max_ms2_tolerance_in_da" in fe_kwargs 194 ): 195 if ( 196 "min_ms2_difference_in_da" not in fe_kwargs 197 or "max_ms2_tolerance_in_da" not in fe_kwargs 198 ): 199 raise ValueError( 200 "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified." 201 ) 202 if ( 203 fe_kwargs["min_ms2_difference_in_da"] 204 != 2 * fe_kwargs["max_ms2_tolerance_in_da"] 205 ): 206 raise ValueError( 207 "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'." 208 ) 209 210 def _get_format_func(self, format): 211 """ 212 Obtain format function by key. 213 214 Returns 215 ------- 216 func 217 Formatting function. 218 """ 219 220 if format.lower() in self.format_map.keys(): 221 return self.format_map[format.lower()] 222 223 raise ValueError(("{} not a supported format.").format(format)) 224 225 def _dict_to_dataclass(self, source_dict, data_class): 226 """ 227 Convert dictionary to dataclass. 228 229 Notes 230 ----- 231 This function will pull the attributes a dataclass and its parent class 232 and convert the dictionary to a dataclass instance with the appropriate 233 attributes. 234 235 Parameters 236 ---------- 237 data_class : :obj:`~dataclasses.dataclass` 238 Dataclass to convert to. 239 source_dict : dict 240 Dictionary object to convert to dataclass. 241 242 Returns 243 ------- 244 :obj:`~dataclasses.dataclass` 245 Dataclass instance. 246 247 """ 248 249 # Get list of expected attributes of data_class 250 data_class_keys = list(data_class.__annotations__.keys()) 251 252 # Does the data_class inherit from another class, if so, get the attributes of the parent class as well 253 if len(data_class.__mro__) > 2: 254 parent_class_keys = list(data_class.__bases__[0].__annotations__.keys()) 255 data_class_keys = list(set(data_class_keys + parent_class_keys)) 256 257 # Remove keys that are not in the data_class from the input dictionary 258 input_dict = {k: v for k, v in source_dict.items() if k in data_class_keys} 259 260 # Add keys that are in the data class but not in the input dictionary as None 261 for key in data_class_keys: 262 if key not in input_dict.keys(): 263 input_dict[key] = None 264 return data_class(**input_dict) 265 266 def _spectrum_to_array(self, spectrum, normalize=True): 267 """ 268 Convert a parenthesis-delimited spectrum string to array. 269 270 Parameters 271 ---------- 272 spectrum : str 273 Spectrum string, i.e. list of (m/z,abundance) pairs. 274 normalize : bool 275 Normalize the spectrum by its magnitude. 276 277 Returns 278 ------- 279 :obj:`~numpy.array` 280 Array of shape (N, 2), with m/z in the first column and abundance in 281 the second. 282 """ 283 284 arr = np.array( 285 re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float 286 ).reshape(-1, 2) 287 288 if normalize: 289 arr = self.normalize_peaks(arr) 290 291 return arr 292 293 @staticmethod 294 def normalize_peaks(arr): 295 """ 296 Normalize peaks in an array. 297 298 Parameters 299 ---------- 300 arr : :obj:`~numpy.array` 301 Array of shape (N, 2), with m/z in the first column and abundance in 302 the second. 303 304 Returns 305 ------- 306 :obj:`~numpy.array` 307 Normalized array of shape (N, 2), with m/z in the first column and 308 normalized abundance in the second. 309 """ 310 # Normalize the array 311 arr[:, -1] = arr[:, -1] / arr[:, -1].sum() 312 313 return arr 314 315 @staticmethod 316 def _build_flash_entropy_index(fe_lib, fe_kwargs={}, clean_spectra=True): 317 """ 318 Build FlashEntropy index. 319 320 Parameters 321 ---------- 322 fe_lib : list 323 List of spectra to build index from. Can be a list of dictionaries or 324 a FlashEntropy search instance. 325 fe_kwargs : dict, optional 326 Keyword arguments for FlashEntropy search. 327 clean_spectra : bool, optional 328 Clean spectra before building index. Default is True. 329 330 Returns 331 ------- 332 :obj:`~ms_entropy.FlashEntropySearch` 333 FlashEntropy search instance. 334 335 """ 336 # Initialize FlashEntropy 337 fe_init_kws = [ 338 "max_ms2_tolerance_in_da", 339 "mz_index_step", 340 "low_memory", 341 "path_data", 342 ] 343 fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws} 344 fes = FlashEntropySearch(**fe_init_kws) 345 346 # Build FlashEntropy index 347 fe_index_kws = [ 348 "max_indexed_mz", 349 "precursor_ions_removal_da", 350 "noise_threshold", 351 "min_ms2_difference_in_da", 352 "max_peak_num", 353 ] 354 fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws} 355 fes.build_index(fe_lib, **fe_index_kws, clean_spectra=clean_spectra) 356 357 return fes 358 359 360class MetabRefInterface(SpectralDatabaseInterface): 361 """ 362 DEPRECATED interface retained for backward compatibility only. 363 """ 364 365 def __init__(self): 366 """ 367 Initialize instance with deprecation warning. 368 369 """ 370 371 super().__init__(key=None) 372 373 if self.__class__ is MetabRefInterface: 374 warnings.warn( 375 "MetabRefInterface is deprecated. Instantiate a concrete interface " 376 "such as GCMSLibraryInterface or LCLipidLibraryInterface instead.", 377 DeprecationWarning, 378 stacklevel=2, 379 ) 380 381 382class GCMSLibraryInterface(SpectralDatabaseInterface): 383 """ 384 Interface to bundled GCMS spectral libraries in MSP format. 385 386 Loads GCMS compound library and FAMES calibration library from local MSP files. 387 Default files are bundled with CoreMS, but can be overridden via environment variables. 388 """ 389 390 def __init__(self): 391 """ 392 Initialize instance. 393 """ 394 super().__init__(key=None) 395 396 # Local data file paths 397 from pathlib import Path 398 399 # Default to bundled data files 400 data_dir = Path(__file__).parent.parent / "data" 401 self.gcms_library_file = os.getenv( 402 "GCMS_LIBRARY_PATH", 403 str(data_dir / "PNNLMetV20191015.msp") 404 ) 405 self.fames_library_file = os.getenv( 406 "FAMES_LIBRARY_PATH", 407 str(data_dir / "FAMES_REF.msp") 408 ) 409 410 self.__init_format_map__() 411 412 def __init_format_map__(self): 413 """ 414 Initialize database format mapper, enabling multiple format requests. 415 416 """ 417 418 # Define format workflows 419 self.format_map = { 420 "json": lambda x, normalize, fe_kwargs: x, 421 "dict": lambda x, 422 normalize, 423 fe_kwargs: self._to_LowResolutionEICompound_dict(x, normalize), 424 "sql": lambda x, 425 normalize, 426 fe_kwargs: self._LowResolutionEICompound_dict_to_sqlite( 427 self._to_LowResolutionEICompound_dict(x, normalize) 428 ), 429 } 430 431 # Add aliases 432 self.format_map["metabref"] = self.format_map["json"] 433 self.format_map["datadict"] = self.format_map["dict"] 434 self.format_map["data-dict"] = self.format_map["dict"] 435 self.format_map["lowreseicompound"] = self.format_map["dict"] 436 self.format_map["lowres"] = self.format_map["dict"] 437 self.format_map["lowresgc"] = self.format_map["dict"] 438 self.format_map["sqlite"] = self.format_map["sql"] 439 440 def available_formats(self): 441 """ 442 View list of available formats. 443 444 Returns 445 ------- 446 list 447 Format map keys. 448 """ 449 450 return list(self.format_map.keys()) 451 452 def get_library(self, format="json", normalize=False): 453 """ 454 Load GC/MS library from local MSP file. 455 456 Parameters 457 ---------- 458 format : str 459 Format of requested library, i.e. "json", "sql", "dict". 460 See `available_formats` method for aliases. 461 normalize : bool 462 Normalize the spectrum by its magnitude. 463 464 Returns 465 ------- 466 Library in requested format. 467 468 """ 469 # Load from local MSP file 470 library_data = self._load_msp_file(self.gcms_library_file, normalize) 471 472 # Init format function 473 format_func = self._get_format_func(format) 474 475 # Apply format conversion 476 return format_func(library_data, normalize, {}) 477 478 def get_fames(self, format="json", normalize=False): 479 """ 480 Load GC/MS FAMEs library from local MSP file. 481 482 Parameters 483 ---------- 484 format : str 485 Format of requested library, i.e. "json", "sql", "dict". 486 See `available_formats` method for aliases. 487 normalize : bool 488 Normalize the spectrum by its magnitude. 489 490 Returns 491 ------- 492 Library in requested format. 493 494 """ 495 # Load from local MSP file 496 library_data = self._load_msp_file(self.fames_library_file, normalize) 497 498 # Init format function 499 format_func = self._get_format_func(format) 500 501 # Apply format conversion 502 return format_func(library_data, normalize, {}) 503 504 def _load_msp_file(self, file_path, normalize=False): 505 """ 506 Load and parse MSP file into format compatible with existing pipeline. 507 508 Parameters 509 ---------- 510 file_path : str 511 Path to MSP file 512 normalize : bool 513 Normalize spectra 514 515 Returns 516 ------- 517 list of dict 518 Library data in format compatible with _to_LowResolutionEICompound_dict 519 """ 520 from pathlib import Path 521 522 file_path = Path(file_path) 523 if not file_path.exists(): 524 raise FileNotFoundError( 525 f"Library file not found: {file_path}. " 526 f"Set GCMS_LIBRARY_PATH or FAMES_LIBRARY_PATH environment variable to specify location." 527 ) 528 529 # Parse MSP file 530 spectra = [] 531 spectrum = {} 532 peaks = [] 533 534 with open(file_path, 'r') as f: 535 for line in f: 536 line = line.strip() 537 538 # Empty line marks end of spectrum 539 if not line: 540 if spectrum and peaks: 541 # Convert peaks to the format expected by downstream code 542 # Format: "(mz,abundance)(mz,abundance)..." 543 peak_str = "".join([f"({int(mz)},{int(abun)})" for mz, abun in peaks]) 544 spectrum['mz'] = peak_str 545 spectra.append(spectrum) 546 spectrum = {} 547 peaks = [] 548 continue 549 550 # Check if line contains peak data (starts with digit) 551 if line and line[0].isdigit(): 552 parts = line.split() 553 if len(parts) >= 2: 554 peaks.append((float(parts[0]), float(parts[1]))) 555 continue 556 557 # Handle metadata fields 558 if ":" in line: 559 key, value = line.split(":", 1) 560 key = key.strip().lower() 561 value = value.strip() 562 563 # Map MSP fields to expected format 564 field_mapping = { 565 "name": "molecule_name", 566 "formula": "formula", 567 "cas": "casno", 568 "retentiontime": "retention_time", 569 "ri": "ri", 570 "comment": "comments", 571 "num peaks": "peak_count", 572 "derivative": "derivative" 573 } 574 575 # Metadata fields that go into the metadata dict 576 metadata_fields = { 577 "inchikey": "inchikey", 578 "inchi": "inchi", 579 "smiles": "smiles", 580 "pubchem": "pubchem", 581 "chebi": "chebi", 582 "kegg": "kegg", 583 "refmet": "refmet", 584 "iupac_name": "iupac_name" 585 } 586 587 if key in field_mapping: 588 mapped_key = field_mapping[key] 589 # Convert numeric fields 590 if key in ["retentiontime", "ri"]: 591 try: 592 value = float(value) 593 except: 594 pass 595 elif key == "num peaks": 596 try: 597 value = int(value) 598 except: 599 pass 600 spectrum[mapped_key] = value 601 elif key in metadata_fields: 602 # Store in nested metadata dict 603 if "metadata" not in spectrum: 604 spectrum["metadata"] = {} 605 spectrum["metadata"][metadata_fields[key]] = value 606 else: 607 # Keep unmapped fields 608 spectrum[key] = value 609 610 # Add last spectrum if file doesn't end with blank line 611 if spectrum and peaks: 612 peak_str = "".join([f"({int(mz)},{int(abun)})" for mz, abun in peaks]) 613 spectrum['mz'] = peak_str 614 spectra.append(spectrum) 615 616 return spectra 617 618 def _to_LowResolutionEICompound_dict(self, metabref_lib, normalize=False): 619 """ 620 Convert MetabRef-formatted library to CoreMS LowResolutionEICompound-formatted 621 dictionary for local ingestion. 622 623 Parameters 624 ---------- 625 metabref_lib : dict 626 MetabRef GC-MS library in JSON format. 627 normalize : bool 628 Normalize each spectrum by its magnitude. 629 630 Returns 631 ------- 632 list of dict 633 List of each spectrum contained in dictionary. 634 635 """ 636 637 # All below key:value lookups are based on CoreMS class definitions 638 # NOT MetabRef content. For example, MetabRef has keys for PubChem, 639 # USI, etc. that are not considered below. 640 641 # Dictionary to map metabref keys to corems keys 642 metadatar_cols = { 643 "casno": "cas", 644 "inchikey": "inchikey", 645 "inchi": "inchi", 646 "chebi": "chebi", 647 "smiles": "smiles", 648 "kegg": "kegg", 649 "iupac_name": "iupac_name", 650 "traditional_name": "traditional_name", # Not present in metabref 651 "common_name": "common_name", # Not present in metabref 652 } 653 654 # Dictionary to map metabref keys to corems keys 655 lowres_ei_compound_cols = { 656 "id": "metabref_id", 657 "molecule_name": "name", # Is this correct? 658 "classify": "classify", # Not present in metabref 659 "formula": "formula", 660 "ri": "ri", 661 "rt": "retention_time", 662 "source": "source", # Not present in metabref 663 "casno": "casno", 664 "comments": "comment", 665 "source_temp_c": "source_temp_c", # Not present in metabref 666 "ev": "ev", # Not present in metabref 667 "peak_count": "peaks_count", 668 "mz": "mz", 669 "abundance": "abundance", 670 } 671 672 # Local result container 673 corems_lib = [] 674 675 # Enumerate spectra 676 for i, source_ in enumerate(metabref_lib): 677 # Copy source to prevent modification 678 source = source_.copy() 679 680 # Parse target data 681 target = { 682 lowres_ei_compound_cols[k]: v 683 for k, v in source.items() 684 if k in lowres_ei_compound_cols 685 } 686 687 # Explicitly add this to connect with LowResCompoundRef later 688 if "retention_time" in source: 689 target["rt"] = source["retention_time"] 690 elif "rt" in source: 691 target["rt"] = source["rt"] 692 693 # Parse (mz, abundance) 694 arr = self._spectrum_to_array(target["mz"], normalize=normalize) 695 target["mz"] = arr[:, 0] 696 target["abundance"] = arr[:, 1] 697 698 # Parse meta data 699 target["metadata"] = { 700 metadatar_cols[k]: v for k, v in source.items() if k in metadatar_cols 701 } 702 703 # Add anything else 704 for k in source: 705 if k not in lowres_ei_compound_cols: 706 target[k] = source[k] 707 708 # Add to CoreMS list 709 corems_lib.append(target) 710 711 return corems_lib 712 713 def _LowResolutionEICompound_dict_to_sqlite( 714 self, lowres_ei_compound_dict, url="sqlite://" 715 ): 716 """ 717 Convert CoreMS LowResolutionEICompound-formatted dictionary to SQLite 718 database for local ingestion. 719 720 Parameters 721 ---------- 722 lowres_ei_compound_dict : dict 723 CoreMS GC-MS library formatted for LowResolutionEICompound. 724 url : str 725 URL to SQLite prefix. 726 727 Returns 728 ------- 729 sqlite database 730 Spectra contained in SQLite database. 731 732 """ 733 734 # Dictionary to map corems keys to all-caps keys 735 capped_cols = { 736 "name": "NAME", 737 "formula": "FORM", 738 "ri": "RI", 739 "retention_time": "RT", 740 "source": "SOURCE", 741 "casno": "CASNO", 742 "comment": "COMMENT", 743 "peaks_count": "NUM PEAKS", 744 } 745 746 # Initialize SQLite object 747 sqlite_obj = EI_LowRes_SQLite(url=url) 748 749 # Iterate spectra 750 for _data_dict in lowres_ei_compound_dict: 751 # Copy source to prevent modification 752 data_dict = _data_dict.copy() 753 754 # Add missing capped values 755 for k, v in capped_cols.items(): 756 # Key exists 757 if k in data_dict: 758 # # This will replace the key 759 # data_dict[v] = data_dict.pop(k) 760 761 # This will keep both keys 762 data_dict[v] = data_dict[k] 763 764 # Parse number of peaks 765 if not data_dict.get("NUM PEAKS"): 766 data_dict["NUM PEAKS"] = len(data_dict.get("mz")) 767 768 # Parse CAS number 769 if not data_dict.get("CASNO"): 770 data_dict["CASNO"] = data_dict.get("CAS") 771 772 if not data_dict["CASNO"]: 773 data_dict["CASNO"] = 0 774 775 # Build linked metadata table 776 if "metadata" in data_dict: 777 metadata = data_dict.pop("metadata") 778 # Only create metadata entry if we have required fields and valid data 779 # Filter to only include fields that Metadatar model supports 780 supported_metadata_fields = [ 781 'cas', 'inchikey', 'inchi', 'chebi', 'smiles', 782 'kegg', 'iupac_name', 'traditional_name', 'common_name' 783 ] 784 filtered_metadata = { 785 k: v for k, v in metadata.items() 786 if k in supported_metadata_fields and v 787 } 788 # Inchikey is required by the database model 789 if filtered_metadata and filtered_metadata.get("inchikey"): 790 data_dict["metadatar"] = Metadatar(**filtered_metadata) 791 792 # Attempt addition to sqlite 793 try: 794 sqlite_obj.add_compound(data_dict) 795 except: 796 print(data_dict["NAME"]) 797 798 return sqlite_obj 799 800 801class MetabRefGCInterface(GCMSLibraryInterface): 802 """ 803 DEPRECATED: Use GCMSLibraryInterface instead. 804 805 This interface is maintained for backward compatibility only. 806 MetabRef API has been discontinued as of 2026. 807 """ 808 809 def __init__(self): 810 """ 811 Initialize instance with deprecation warning. 812 """ 813 warnings.warn( 814 "MetabRefGCInterface is deprecated. Use GCMSLibraryInterface instead. " 815 "MetabRef API has been discontinued; all data now loads from bundled local MSP files.", 816 DeprecationWarning, 817 stacklevel=2 818 ) 819 super().__init__() 820 821 822class LCLipidLibraryInterface(SpectralDatabaseInterface): 823 """ 824 Interface to a local sqlite lipid library for LC-MS spectral searches. 825 """ 826 827 DEFAULT_DOWNLOAD_URL = ( 828 "https://nmdcdemo.emsl.pnnl.gov/minio/lipidomics/parameter_files/" 829 "202412_lipid_ref.sqlite" 830 ) 831 832 def __init__(self, db_location=None): 833 """ 834 Initialize instance. 835 836 Parameters 837 ---------- 838 db_location : str | Path, optional 839 Local path to the sqlite lipid library. If omitted, the 840 COREMS_LIPIDOMICS_SQLITE_PATH environment variable is used. 841 """ 842 843 super().__init__(key=None) 844 self.db_location = db_location 845 self.__init_format_map__() 846 847 def _to_flashentropy(self, spectral_library, normalize=True, fe_kwargs={}): 848 """ 849 Convert a spectral library to FlashEntropy format. 850 851 Parameters 852 ---------- 853 spectral_library : dict 854 MS2 library in JSON format or FlashEntropy search instance 855 (for reformatting at different MS2 separation). 856 normalize : bool 857 Normalize each spectrum by its magnitude. 858 fe_kwargs : dict, optional 859 Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search; 860 any keys not recognized will be ignored. By default, all parameters set to defaults. 861 862 Returns 863 ------- 864 :obj:`~ms_entropy.FlashEntropySearch` 865 MS2 library as FlashEntropy search instance. 866 867 Raises 868 ------ 869 ValueError 870 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal. 871 872 """ 873 self._check_flash_entropy_kwargs(fe_kwargs) 874 875 # Initialize empty library 876 fe_lib = [] 877 878 # Enumerate spectra 879 for i, source in enumerate(spectral_library): 880 if "spectrum_data" in source.keys(): 881 spectrum = source["spectrum_data"] 882 else: 883 spectrum = source 884 885 if "precursor_mz" not in spectrum.keys(): 886 spectrum["precursor_mz"] = spectrum.pop("precursor_ion") 887 888 spectrum["peaks"] = self._spectrum_to_array( 889 spectrum["mz"], normalize=normalize 890 ) 891 fe_lib.append(spectrum) 892 893 fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs) 894 895 return fe_search 896 897 def __init_format_map__(self): 898 """ 899 Initialize database format mapper, enabling multiple format requests. 900 """ 901 902 self.format_map = { 903 "json": lambda x, normalize, fe_kwargs: x, 904 "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy( 905 x, normalize, fe_kwargs 906 ), 907 "dataframe": lambda x, normalize, fe_kwargs: pd.DataFrame(x), 908 } 909 910 self.format_map["fe"] = self.format_map["flashentropy"] 911 self.format_map["flash-entropy"] = self.format_map["flashentropy"] 912 self.format_map["df"] = self.format_map["dataframe"] 913 914 def available_formats(self): 915 """ 916 View list of available formats. 917 918 Returns 919 ------- 920 list 921 Format map keys. 922 """ 923 924 return list(self.format_map.keys()) 925 926 def _resolve_db_location(self): 927 """ 928 Resolve and validate sqlite database location. 929 930 Returns 931 ------- 932 Path 933 Existing sqlite database file path. 934 """ 935 936 db_location = self.db_location or os.getenv("COREMS_LIPIDOMICS_SQLITE_PATH") 937 if not db_location: 938 raise ValueError( 939 "A local lipid sqlite library path is required. " 940 "Set COREMS_LIPIDOMICS_SQLITE_PATH or pass db_location." 941 ) 942 943 db_path = Path(db_location).expanduser() 944 if not db_path.exists(): 945 raise FileNotFoundError( 946 f"Lipid sqlite library not found at {db_path}. " 947 f"Download it from {self.DEFAULT_DOWNLOAD_URL} " 948 "and set COREMS_LIPIDOMICS_SQLITE_PATH." 949 ) 950 951 return db_path 952 953 def _get_candidate_spectra(self, connection, mz_list, polarity, mz_tol_ppm): 954 """ 955 Fetch candidate spectra rows by precursor m/z and polarity. 956 957 Returns 958 ------- 959 pandas.DataFrame 960 Filtered rows from lipidMassSpectrumObject. 961 """ 962 963 mz_observed = np.sort(np.asarray(mz_list, dtype=float)) 964 if mz_observed.size == 0: 965 return pd.DataFrame() 966 967 mz_all = pd.read_sql_query( 968 "SELECT id, polarity, precursor_mz FROM lipidMassSpectrumObject", connection 969 ) 970 mz_all = mz_all[mz_all["polarity"] == polarity].copy() 971 if mz_all.empty: 972 return pd.DataFrame() 973 974 mz_all = mz_all.sort_values(by="precursor_mz").reset_index(drop=True) 975 976 if mz_observed.size == 1: 977 mz_all["closest_mz_obs"] = mz_observed[0] 978 else: 979 mz_all["closest_mz_obs"] = mz_observed[ 980 find_closest(mz_observed, mz_all.precursor_mz.values) 981 ] 982 983 mz_all["ppm_error"] = ( 984 (mz_all["precursor_mz"] - mz_all["closest_mz_obs"]) 985 / mz_all["precursor_mz"] 986 * 1e6 987 ) 988 989 mz_all = mz_all[np.abs(mz_all["ppm_error"]) <= mz_tol_ppm] 990 if mz_all.empty: 991 return pd.DataFrame() 992 993 mz_ids = tuple(mz_all["id"].tolist()) 994 return pd.read_sql_query( 995 f"SELECT * FROM lipidMassSpectrumObject WHERE id IN {mz_ids}", 996 connection, 997 ) 998 999 def get_lipid_library( 1000 self, 1001 mz_list, 1002 polarity, 1003 mz_tol_ppm, 1004 mz_tol_da_api=None, 1005 format="json", 1006 normalize=True, 1007 fe_kwargs={}, 1008 api_delay=5, 1009 api_attempts=10, 1010 ): 1011 """ 1012 Retrieve lipid spectra and metadata from a local sqlite library. 1013 1014 Parameters 1015 ---------- 1016 mz_list : list 1017 List of precursor m/z values. 1018 polarity : str 1019 Ionization polarity, either "positive" or "negative". 1020 mz_tol_ppm : float 1021 Tolerance in ppm for precursor matching. 1022 mz_tol_da_api : float, optional 1023 Unused, kept for backward compatibility. 1024 format : str, optional 1025 Format of requested library, e.g. "json" or "flashentropy". 1026 normalize : bool, optional 1027 Normalize spectrum intensities. 1028 fe_kwargs : dict, optional 1029 Keyword arguments for FlashEntropy search. 1030 api_delay : int, optional 1031 Unused, kept for backward compatibility. 1032 api_attempts : int, optional 1033 Unused, kept for backward compatibility. 1034 1035 Returns 1036 ------- 1037 tuple 1038 Library in requested format and lipid metadata dictionary. 1039 """ 1040 1041 if not isinstance(mz_list, (list, np.ndarray)): 1042 raise ValueError("mz_list must be a list or numpy array") 1043 if not all(isinstance(mz, (float, int)) for mz in mz_list): 1044 raise ValueError("All elements in mz_list must be float or int") 1045 if polarity not in {"positive", "negative"}: 1046 raise ValueError("polarity must be either 'positive' or 'negative'") 1047 if not isinstance(mz_tol_ppm, (float, int)): 1048 raise ValueError("mz_tol_ppm must be a float or int") 1049 1050 db_path = self._resolve_db_location() 1051 connection = sqlite3.connect(str(db_path)) 1052 try: 1053 # Step 1: Get candidate spectra records based on m/z and polarity 1054 spectra_df = self._get_candidate_spectra( 1055 connection=connection, 1056 mz_list=mz_list, 1057 polarity=polarity, 1058 mz_tol_ppm=float(mz_tol_ppm), 1059 ) 1060 1061 if spectra_df.empty: 1062 format_func = self._get_format_func(format) 1063 return format_func([], normalize=normalize, fe_kwargs=fe_kwargs), {} 1064 1065 # Step 2: Get corresponding lipid metadata for candidate spectra from lipidTree view 1066 mol_ids = tuple(spectra_df["molecular_data_id"].tolist()) 1067 mol_df = pd.read_sql_query( 1068 f"SELECT * FROM lipidTree WHERE id IN {mol_ids}", 1069 connection, 1070 ) 1071 finally: 1072 connection.close() 1073 1074 mol_df["id_index"] = mol_df["id"] 1075 mol_df = mol_df.set_index("id_index") 1076 mol_records = mol_df.to_dict(orient="index") 1077 lipid_metadata = { 1078 int(k): self._dict_to_dataclass(v, LipidMetadata) 1079 for k, v in mol_records.items() 1080 } 1081 1082 spectra_records = spectra_df.to_dict(orient="records") 1083 format_func = self._get_format_func(format) 1084 library = format_func(spectra_records, normalize=normalize, fe_kwargs=fe_kwargs) 1085 return library, lipid_metadata 1086 1087 1088class MSPInterface(SpectralDatabaseInterface): 1089 """ 1090 Interface to parse NIST MSP files 1091 """ 1092 1093 def __init__(self, file_path): 1094 """ 1095 Initialize instance. 1096 1097 Parameters 1098 ---------- 1099 file_path : str 1100 Path to a local MSP file. 1101 1102 Attributes 1103 ---------- 1104 file_path : str 1105 Path to the MSP file. 1106 _file_content : str 1107 Content of the MSP file. 1108 _data_frame : :obj:`~pandas.DataFrame` 1109 DataFrame of spectra from the MSP file with unaltered content. 1110 """ 1111 super().__init__(key=None) 1112 1113 self.file_path = file_path 1114 if not os.path.exists(self.file_path): 1115 raise FileNotFoundError( 1116 f"File {self.file_path} does not exist. Please check the file path." 1117 ) 1118 with open(self.file_path, "r") as f: 1119 self._file_content = f.read() 1120 1121 self._data_frame = self._read_msp_file() 1122 self.__init_format_map__() 1123 1124 def __init_format_map__(self): 1125 """ 1126 Initialize database format mapper, enabling multiple format requests. 1127 1128 """ 1129 1130 # x is a pandas dataframe similar to self._data_frame format 1131 # Define format workflows 1132 self.format_map = { 1133 "msp": lambda x, normalize, fe_kwargs: self._to_msp(x, normalize), 1134 "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy( 1135 x, normalize, fe_kwargs 1136 ), 1137 "df": lambda x, normalize, fe_kwargs: self._to_df(x, normalize), 1138 } 1139 1140 # Add aliases 1141 self.format_map["fe"] = self.format_map["flashentropy"] 1142 self.format_map["flash-entropy"] = self.format_map["flashentropy"] 1143 self.format_map["dataframe"] = self.format_map["df"] 1144 self.format_map["data-frame"] = self.format_map["df"] 1145 1146 def _read_msp_file(self): 1147 """ 1148 Reads the MSP files into the pandas dataframe, and sort/remove zero intensity ions in MS/MS spectra. 1149 1150 Returns 1151 ------- 1152 :obj:`~pandas.DataFrame` 1153 DataFrame of spectra from the MSP file, exacly as it is in the file (no sorting, filtering etc) 1154 """ 1155 # If input_dataframe is provided, return it it 1156 spectra = [] 1157 spectrum = {} 1158 1159 f = StringIO(self._file_content) 1160 for line in f: 1161 line = line.strip() 1162 if not line: 1163 continue # Skip empty lines 1164 1165 # Handle metadata 1166 if ":" in line: 1167 key, value = line.split(":", 1) 1168 key = key.strip().lower() 1169 value = value.strip() 1170 1171 if key == "name": 1172 # Save current spectrum and start a new one 1173 if spectrum: 1174 spectra.append(spectrum) 1175 spectrum = {"name": value, "peaks": []} 1176 else: 1177 spectrum[key] = value 1178 1179 # Handle peak data (assumed to start with a number) 1180 elif line[0].isdigit(): 1181 peaks = line.split() 1182 m_z = float(peaks[0]) 1183 intensity = float(peaks[1]) 1184 spectrum["peaks"].append(([m_z, intensity])) 1185 # Save the last spectrum 1186 if spectrum: 1187 spectra.append(spectrum) 1188 1189 df = pd.DataFrame(spectra) 1190 for column in df.columns: 1191 if column != "peaks": # Skip 'peaks' column 1192 try: 1193 df[column] = pd.to_numeric(df[column], errors="raise") 1194 except: 1195 pass 1196 return df 1197 1198 def _to_df(self, input_dataframe, normalize=True): 1199 """ 1200 Convert MSP-derived library to FlashEntropy library. 1201 1202 Parameters 1203 ---------- 1204 input_dataframe : :obj:`~pandas.DataFrame` 1205 Input DataFrame containing MSP-formatted spectra. 1206 normalize : bool, optional 1207 Normalize each spectrum by its magnitude. 1208 Default is True. 1209 1210 Returns 1211 ------- 1212 :obj:`~pandas.DataFrame` 1213 DataFrame of with desired normalization 1214 """ 1215 if not normalize: 1216 return input_dataframe 1217 else: 1218 # Convert to dictionary 1219 db_dict = input_dataframe.to_dict(orient="records") 1220 1221 # Initialize empty library 1222 lib = [] 1223 1224 # Enumerate spectra 1225 for i, source in enumerate(db_dict): 1226 spectrum = source 1227 # Check that spectrum["peaks"] exists 1228 if "peaks" not in spectrum.keys(): 1229 raise KeyError( 1230 "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute." 1231 ) 1232 1233 # Convert spectrum["peaks"] to numpy array 1234 if not isinstance(spectrum["peaks"], np.ndarray): 1235 spectrum["peaks"] = np.array(spectrum["peaks"]) 1236 1237 # Normalize peaks, if requested 1238 if normalize: 1239 spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"]) 1240 spectrum["num peaks"] = len(spectrum["peaks"]) 1241 1242 # Add spectrum to library 1243 lib.append(spectrum) 1244 1245 # Convert to DataFrame 1246 df = pd.DataFrame(lib) 1247 return df 1248 1249 def _to_flashentropy(self, input_dataframe, normalize=True, fe_kwargs={}): 1250 """ 1251 Convert MSP-derived library to FlashEntropy library. 1252 1253 Parameters 1254 ---------- 1255 input_dataframe : :obj:`~pandas.DataFrame` 1256 Input DataFrame containing MSP-formatted spectra. 1257 normalize : bool 1258 Normalize each spectrum by its magnitude. 1259 fe_kwargs : dict, optional 1260 Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search; 1261 any keys not recognized will be ignored. By default, all parameters set to defaults. 1262 1263 Returns 1264 ------- 1265 :obj:`~ms_entropy.FlashEntropySearch` 1266 MS2 library as FlashEntropy search instance. 1267 1268 Raises 1269 ------ 1270 ValueError 1271 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they 1272 """ 1273 self._check_flash_entropy_kwargs(fe_kwargs) 1274 1275 db_df = input_dataframe 1276 1277 # Convert to dictionary 1278 db_dict = db_df.to_dict(orient="records") 1279 1280 # Initialize empty library 1281 fe_lib = [] 1282 1283 # Enumerate spectra 1284 for i, source in enumerate(db_dict): 1285 # Reorganize source dict, if necessary 1286 if "spectrum_data" in source.keys(): 1287 spectrum = source["spectrum_data"] 1288 else: 1289 spectrum = source 1290 1291 # Rename precursor_mz key for FlashEntropy 1292 if "precursor_mz" not in spectrum.keys(): 1293 if "precursormz" in spectrum: 1294 spectrum["precursor_mz"] = spectrum.pop("precursormz") 1295 elif "precursor_ion" in spectrum: 1296 spectrum["precursor_mz"] = spectrum.pop("precursor_ion") 1297 else: 1298 raise KeyError( 1299 "MSP must have either 'precursormz' or 'precursor_ion' key to be converted to FlashEntropy format." 1300 ) 1301 1302 # Check that spectrum["peaks"] exists 1303 if "peaks" not in spectrum.keys(): 1304 raise KeyError( 1305 "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute." 1306 ) 1307 1308 # Convert spectrum["peaks"] to numpy array 1309 if not isinstance(spectrum["peaks"], np.ndarray): 1310 spectrum["peaks"] = np.array(spectrum["peaks"]) 1311 1312 # Normalize peaks, if requested 1313 if normalize: 1314 spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"]) 1315 1316 # Add spectrum to library 1317 fe_lib.append(spectrum) 1318 1319 # Build FlashEntropy index 1320 fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs) 1321 1322 return fe_search 1323 1324 def _check_msp_compatibility(self): 1325 """ 1326 Check if the MSP file is compatible with the get_metabolomics_spectra_library method and provide feedback if it is not. 1327 """ 1328 # Check polarity 1329 if ( 1330 "polarity" not in self._data_frame.columns 1331 and "ionmode" not in self._data_frame.columns 1332 ): 1333 raise ValueError( 1334 "Neither 'polarity' nor 'ionmode' columns found in the input MSP metadata. Please check the file." 1335 ) 1336 polarity_column = ( 1337 "polarity" if "polarity" in self._data_frame.columns else "ionmode" 1338 ) 1339 1340 # Check if polarity_column contents is either "positive" or "negative" 1341 if not all(self._data_frame[polarity_column].isin(["positive", "negative"])): 1342 raise ValueError( 1343 f"Input field on MSP '{polarity_column}' must contain only 'positive' or 'negative' values." 1344 ) 1345 1346 # Check if the MSP file contains the required columns for metabolite metadata 1347 # inchikey, by name, not null 1348 # either formula or molecular_formula, not null 1349 if not all(self._data_frame["inchikey"].notnull()): 1350 raise ValueError( 1351 "Input field on MSP 'inchikey' must contain only non-null values." 1352 ) 1353 if ( 1354 "formula" not in self._data_frame.columns 1355 and "molecular_formula" not in self._data_frame.columns 1356 ): 1357 raise ValueError( 1358 "Input field on MSP must contain either 'formula' or 'molecular_formula' columns." 1359 ) 1360 molecular_formula_column = ( 1361 "formula" if "formula" in self._data_frame.columns else "molecular_formula" 1362 ) 1363 if not all(self._data_frame[molecular_formula_column].notnull()): 1364 raise ValueError( 1365 f"Input field on MSP '{molecular_formula_column}' must contain only non-null values." 1366 ) 1367 1368 def get_metabolomics_spectra_library( 1369 self, 1370 polarity, 1371 metabolite_metadata_mapping={}, 1372 format="fe", 1373 normalize=True, 1374 fe_kwargs={}, 1375 ): 1376 """ 1377 Prepare metabolomics spectra library and associated metabolite metadata 1378 1379 Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input 1380 1381 """ 1382 # Check if the MSP file is compatible with the get_metabolomics_spectra_library method 1383 self._check_msp_compatibility() 1384 1385 # Check if the polarity parameter is valid and if a polarity column exists in the dataframe 1386 if polarity not in ["positive", "negative"]: 1387 raise ValueError("Polarity must be 'positive' or 'negative'") 1388 polarity_column = ( 1389 "polarity" if "polarity" in self._data_frame.columns else "ionmode" 1390 ) 1391 1392 # Get a subset of the initial dataframea by polarity 1393 db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy() 1394 1395 # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping 1396 # If the mapping is not provided, use the default mapping 1397 if not metabolite_metadata_mapping: 1398 metabolite_metadata_mapping = { 1399 "chebi_id": "chebi", 1400 "kegg_id": "kegg", 1401 "refmet_name": "common_name", 1402 "molecular_formula": "formula", 1403 "gnps_spectra_id":"id", 1404 "precursormz": "precursor_mz", 1405 "precursortype":"ion_type" 1406 } 1407 db_df.rename(columns=metabolite_metadata_mapping, inplace=True) 1408 db_df["molecular_data_id"] = db_df["inchikey"] 1409 1410 1411 1412 # Check if the resulting dataframe has the required columns for the flash entropy search 1413 required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"] 1414 for col in required_columns: 1415 if col not in db_df.columns: 1416 raise ValueError( 1417 f"Input field on MSP must contain '{col}' column for FlashEntropy search." 1418 ) 1419 1420 # Pull out the metabolite metadata from the dataframe and put it into a different dataframe 1421 # First get a list of the possible attributes of the MetaboliteMetadata dataclass 1422 metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys()) 1423 # Replace id with molecular_data_id in metabolite_metadata_keys 1424 metabolite_metadata_keys = [ 1425 "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys 1426 ] 1427 metabolite_metadata_df = db_df[ 1428 db_df.columns[db_df.columns.isin(metabolite_metadata_keys)] 1429 ].copy() 1430 1431 # Make unique and recast the id column for metabolite metadata 1432 metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True) 1433 metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"] 1434 1435 # Convert to a dictionary using the inchikey as the key 1436 metabolite_metadata_dict = metabolite_metadata_df.to_dict( 1437 orient="records" 1438 ) 1439 metabolite_metadata_dict = { 1440 v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata) 1441 for v in metabolite_metadata_dict 1442 } 1443 1444 # Remove the metabolite metadata columns from the original dataframe 1445 for key in metabolite_metadata_keys: 1446 if key != "molecular_data_id": 1447 if key in db_df.columns: 1448 db_df.drop(columns=key, inplace=True) 1449 1450 # Format the spectral library 1451 format_func = self._get_format_func(format) 1452 lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs) 1453 return (lib, metabolite_metadata_dict)
24class SpectralDatabaseInterface(ABC): 25 """ 26 Base class that facilitates connection to spectral reference databases, 27 such as EMSL's Metabolomics Reference Database (MetabRef). 28 29 """ 30 31 def __init__(self, key=None): 32 """ 33 Initialize instance. 34 35 Parameters 36 ---------- 37 key : str 38 Token key. 39 40 """ 41 42 self.key = key 43 44 def set_token(self, path): 45 """ 46 Set environment variable for MetabRef database token. 47 48 Parameters 49 ---------- 50 path : str 51 Path to token. 52 53 """ 54 55 # Read token from file 56 with open(path, "r", encoding="utf-8") as f: 57 token = f.readline().strip() 58 59 # Set environment variable 60 os.environ[self.key] = token 61 62 def get_token(self): 63 """ 64 Get environment variable for database token. 65 66 Returns 67 ------- 68 str 69 Token string. 70 71 """ 72 73 # Check for token 74 if self.key not in os.environ: 75 raise ValueError("Must set {} environment variable.".format(self.key)) 76 77 # Get token from environment variables 78 return os.environ.get(self.key) 79 80 def get_header(self): 81 """ 82 Access stored database token and prepare as header. 83 84 Returns 85 ------- 86 str 87 Header string. 88 89 """ 90 91 # Get token 92 token = self.get_token() 93 94 # Pad header information 95 header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"} 96 97 return header 98 99 def get_query(self, url, use_header=True): 100 """ 101 Request payload from URL according to `get` protocol. 102 103 Parameters 104 ---------- 105 url : str 106 URL for request. 107 use_header: bool 108 Whether or not the query should include the header 109 110 Returns 111 ------- 112 dict 113 Response as JSON. 114 115 """ 116 117 # Query URL via `get` 118 if use_header: 119 response = requests.get(url, headers=self.get_header()) 120 else: 121 response = requests.get(url) 122 123 # Check response 124 response.raise_for_status() 125 126 # Return as JSON 127 return response.json() 128 129 def post_query(self, url, variable, values, tolerance): 130 """ 131 Request payload from URL according to `post` protocol. 132 133 Parameters 134 ---------- 135 url : str 136 URL for request. 137 variable : str 138 Variable to query. 139 values : str 140 Specific values of `variable` to query. 141 tolerance : str 142 Query tolerance relative to `values`. 143 144 Returns 145 ------- 146 dict 147 Response as JSON. 148 149 """ 150 151 # Coerce to string 152 if not isinstance(variable, str): 153 variable = str(variable).replace(" ", "") 154 155 if not isinstance(values, str): 156 values = str(values).replace(" ", "") 157 158 if not isinstance(tolerance, str): 159 tolerance = str(tolerance).replace(" ", "") 160 161 # Query URL via `post` 162 response = requests.post( 163 os.path.join(url, variable, tolerance), 164 data=values, 165 headers=self.get_header(), 166 ) 167 168 # Check response 169 response.raise_for_status() 170 171 # Return as JSON 172 return response.json() 173 174 def _check_flash_entropy_kwargs(self, fe_kwargs): 175 """ 176 Check FlashEntropy keyword arguments. 177 178 Parameters 179 ---------- 180 fe_kwargs : dict 181 Keyword arguments for FlashEntropy search. 182 183 184 Raises 185 ------ 186 ValueError 187 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they 188 are not equal. 189 190 """ 191 # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da 192 if ( 193 "min_ms2_difference_in_da" in fe_kwargs 194 or "max_ms2_tolerance_in_da" in fe_kwargs 195 ): 196 if ( 197 "min_ms2_difference_in_da" not in fe_kwargs 198 or "max_ms2_tolerance_in_da" not in fe_kwargs 199 ): 200 raise ValueError( 201 "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified." 202 ) 203 if ( 204 fe_kwargs["min_ms2_difference_in_da"] 205 != 2 * fe_kwargs["max_ms2_tolerance_in_da"] 206 ): 207 raise ValueError( 208 "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'." 209 ) 210 211 def _get_format_func(self, format): 212 """ 213 Obtain format function by key. 214 215 Returns 216 ------- 217 func 218 Formatting function. 219 """ 220 221 if format.lower() in self.format_map.keys(): 222 return self.format_map[format.lower()] 223 224 raise ValueError(("{} not a supported format.").format(format)) 225 226 def _dict_to_dataclass(self, source_dict, data_class): 227 """ 228 Convert dictionary to dataclass. 229 230 Notes 231 ----- 232 This function will pull the attributes a dataclass and its parent class 233 and convert the dictionary to a dataclass instance with the appropriate 234 attributes. 235 236 Parameters 237 ---------- 238 data_class : :obj:`~dataclasses.dataclass` 239 Dataclass to convert to. 240 source_dict : dict 241 Dictionary object to convert to dataclass. 242 243 Returns 244 ------- 245 :obj:`~dataclasses.dataclass` 246 Dataclass instance. 247 248 """ 249 250 # Get list of expected attributes of data_class 251 data_class_keys = list(data_class.__annotations__.keys()) 252 253 # Does the data_class inherit from another class, if so, get the attributes of the parent class as well 254 if len(data_class.__mro__) > 2: 255 parent_class_keys = list(data_class.__bases__[0].__annotations__.keys()) 256 data_class_keys = list(set(data_class_keys + parent_class_keys)) 257 258 # Remove keys that are not in the data_class from the input dictionary 259 input_dict = {k: v for k, v in source_dict.items() if k in data_class_keys} 260 261 # Add keys that are in the data class but not in the input dictionary as None 262 for key in data_class_keys: 263 if key not in input_dict.keys(): 264 input_dict[key] = None 265 return data_class(**input_dict) 266 267 def _spectrum_to_array(self, spectrum, normalize=True): 268 """ 269 Convert a parenthesis-delimited spectrum string to array. 270 271 Parameters 272 ---------- 273 spectrum : str 274 Spectrum string, i.e. list of (m/z,abundance) pairs. 275 normalize : bool 276 Normalize the spectrum by its magnitude. 277 278 Returns 279 ------- 280 :obj:`~numpy.array` 281 Array of shape (N, 2), with m/z in the first column and abundance in 282 the second. 283 """ 284 285 arr = np.array( 286 re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float 287 ).reshape(-1, 2) 288 289 if normalize: 290 arr = self.normalize_peaks(arr) 291 292 return arr 293 294 @staticmethod 295 def normalize_peaks(arr): 296 """ 297 Normalize peaks in an array. 298 299 Parameters 300 ---------- 301 arr : :obj:`~numpy.array` 302 Array of shape (N, 2), with m/z in the first column and abundance in 303 the second. 304 305 Returns 306 ------- 307 :obj:`~numpy.array` 308 Normalized array of shape (N, 2), with m/z in the first column and 309 normalized abundance in the second. 310 """ 311 # Normalize the array 312 arr[:, -1] = arr[:, -1] / arr[:, -1].sum() 313 314 return arr 315 316 @staticmethod 317 def _build_flash_entropy_index(fe_lib, fe_kwargs={}, clean_spectra=True): 318 """ 319 Build FlashEntropy index. 320 321 Parameters 322 ---------- 323 fe_lib : list 324 List of spectra to build index from. Can be a list of dictionaries or 325 a FlashEntropy search instance. 326 fe_kwargs : dict, optional 327 Keyword arguments for FlashEntropy search. 328 clean_spectra : bool, optional 329 Clean spectra before building index. Default is True. 330 331 Returns 332 ------- 333 :obj:`~ms_entropy.FlashEntropySearch` 334 FlashEntropy search instance. 335 336 """ 337 # Initialize FlashEntropy 338 fe_init_kws = [ 339 "max_ms2_tolerance_in_da", 340 "mz_index_step", 341 "low_memory", 342 "path_data", 343 ] 344 fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws} 345 fes = FlashEntropySearch(**fe_init_kws) 346 347 # Build FlashEntropy index 348 fe_index_kws = [ 349 "max_indexed_mz", 350 "precursor_ions_removal_da", 351 "noise_threshold", 352 "min_ms2_difference_in_da", 353 "max_peak_num", 354 ] 355 fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws} 356 fes.build_index(fe_lib, **fe_index_kws, clean_spectra=clean_spectra) 357 358 return fes
Base class that facilitates connection to spectral reference databases, such as EMSL's Metabolomics Reference Database (MetabRef).
31 def __init__(self, key=None): 32 """ 33 Initialize instance. 34 35 Parameters 36 ---------- 37 key : str 38 Token key. 39 40 """ 41 42 self.key = key
Initialize instance.
Parameters
- key (str): Token key.
44 def set_token(self, path): 45 """ 46 Set environment variable for MetabRef database token. 47 48 Parameters 49 ---------- 50 path : str 51 Path to token. 52 53 """ 54 55 # Read token from file 56 with open(path, "r", encoding="utf-8") as f: 57 token = f.readline().strip() 58 59 # Set environment variable 60 os.environ[self.key] = token
Set environment variable for MetabRef database token.
Parameters
- path (str): Path to token.
62 def get_token(self): 63 """ 64 Get environment variable for database token. 65 66 Returns 67 ------- 68 str 69 Token string. 70 71 """ 72 73 # Check for token 74 if self.key not in os.environ: 75 raise ValueError("Must set {} environment variable.".format(self.key)) 76 77 # Get token from environment variables 78 return os.environ.get(self.key)
Get environment variable for database token.
Returns
- str: Token string.
80 def get_header(self): 81 """ 82 Access stored database token and prepare as header. 83 84 Returns 85 ------- 86 str 87 Header string. 88 89 """ 90 91 # Get token 92 token = self.get_token() 93 94 # Pad header information 95 header = {"Authorization": f"Bearer {token}", "Content-Type": "text/plain"} 96 97 return header
Access stored database token and prepare as header.
Returns
- str: Header string.
99 def get_query(self, url, use_header=True): 100 """ 101 Request payload from URL according to `get` protocol. 102 103 Parameters 104 ---------- 105 url : str 106 URL for request. 107 use_header: bool 108 Whether or not the query should include the header 109 110 Returns 111 ------- 112 dict 113 Response as JSON. 114 115 """ 116 117 # Query URL via `get` 118 if use_header: 119 response = requests.get(url, headers=self.get_header()) 120 else: 121 response = requests.get(url) 122 123 # Check response 124 response.raise_for_status() 125 126 # Return as JSON 127 return response.json()
Request payload from URL according to get protocol.
Parameters
- url (str): URL for request.
- use_header (bool): Whether or not the query should include the header
Returns
- dict: Response as JSON.
129 def post_query(self, url, variable, values, tolerance): 130 """ 131 Request payload from URL according to `post` protocol. 132 133 Parameters 134 ---------- 135 url : str 136 URL for request. 137 variable : str 138 Variable to query. 139 values : str 140 Specific values of `variable` to query. 141 tolerance : str 142 Query tolerance relative to `values`. 143 144 Returns 145 ------- 146 dict 147 Response as JSON. 148 149 """ 150 151 # Coerce to string 152 if not isinstance(variable, str): 153 variable = str(variable).replace(" ", "") 154 155 if not isinstance(values, str): 156 values = str(values).replace(" ", "") 157 158 if not isinstance(tolerance, str): 159 tolerance = str(tolerance).replace(" ", "") 160 161 # Query URL via `post` 162 response = requests.post( 163 os.path.join(url, variable, tolerance), 164 data=values, 165 headers=self.get_header(), 166 ) 167 168 # Check response 169 response.raise_for_status() 170 171 # Return as JSON 172 return response.json()
Request payload from URL according to post protocol.
Parameters
- url (str): URL for request.
- variable (str): Variable to query.
- values (str):
Specific values of
variableto query. - tolerance (str):
Query tolerance relative to
values.
Returns
- dict: Response as JSON.
294 @staticmethod 295 def normalize_peaks(arr): 296 """ 297 Normalize peaks in an array. 298 299 Parameters 300 ---------- 301 arr : :obj:`~numpy.array` 302 Array of shape (N, 2), with m/z in the first column and abundance in 303 the second. 304 305 Returns 306 ------- 307 :obj:`~numpy.array` 308 Normalized array of shape (N, 2), with m/z in the first column and 309 normalized abundance in the second. 310 """ 311 # Normalize the array 312 arr[:, -1] = arr[:, -1] / arr[:, -1].sum() 313 314 return arr
Normalize peaks in an array.
Parameters
- arr (
~numpy.array): Array of shape (N, 2), with m/z in the first column and abundance in the second.
Returns
~numpy.array: Normalized array of shape (N, 2), with m/z in the first column and normalized abundance in the second.
361class MetabRefInterface(SpectralDatabaseInterface): 362 """ 363 DEPRECATED interface retained for backward compatibility only. 364 """ 365 366 def __init__(self): 367 """ 368 Initialize instance with deprecation warning. 369 370 """ 371 372 super().__init__(key=None) 373 374 if self.__class__ is MetabRefInterface: 375 warnings.warn( 376 "MetabRefInterface is deprecated. Instantiate a concrete interface " 377 "such as GCMSLibraryInterface or LCLipidLibraryInterface instead.", 378 DeprecationWarning, 379 stacklevel=2, 380 )
DEPRECATED interface retained for backward compatibility only.
366 def __init__(self): 367 """ 368 Initialize instance with deprecation warning. 369 370 """ 371 372 super().__init__(key=None) 373 374 if self.__class__ is MetabRefInterface: 375 warnings.warn( 376 "MetabRefInterface is deprecated. Instantiate a concrete interface " 377 "such as GCMSLibraryInterface or LCLipidLibraryInterface instead.", 378 DeprecationWarning, 379 stacklevel=2, 380 )
Initialize instance with deprecation warning.
Inherited Members
383class GCMSLibraryInterface(SpectralDatabaseInterface): 384 """ 385 Interface to bundled GCMS spectral libraries in MSP format. 386 387 Loads GCMS compound library and FAMES calibration library from local MSP files. 388 Default files are bundled with CoreMS, but can be overridden via environment variables. 389 """ 390 391 def __init__(self): 392 """ 393 Initialize instance. 394 """ 395 super().__init__(key=None) 396 397 # Local data file paths 398 from pathlib import Path 399 400 # Default to bundled data files 401 data_dir = Path(__file__).parent.parent / "data" 402 self.gcms_library_file = os.getenv( 403 "GCMS_LIBRARY_PATH", 404 str(data_dir / "PNNLMetV20191015.msp") 405 ) 406 self.fames_library_file = os.getenv( 407 "FAMES_LIBRARY_PATH", 408 str(data_dir / "FAMES_REF.msp") 409 ) 410 411 self.__init_format_map__() 412 413 def __init_format_map__(self): 414 """ 415 Initialize database format mapper, enabling multiple format requests. 416 417 """ 418 419 # Define format workflows 420 self.format_map = { 421 "json": lambda x, normalize, fe_kwargs: x, 422 "dict": lambda x, 423 normalize, 424 fe_kwargs: self._to_LowResolutionEICompound_dict(x, normalize), 425 "sql": lambda x, 426 normalize, 427 fe_kwargs: self._LowResolutionEICompound_dict_to_sqlite( 428 self._to_LowResolutionEICompound_dict(x, normalize) 429 ), 430 } 431 432 # Add aliases 433 self.format_map["metabref"] = self.format_map["json"] 434 self.format_map["datadict"] = self.format_map["dict"] 435 self.format_map["data-dict"] = self.format_map["dict"] 436 self.format_map["lowreseicompound"] = self.format_map["dict"] 437 self.format_map["lowres"] = self.format_map["dict"] 438 self.format_map["lowresgc"] = self.format_map["dict"] 439 self.format_map["sqlite"] = self.format_map["sql"] 440 441 def available_formats(self): 442 """ 443 View list of available formats. 444 445 Returns 446 ------- 447 list 448 Format map keys. 449 """ 450 451 return list(self.format_map.keys()) 452 453 def get_library(self, format="json", normalize=False): 454 """ 455 Load GC/MS library from local MSP file. 456 457 Parameters 458 ---------- 459 format : str 460 Format of requested library, i.e. "json", "sql", "dict". 461 See `available_formats` method for aliases. 462 normalize : bool 463 Normalize the spectrum by its magnitude. 464 465 Returns 466 ------- 467 Library in requested format. 468 469 """ 470 # Load from local MSP file 471 library_data = self._load_msp_file(self.gcms_library_file, normalize) 472 473 # Init format function 474 format_func = self._get_format_func(format) 475 476 # Apply format conversion 477 return format_func(library_data, normalize, {}) 478 479 def get_fames(self, format="json", normalize=False): 480 """ 481 Load GC/MS FAMEs library from local MSP file. 482 483 Parameters 484 ---------- 485 format : str 486 Format of requested library, i.e. "json", "sql", "dict". 487 See `available_formats` method for aliases. 488 normalize : bool 489 Normalize the spectrum by its magnitude. 490 491 Returns 492 ------- 493 Library in requested format. 494 495 """ 496 # Load from local MSP file 497 library_data = self._load_msp_file(self.fames_library_file, normalize) 498 499 # Init format function 500 format_func = self._get_format_func(format) 501 502 # Apply format conversion 503 return format_func(library_data, normalize, {}) 504 505 def _load_msp_file(self, file_path, normalize=False): 506 """ 507 Load and parse MSP file into format compatible with existing pipeline. 508 509 Parameters 510 ---------- 511 file_path : str 512 Path to MSP file 513 normalize : bool 514 Normalize spectra 515 516 Returns 517 ------- 518 list of dict 519 Library data in format compatible with _to_LowResolutionEICompound_dict 520 """ 521 from pathlib import Path 522 523 file_path = Path(file_path) 524 if not file_path.exists(): 525 raise FileNotFoundError( 526 f"Library file not found: {file_path}. " 527 f"Set GCMS_LIBRARY_PATH or FAMES_LIBRARY_PATH environment variable to specify location." 528 ) 529 530 # Parse MSP file 531 spectra = [] 532 spectrum = {} 533 peaks = [] 534 535 with open(file_path, 'r') as f: 536 for line in f: 537 line = line.strip() 538 539 # Empty line marks end of spectrum 540 if not line: 541 if spectrum and peaks: 542 # Convert peaks to the format expected by downstream code 543 # Format: "(mz,abundance)(mz,abundance)..." 544 peak_str = "".join([f"({int(mz)},{int(abun)})" for mz, abun in peaks]) 545 spectrum['mz'] = peak_str 546 spectra.append(spectrum) 547 spectrum = {} 548 peaks = [] 549 continue 550 551 # Check if line contains peak data (starts with digit) 552 if line and line[0].isdigit(): 553 parts = line.split() 554 if len(parts) >= 2: 555 peaks.append((float(parts[0]), float(parts[1]))) 556 continue 557 558 # Handle metadata fields 559 if ":" in line: 560 key, value = line.split(":", 1) 561 key = key.strip().lower() 562 value = value.strip() 563 564 # Map MSP fields to expected format 565 field_mapping = { 566 "name": "molecule_name", 567 "formula": "formula", 568 "cas": "casno", 569 "retentiontime": "retention_time", 570 "ri": "ri", 571 "comment": "comments", 572 "num peaks": "peak_count", 573 "derivative": "derivative" 574 } 575 576 # Metadata fields that go into the metadata dict 577 metadata_fields = { 578 "inchikey": "inchikey", 579 "inchi": "inchi", 580 "smiles": "smiles", 581 "pubchem": "pubchem", 582 "chebi": "chebi", 583 "kegg": "kegg", 584 "refmet": "refmet", 585 "iupac_name": "iupac_name" 586 } 587 588 if key in field_mapping: 589 mapped_key = field_mapping[key] 590 # Convert numeric fields 591 if key in ["retentiontime", "ri"]: 592 try: 593 value = float(value) 594 except: 595 pass 596 elif key == "num peaks": 597 try: 598 value = int(value) 599 except: 600 pass 601 spectrum[mapped_key] = value 602 elif key in metadata_fields: 603 # Store in nested metadata dict 604 if "metadata" not in spectrum: 605 spectrum["metadata"] = {} 606 spectrum["metadata"][metadata_fields[key]] = value 607 else: 608 # Keep unmapped fields 609 spectrum[key] = value 610 611 # Add last spectrum if file doesn't end with blank line 612 if spectrum and peaks: 613 peak_str = "".join([f"({int(mz)},{int(abun)})" for mz, abun in peaks]) 614 spectrum['mz'] = peak_str 615 spectra.append(spectrum) 616 617 return spectra 618 619 def _to_LowResolutionEICompound_dict(self, metabref_lib, normalize=False): 620 """ 621 Convert MetabRef-formatted library to CoreMS LowResolutionEICompound-formatted 622 dictionary for local ingestion. 623 624 Parameters 625 ---------- 626 metabref_lib : dict 627 MetabRef GC-MS library in JSON format. 628 normalize : bool 629 Normalize each spectrum by its magnitude. 630 631 Returns 632 ------- 633 list of dict 634 List of each spectrum contained in dictionary. 635 636 """ 637 638 # All below key:value lookups are based on CoreMS class definitions 639 # NOT MetabRef content. For example, MetabRef has keys for PubChem, 640 # USI, etc. that are not considered below. 641 642 # Dictionary to map metabref keys to corems keys 643 metadatar_cols = { 644 "casno": "cas", 645 "inchikey": "inchikey", 646 "inchi": "inchi", 647 "chebi": "chebi", 648 "smiles": "smiles", 649 "kegg": "kegg", 650 "iupac_name": "iupac_name", 651 "traditional_name": "traditional_name", # Not present in metabref 652 "common_name": "common_name", # Not present in metabref 653 } 654 655 # Dictionary to map metabref keys to corems keys 656 lowres_ei_compound_cols = { 657 "id": "metabref_id", 658 "molecule_name": "name", # Is this correct? 659 "classify": "classify", # Not present in metabref 660 "formula": "formula", 661 "ri": "ri", 662 "rt": "retention_time", 663 "source": "source", # Not present in metabref 664 "casno": "casno", 665 "comments": "comment", 666 "source_temp_c": "source_temp_c", # Not present in metabref 667 "ev": "ev", # Not present in metabref 668 "peak_count": "peaks_count", 669 "mz": "mz", 670 "abundance": "abundance", 671 } 672 673 # Local result container 674 corems_lib = [] 675 676 # Enumerate spectra 677 for i, source_ in enumerate(metabref_lib): 678 # Copy source to prevent modification 679 source = source_.copy() 680 681 # Parse target data 682 target = { 683 lowres_ei_compound_cols[k]: v 684 for k, v in source.items() 685 if k in lowres_ei_compound_cols 686 } 687 688 # Explicitly add this to connect with LowResCompoundRef later 689 if "retention_time" in source: 690 target["rt"] = source["retention_time"] 691 elif "rt" in source: 692 target["rt"] = source["rt"] 693 694 # Parse (mz, abundance) 695 arr = self._spectrum_to_array(target["mz"], normalize=normalize) 696 target["mz"] = arr[:, 0] 697 target["abundance"] = arr[:, 1] 698 699 # Parse meta data 700 target["metadata"] = { 701 metadatar_cols[k]: v for k, v in source.items() if k in metadatar_cols 702 } 703 704 # Add anything else 705 for k in source: 706 if k not in lowres_ei_compound_cols: 707 target[k] = source[k] 708 709 # Add to CoreMS list 710 corems_lib.append(target) 711 712 return corems_lib 713 714 def _LowResolutionEICompound_dict_to_sqlite( 715 self, lowres_ei_compound_dict, url="sqlite://" 716 ): 717 """ 718 Convert CoreMS LowResolutionEICompound-formatted dictionary to SQLite 719 database for local ingestion. 720 721 Parameters 722 ---------- 723 lowres_ei_compound_dict : dict 724 CoreMS GC-MS library formatted for LowResolutionEICompound. 725 url : str 726 URL to SQLite prefix. 727 728 Returns 729 ------- 730 sqlite database 731 Spectra contained in SQLite database. 732 733 """ 734 735 # Dictionary to map corems keys to all-caps keys 736 capped_cols = { 737 "name": "NAME", 738 "formula": "FORM", 739 "ri": "RI", 740 "retention_time": "RT", 741 "source": "SOURCE", 742 "casno": "CASNO", 743 "comment": "COMMENT", 744 "peaks_count": "NUM PEAKS", 745 } 746 747 # Initialize SQLite object 748 sqlite_obj = EI_LowRes_SQLite(url=url) 749 750 # Iterate spectra 751 for _data_dict in lowres_ei_compound_dict: 752 # Copy source to prevent modification 753 data_dict = _data_dict.copy() 754 755 # Add missing capped values 756 for k, v in capped_cols.items(): 757 # Key exists 758 if k in data_dict: 759 # # This will replace the key 760 # data_dict[v] = data_dict.pop(k) 761 762 # This will keep both keys 763 data_dict[v] = data_dict[k] 764 765 # Parse number of peaks 766 if not data_dict.get("NUM PEAKS"): 767 data_dict["NUM PEAKS"] = len(data_dict.get("mz")) 768 769 # Parse CAS number 770 if not data_dict.get("CASNO"): 771 data_dict["CASNO"] = data_dict.get("CAS") 772 773 if not data_dict["CASNO"]: 774 data_dict["CASNO"] = 0 775 776 # Build linked metadata table 777 if "metadata" in data_dict: 778 metadata = data_dict.pop("metadata") 779 # Only create metadata entry if we have required fields and valid data 780 # Filter to only include fields that Metadatar model supports 781 supported_metadata_fields = [ 782 'cas', 'inchikey', 'inchi', 'chebi', 'smiles', 783 'kegg', 'iupac_name', 'traditional_name', 'common_name' 784 ] 785 filtered_metadata = { 786 k: v for k, v in metadata.items() 787 if k in supported_metadata_fields and v 788 } 789 # Inchikey is required by the database model 790 if filtered_metadata and filtered_metadata.get("inchikey"): 791 data_dict["metadatar"] = Metadatar(**filtered_metadata) 792 793 # Attempt addition to sqlite 794 try: 795 sqlite_obj.add_compound(data_dict) 796 except: 797 print(data_dict["NAME"]) 798 799 return sqlite_obj
Interface to bundled GCMS spectral libraries in MSP format.
Loads GCMS compound library and FAMES calibration library from local MSP files. Default files are bundled with CoreMS, but can be overridden via environment variables.
391 def __init__(self): 392 """ 393 Initialize instance. 394 """ 395 super().__init__(key=None) 396 397 # Local data file paths 398 from pathlib import Path 399 400 # Default to bundled data files 401 data_dir = Path(__file__).parent.parent / "data" 402 self.gcms_library_file = os.getenv( 403 "GCMS_LIBRARY_PATH", 404 str(data_dir / "PNNLMetV20191015.msp") 405 ) 406 self.fames_library_file = os.getenv( 407 "FAMES_LIBRARY_PATH", 408 str(data_dir / "FAMES_REF.msp") 409 ) 410 411 self.__init_format_map__()
Initialize instance.
441 def available_formats(self): 442 """ 443 View list of available formats. 444 445 Returns 446 ------- 447 list 448 Format map keys. 449 """ 450 451 return list(self.format_map.keys())
View list of available formats.
Returns
- list: Format map keys.
453 def get_library(self, format="json", normalize=False): 454 """ 455 Load GC/MS library from local MSP file. 456 457 Parameters 458 ---------- 459 format : str 460 Format of requested library, i.e. "json", "sql", "dict". 461 See `available_formats` method for aliases. 462 normalize : bool 463 Normalize the spectrum by its magnitude. 464 465 Returns 466 ------- 467 Library in requested format. 468 469 """ 470 # Load from local MSP file 471 library_data = self._load_msp_file(self.gcms_library_file, normalize) 472 473 # Init format function 474 format_func = self._get_format_func(format) 475 476 # Apply format conversion 477 return format_func(library_data, normalize, {})
Load GC/MS library from local MSP file.
Parameters
- format (str):
Format of requested library, i.e. "json", "sql", "dict".
See
available_formatsmethod for aliases. - normalize (bool): Normalize the spectrum by its magnitude.
Returns
- Library in requested format.
479 def get_fames(self, format="json", normalize=False): 480 """ 481 Load GC/MS FAMEs library from local MSP file. 482 483 Parameters 484 ---------- 485 format : str 486 Format of requested library, i.e. "json", "sql", "dict". 487 See `available_formats` method for aliases. 488 normalize : bool 489 Normalize the spectrum by its magnitude. 490 491 Returns 492 ------- 493 Library in requested format. 494 495 """ 496 # Load from local MSP file 497 library_data = self._load_msp_file(self.fames_library_file, normalize) 498 499 # Init format function 500 format_func = self._get_format_func(format) 501 502 # Apply format conversion 503 return format_func(library_data, normalize, {})
Load GC/MS FAMEs library from local MSP file.
Parameters
- format (str):
Format of requested library, i.e. "json", "sql", "dict".
See
available_formatsmethod for aliases. - normalize (bool): Normalize the spectrum by its magnitude.
Returns
- Library in requested format.
Inherited Members
802class MetabRefGCInterface(GCMSLibraryInterface): 803 """ 804 DEPRECATED: Use GCMSLibraryInterface instead. 805 806 This interface is maintained for backward compatibility only. 807 MetabRef API has been discontinued as of 2026. 808 """ 809 810 def __init__(self): 811 """ 812 Initialize instance with deprecation warning. 813 """ 814 warnings.warn( 815 "MetabRefGCInterface is deprecated. Use GCMSLibraryInterface instead. " 816 "MetabRef API has been discontinued; all data now loads from bundled local MSP files.", 817 DeprecationWarning, 818 stacklevel=2 819 ) 820 super().__init__()
DEPRECATED: Use GCMSLibraryInterface instead.
This interface is maintained for backward compatibility only. MetabRef API has been discontinued as of 2026.
810 def __init__(self): 811 """ 812 Initialize instance with deprecation warning. 813 """ 814 warnings.warn( 815 "MetabRefGCInterface is deprecated. Use GCMSLibraryInterface instead. " 816 "MetabRef API has been discontinued; all data now loads from bundled local MSP files.", 817 DeprecationWarning, 818 stacklevel=2 819 ) 820 super().__init__()
Initialize instance with deprecation warning.
823class LCLipidLibraryInterface(SpectralDatabaseInterface): 824 """ 825 Interface to a local sqlite lipid library for LC-MS spectral searches. 826 """ 827 828 DEFAULT_DOWNLOAD_URL = ( 829 "https://nmdcdemo.emsl.pnnl.gov/minio/lipidomics/parameter_files/" 830 "202412_lipid_ref.sqlite" 831 ) 832 833 def __init__(self, db_location=None): 834 """ 835 Initialize instance. 836 837 Parameters 838 ---------- 839 db_location : str | Path, optional 840 Local path to the sqlite lipid library. If omitted, the 841 COREMS_LIPIDOMICS_SQLITE_PATH environment variable is used. 842 """ 843 844 super().__init__(key=None) 845 self.db_location = db_location 846 self.__init_format_map__() 847 848 def _to_flashentropy(self, spectral_library, normalize=True, fe_kwargs={}): 849 """ 850 Convert a spectral library to FlashEntropy format. 851 852 Parameters 853 ---------- 854 spectral_library : dict 855 MS2 library in JSON format or FlashEntropy search instance 856 (for reformatting at different MS2 separation). 857 normalize : bool 858 Normalize each spectrum by its magnitude. 859 fe_kwargs : dict, optional 860 Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search; 861 any keys not recognized will be ignored. By default, all parameters set to defaults. 862 863 Returns 864 ------- 865 :obj:`~ms_entropy.FlashEntropySearch` 866 MS2 library as FlashEntropy search instance. 867 868 Raises 869 ------ 870 ValueError 871 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal. 872 873 """ 874 self._check_flash_entropy_kwargs(fe_kwargs) 875 876 # Initialize empty library 877 fe_lib = [] 878 879 # Enumerate spectra 880 for i, source in enumerate(spectral_library): 881 if "spectrum_data" in source.keys(): 882 spectrum = source["spectrum_data"] 883 else: 884 spectrum = source 885 886 if "precursor_mz" not in spectrum.keys(): 887 spectrum["precursor_mz"] = spectrum.pop("precursor_ion") 888 889 spectrum["peaks"] = self._spectrum_to_array( 890 spectrum["mz"], normalize=normalize 891 ) 892 fe_lib.append(spectrum) 893 894 fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs) 895 896 return fe_search 897 898 def __init_format_map__(self): 899 """ 900 Initialize database format mapper, enabling multiple format requests. 901 """ 902 903 self.format_map = { 904 "json": lambda x, normalize, fe_kwargs: x, 905 "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy( 906 x, normalize, fe_kwargs 907 ), 908 "dataframe": lambda x, normalize, fe_kwargs: pd.DataFrame(x), 909 } 910 911 self.format_map["fe"] = self.format_map["flashentropy"] 912 self.format_map["flash-entropy"] = self.format_map["flashentropy"] 913 self.format_map["df"] = self.format_map["dataframe"] 914 915 def available_formats(self): 916 """ 917 View list of available formats. 918 919 Returns 920 ------- 921 list 922 Format map keys. 923 """ 924 925 return list(self.format_map.keys()) 926 927 def _resolve_db_location(self): 928 """ 929 Resolve and validate sqlite database location. 930 931 Returns 932 ------- 933 Path 934 Existing sqlite database file path. 935 """ 936 937 db_location = self.db_location or os.getenv("COREMS_LIPIDOMICS_SQLITE_PATH") 938 if not db_location: 939 raise ValueError( 940 "A local lipid sqlite library path is required. " 941 "Set COREMS_LIPIDOMICS_SQLITE_PATH or pass db_location." 942 ) 943 944 db_path = Path(db_location).expanduser() 945 if not db_path.exists(): 946 raise FileNotFoundError( 947 f"Lipid sqlite library not found at {db_path}. " 948 f"Download it from {self.DEFAULT_DOWNLOAD_URL} " 949 "and set COREMS_LIPIDOMICS_SQLITE_PATH." 950 ) 951 952 return db_path 953 954 def _get_candidate_spectra(self, connection, mz_list, polarity, mz_tol_ppm): 955 """ 956 Fetch candidate spectra rows by precursor m/z and polarity. 957 958 Returns 959 ------- 960 pandas.DataFrame 961 Filtered rows from lipidMassSpectrumObject. 962 """ 963 964 mz_observed = np.sort(np.asarray(mz_list, dtype=float)) 965 if mz_observed.size == 0: 966 return pd.DataFrame() 967 968 mz_all = pd.read_sql_query( 969 "SELECT id, polarity, precursor_mz FROM lipidMassSpectrumObject", connection 970 ) 971 mz_all = mz_all[mz_all["polarity"] == polarity].copy() 972 if mz_all.empty: 973 return pd.DataFrame() 974 975 mz_all = mz_all.sort_values(by="precursor_mz").reset_index(drop=True) 976 977 if mz_observed.size == 1: 978 mz_all["closest_mz_obs"] = mz_observed[0] 979 else: 980 mz_all["closest_mz_obs"] = mz_observed[ 981 find_closest(mz_observed, mz_all.precursor_mz.values) 982 ] 983 984 mz_all["ppm_error"] = ( 985 (mz_all["precursor_mz"] - mz_all["closest_mz_obs"]) 986 / mz_all["precursor_mz"] 987 * 1e6 988 ) 989 990 mz_all = mz_all[np.abs(mz_all["ppm_error"]) <= mz_tol_ppm] 991 if mz_all.empty: 992 return pd.DataFrame() 993 994 mz_ids = tuple(mz_all["id"].tolist()) 995 return pd.read_sql_query( 996 f"SELECT * FROM lipidMassSpectrumObject WHERE id IN {mz_ids}", 997 connection, 998 ) 999 1000 def get_lipid_library( 1001 self, 1002 mz_list, 1003 polarity, 1004 mz_tol_ppm, 1005 mz_tol_da_api=None, 1006 format="json", 1007 normalize=True, 1008 fe_kwargs={}, 1009 api_delay=5, 1010 api_attempts=10, 1011 ): 1012 """ 1013 Retrieve lipid spectra and metadata from a local sqlite library. 1014 1015 Parameters 1016 ---------- 1017 mz_list : list 1018 List of precursor m/z values. 1019 polarity : str 1020 Ionization polarity, either "positive" or "negative". 1021 mz_tol_ppm : float 1022 Tolerance in ppm for precursor matching. 1023 mz_tol_da_api : float, optional 1024 Unused, kept for backward compatibility. 1025 format : str, optional 1026 Format of requested library, e.g. "json" or "flashentropy". 1027 normalize : bool, optional 1028 Normalize spectrum intensities. 1029 fe_kwargs : dict, optional 1030 Keyword arguments for FlashEntropy search. 1031 api_delay : int, optional 1032 Unused, kept for backward compatibility. 1033 api_attempts : int, optional 1034 Unused, kept for backward compatibility. 1035 1036 Returns 1037 ------- 1038 tuple 1039 Library in requested format and lipid metadata dictionary. 1040 """ 1041 1042 if not isinstance(mz_list, (list, np.ndarray)): 1043 raise ValueError("mz_list must be a list or numpy array") 1044 if not all(isinstance(mz, (float, int)) for mz in mz_list): 1045 raise ValueError("All elements in mz_list must be float or int") 1046 if polarity not in {"positive", "negative"}: 1047 raise ValueError("polarity must be either 'positive' or 'negative'") 1048 if not isinstance(mz_tol_ppm, (float, int)): 1049 raise ValueError("mz_tol_ppm must be a float or int") 1050 1051 db_path = self._resolve_db_location() 1052 connection = sqlite3.connect(str(db_path)) 1053 try: 1054 # Step 1: Get candidate spectra records based on m/z and polarity 1055 spectra_df = self._get_candidate_spectra( 1056 connection=connection, 1057 mz_list=mz_list, 1058 polarity=polarity, 1059 mz_tol_ppm=float(mz_tol_ppm), 1060 ) 1061 1062 if spectra_df.empty: 1063 format_func = self._get_format_func(format) 1064 return format_func([], normalize=normalize, fe_kwargs=fe_kwargs), {} 1065 1066 # Step 2: Get corresponding lipid metadata for candidate spectra from lipidTree view 1067 mol_ids = tuple(spectra_df["molecular_data_id"].tolist()) 1068 mol_df = pd.read_sql_query( 1069 f"SELECT * FROM lipidTree WHERE id IN {mol_ids}", 1070 connection, 1071 ) 1072 finally: 1073 connection.close() 1074 1075 mol_df["id_index"] = mol_df["id"] 1076 mol_df = mol_df.set_index("id_index") 1077 mol_records = mol_df.to_dict(orient="index") 1078 lipid_metadata = { 1079 int(k): self._dict_to_dataclass(v, LipidMetadata) 1080 for k, v in mol_records.items() 1081 } 1082 1083 spectra_records = spectra_df.to_dict(orient="records") 1084 format_func = self._get_format_func(format) 1085 library = format_func(spectra_records, normalize=normalize, fe_kwargs=fe_kwargs) 1086 return library, lipid_metadata
Interface to a local sqlite lipid library for LC-MS spectral searches.
833 def __init__(self, db_location=None): 834 """ 835 Initialize instance. 836 837 Parameters 838 ---------- 839 db_location : str | Path, optional 840 Local path to the sqlite lipid library. If omitted, the 841 COREMS_LIPIDOMICS_SQLITE_PATH environment variable is used. 842 """ 843 844 super().__init__(key=None) 845 self.db_location = db_location 846 self.__init_format_map__()
Initialize instance.
Parameters
- db_location (str | Path, optional): Local path to the sqlite lipid library. If omitted, the COREMS_LIPIDOMICS_SQLITE_PATH environment variable is used.
915 def available_formats(self): 916 """ 917 View list of available formats. 918 919 Returns 920 ------- 921 list 922 Format map keys. 923 """ 924 925 return list(self.format_map.keys())
View list of available formats.
Returns
- list: Format map keys.
1000 def get_lipid_library( 1001 self, 1002 mz_list, 1003 polarity, 1004 mz_tol_ppm, 1005 mz_tol_da_api=None, 1006 format="json", 1007 normalize=True, 1008 fe_kwargs={}, 1009 api_delay=5, 1010 api_attempts=10, 1011 ): 1012 """ 1013 Retrieve lipid spectra and metadata from a local sqlite library. 1014 1015 Parameters 1016 ---------- 1017 mz_list : list 1018 List of precursor m/z values. 1019 polarity : str 1020 Ionization polarity, either "positive" or "negative". 1021 mz_tol_ppm : float 1022 Tolerance in ppm for precursor matching. 1023 mz_tol_da_api : float, optional 1024 Unused, kept for backward compatibility. 1025 format : str, optional 1026 Format of requested library, e.g. "json" or "flashentropy". 1027 normalize : bool, optional 1028 Normalize spectrum intensities. 1029 fe_kwargs : dict, optional 1030 Keyword arguments for FlashEntropy search. 1031 api_delay : int, optional 1032 Unused, kept for backward compatibility. 1033 api_attempts : int, optional 1034 Unused, kept for backward compatibility. 1035 1036 Returns 1037 ------- 1038 tuple 1039 Library in requested format and lipid metadata dictionary. 1040 """ 1041 1042 if not isinstance(mz_list, (list, np.ndarray)): 1043 raise ValueError("mz_list must be a list or numpy array") 1044 if not all(isinstance(mz, (float, int)) for mz in mz_list): 1045 raise ValueError("All elements in mz_list must be float or int") 1046 if polarity not in {"positive", "negative"}: 1047 raise ValueError("polarity must be either 'positive' or 'negative'") 1048 if not isinstance(mz_tol_ppm, (float, int)): 1049 raise ValueError("mz_tol_ppm must be a float or int") 1050 1051 db_path = self._resolve_db_location() 1052 connection = sqlite3.connect(str(db_path)) 1053 try: 1054 # Step 1: Get candidate spectra records based on m/z and polarity 1055 spectra_df = self._get_candidate_spectra( 1056 connection=connection, 1057 mz_list=mz_list, 1058 polarity=polarity, 1059 mz_tol_ppm=float(mz_tol_ppm), 1060 ) 1061 1062 if spectra_df.empty: 1063 format_func = self._get_format_func(format) 1064 return format_func([], normalize=normalize, fe_kwargs=fe_kwargs), {} 1065 1066 # Step 2: Get corresponding lipid metadata for candidate spectra from lipidTree view 1067 mol_ids = tuple(spectra_df["molecular_data_id"].tolist()) 1068 mol_df = pd.read_sql_query( 1069 f"SELECT * FROM lipidTree WHERE id IN {mol_ids}", 1070 connection, 1071 ) 1072 finally: 1073 connection.close() 1074 1075 mol_df["id_index"] = mol_df["id"] 1076 mol_df = mol_df.set_index("id_index") 1077 mol_records = mol_df.to_dict(orient="index") 1078 lipid_metadata = { 1079 int(k): self._dict_to_dataclass(v, LipidMetadata) 1080 for k, v in mol_records.items() 1081 } 1082 1083 spectra_records = spectra_df.to_dict(orient="records") 1084 format_func = self._get_format_func(format) 1085 library = format_func(spectra_records, normalize=normalize, fe_kwargs=fe_kwargs) 1086 return library, lipid_metadata
Retrieve lipid spectra and metadata from a local sqlite library.
Parameters
- mz_list (list): List of precursor m/z values.
- polarity (str): Ionization polarity, either "positive" or "negative".
- mz_tol_ppm (float): Tolerance in ppm for precursor matching.
- mz_tol_da_api (float, optional): Unused, kept for backward compatibility.
- format (str, optional): Format of requested library, e.g. "json" or "flashentropy".
- normalize (bool, optional): Normalize spectrum intensities.
- fe_kwargs (dict, optional): Keyword arguments for FlashEntropy search.
- api_delay (int, optional): Unused, kept for backward compatibility.
- api_attempts (int, optional): Unused, kept for backward compatibility.
Returns
- tuple: Library in requested format and lipid metadata dictionary.
Inherited Members
1089class MSPInterface(SpectralDatabaseInterface): 1090 """ 1091 Interface to parse NIST MSP files 1092 """ 1093 1094 def __init__(self, file_path): 1095 """ 1096 Initialize instance. 1097 1098 Parameters 1099 ---------- 1100 file_path : str 1101 Path to a local MSP file. 1102 1103 Attributes 1104 ---------- 1105 file_path : str 1106 Path to the MSP file. 1107 _file_content : str 1108 Content of the MSP file. 1109 _data_frame : :obj:`~pandas.DataFrame` 1110 DataFrame of spectra from the MSP file with unaltered content. 1111 """ 1112 super().__init__(key=None) 1113 1114 self.file_path = file_path 1115 if not os.path.exists(self.file_path): 1116 raise FileNotFoundError( 1117 f"File {self.file_path} does not exist. Please check the file path." 1118 ) 1119 with open(self.file_path, "r") as f: 1120 self._file_content = f.read() 1121 1122 self._data_frame = self._read_msp_file() 1123 self.__init_format_map__() 1124 1125 def __init_format_map__(self): 1126 """ 1127 Initialize database format mapper, enabling multiple format requests. 1128 1129 """ 1130 1131 # x is a pandas dataframe similar to self._data_frame format 1132 # Define format workflows 1133 self.format_map = { 1134 "msp": lambda x, normalize, fe_kwargs: self._to_msp(x, normalize), 1135 "flashentropy": lambda x, normalize, fe_kwargs: self._to_flashentropy( 1136 x, normalize, fe_kwargs 1137 ), 1138 "df": lambda x, normalize, fe_kwargs: self._to_df(x, normalize), 1139 } 1140 1141 # Add aliases 1142 self.format_map["fe"] = self.format_map["flashentropy"] 1143 self.format_map["flash-entropy"] = self.format_map["flashentropy"] 1144 self.format_map["dataframe"] = self.format_map["df"] 1145 self.format_map["data-frame"] = self.format_map["df"] 1146 1147 def _read_msp_file(self): 1148 """ 1149 Reads the MSP files into the pandas dataframe, and sort/remove zero intensity ions in MS/MS spectra. 1150 1151 Returns 1152 ------- 1153 :obj:`~pandas.DataFrame` 1154 DataFrame of spectra from the MSP file, exacly as it is in the file (no sorting, filtering etc) 1155 """ 1156 # If input_dataframe is provided, return it it 1157 spectra = [] 1158 spectrum = {} 1159 1160 f = StringIO(self._file_content) 1161 for line in f: 1162 line = line.strip() 1163 if not line: 1164 continue # Skip empty lines 1165 1166 # Handle metadata 1167 if ":" in line: 1168 key, value = line.split(":", 1) 1169 key = key.strip().lower() 1170 value = value.strip() 1171 1172 if key == "name": 1173 # Save current spectrum and start a new one 1174 if spectrum: 1175 spectra.append(spectrum) 1176 spectrum = {"name": value, "peaks": []} 1177 else: 1178 spectrum[key] = value 1179 1180 # Handle peak data (assumed to start with a number) 1181 elif line[0].isdigit(): 1182 peaks = line.split() 1183 m_z = float(peaks[0]) 1184 intensity = float(peaks[1]) 1185 spectrum["peaks"].append(([m_z, intensity])) 1186 # Save the last spectrum 1187 if spectrum: 1188 spectra.append(spectrum) 1189 1190 df = pd.DataFrame(spectra) 1191 for column in df.columns: 1192 if column != "peaks": # Skip 'peaks' column 1193 try: 1194 df[column] = pd.to_numeric(df[column], errors="raise") 1195 except: 1196 pass 1197 return df 1198 1199 def _to_df(self, input_dataframe, normalize=True): 1200 """ 1201 Convert MSP-derived library to FlashEntropy library. 1202 1203 Parameters 1204 ---------- 1205 input_dataframe : :obj:`~pandas.DataFrame` 1206 Input DataFrame containing MSP-formatted spectra. 1207 normalize : bool, optional 1208 Normalize each spectrum by its magnitude. 1209 Default is True. 1210 1211 Returns 1212 ------- 1213 :obj:`~pandas.DataFrame` 1214 DataFrame of with desired normalization 1215 """ 1216 if not normalize: 1217 return input_dataframe 1218 else: 1219 # Convert to dictionary 1220 db_dict = input_dataframe.to_dict(orient="records") 1221 1222 # Initialize empty library 1223 lib = [] 1224 1225 # Enumerate spectra 1226 for i, source in enumerate(db_dict): 1227 spectrum = source 1228 # Check that spectrum["peaks"] exists 1229 if "peaks" not in spectrum.keys(): 1230 raise KeyError( 1231 "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute." 1232 ) 1233 1234 # Convert spectrum["peaks"] to numpy array 1235 if not isinstance(spectrum["peaks"], np.ndarray): 1236 spectrum["peaks"] = np.array(spectrum["peaks"]) 1237 1238 # Normalize peaks, if requested 1239 if normalize: 1240 spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"]) 1241 spectrum["num peaks"] = len(spectrum["peaks"]) 1242 1243 # Add spectrum to library 1244 lib.append(spectrum) 1245 1246 # Convert to DataFrame 1247 df = pd.DataFrame(lib) 1248 return df 1249 1250 def _to_flashentropy(self, input_dataframe, normalize=True, fe_kwargs={}): 1251 """ 1252 Convert MSP-derived library to FlashEntropy library. 1253 1254 Parameters 1255 ---------- 1256 input_dataframe : :obj:`~pandas.DataFrame` 1257 Input DataFrame containing MSP-formatted spectra. 1258 normalize : bool 1259 Normalize each spectrum by its magnitude. 1260 fe_kwargs : dict, optional 1261 Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search; 1262 any keys not recognized will be ignored. By default, all parameters set to defaults. 1263 1264 Returns 1265 ------- 1266 :obj:`~ms_entropy.FlashEntropySearch` 1267 MS2 library as FlashEntropy search instance. 1268 1269 Raises 1270 ------ 1271 ValueError 1272 If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they 1273 """ 1274 self._check_flash_entropy_kwargs(fe_kwargs) 1275 1276 db_df = input_dataframe 1277 1278 # Convert to dictionary 1279 db_dict = db_df.to_dict(orient="records") 1280 1281 # Initialize empty library 1282 fe_lib = [] 1283 1284 # Enumerate spectra 1285 for i, source in enumerate(db_dict): 1286 # Reorganize source dict, if necessary 1287 if "spectrum_data" in source.keys(): 1288 spectrum = source["spectrum_data"] 1289 else: 1290 spectrum = source 1291 1292 # Rename precursor_mz key for FlashEntropy 1293 if "precursor_mz" not in spectrum.keys(): 1294 if "precursormz" in spectrum: 1295 spectrum["precursor_mz"] = spectrum.pop("precursormz") 1296 elif "precursor_ion" in spectrum: 1297 spectrum["precursor_mz"] = spectrum.pop("precursor_ion") 1298 else: 1299 raise KeyError( 1300 "MSP must have either 'precursormz' or 'precursor_ion' key to be converted to FlashEntropy format." 1301 ) 1302 1303 # Check that spectrum["peaks"] exists 1304 if "peaks" not in spectrum.keys(): 1305 raise KeyError( 1306 "MSP not interpretted correctly, 'peaks' key not found in spectrum, check _dataframe attribute." 1307 ) 1308 1309 # Convert spectrum["peaks"] to numpy array 1310 if not isinstance(spectrum["peaks"], np.ndarray): 1311 spectrum["peaks"] = np.array(spectrum["peaks"]) 1312 1313 # Normalize peaks, if requested 1314 if normalize: 1315 spectrum["peaks"] = self.normalize_peaks(spectrum["peaks"]) 1316 1317 # Add spectrum to library 1318 fe_lib.append(spectrum) 1319 1320 # Build FlashEntropy index 1321 fe_search = self._build_flash_entropy_index(fe_lib, fe_kwargs=fe_kwargs) 1322 1323 return fe_search 1324 1325 def _check_msp_compatibility(self): 1326 """ 1327 Check if the MSP file is compatible with the get_metabolomics_spectra_library method and provide feedback if it is not. 1328 """ 1329 # Check polarity 1330 if ( 1331 "polarity" not in self._data_frame.columns 1332 and "ionmode" not in self._data_frame.columns 1333 ): 1334 raise ValueError( 1335 "Neither 'polarity' nor 'ionmode' columns found in the input MSP metadata. Please check the file." 1336 ) 1337 polarity_column = ( 1338 "polarity" if "polarity" in self._data_frame.columns else "ionmode" 1339 ) 1340 1341 # Check if polarity_column contents is either "positive" or "negative" 1342 if not all(self._data_frame[polarity_column].isin(["positive", "negative"])): 1343 raise ValueError( 1344 f"Input field on MSP '{polarity_column}' must contain only 'positive' or 'negative' values." 1345 ) 1346 1347 # Check if the MSP file contains the required columns for metabolite metadata 1348 # inchikey, by name, not null 1349 # either formula or molecular_formula, not null 1350 if not all(self._data_frame["inchikey"].notnull()): 1351 raise ValueError( 1352 "Input field on MSP 'inchikey' must contain only non-null values." 1353 ) 1354 if ( 1355 "formula" not in self._data_frame.columns 1356 and "molecular_formula" not in self._data_frame.columns 1357 ): 1358 raise ValueError( 1359 "Input field on MSP must contain either 'formula' or 'molecular_formula' columns." 1360 ) 1361 molecular_formula_column = ( 1362 "formula" if "formula" in self._data_frame.columns else "molecular_formula" 1363 ) 1364 if not all(self._data_frame[molecular_formula_column].notnull()): 1365 raise ValueError( 1366 f"Input field on MSP '{molecular_formula_column}' must contain only non-null values." 1367 ) 1368 1369 def get_metabolomics_spectra_library( 1370 self, 1371 polarity, 1372 metabolite_metadata_mapping={}, 1373 format="fe", 1374 normalize=True, 1375 fe_kwargs={}, 1376 ): 1377 """ 1378 Prepare metabolomics spectra library and associated metabolite metadata 1379 1380 Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input 1381 1382 """ 1383 # Check if the MSP file is compatible with the get_metabolomics_spectra_library method 1384 self._check_msp_compatibility() 1385 1386 # Check if the polarity parameter is valid and if a polarity column exists in the dataframe 1387 if polarity not in ["positive", "negative"]: 1388 raise ValueError("Polarity must be 'positive' or 'negative'") 1389 polarity_column = ( 1390 "polarity" if "polarity" in self._data_frame.columns else "ionmode" 1391 ) 1392 1393 # Get a subset of the initial dataframea by polarity 1394 db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy() 1395 1396 # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping 1397 # If the mapping is not provided, use the default mapping 1398 if not metabolite_metadata_mapping: 1399 metabolite_metadata_mapping = { 1400 "chebi_id": "chebi", 1401 "kegg_id": "kegg", 1402 "refmet_name": "common_name", 1403 "molecular_formula": "formula", 1404 "gnps_spectra_id":"id", 1405 "precursormz": "precursor_mz", 1406 "precursortype":"ion_type" 1407 } 1408 db_df.rename(columns=metabolite_metadata_mapping, inplace=True) 1409 db_df["molecular_data_id"] = db_df["inchikey"] 1410 1411 1412 1413 # Check if the resulting dataframe has the required columns for the flash entropy search 1414 required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"] 1415 for col in required_columns: 1416 if col not in db_df.columns: 1417 raise ValueError( 1418 f"Input field on MSP must contain '{col}' column for FlashEntropy search." 1419 ) 1420 1421 # Pull out the metabolite metadata from the dataframe and put it into a different dataframe 1422 # First get a list of the possible attributes of the MetaboliteMetadata dataclass 1423 metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys()) 1424 # Replace id with molecular_data_id in metabolite_metadata_keys 1425 metabolite_metadata_keys = [ 1426 "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys 1427 ] 1428 metabolite_metadata_df = db_df[ 1429 db_df.columns[db_df.columns.isin(metabolite_metadata_keys)] 1430 ].copy() 1431 1432 # Make unique and recast the id column for metabolite metadata 1433 metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True) 1434 metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"] 1435 1436 # Convert to a dictionary using the inchikey as the key 1437 metabolite_metadata_dict = metabolite_metadata_df.to_dict( 1438 orient="records" 1439 ) 1440 metabolite_metadata_dict = { 1441 v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata) 1442 for v in metabolite_metadata_dict 1443 } 1444 1445 # Remove the metabolite metadata columns from the original dataframe 1446 for key in metabolite_metadata_keys: 1447 if key != "molecular_data_id": 1448 if key in db_df.columns: 1449 db_df.drop(columns=key, inplace=True) 1450 1451 # Format the spectral library 1452 format_func = self._get_format_func(format) 1453 lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs) 1454 return (lib, metabolite_metadata_dict)
Interface to parse NIST MSP files
1094 def __init__(self, file_path): 1095 """ 1096 Initialize instance. 1097 1098 Parameters 1099 ---------- 1100 file_path : str 1101 Path to a local MSP file. 1102 1103 Attributes 1104 ---------- 1105 file_path : str 1106 Path to the MSP file. 1107 _file_content : str 1108 Content of the MSP file. 1109 _data_frame : :obj:`~pandas.DataFrame` 1110 DataFrame of spectra from the MSP file with unaltered content. 1111 """ 1112 super().__init__(key=None) 1113 1114 self.file_path = file_path 1115 if not os.path.exists(self.file_path): 1116 raise FileNotFoundError( 1117 f"File {self.file_path} does not exist. Please check the file path." 1118 ) 1119 with open(self.file_path, "r") as f: 1120 self._file_content = f.read() 1121 1122 self._data_frame = self._read_msp_file() 1123 self.__init_format_map__()
Initialize instance.
Parameters
- file_path (str): Path to a local MSP file.
Attributes
- file_path (str): Path to the MSP file.
- _file_content (str): Content of the MSP file.
- _data_frame (
~pandas.DataFrame): DataFrame of spectra from the MSP file with unaltered content.
1369 def get_metabolomics_spectra_library( 1370 self, 1371 polarity, 1372 metabolite_metadata_mapping={}, 1373 format="fe", 1374 normalize=True, 1375 fe_kwargs={}, 1376 ): 1377 """ 1378 Prepare metabolomics spectra library and associated metabolite metadata 1379 1380 Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input 1381 1382 """ 1383 # Check if the MSP file is compatible with the get_metabolomics_spectra_library method 1384 self._check_msp_compatibility() 1385 1386 # Check if the polarity parameter is valid and if a polarity column exists in the dataframe 1387 if polarity not in ["positive", "negative"]: 1388 raise ValueError("Polarity must be 'positive' or 'negative'") 1389 polarity_column = ( 1390 "polarity" if "polarity" in self._data_frame.columns else "ionmode" 1391 ) 1392 1393 # Get a subset of the initial dataframea by polarity 1394 db_df = self._data_frame[self._data_frame[polarity_column] == polarity].copy() 1395 1396 # Rename the columns of the db_df to match the MetaboliteMetadata dataclass using the metabolite_metadata_mapping 1397 # If the mapping is not provided, use the default mapping 1398 if not metabolite_metadata_mapping: 1399 metabolite_metadata_mapping = { 1400 "chebi_id": "chebi", 1401 "kegg_id": "kegg", 1402 "refmet_name": "common_name", 1403 "molecular_formula": "formula", 1404 "gnps_spectra_id":"id", 1405 "precursormz": "precursor_mz", 1406 "precursortype":"ion_type" 1407 } 1408 db_df.rename(columns=metabolite_metadata_mapping, inplace=True) 1409 db_df["molecular_data_id"] = db_df["inchikey"] 1410 1411 1412 1413 # Check if the resulting dataframe has the required columns for the flash entropy search 1414 required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"] 1415 for col in required_columns: 1416 if col not in db_df.columns: 1417 raise ValueError( 1418 f"Input field on MSP must contain '{col}' column for FlashEntropy search." 1419 ) 1420 1421 # Pull out the metabolite metadata from the dataframe and put it into a different dataframe 1422 # First get a list of the possible attributes of the MetaboliteMetadata dataclass 1423 metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys()) 1424 # Replace id with molecular_data_id in metabolite_metadata_keys 1425 metabolite_metadata_keys = [ 1426 "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys 1427 ] 1428 metabolite_metadata_df = db_df[ 1429 db_df.columns[db_df.columns.isin(metabolite_metadata_keys)] 1430 ].copy() 1431 1432 # Make unique and recast the id column for metabolite metadata 1433 metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True) 1434 metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"] 1435 1436 # Convert to a dictionary using the inchikey as the key 1437 metabolite_metadata_dict = metabolite_metadata_df.to_dict( 1438 orient="records" 1439 ) 1440 metabolite_metadata_dict = { 1441 v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata) 1442 for v in metabolite_metadata_dict 1443 } 1444 1445 # Remove the metabolite metadata columns from the original dataframe 1446 for key in metabolite_metadata_keys: 1447 if key != "molecular_data_id": 1448 if key in db_df.columns: 1449 db_df.drop(columns=key, inplace=True) 1450 1451 # Format the spectral library 1452 format_func = self._get_format_func(format) 1453 lib = format_func(db_df, normalize=normalize, fe_kwargs=fe_kwargs) 1454 return (lib, metabolite_metadata_dict)
Prepare metabolomics spectra library and associated metabolite metadata
Note: this uses the inchikey as the index for the metabolite metadata dataframe and for connecting to the spectra, so it must be in the input