corems.molecular_id.search.molecularFormulaSearch
1__author__ = "Yuri E. Corilo" 2__date__ = "Jul 29, 2019" 3 4 5from typing import List 6 7import tqdm 8 9from corems import chunks, timeit 10from corems.encapsulation.constant import Labels 11from corems.molecular_formula.factory.MolecularFormulaFactory import ( 12 LCMSLibRefMolecularFormula, 13 MolecularFormula, 14) 15from corems.molecular_id.factory.MolecularLookupTable import MolecularCombinations 16from corems.molecular_id.factory.molecularSQL import MolForm_SQL 17from corems.ms_peak.factory.MSPeakClasses import _MSPeak 18 19last_error = 0 20last_dif = 0 21closest_error = 0 22error_average = 0 23nbValues = 0 24 25 26class SearchMolecularFormulas: 27 """Class for searching molecular formulas in a mass spectrum. 28 29 Parameters 30 ---------- 31 mass_spectrum_obj : MassSpectrum 32 The mass spectrum object. 33 sql_db : MolForm_SQL, optional 34 The SQL database object, by default None. 35 first_hit : bool, optional 36 Flag to indicate whether to skip peaks that already have a molecular formula assigned, by default False. 37 find_isotopologues : bool, optional 38 Flag to indicate whether to find isotopologues, by default True. 39 40 Attributes 41 ---------- 42 mass_spectrum_obj : MassSpectrum 43 The mass spectrum object. 44 sql_db : MolForm_SQL 45 The SQL database object. 46 first_hit : bool 47 Flag to indicate whether to skip peaks that already have a molecular formula assigned. 48 find_isotopologues : bool 49 Flag to indicate whether to find isotopologues. 50 51 52 Methods 53 ------- 54 * run_search(). 55 Run the molecular formula search. 56 * run_worker_mass_spectrum(). 57 Run the molecular formula search on the mass spectrum object. 58 * run_worker_ms_peaks(). 59 Run the molecular formula search on the given list of mass spectrum peaks. 60 * database_to_dict(). 61 Convert the database results to a dictionary. 62 * run_molecular_formula(). 63 Run the molecular formula search on the given list of mass spectrum peaks. 64 * search_mol_formulas(). 65 Search for molecular formulas in the mass spectrum. 66 67 """ 68 69 def __init__( 70 self, 71 mass_spectrum_obj, 72 sql_db=None, 73 first_hit: bool = False, 74 find_isotopologues: bool = True, 75 ): 76 self.first_hit = first_hit 77 78 self.find_isotopologues = find_isotopologues 79 80 self.mass_spectrum_obj = mass_spectrum_obj 81 82 if not sql_db: 83 self.sql_db = MolForm_SQL( 84 url=mass_spectrum_obj.molecular_search_settings.url_database 85 ) 86 87 else: 88 self.sql_db = sql_db 89 90 def __enter__(self): 91 """Open the SQL database connection.""" 92 return self 93 94 def __exit__(self, exc_type, exc_val, exc_tb): 95 """Close the SQL database connection.""" 96 self.sql_db.close() 97 98 return False 99 100 def run_search( 101 self, 102 mspeaks: list, 103 query: dict, 104 min_abundance: float, 105 ion_type: str, 106 ion_charge: int, 107 adduct_atom=None, 108 ): 109 """Run the molecular formula search. 110 111 Parameters 112 ---------- 113 mspeaks : list of MSPeak 114 The list of mass spectrum peaks. 115 query : dict 116 The query dictionary containing the possible molecular formulas. 117 min_abundance : float 118 The minimum abundance threshold. 119 ion_type : str 120 The ion type. 121 ion_charge : int 122 The ion charge. 123 adduct_atom : str, optional 124 The adduct atom, by default None. 125 """ 126 127 def get_formulas(nominal_overlay: float = 0.1): 128 """ 129 Get the list of formulas based on the nominal overlay. 130 131 Parameters 132 ---------- 133 nominal_overlay : float, optional 134 The nominal overlay, by default 0.1. 135 136 Returns 137 ------- 138 list 139 The list of formulas. 140 """ 141 nominal_mz = ms_peak.nominal_mz_exp 142 143 defect_mass = ms_peak.mz_exp - nominal_mz 144 nominal_masses = [nominal_mz] 145 146 if (defect_mass) >= 1 - nominal_overlay: 147 nominal_masses.append(nominal_mz + 1) 148 elif (defect_mass) <= nominal_overlay: 149 nominal_masses.append(nominal_mz - 1) 150 151 list_formulas_candidates = [] 152 153 for nominal_mass in nominal_masses: 154 if nominal_mass in query.keys(): 155 list_formulas_candidates.extend(query.get(nominal_mass)) 156 157 return list_formulas_candidates 158 159 all_assigned_indexes = list() 160 161 # molecular_search_settings = self.mass_spectrum_obj.molecular_search_settings 162 163 search_molfrom = SearchMolecularFormulaWorker( 164 find_isotopologues=self.find_isotopologues 165 ) 166 167 for ms_peak in mspeaks: 168 # already assigned a molecular formula 169 if self.first_hit: 170 if ms_peak.is_assigned: 171 continue 172 173 ms_peak_indexes = search_molfrom.find_formulas( 174 get_formulas(), 175 min_abundance, 176 self.mass_spectrum_obj, 177 ms_peak, 178 ion_type, 179 ion_charge, 180 adduct_atom, 181 ) 182 183 all_assigned_indexes.extend(ms_peak_indexes) 184 185 # all_assigned_indexes = MolecularFormulaSearchFilters().filter_isotopologue(all_assigned_indexes, self.mass_spectrum_obj) 186 187 # all_assigned_indexes = MolecularFormulaSearchFilters().filter_kendrick(all_assigned_indexes, self.mass_spectrum_obj) 188 189 # MolecularFormulaSearchFilters().check_min_peaks(all_assigned_indexes, self.mass_spectrum_obj) 190 # filter per min peaks per mono isotopic class 191 192 def run_worker_mass_spectrum(self): 193 """Run the molecular formula search on the mass spectrum object.""" 194 self.run_molecular_formula( 195 self.mass_spectrum_obj.sort_by_abundance(), 196 print_time=self.mass_spectrum_obj.molecular_search_settings.verbose_processing 197 ) 198 199 def run_worker_ms_peaks(self, ms_peaks): 200 """Run the molecular formula search on the given list of mass spectrum peaks. 201 202 Parameters 203 ---------- 204 ms_peaks : list of MSPeak 205 The list of mass spectrum peaks. 206 """ 207 self.run_molecular_formula( 208 ms_peaks, 209 print_time=self.mass_spectrum_obj.molecular_search_settings.verbose_processing 210 ) 211 212 @staticmethod 213 def database_to_dict(classe_str_list, nominal_mzs, mf_search_settings, ion_charge): 214 """Convert the database results to a dictionary. 215 216 Parameters 217 ---------- 218 classe_str_list : list 219 The list of class strings. 220 nominal_mzs : list 221 The list of nominal m/z values. 222 mf_search_settings : MolecularFormulaSearchSettings 223 The molecular formula search settings. 224 ion_charge : int 225 The ion charge. 226 227 Returns 228 ------- 229 dict 230 The dictionary containing the database results. 231 """ 232 sql_db = MolForm_SQL(url=mf_search_settings.url_database) 233 234 dict_res = {} 235 236 if mf_search_settings.isProtonated: 237 dict_res[Labels.protonated_de_ion] = sql_db.get_dict_by_classes( 238 classe_str_list, 239 Labels.protonated_de_ion, 240 nominal_mzs, 241 ion_charge, 242 mf_search_settings, 243 ) 244 245 if mf_search_settings.isRadical: 246 dict_res[Labels.radical_ion] = sql_db.get_dict_by_classes( 247 classe_str_list, 248 Labels.radical_ion, 249 nominal_mzs, 250 ion_charge, 251 mf_search_settings, 252 ) 253 254 if mf_search_settings.isAdduct: 255 adduct_list = ( 256 mf_search_settings.adduct_atoms_neg 257 if ion_charge < 0 258 else mf_search_settings.adduct_atoms_pos 259 ) 260 dict_res[Labels.adduct_ion] = sql_db.get_dict_by_classes( 261 classe_str_list, 262 Labels.adduct_ion, 263 nominal_mzs, 264 ion_charge, 265 mf_search_settings, 266 adducts=adduct_list, 267 ) 268 269 return dict_res 270 271 @timeit(print_time=True) 272 def run_molecular_formula(self, ms_peaks, **kwargs): 273 """Run the molecular formula search on the given list of mass spectrum peaks. 274 275 Parameters 276 ---------- 277 ms_peaks : list of MSPeak 278 The list of mass spectrum peaks. 279 **kwargs 280 Additional keyword arguments. 281 Most notably, print_time, which is a boolean flag to indicate whether to print the time 282 and passed to the timeit decorator. 283 """ 284 ion_charge = self.mass_spectrum_obj.polarity 285 min_abundance = self.mass_spectrum_obj.min_abundance 286 nominal_mzs = self.mass_spectrum_obj.nominal_mz 287 288 verbose = self.mass_spectrum_obj.molecular_search_settings.verbose_processing 289 # reset average error, only relevant is average mass error method is being used 290 SearchMolecularFormulaWorker( 291 find_isotopologues=self.find_isotopologues 292 ).reset_error(self.mass_spectrum_obj) 293 294 # check database for all possible molecular formula combinations based on the setting passed to self.mass_spectrum_obj.molecular_search_settings 295 classes = MolecularCombinations(self.sql_db).runworker( 296 self.mass_spectrum_obj.molecular_search_settings, 297 print_time=self.mass_spectrum_obj.molecular_search_settings.verbose_processing 298 ) 299 300 # split the database load to not blowout the memory 301 # TODO add to the settings 302 for classe_chunk in chunks( 303 classes, self.mass_spectrum_obj.molecular_search_settings.db_chunk_size 304 ): 305 classes_str_list = [class_tuple[0] for class_tuple in classe_chunk] 306 307 # load the molecular formula objs binned by ion type and heteroatoms classes, {ion type:{classe:[list_formula]}} 308 # for adduct ion type a third key is added {atoms:{ion type:{classe:[list_formula]}}} 309 dict_res = self.database_to_dict( 310 classes_str_list, 311 nominal_mzs, 312 self.mass_spectrum_obj.molecular_search_settings, 313 ion_charge, 314 ) 315 pbar = tqdm.tqdm(classe_chunk, disable = not verbose) 316 for classe_tuple in pbar: 317 # class string is a json serialized dict 318 classe_str = classe_tuple[0] 319 classe_dict = classe_tuple[1] 320 321 if self.mass_spectrum_obj.molecular_search_settings.isProtonated: 322 ion_type = Labels.protonated_de_ion 323 if verbose: 324 pbar.set_description_str( 325 desc="Started molecular formula search for class %s, (de)protonated " 326 % classe_str, 327 refresh=True, 328 ) 329 330 candidate_formulas = dict_res.get(ion_type).get(classe_str) 331 332 if candidate_formulas: 333 self.run_search( 334 ms_peaks, 335 candidate_formulas, 336 min_abundance, 337 ion_type, 338 ion_charge, 339 ) 340 341 if self.mass_spectrum_obj.molecular_search_settings.isRadical: 342 if verbose: 343 pbar.set_description_str( 344 desc="Started molecular formula search for class %s, radical " 345 % classe_str, 346 refresh=True, 347 ) 348 349 ion_type = Labels.radical_ion 350 351 candidate_formulas = dict_res.get(ion_type).get(classe_str) 352 353 if candidate_formulas: 354 self.run_search( 355 ms_peaks, 356 candidate_formulas, 357 min_abundance, 358 ion_type, 359 ion_charge, 360 ) 361 # looks for adduct, used_atom_valences should be 0 362 # this code does not support H exchance by halogen atoms 363 if self.mass_spectrum_obj.molecular_search_settings.isAdduct: 364 if verbose: 365 pbar.set_description_str( 366 desc="Started molecular formula search for class %s, adduct " 367 % classe_str, 368 refresh=True, 369 ) 370 371 ion_type = Labels.adduct_ion 372 dict_atoms_formulas = dict_res.get(ion_type) 373 374 for adduct_atom, dict_by_class in dict_atoms_formulas.items(): 375 candidate_formulas = dict_by_class.get(classe_str) 376 377 if candidate_formulas: 378 self.run_search( 379 ms_peaks, 380 candidate_formulas, 381 min_abundance, 382 ion_type, 383 ion_charge, 384 adduct_atom=adduct_atom, 385 ) 386 self.sql_db.close() 387 388 def search_mol_formulas( 389 self, 390 possible_formulas_list: List[MolecularFormula], 391 ion_type: str, 392 neutral_molform=True, 393 find_isotopologues=True, 394 adduct_atom=None, 395 ) -> List[_MSPeak]: 396 """Search for molecular formulas in the mass spectrum. 397 398 Parameters 399 ---------- 400 possible_formulas_list : list of MolecularFormula 401 The list of possible molecular formulas. 402 ion_type : str 403 The ion type. 404 neutral_molform : bool, optional 405 Flag to indicate whether the molecular formulas are neutral, by default True. 406 find_isotopologues : bool, optional 407 Flag to indicate whether to find isotopologues, by default True. 408 adduct_atom : str, optional 409 The adduct atom, by default None. 410 411 Returns 412 ------- 413 list of MSPeak 414 The list of mass spectrum peaks with assigned molecular formulas. 415 """ 416 # neutral_molform: some reference files already present the formula on ion mode, for instance, bruker reference files 417 # if that is the case than turn neutral_molform off 418 419 SearchMolecularFormulaWorker(find_isotopologues=find_isotopologues).reset_error( 420 self.mass_spectrum_obj 421 ) 422 423 initial_min_peak_bool = ( 424 self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter 425 ) 426 initial_runtime_kendrick_filter = ( 427 self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter 428 ) 429 430 # Are the following 3 lines redundant? 431 self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = False 432 self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = ( 433 False # TODO check this line 434 ) 435 self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter = ( 436 False 437 ) 438 439 possible_formulas_dict_nm = {} 440 441 for mf in possible_formulas_list: 442 if neutral_molform: 443 nm = int(mf.protonated_mz) 444 else: 445 nm = int(mf.mz_nominal_calc) 446 447 if nm in possible_formulas_dict_nm.keys(): 448 possible_formulas_dict_nm[nm].append(mf) 449 450 else: 451 possible_formulas_dict_nm[nm] = [mf] 452 453 min_abundance = self.mass_spectrum_obj.min_abundance 454 455 ion_type = ion_type 456 457 self.run_search( 458 self.mass_spectrum_obj, 459 possible_formulas_dict_nm, 460 min_abundance, 461 ion_type, 462 self.mass_spectrum_obj.polarity, 463 adduct_atom=adduct_atom, 464 ) 465 466 self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = ( 467 initial_min_peak_bool 468 ) 469 self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter = ( 470 initial_runtime_kendrick_filter 471 ) 472 473 mspeaks = [mspeak for mspeak in self.mass_spectrum_obj if mspeak.is_assigned] 474 475 self.sql_db.close() 476 477 return mspeaks 478 479 480class SearchMolecularFormulaWorker: 481 """Class for searching molecular formulas in a mass spectrum. 482 483 Parameters 484 ---------- 485 find_isotopologues : bool, optional 486 Flag to indicate whether to find isotopologues, by default True. 487 488 Attributes 489 ---------- 490 find_isotopologues : bool 491 Flag to indicate whether to find isotopologues. 492 493 Methods 494 ------- 495 * reset_error(). 496 Reset the error variables. 497 * set_last_error(). 498 Set the last error. 499 * find_formulas(). 500 Find the formulas. 501 * calc_error(). 502 Calculate the error. 503 """ 504 505 # TODO add reset error function 506 # needs this wraper to pass the class to multiprocessing 507 508 def __init__(self, find_isotopologues=True): 509 self.find_isotopologues = find_isotopologues 510 511 def __call__(self, args): 512 """Call the find formulas function. 513 514 Parameters 515 ---------- 516 args : tuple 517 The arguments. 518 519 Returns 520 ------- 521 list 522 The list of mass spectrum peaks with assigned molecular formulas. 523 """ 524 return self.find_formulas(*args) # ,args[1] 525 526 def reset_error(self, mass_spectrum_obj): 527 """Reset the error variables. 528 529 Parameters 530 ---------- 531 mass_spectrum_obj : MassSpectrum 532 The mass spectrum object. 533 534 Notes 535 ----- 536 This function resets the error variables for the given mass spectrum object. 537 """ 538 global last_error, last_dif, closest_error, error_average, nbValues 539 last_error, last_dif, closest_error, nbValues = 0.0, 0.0, 0.0, 0.0 540 541 def set_last_error(self, error, mass_spectrum_obj): 542 """Set the last error. 543 544 Parameters 545 ---------- 546 error : float 547 The error. 548 mass_spectrum_obj : MassSpectrum 549 The mass spectrum object. 550 """ 551 # set the changes to the global variables, not internal ones 552 global last_error, last_dif, closest_error, error_average, nbValues 553 554 if mass_spectrum_obj.molecular_search_settings.error_method == "distance": 555 dif = error - last_error 556 if dif < last_dif: 557 last_dif = dif 558 closest_error = error 559 mass_spectrum_obj.molecular_search_settings.min_ppm_error = ( 560 closest_error 561 - mass_spectrum_obj.molecular_search_settings.mz_error_range 562 ) 563 mass_spectrum_obj.molecular_search_settings.max_ppm_error = ( 564 closest_error 565 + mass_spectrum_obj.molecular_search_settings.mz_error_range 566 ) 567 568 elif mass_spectrum_obj.molecular_search_settings.error_method == "lowest": 569 if error < last_error: 570 mass_spectrum_obj.molecular_search_settings.min_ppm_error = ( 571 error - mass_spectrum_obj.molecular_search_settings.mz_error_range 572 ) 573 mass_spectrum_obj.molecular_search_settings.max_ppm_error = ( 574 error + mass_spectrum_obj.molecular_search_settings.mz_error_range 575 ) 576 last_error = error 577 578 elif mass_spectrum_obj.molecular_search_settings.error_method == "symmetrical": 579 mass_spectrum_obj.molecular_search_settings.min_ppm_error = ( 580 mass_spectrum_obj.molecular_search_settings.mz_error_average 581 - mass_spectrum_obj.molecular_search_settings.mz_error_range 582 ) 583 mass_spectrum_obj.molecular_search_settings.max_ppm_error = ( 584 mass_spectrum_obj.molecular_search_settings.mz_error_average 585 + mass_spectrum_obj.molecular_search_settings.mz_error_range 586 ) 587 588 elif mass_spectrum_obj.molecular_search_settings.error_method == "average": 589 nbValues += 1 590 error_average = error_average + ((error - error_average) / nbValues) 591 mass_spectrum_obj.molecular_search_settings.min_ppm_error = ( 592 error_average 593 - mass_spectrum_obj.molecular_search_settings.mz_error_range 594 ) 595 mass_spectrum_obj.molecular_search_settings.max_ppm_error = ( 596 error_average 597 + mass_spectrum_obj.molecular_search_settings.mz_error_range 598 ) 599 600 else: 601 # using set mass_spectrum_obj.molecular_search_settings.min_ppm_error and max_ppm_error range 602 pass 603 604 # returns the error based on the selected method at mass_spectrum_obj.molecular_search_settings.method 605 606 @staticmethod 607 def calc_error(mz_exp, mz_calc, method="ppm"): 608 """Calculate the error. 609 610 Parameters 611 ---------- 612 mz_exp : float 613 The experimental m/z value. 614 mz_calc : float 615 The calculated m/z value. 616 method : str, optional 617 The method, by default 'ppm'. 618 619 Raises 620 ------- 621 Exception 622 If the method is not ppm or ppb. 623 624 Returns 625 ------- 626 float 627 The error. 628 """ 629 630 if method == "ppm": 631 multi_factor = 1_000_000 632 633 elif method == "ppb": 634 multi_factor = 1_000_000_000 635 636 elif method == "perc": 637 multi_factor = 100 638 639 else: 640 raise Exception( 641 "method needs to be ppm or ppb, you have entered %s" % method 642 ) 643 644 if mz_exp: 645 return ((mz_exp - mz_calc) / mz_calc) * multi_factor 646 647 else: 648 raise Exception("Please set mz_calc first") 649 650 def find_formulas( 651 self, 652 formulas, 653 min_abundance, 654 mass_spectrum_obj, 655 ms_peak, 656 ion_type, 657 ion_charge, 658 adduct_atom=None, 659 ): 660 """Find the formulas. 661 662 Parameters 663 ---------- 664 formulas : list of MolecularFormula 665 The list of molecular formulas. 666 min_abundance : float 667 The minimum abundance threshold. 668 mass_spectrum_obj : MassSpectrum 669 The mass spectrum object. 670 ms_peak : MSPeak 671 The mass spectrum peak. 672 ion_type : str 673 The ion type. 674 ion_charge : int 675 The ion charge. 676 adduct_atom : str, optional 677 The adduct atom, by default None. 678 679 Returns 680 ------- 681 list of MSPeak 682 The list of mass spectrum peaks with assigned molecular formulas. 683 684 Notes 685 ----- 686 Uses the closest error the next search (this is not ideal, it needs to use confidence 687 metric to choose the right candidate then propagate the error using the error from the best candidate). 688 It needs to add s/n to the equation. 689 It need optimization to define the mz_error_range within a m/z unit since it is directly proportional 690 with the mass, and inversely proportional to the rp. It's not linear, i.e., sigma mass. 691 The idea it to correlate sigma to resolving power, signal to noise and sample complexity per mz unit. 692 Method='distance' 693 """ 694 mspeak_assigned_index = list() 695 696 min_ppm_error = mass_spectrum_obj.molecular_search_settings.min_ppm_error 697 max_ppm_error = mass_spectrum_obj.molecular_search_settings.max_ppm_error 698 699 min_abun_error = mass_spectrum_obj.molecular_search_settings.min_abun_error 700 max_abun_error = mass_spectrum_obj.molecular_search_settings.max_abun_error 701 702 # f = open("abundance_error.txt", "a+") 703 ms_peak_mz_exp, ms_peak_abundance = ms_peak.mz_exp, ms_peak.abundance 704 # min_error = min([pmf.mz_error for pmf in possible_formulas]) 705 706 def mass_by_ion_type(possible_formula_obj): 707 if ion_type == Labels.protonated_de_ion: 708 return possible_formula_obj._protonated_mz(ion_charge) 709 710 elif ion_type == Labels.radical_ion: 711 return possible_formula_obj._radical_mz(ion_charge) 712 713 elif ion_type == Labels.adduct_ion and adduct_atom: 714 return possible_formula._adduct_mz(ion_charge, adduct_atom) 715 716 else: 717 # will return externally calculated mz if is set, #use on Bruker Reference list import 718 # if the ion type is known the ion mass based on molecular formula ion type 719 # if ion type is unknow will return neutral mass 720 return possible_formula.mz_calc 721 722 if formulas: 723 if isinstance(formulas[0], LCMSLibRefMolecularFormula): 724 possible_mf_class = True 725 726 else: 727 possible_mf_class = False 728 729 for possible_formula in formulas: 730 if possible_formula: 731 error = self.calc_error( 732 ms_peak_mz_exp, mass_by_ion_type(possible_formula) 733 ) 734 735 # error = possible_formula.mz_error 736 737 if min_ppm_error <= error <= max_ppm_error: 738 # update the error 739 740 self.set_last_error(error, mass_spectrum_obj) 741 742 # add molecular formula match to ms_peak 743 744 # get molecular formula dict from sql obj 745 # formula_dict = pickle.loads(possible_formula.mol_formula) 746 # if possible_mf_class: 747 748 # molecular_formula = deepcopy(possible_formula) 749 750 # else: 751 752 formula_dict = possible_formula.to_dict() 753 # create the molecular formula obj to be stored 754 if possible_mf_class: 755 molecular_formula = LCMSLibRefMolecularFormula( 756 formula_dict, 757 ion_charge, 758 ion_type=ion_type, 759 adduct_atom=adduct_atom, 760 ) 761 762 molecular_formula.name = possible_formula.name 763 molecular_formula.kegg_id = possible_formula.kegg_id 764 molecular_formula.cas = possible_formula.cas 765 766 else: 767 molecular_formula = MolecularFormula( 768 formula_dict, 769 ion_charge, 770 ion_type=ion_type, 771 adduct_atom=adduct_atom, 772 ) 773 # add the molecular formula obj to the mspeak obj 774 # add the mspeak obj and it's index for tracking next assignment step 775 776 if self.find_isotopologues: 777 # calculates isotopologues 778 isotopologues = molecular_formula.isotopologues( 779 min_abundance, 780 ms_peak_abundance, 781 mass_spectrum_obj.dynamic_range, 782 ) 783 784 # search for isotopologues 785 for isotopologue_formula in isotopologues: 786 molecular_formula.expected_isotopologues.append( 787 isotopologue_formula 788 ) 789 # move this outside to improve preformace 790 # we need to increase the search space to -+1 m_z 791 first_index, last_index = ( 792 mass_spectrum_obj.get_nominal_mz_first_last_indexes( 793 isotopologue_formula.mz_nominal_calc 794 ) 795 ) 796 797 for ms_peak_iso in mass_spectrum_obj[ 798 first_index:last_index 799 ]: 800 error = self.calc_error( 801 ms_peak_iso.mz_exp, isotopologue_formula.mz_calc 802 ) 803 804 if min_ppm_error <= error <= max_ppm_error: 805 # need to define error distribution for abundance measurements 806 807 # if mass_spectrum_obj.is_centroid: 808 809 abundance_error = self.calc_error( 810 isotopologue_formula.abundance_calc, 811 ms_peak_iso.abundance, 812 method="perc", 813 ) 814 815 # area_error = self.calc_error(ms_peak.area, ms_peak_iso.area, method='perc') 816 817 # margin of error was set empirically/ needs statistical calculation 818 # of margin of error for the measurement of the abundances 819 if ( 820 min_abun_error 821 <= abundance_error 822 <= max_abun_error 823 ): 824 # update the error 825 826 self.set_last_error(error, mass_spectrum_obj) 827 828 # isotopologue_formula.mz_error = error 829 830 # isotopologue_formula.area_error = area_error 831 832 # isotopologue_formula.abundance_error = abundance_error 833 834 isotopologue_formula.mspeak_index_mono_isotopic = ms_peak.index 835 836 mono_isotopic_formula_index = len(ms_peak) 837 838 isotopologue_formula.mspeak_index_mono_isotopic = ms_peak.index 839 840 isotopologue_formula.mono_isotopic_formula_index = mono_isotopic_formula_index 841 842 # add mspeaks isotopologue index to the mono isotopic MolecularFormula obj and the respective formula position 843 844 # add molecular formula match to ms_peak 845 x = ms_peak_iso.add_molecular_formula( 846 isotopologue_formula 847 ) 848 849 molecular_formula.mspeak_mf_isotopologues_indexes.append( 850 (ms_peak_iso.index, x) 851 ) 852 # add mspeaks mono isotopic index to the isotopologue MolecularFormula obj 853 854 y = ms_peak.add_molecular_formula(molecular_formula) 855 856 mspeak_assigned_index.append((ms_peak.index, y)) 857 858 return mspeak_assigned_index 859 860 861class SearchMolecularFormulasLC: 862 """Class for searching molecular formulas in a LC object. 863 864 Parameters 865 ---------- 866 lcms_obj : LCMSBase 867 The LCMSBase object. 868 sql_db : MolForm_SQL, optional 869 The SQL database object, by default None. 870 first_hit : bool, optional 871 Flag to indicate whether to skip peaks that already have a molecular formula assigned, by default False. 872 find_isotopologues : bool, optional 873 Flag to indicate whether to find isotopologues, by default True. 874 875 Methods 876 ------- 877 878 * search_spectra_against_candidates(). 879 Search a list of mass spectra against a list of candidate formulas with a given ion type and charge. 880 * bulk_run_molecular_formula_search(). 881 Run the molecular formula search on the given list of mass spectra. 882 Pulls the settings from the LCMSBase object to set ion type and charge to search for. 883 * run_mass_feature_search(). 884 Run the molecular formula search on mass features. 885 Calls bulk_run_molecular_formula_search() with specified mass spectra and mass peaks. 886 * run_untargeted_worker_ms1(). 887 Run untargeted molecular formula search on the ms1 mass spectrum. 888 DEPRECATED: use run_mass_feature_search() or bulk_run_molecular_formula_search() instead. 889 * run_target_worker_ms1(). 890 Run targeted molecular formula search on the ms1 mass spectrum. 891 DEPRECATED: use run_mass_feature_search() or bulk_run_molecular_formula_search() instead. 892 """ 893 894 def __init__(self, lcms_obj, sql_db=None, first_hit=False, find_isotopologues=True): 895 self.first_hit = first_hit 896 897 self.find_isotopologues = find_isotopologues 898 899 self.lcms_obj = lcms_obj 900 901 if not sql_db: 902 self.sql_db = MolForm_SQL( 903 url=self.lcms_obj.parameters.mass_spectrum['ms1'].molecular_search.url_database 904 ) 905 906 else: 907 self.sql_db = sql_db 908 909 def search_spectra_against_candidates(self, mass_spectrum_list, ms_peaks_list, candidate_formulas, ion_type, ion_charge): 910 """Search a list of mass spectra against a list of candidate formulas with a given ion type and charge. 911 912 Parameters 913 ---------- 914 mass_spectrum_list : list of MassSpectrum 915 The list of mass spectra to perform the search on. 916 ms_peaks_list : list of lists of MSPeak objects 917 The list of mass spectrum peaks to search within each mass spectrum. 918 candidate_formulas : dict 919 The candidate formulas. 920 ion_type : str 921 The ion type. 922 ion_charge : int 923 The ion charge, either 1 or -1. 924 925 Notes 926 ----- 927 This function is designed to be used with the bulk_run_molecular_formula_search function. 928 """ 929 for mass_spectrum, ms_peaks in zip(mass_spectrum_list, ms_peaks_list): 930 single_ms_search = SearchMolecularFormulas( 931 mass_spectrum, 932 sql_db=self.sql_db, 933 first_hit=self.first_hit, 934 find_isotopologues=self.find_isotopologues, 935 ) 936 single_ms_search.run_search( 937 ms_peaks, 938 candidate_formulas, 939 mass_spectrum.min_abundance, 940 ion_type, 941 ion_charge, 942 ) 943 944 def bulk_run_molecular_formula_search(self, mass_spectrum_list, ms_peaks_list, mass_spectrum_setting_key='ms1'): 945 """Run the molecular formula search on the given list of mass spectra 946 947 Parameters 948 ---------- 949 mass_spectrum_list : list of MassSpectrum 950 The list of mass spectra to search. 951 ms_peaks_list : list of lists of MSPeak objects 952 The mass peaks to perform molecular formula search within each mass spectrum 953 mass_spectrum_setting_key : str, optional 954 The mass spectrum setting key, by default 'ms1'. 955 This is used to get the appropriate molecular search settings from the LCMSBase object 956 """ 957 # Set min_abundance and nominal_mzs 958 if self.lcms_obj.polarity == "positive": 959 ion_charge = 1 960 elif self.lcms_obj.polarity == "negative": 961 ion_charge = -1 962 else: 963 raise ValueError("Polarity must be either 'positive' or 'negative'") 964 965 # Check that the length of the mass spectrum list and the ms_peaks list are the same 966 if len(mass_spectrum_list) != len(ms_peaks_list): 967 raise ValueError("The length of the mass spectrum list and the ms_peaks list must be the same") 968 969 nominal_mzs = [x.nominal_mz for x in mass_spectrum_list] 970 nominal_mzs = list(set([item for sublist in nominal_mzs for item in sublist])) 971 verbose = self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.verbose_processing 972 973 # reset average error, only relevant if average mass error method is being used 974 SearchMolecularFormulaWorker( 975 find_isotopologues=self.find_isotopologues 976 ).reset_error(mass_spectrum_list[0]) 977 978 # check database for all possible molecular formula combinations based on the setting passed to self.mass_spectrum_obj.molecular_search_settings 979 classes = MolecularCombinations(self.sql_db).runworker( 980 self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search, 981 print_time=self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.verbose_processing 982 ) 983 984 # split the database load to not blowout the memory 985 for classe_chunk in chunks( 986 classes, self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.db_chunk_size 987 ): 988 classes_str_list = [class_tuple[0] for class_tuple in classe_chunk] 989 990 # load the molecular formula objs binned by ion type and heteroatoms classes, {ion type:{classe:[list_formula]}} 991 # for adduct ion type a third key is added {atoms:{ion type:{classe:[list_formula]}}} 992 dict_res = SearchMolecularFormulas.database_to_dict( 993 classes_str_list, 994 nominal_mzs, 995 self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search, 996 ion_charge, 997 ) 998 999 pbar = tqdm.tqdm(classe_chunk, disable = not verbose) 1000 for classe_tuple in pbar: 1001 # class string is a json serialized dict 1002 classe_str = classe_tuple[0] 1003 1004 # Perform search for (de)protonated ion type 1005 if self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.isProtonated: 1006 ion_type = Labels.protonated_de_ion 1007 1008 pbar.set_description_str( 1009 desc="Started molecular formula search for class %s, (de)protonated " 1010 % classe_str, 1011 refresh=True, 1012 ) 1013 1014 candidate_formulas = dict_res.get(ion_type).get(classe_str) 1015 1016 if candidate_formulas: 1017 self.search_spectra_against_candidates( 1018 mass_spectrum_list=mass_spectrum_list, 1019 ms_peaks_list=ms_peaks_list, 1020 candidate_formulas=candidate_formulas, 1021 ion_type=ion_type, 1022 ion_charge=ion_charge 1023 ) 1024 1025 1026 # Perform search for radical ion type 1027 if self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.isRadical: 1028 pbar.set_description_str( 1029 desc="Started molecular formula search for class %s, radical " 1030 % classe_str, 1031 refresh=True, 1032 ) 1033 1034 ion_type = Labels.radical_ion 1035 1036 candidate_formulas = dict_res.get(ion_type).get(classe_str) 1037 1038 if candidate_formulas: 1039 self.search_spectra_against_candidates( 1040 mass_spectrum_list=mass_spectrum_list, 1041 ms_peaks_list=ms_peaks_list, 1042 candidate_formulas=candidate_formulas, 1043 ion_type=ion_type, 1044 ion_charge=ion_charge 1045 ) 1046 1047 # Perform search for adduct ion type 1048 # looks for adduct, used_atom_valences should be 0 1049 # this code does not support H exchance by halogen atoms 1050 if self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.isAdduct: 1051 pbar.set_description_str( 1052 desc="Started molecular formula search for class %s, adduct " 1053 % classe_str, 1054 refresh=True, 1055 ) 1056 1057 ion_type = Labels.adduct_ion 1058 dict_atoms_formulas = dict_res.get(ion_type) 1059 1060 for adduct_atom, dict_by_class in dict_atoms_formulas.items(): 1061 candidate_formulas = dict_by_class.get(classe_str) 1062 1063 if candidate_formulas: 1064 self.search_spectra_against_candidates( 1065 mass_spectrum_list=mass_spectrum_list, 1066 ms_peaks_list=ms_peaks_list, 1067 candidate_formulas=candidate_formulas, 1068 ion_type=ion_type, 1069 ion_charge=ion_charge 1070 ) 1071 self.sql_db.close() 1072 1073 def run_mass_feature_search(self): 1074 """Run the molecular formula search on the mass features. 1075 1076 Calls bulk_run_molecular_formula_search() with specified mass spectra and mass peaks. 1077 """ 1078 mass_features_df = self.lcms_obj.mass_features_to_df() 1079 1080 # Get the list of mass spectrum (and peaks to search with each mass spectrum) for all mass features 1081 scan_list = mass_features_df.apex_scan.unique() 1082 mass_spectrum_list = [self.lcms_obj._ms[x] for x in scan_list] 1083 ms_peaks = [] 1084 for scan in scan_list: 1085 mf_df_scan = mass_features_df[mass_features_df.apex_scan == scan] 1086 peaks_to_search = [ 1087 self.lcms_obj.mass_features[x].ms1_peak for x in mf_df_scan.index.tolist() 1088 ] 1089 ms_peaks.append(peaks_to_search) 1090 1091 # Run the molecular formula search 1092 self.bulk_run_molecular_formula_search(mass_spectrum_list, ms_peaks) 1093 1094 def run_untargeted_worker_ms1(self): 1095 """Run untargeted molecular formula search on the ms1 mass spectrum.""" 1096 raise NotImplementedError("run_untargeted_worker_ms1 search is not implemented in CoreMS 3.0 and greater") 1097 1098 def run_target_worker_ms1(self): 1099 """Run targeted molecular formula search on the ms1 mass spectrum.""" 1100 raise NotImplementedError("run_target_worker_ms1 formula search is not yet implemented in CoreMS 3.0 and greater")
27class SearchMolecularFormulas: 28 """Class for searching molecular formulas in a mass spectrum. 29 30 Parameters 31 ---------- 32 mass_spectrum_obj : MassSpectrum 33 The mass spectrum object. 34 sql_db : MolForm_SQL, optional 35 The SQL database object, by default None. 36 first_hit : bool, optional 37 Flag to indicate whether to skip peaks that already have a molecular formula assigned, by default False. 38 find_isotopologues : bool, optional 39 Flag to indicate whether to find isotopologues, by default True. 40 41 Attributes 42 ---------- 43 mass_spectrum_obj : MassSpectrum 44 The mass spectrum object. 45 sql_db : MolForm_SQL 46 The SQL database object. 47 first_hit : bool 48 Flag to indicate whether to skip peaks that already have a molecular formula assigned. 49 find_isotopologues : bool 50 Flag to indicate whether to find isotopologues. 51 52 53 Methods 54 ------- 55 * run_search(). 56 Run the molecular formula search. 57 * run_worker_mass_spectrum(). 58 Run the molecular formula search on the mass spectrum object. 59 * run_worker_ms_peaks(). 60 Run the molecular formula search on the given list of mass spectrum peaks. 61 * database_to_dict(). 62 Convert the database results to a dictionary. 63 * run_molecular_formula(). 64 Run the molecular formula search on the given list of mass spectrum peaks. 65 * search_mol_formulas(). 66 Search for molecular formulas in the mass spectrum. 67 68 """ 69 70 def __init__( 71 self, 72 mass_spectrum_obj, 73 sql_db=None, 74 first_hit: bool = False, 75 find_isotopologues: bool = True, 76 ): 77 self.first_hit = first_hit 78 79 self.find_isotopologues = find_isotopologues 80 81 self.mass_spectrum_obj = mass_spectrum_obj 82 83 if not sql_db: 84 self.sql_db = MolForm_SQL( 85 url=mass_spectrum_obj.molecular_search_settings.url_database 86 ) 87 88 else: 89 self.sql_db = sql_db 90 91 def __enter__(self): 92 """Open the SQL database connection.""" 93 return self 94 95 def __exit__(self, exc_type, exc_val, exc_tb): 96 """Close the SQL database connection.""" 97 self.sql_db.close() 98 99 return False 100 101 def run_search( 102 self, 103 mspeaks: list, 104 query: dict, 105 min_abundance: float, 106 ion_type: str, 107 ion_charge: int, 108 adduct_atom=None, 109 ): 110 """Run the molecular formula search. 111 112 Parameters 113 ---------- 114 mspeaks : list of MSPeak 115 The list of mass spectrum peaks. 116 query : dict 117 The query dictionary containing the possible molecular formulas. 118 min_abundance : float 119 The minimum abundance threshold. 120 ion_type : str 121 The ion type. 122 ion_charge : int 123 The ion charge. 124 adduct_atom : str, optional 125 The adduct atom, by default None. 126 """ 127 128 def get_formulas(nominal_overlay: float = 0.1): 129 """ 130 Get the list of formulas based on the nominal overlay. 131 132 Parameters 133 ---------- 134 nominal_overlay : float, optional 135 The nominal overlay, by default 0.1. 136 137 Returns 138 ------- 139 list 140 The list of formulas. 141 """ 142 nominal_mz = ms_peak.nominal_mz_exp 143 144 defect_mass = ms_peak.mz_exp - nominal_mz 145 nominal_masses = [nominal_mz] 146 147 if (defect_mass) >= 1 - nominal_overlay: 148 nominal_masses.append(nominal_mz + 1) 149 elif (defect_mass) <= nominal_overlay: 150 nominal_masses.append(nominal_mz - 1) 151 152 list_formulas_candidates = [] 153 154 for nominal_mass in nominal_masses: 155 if nominal_mass in query.keys(): 156 list_formulas_candidates.extend(query.get(nominal_mass)) 157 158 return list_formulas_candidates 159 160 all_assigned_indexes = list() 161 162 # molecular_search_settings = self.mass_spectrum_obj.molecular_search_settings 163 164 search_molfrom = SearchMolecularFormulaWorker( 165 find_isotopologues=self.find_isotopologues 166 ) 167 168 for ms_peak in mspeaks: 169 # already assigned a molecular formula 170 if self.first_hit: 171 if ms_peak.is_assigned: 172 continue 173 174 ms_peak_indexes = search_molfrom.find_formulas( 175 get_formulas(), 176 min_abundance, 177 self.mass_spectrum_obj, 178 ms_peak, 179 ion_type, 180 ion_charge, 181 adduct_atom, 182 ) 183 184 all_assigned_indexes.extend(ms_peak_indexes) 185 186 # all_assigned_indexes = MolecularFormulaSearchFilters().filter_isotopologue(all_assigned_indexes, self.mass_spectrum_obj) 187 188 # all_assigned_indexes = MolecularFormulaSearchFilters().filter_kendrick(all_assigned_indexes, self.mass_spectrum_obj) 189 190 # MolecularFormulaSearchFilters().check_min_peaks(all_assigned_indexes, self.mass_spectrum_obj) 191 # filter per min peaks per mono isotopic class 192 193 def run_worker_mass_spectrum(self): 194 """Run the molecular formula search on the mass spectrum object.""" 195 self.run_molecular_formula( 196 self.mass_spectrum_obj.sort_by_abundance(), 197 print_time=self.mass_spectrum_obj.molecular_search_settings.verbose_processing 198 ) 199 200 def run_worker_ms_peaks(self, ms_peaks): 201 """Run the molecular formula search on the given list of mass spectrum peaks. 202 203 Parameters 204 ---------- 205 ms_peaks : list of MSPeak 206 The list of mass spectrum peaks. 207 """ 208 self.run_molecular_formula( 209 ms_peaks, 210 print_time=self.mass_spectrum_obj.molecular_search_settings.verbose_processing 211 ) 212 213 @staticmethod 214 def database_to_dict(classe_str_list, nominal_mzs, mf_search_settings, ion_charge): 215 """Convert the database results to a dictionary. 216 217 Parameters 218 ---------- 219 classe_str_list : list 220 The list of class strings. 221 nominal_mzs : list 222 The list of nominal m/z values. 223 mf_search_settings : MolecularFormulaSearchSettings 224 The molecular formula search settings. 225 ion_charge : int 226 The ion charge. 227 228 Returns 229 ------- 230 dict 231 The dictionary containing the database results. 232 """ 233 sql_db = MolForm_SQL(url=mf_search_settings.url_database) 234 235 dict_res = {} 236 237 if mf_search_settings.isProtonated: 238 dict_res[Labels.protonated_de_ion] = sql_db.get_dict_by_classes( 239 classe_str_list, 240 Labels.protonated_de_ion, 241 nominal_mzs, 242 ion_charge, 243 mf_search_settings, 244 ) 245 246 if mf_search_settings.isRadical: 247 dict_res[Labels.radical_ion] = sql_db.get_dict_by_classes( 248 classe_str_list, 249 Labels.radical_ion, 250 nominal_mzs, 251 ion_charge, 252 mf_search_settings, 253 ) 254 255 if mf_search_settings.isAdduct: 256 adduct_list = ( 257 mf_search_settings.adduct_atoms_neg 258 if ion_charge < 0 259 else mf_search_settings.adduct_atoms_pos 260 ) 261 dict_res[Labels.adduct_ion] = sql_db.get_dict_by_classes( 262 classe_str_list, 263 Labels.adduct_ion, 264 nominal_mzs, 265 ion_charge, 266 mf_search_settings, 267 adducts=adduct_list, 268 ) 269 270 return dict_res 271 272 @timeit(print_time=True) 273 def run_molecular_formula(self, ms_peaks, **kwargs): 274 """Run the molecular formula search on the given list of mass spectrum peaks. 275 276 Parameters 277 ---------- 278 ms_peaks : list of MSPeak 279 The list of mass spectrum peaks. 280 **kwargs 281 Additional keyword arguments. 282 Most notably, print_time, which is a boolean flag to indicate whether to print the time 283 and passed to the timeit decorator. 284 """ 285 ion_charge = self.mass_spectrum_obj.polarity 286 min_abundance = self.mass_spectrum_obj.min_abundance 287 nominal_mzs = self.mass_spectrum_obj.nominal_mz 288 289 verbose = self.mass_spectrum_obj.molecular_search_settings.verbose_processing 290 # reset average error, only relevant is average mass error method is being used 291 SearchMolecularFormulaWorker( 292 find_isotopologues=self.find_isotopologues 293 ).reset_error(self.mass_spectrum_obj) 294 295 # check database for all possible molecular formula combinations based on the setting passed to self.mass_spectrum_obj.molecular_search_settings 296 classes = MolecularCombinations(self.sql_db).runworker( 297 self.mass_spectrum_obj.molecular_search_settings, 298 print_time=self.mass_spectrum_obj.molecular_search_settings.verbose_processing 299 ) 300 301 # split the database load to not blowout the memory 302 # TODO add to the settings 303 for classe_chunk in chunks( 304 classes, self.mass_spectrum_obj.molecular_search_settings.db_chunk_size 305 ): 306 classes_str_list = [class_tuple[0] for class_tuple in classe_chunk] 307 308 # load the molecular formula objs binned by ion type and heteroatoms classes, {ion type:{classe:[list_formula]}} 309 # for adduct ion type a third key is added {atoms:{ion type:{classe:[list_formula]}}} 310 dict_res = self.database_to_dict( 311 classes_str_list, 312 nominal_mzs, 313 self.mass_spectrum_obj.molecular_search_settings, 314 ion_charge, 315 ) 316 pbar = tqdm.tqdm(classe_chunk, disable = not verbose) 317 for classe_tuple in pbar: 318 # class string is a json serialized dict 319 classe_str = classe_tuple[0] 320 classe_dict = classe_tuple[1] 321 322 if self.mass_spectrum_obj.molecular_search_settings.isProtonated: 323 ion_type = Labels.protonated_de_ion 324 if verbose: 325 pbar.set_description_str( 326 desc="Started molecular formula search for class %s, (de)protonated " 327 % classe_str, 328 refresh=True, 329 ) 330 331 candidate_formulas = dict_res.get(ion_type).get(classe_str) 332 333 if candidate_formulas: 334 self.run_search( 335 ms_peaks, 336 candidate_formulas, 337 min_abundance, 338 ion_type, 339 ion_charge, 340 ) 341 342 if self.mass_spectrum_obj.molecular_search_settings.isRadical: 343 if verbose: 344 pbar.set_description_str( 345 desc="Started molecular formula search for class %s, radical " 346 % classe_str, 347 refresh=True, 348 ) 349 350 ion_type = Labels.radical_ion 351 352 candidate_formulas = dict_res.get(ion_type).get(classe_str) 353 354 if candidate_formulas: 355 self.run_search( 356 ms_peaks, 357 candidate_formulas, 358 min_abundance, 359 ion_type, 360 ion_charge, 361 ) 362 # looks for adduct, used_atom_valences should be 0 363 # this code does not support H exchance by halogen atoms 364 if self.mass_spectrum_obj.molecular_search_settings.isAdduct: 365 if verbose: 366 pbar.set_description_str( 367 desc="Started molecular formula search for class %s, adduct " 368 % classe_str, 369 refresh=True, 370 ) 371 372 ion_type = Labels.adduct_ion 373 dict_atoms_formulas = dict_res.get(ion_type) 374 375 for adduct_atom, dict_by_class in dict_atoms_formulas.items(): 376 candidate_formulas = dict_by_class.get(classe_str) 377 378 if candidate_formulas: 379 self.run_search( 380 ms_peaks, 381 candidate_formulas, 382 min_abundance, 383 ion_type, 384 ion_charge, 385 adduct_atom=adduct_atom, 386 ) 387 self.sql_db.close() 388 389 def search_mol_formulas( 390 self, 391 possible_formulas_list: List[MolecularFormula], 392 ion_type: str, 393 neutral_molform=True, 394 find_isotopologues=True, 395 adduct_atom=None, 396 ) -> List[_MSPeak]: 397 """Search for molecular formulas in the mass spectrum. 398 399 Parameters 400 ---------- 401 possible_formulas_list : list of MolecularFormula 402 The list of possible molecular formulas. 403 ion_type : str 404 The ion type. 405 neutral_molform : bool, optional 406 Flag to indicate whether the molecular formulas are neutral, by default True. 407 find_isotopologues : bool, optional 408 Flag to indicate whether to find isotopologues, by default True. 409 adduct_atom : str, optional 410 The adduct atom, by default None. 411 412 Returns 413 ------- 414 list of MSPeak 415 The list of mass spectrum peaks with assigned molecular formulas. 416 """ 417 # neutral_molform: some reference files already present the formula on ion mode, for instance, bruker reference files 418 # if that is the case than turn neutral_molform off 419 420 SearchMolecularFormulaWorker(find_isotopologues=find_isotopologues).reset_error( 421 self.mass_spectrum_obj 422 ) 423 424 initial_min_peak_bool = ( 425 self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter 426 ) 427 initial_runtime_kendrick_filter = ( 428 self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter 429 ) 430 431 # Are the following 3 lines redundant? 432 self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = False 433 self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = ( 434 False # TODO check this line 435 ) 436 self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter = ( 437 False 438 ) 439 440 possible_formulas_dict_nm = {} 441 442 for mf in possible_formulas_list: 443 if neutral_molform: 444 nm = int(mf.protonated_mz) 445 else: 446 nm = int(mf.mz_nominal_calc) 447 448 if nm in possible_formulas_dict_nm.keys(): 449 possible_formulas_dict_nm[nm].append(mf) 450 451 else: 452 possible_formulas_dict_nm[nm] = [mf] 453 454 min_abundance = self.mass_spectrum_obj.min_abundance 455 456 ion_type = ion_type 457 458 self.run_search( 459 self.mass_spectrum_obj, 460 possible_formulas_dict_nm, 461 min_abundance, 462 ion_type, 463 self.mass_spectrum_obj.polarity, 464 adduct_atom=adduct_atom, 465 ) 466 467 self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = ( 468 initial_min_peak_bool 469 ) 470 self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter = ( 471 initial_runtime_kendrick_filter 472 ) 473 474 mspeaks = [mspeak for mspeak in self.mass_spectrum_obj if mspeak.is_assigned] 475 476 self.sql_db.close() 477 478 return mspeaks
Class for searching molecular formulas in a mass spectrum.
Parameters
- mass_spectrum_obj (MassSpectrum): The mass spectrum object.
- sql_db (MolForm_SQL, optional): The SQL database object, by default None.
- first_hit (bool, optional): Flag to indicate whether to skip peaks that already have a molecular formula assigned, by default False.
- find_isotopologues (bool, optional): Flag to indicate whether to find isotopologues, by default True.
Attributes
- mass_spectrum_obj (MassSpectrum): The mass spectrum object.
- sql_db (MolForm_SQL): The SQL database object.
- first_hit (bool): Flag to indicate whether to skip peaks that already have a molecular formula assigned.
- find_isotopologues (bool): Flag to indicate whether to find isotopologues.
Methods
- run_search(). Run the molecular formula search.
- run_worker_mass_spectrum(). Run the molecular formula search on the mass spectrum object.
- run_worker_ms_peaks(). Run the molecular formula search on the given list of mass spectrum peaks.
- database_to_dict(). Convert the database results to a dictionary.
- run_molecular_formula(). Run the molecular formula search on the given list of mass spectrum peaks.
- search_mol_formulas(). Search for molecular formulas in the mass spectrum.
70 def __init__( 71 self, 72 mass_spectrum_obj, 73 sql_db=None, 74 first_hit: bool = False, 75 find_isotopologues: bool = True, 76 ): 77 self.first_hit = first_hit 78 79 self.find_isotopologues = find_isotopologues 80 81 self.mass_spectrum_obj = mass_spectrum_obj 82 83 if not sql_db: 84 self.sql_db = MolForm_SQL( 85 url=mass_spectrum_obj.molecular_search_settings.url_database 86 ) 87 88 else: 89 self.sql_db = sql_db
101 def run_search( 102 self, 103 mspeaks: list, 104 query: dict, 105 min_abundance: float, 106 ion_type: str, 107 ion_charge: int, 108 adduct_atom=None, 109 ): 110 """Run the molecular formula search. 111 112 Parameters 113 ---------- 114 mspeaks : list of MSPeak 115 The list of mass spectrum peaks. 116 query : dict 117 The query dictionary containing the possible molecular formulas. 118 min_abundance : float 119 The minimum abundance threshold. 120 ion_type : str 121 The ion type. 122 ion_charge : int 123 The ion charge. 124 adduct_atom : str, optional 125 The adduct atom, by default None. 126 """ 127 128 def get_formulas(nominal_overlay: float = 0.1): 129 """ 130 Get the list of formulas based on the nominal overlay. 131 132 Parameters 133 ---------- 134 nominal_overlay : float, optional 135 The nominal overlay, by default 0.1. 136 137 Returns 138 ------- 139 list 140 The list of formulas. 141 """ 142 nominal_mz = ms_peak.nominal_mz_exp 143 144 defect_mass = ms_peak.mz_exp - nominal_mz 145 nominal_masses = [nominal_mz] 146 147 if (defect_mass) >= 1 - nominal_overlay: 148 nominal_masses.append(nominal_mz + 1) 149 elif (defect_mass) <= nominal_overlay: 150 nominal_masses.append(nominal_mz - 1) 151 152 list_formulas_candidates = [] 153 154 for nominal_mass in nominal_masses: 155 if nominal_mass in query.keys(): 156 list_formulas_candidates.extend(query.get(nominal_mass)) 157 158 return list_formulas_candidates 159 160 all_assigned_indexes = list() 161 162 # molecular_search_settings = self.mass_spectrum_obj.molecular_search_settings 163 164 search_molfrom = SearchMolecularFormulaWorker( 165 find_isotopologues=self.find_isotopologues 166 ) 167 168 for ms_peak in mspeaks: 169 # already assigned a molecular formula 170 if self.first_hit: 171 if ms_peak.is_assigned: 172 continue 173 174 ms_peak_indexes = search_molfrom.find_formulas( 175 get_formulas(), 176 min_abundance, 177 self.mass_spectrum_obj, 178 ms_peak, 179 ion_type, 180 ion_charge, 181 adduct_atom, 182 ) 183 184 all_assigned_indexes.extend(ms_peak_indexes) 185 186 # all_assigned_indexes = MolecularFormulaSearchFilters().filter_isotopologue(all_assigned_indexes, self.mass_spectrum_obj) 187 188 # all_assigned_indexes = MolecularFormulaSearchFilters().filter_kendrick(all_assigned_indexes, self.mass_spectrum_obj) 189 190 # MolecularFormulaSearchFilters().check_min_peaks(all_assigned_indexes, self.mass_spectrum_obj) 191 # filter per min peaks per mono isotopic class
Run the molecular formula search.
Parameters
- mspeaks (list of MSPeak): The list of mass spectrum peaks.
- query (dict): The query dictionary containing the possible molecular formulas.
- min_abundance (float): The minimum abundance threshold.
- ion_type (str): The ion type.
- ion_charge (int): The ion charge.
- adduct_atom (str, optional): The adduct atom, by default None.
193 def run_worker_mass_spectrum(self): 194 """Run the molecular formula search on the mass spectrum object.""" 195 self.run_molecular_formula( 196 self.mass_spectrum_obj.sort_by_abundance(), 197 print_time=self.mass_spectrum_obj.molecular_search_settings.verbose_processing 198 )
Run the molecular formula search on the mass spectrum object.
200 def run_worker_ms_peaks(self, ms_peaks): 201 """Run the molecular formula search on the given list of mass spectrum peaks. 202 203 Parameters 204 ---------- 205 ms_peaks : list of MSPeak 206 The list of mass spectrum peaks. 207 """ 208 self.run_molecular_formula( 209 ms_peaks, 210 print_time=self.mass_spectrum_obj.molecular_search_settings.verbose_processing 211 )
Run the molecular formula search on the given list of mass spectrum peaks.
Parameters
- ms_peaks (list of MSPeak): The list of mass spectrum peaks.
213 @staticmethod 214 def database_to_dict(classe_str_list, nominal_mzs, mf_search_settings, ion_charge): 215 """Convert the database results to a dictionary. 216 217 Parameters 218 ---------- 219 classe_str_list : list 220 The list of class strings. 221 nominal_mzs : list 222 The list of nominal m/z values. 223 mf_search_settings : MolecularFormulaSearchSettings 224 The molecular formula search settings. 225 ion_charge : int 226 The ion charge. 227 228 Returns 229 ------- 230 dict 231 The dictionary containing the database results. 232 """ 233 sql_db = MolForm_SQL(url=mf_search_settings.url_database) 234 235 dict_res = {} 236 237 if mf_search_settings.isProtonated: 238 dict_res[Labels.protonated_de_ion] = sql_db.get_dict_by_classes( 239 classe_str_list, 240 Labels.protonated_de_ion, 241 nominal_mzs, 242 ion_charge, 243 mf_search_settings, 244 ) 245 246 if mf_search_settings.isRadical: 247 dict_res[Labels.radical_ion] = sql_db.get_dict_by_classes( 248 classe_str_list, 249 Labels.radical_ion, 250 nominal_mzs, 251 ion_charge, 252 mf_search_settings, 253 ) 254 255 if mf_search_settings.isAdduct: 256 adduct_list = ( 257 mf_search_settings.adduct_atoms_neg 258 if ion_charge < 0 259 else mf_search_settings.adduct_atoms_pos 260 ) 261 dict_res[Labels.adduct_ion] = sql_db.get_dict_by_classes( 262 classe_str_list, 263 Labels.adduct_ion, 264 nominal_mzs, 265 ion_charge, 266 mf_search_settings, 267 adducts=adduct_list, 268 ) 269 270 return dict_res
Convert the database results to a dictionary.
Parameters
- classe_str_list (list): The list of class strings.
- nominal_mzs (list): The list of nominal m/z values.
- mf_search_settings (MolecularFormulaSearchSettings): The molecular formula search settings.
- ion_charge (int): The ion charge.
Returns
- dict: The dictionary containing the database results.
27 def timed(*args, **kw): 28 # Extract print_time from kwargs if provided 29 local_print_time = kw.pop('print_time', print_time) 30 ts = time.time() 31 result = method(*args, **kw) 32 te = time.time() 33 if "log_time" in kw: 34 name = kw.get("log_name", method.__name__.upper()) 35 kw["log_time"][name] = int((te - ts) * 1000) 36 elif local_print_time: 37 print("%r %2.2f ms" % (method.__name__, (te - ts) * 1000)) 38 return result
Run the molecular formula search on the given list of mass spectrum peaks.
Parameters
- ms_peaks (list of MSPeak): The list of mass spectrum peaks.
- **kwargs: Additional keyword arguments. Most notably, print_time, which is a boolean flag to indicate whether to print the time and passed to the timeit decorator.
389 def search_mol_formulas( 390 self, 391 possible_formulas_list: List[MolecularFormula], 392 ion_type: str, 393 neutral_molform=True, 394 find_isotopologues=True, 395 adduct_atom=None, 396 ) -> List[_MSPeak]: 397 """Search for molecular formulas in the mass spectrum. 398 399 Parameters 400 ---------- 401 possible_formulas_list : list of MolecularFormula 402 The list of possible molecular formulas. 403 ion_type : str 404 The ion type. 405 neutral_molform : bool, optional 406 Flag to indicate whether the molecular formulas are neutral, by default True. 407 find_isotopologues : bool, optional 408 Flag to indicate whether to find isotopologues, by default True. 409 adduct_atom : str, optional 410 The adduct atom, by default None. 411 412 Returns 413 ------- 414 list of MSPeak 415 The list of mass spectrum peaks with assigned molecular formulas. 416 """ 417 # neutral_molform: some reference files already present the formula on ion mode, for instance, bruker reference files 418 # if that is the case than turn neutral_molform off 419 420 SearchMolecularFormulaWorker(find_isotopologues=find_isotopologues).reset_error( 421 self.mass_spectrum_obj 422 ) 423 424 initial_min_peak_bool = ( 425 self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter 426 ) 427 initial_runtime_kendrick_filter = ( 428 self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter 429 ) 430 431 # Are the following 3 lines redundant? 432 self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = False 433 self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = ( 434 False # TODO check this line 435 ) 436 self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter = ( 437 False 438 ) 439 440 possible_formulas_dict_nm = {} 441 442 for mf in possible_formulas_list: 443 if neutral_molform: 444 nm = int(mf.protonated_mz) 445 else: 446 nm = int(mf.mz_nominal_calc) 447 448 if nm in possible_formulas_dict_nm.keys(): 449 possible_formulas_dict_nm[nm].append(mf) 450 451 else: 452 possible_formulas_dict_nm[nm] = [mf] 453 454 min_abundance = self.mass_spectrum_obj.min_abundance 455 456 ion_type = ion_type 457 458 self.run_search( 459 self.mass_spectrum_obj, 460 possible_formulas_dict_nm, 461 min_abundance, 462 ion_type, 463 self.mass_spectrum_obj.polarity, 464 adduct_atom=adduct_atom, 465 ) 466 467 self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = ( 468 initial_min_peak_bool 469 ) 470 self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter = ( 471 initial_runtime_kendrick_filter 472 ) 473 474 mspeaks = [mspeak for mspeak in self.mass_spectrum_obj if mspeak.is_assigned] 475 476 self.sql_db.close() 477 478 return mspeaks
Search for molecular formulas in the mass spectrum.
Parameters
- possible_formulas_list (list of MolecularFormula): The list of possible molecular formulas.
- ion_type (str): The ion type.
- neutral_molform (bool, optional): Flag to indicate whether the molecular formulas are neutral, by default True.
- find_isotopologues (bool, optional): Flag to indicate whether to find isotopologues, by default True.
- adduct_atom (str, optional): The adduct atom, by default None.
Returns
- list of MSPeak: The list of mass spectrum peaks with assigned molecular formulas.
481class SearchMolecularFormulaWorker: 482 """Class for searching molecular formulas in a mass spectrum. 483 484 Parameters 485 ---------- 486 find_isotopologues : bool, optional 487 Flag to indicate whether to find isotopologues, by default True. 488 489 Attributes 490 ---------- 491 find_isotopologues : bool 492 Flag to indicate whether to find isotopologues. 493 494 Methods 495 ------- 496 * reset_error(). 497 Reset the error variables. 498 * set_last_error(). 499 Set the last error. 500 * find_formulas(). 501 Find the formulas. 502 * calc_error(). 503 Calculate the error. 504 """ 505 506 # TODO add reset error function 507 # needs this wraper to pass the class to multiprocessing 508 509 def __init__(self, find_isotopologues=True): 510 self.find_isotopologues = find_isotopologues 511 512 def __call__(self, args): 513 """Call the find formulas function. 514 515 Parameters 516 ---------- 517 args : tuple 518 The arguments. 519 520 Returns 521 ------- 522 list 523 The list of mass spectrum peaks with assigned molecular formulas. 524 """ 525 return self.find_formulas(*args) # ,args[1] 526 527 def reset_error(self, mass_spectrum_obj): 528 """Reset the error variables. 529 530 Parameters 531 ---------- 532 mass_spectrum_obj : MassSpectrum 533 The mass spectrum object. 534 535 Notes 536 ----- 537 This function resets the error variables for the given mass spectrum object. 538 """ 539 global last_error, last_dif, closest_error, error_average, nbValues 540 last_error, last_dif, closest_error, nbValues = 0.0, 0.0, 0.0, 0.0 541 542 def set_last_error(self, error, mass_spectrum_obj): 543 """Set the last error. 544 545 Parameters 546 ---------- 547 error : float 548 The error. 549 mass_spectrum_obj : MassSpectrum 550 The mass spectrum object. 551 """ 552 # set the changes to the global variables, not internal ones 553 global last_error, last_dif, closest_error, error_average, nbValues 554 555 if mass_spectrum_obj.molecular_search_settings.error_method == "distance": 556 dif = error - last_error 557 if dif < last_dif: 558 last_dif = dif 559 closest_error = error 560 mass_spectrum_obj.molecular_search_settings.min_ppm_error = ( 561 closest_error 562 - mass_spectrum_obj.molecular_search_settings.mz_error_range 563 ) 564 mass_spectrum_obj.molecular_search_settings.max_ppm_error = ( 565 closest_error 566 + mass_spectrum_obj.molecular_search_settings.mz_error_range 567 ) 568 569 elif mass_spectrum_obj.molecular_search_settings.error_method == "lowest": 570 if error < last_error: 571 mass_spectrum_obj.molecular_search_settings.min_ppm_error = ( 572 error - mass_spectrum_obj.molecular_search_settings.mz_error_range 573 ) 574 mass_spectrum_obj.molecular_search_settings.max_ppm_error = ( 575 error + mass_spectrum_obj.molecular_search_settings.mz_error_range 576 ) 577 last_error = error 578 579 elif mass_spectrum_obj.molecular_search_settings.error_method == "symmetrical": 580 mass_spectrum_obj.molecular_search_settings.min_ppm_error = ( 581 mass_spectrum_obj.molecular_search_settings.mz_error_average 582 - mass_spectrum_obj.molecular_search_settings.mz_error_range 583 ) 584 mass_spectrum_obj.molecular_search_settings.max_ppm_error = ( 585 mass_spectrum_obj.molecular_search_settings.mz_error_average 586 + mass_spectrum_obj.molecular_search_settings.mz_error_range 587 ) 588 589 elif mass_spectrum_obj.molecular_search_settings.error_method == "average": 590 nbValues += 1 591 error_average = error_average + ((error - error_average) / nbValues) 592 mass_spectrum_obj.molecular_search_settings.min_ppm_error = ( 593 error_average 594 - mass_spectrum_obj.molecular_search_settings.mz_error_range 595 ) 596 mass_spectrum_obj.molecular_search_settings.max_ppm_error = ( 597 error_average 598 + mass_spectrum_obj.molecular_search_settings.mz_error_range 599 ) 600 601 else: 602 # using set mass_spectrum_obj.molecular_search_settings.min_ppm_error and max_ppm_error range 603 pass 604 605 # returns the error based on the selected method at mass_spectrum_obj.molecular_search_settings.method 606 607 @staticmethod 608 def calc_error(mz_exp, mz_calc, method="ppm"): 609 """Calculate the error. 610 611 Parameters 612 ---------- 613 mz_exp : float 614 The experimental m/z value. 615 mz_calc : float 616 The calculated m/z value. 617 method : str, optional 618 The method, by default 'ppm'. 619 620 Raises 621 ------- 622 Exception 623 If the method is not ppm or ppb. 624 625 Returns 626 ------- 627 float 628 The error. 629 """ 630 631 if method == "ppm": 632 multi_factor = 1_000_000 633 634 elif method == "ppb": 635 multi_factor = 1_000_000_000 636 637 elif method == "perc": 638 multi_factor = 100 639 640 else: 641 raise Exception( 642 "method needs to be ppm or ppb, you have entered %s" % method 643 ) 644 645 if mz_exp: 646 return ((mz_exp - mz_calc) / mz_calc) * multi_factor 647 648 else: 649 raise Exception("Please set mz_calc first") 650 651 def find_formulas( 652 self, 653 formulas, 654 min_abundance, 655 mass_spectrum_obj, 656 ms_peak, 657 ion_type, 658 ion_charge, 659 adduct_atom=None, 660 ): 661 """Find the formulas. 662 663 Parameters 664 ---------- 665 formulas : list of MolecularFormula 666 The list of molecular formulas. 667 min_abundance : float 668 The minimum abundance threshold. 669 mass_spectrum_obj : MassSpectrum 670 The mass spectrum object. 671 ms_peak : MSPeak 672 The mass spectrum peak. 673 ion_type : str 674 The ion type. 675 ion_charge : int 676 The ion charge. 677 adduct_atom : str, optional 678 The adduct atom, by default None. 679 680 Returns 681 ------- 682 list of MSPeak 683 The list of mass spectrum peaks with assigned molecular formulas. 684 685 Notes 686 ----- 687 Uses the closest error the next search (this is not ideal, it needs to use confidence 688 metric to choose the right candidate then propagate the error using the error from the best candidate). 689 It needs to add s/n to the equation. 690 It need optimization to define the mz_error_range within a m/z unit since it is directly proportional 691 with the mass, and inversely proportional to the rp. It's not linear, i.e., sigma mass. 692 The idea it to correlate sigma to resolving power, signal to noise and sample complexity per mz unit. 693 Method='distance' 694 """ 695 mspeak_assigned_index = list() 696 697 min_ppm_error = mass_spectrum_obj.molecular_search_settings.min_ppm_error 698 max_ppm_error = mass_spectrum_obj.molecular_search_settings.max_ppm_error 699 700 min_abun_error = mass_spectrum_obj.molecular_search_settings.min_abun_error 701 max_abun_error = mass_spectrum_obj.molecular_search_settings.max_abun_error 702 703 # f = open("abundance_error.txt", "a+") 704 ms_peak_mz_exp, ms_peak_abundance = ms_peak.mz_exp, ms_peak.abundance 705 # min_error = min([pmf.mz_error for pmf in possible_formulas]) 706 707 def mass_by_ion_type(possible_formula_obj): 708 if ion_type == Labels.protonated_de_ion: 709 return possible_formula_obj._protonated_mz(ion_charge) 710 711 elif ion_type == Labels.radical_ion: 712 return possible_formula_obj._radical_mz(ion_charge) 713 714 elif ion_type == Labels.adduct_ion and adduct_atom: 715 return possible_formula._adduct_mz(ion_charge, adduct_atom) 716 717 else: 718 # will return externally calculated mz if is set, #use on Bruker Reference list import 719 # if the ion type is known the ion mass based on molecular formula ion type 720 # if ion type is unknow will return neutral mass 721 return possible_formula.mz_calc 722 723 if formulas: 724 if isinstance(formulas[0], LCMSLibRefMolecularFormula): 725 possible_mf_class = True 726 727 else: 728 possible_mf_class = False 729 730 for possible_formula in formulas: 731 if possible_formula: 732 error = self.calc_error( 733 ms_peak_mz_exp, mass_by_ion_type(possible_formula) 734 ) 735 736 # error = possible_formula.mz_error 737 738 if min_ppm_error <= error <= max_ppm_error: 739 # update the error 740 741 self.set_last_error(error, mass_spectrum_obj) 742 743 # add molecular formula match to ms_peak 744 745 # get molecular formula dict from sql obj 746 # formula_dict = pickle.loads(possible_formula.mol_formula) 747 # if possible_mf_class: 748 749 # molecular_formula = deepcopy(possible_formula) 750 751 # else: 752 753 formula_dict = possible_formula.to_dict() 754 # create the molecular formula obj to be stored 755 if possible_mf_class: 756 molecular_formula = LCMSLibRefMolecularFormula( 757 formula_dict, 758 ion_charge, 759 ion_type=ion_type, 760 adduct_atom=adduct_atom, 761 ) 762 763 molecular_formula.name = possible_formula.name 764 molecular_formula.kegg_id = possible_formula.kegg_id 765 molecular_formula.cas = possible_formula.cas 766 767 else: 768 molecular_formula = MolecularFormula( 769 formula_dict, 770 ion_charge, 771 ion_type=ion_type, 772 adduct_atom=adduct_atom, 773 ) 774 # add the molecular formula obj to the mspeak obj 775 # add the mspeak obj and it's index for tracking next assignment step 776 777 if self.find_isotopologues: 778 # calculates isotopologues 779 isotopologues = molecular_formula.isotopologues( 780 min_abundance, 781 ms_peak_abundance, 782 mass_spectrum_obj.dynamic_range, 783 ) 784 785 # search for isotopologues 786 for isotopologue_formula in isotopologues: 787 molecular_formula.expected_isotopologues.append( 788 isotopologue_formula 789 ) 790 # move this outside to improve preformace 791 # we need to increase the search space to -+1 m_z 792 first_index, last_index = ( 793 mass_spectrum_obj.get_nominal_mz_first_last_indexes( 794 isotopologue_formula.mz_nominal_calc 795 ) 796 ) 797 798 for ms_peak_iso in mass_spectrum_obj[ 799 first_index:last_index 800 ]: 801 error = self.calc_error( 802 ms_peak_iso.mz_exp, isotopologue_formula.mz_calc 803 ) 804 805 if min_ppm_error <= error <= max_ppm_error: 806 # need to define error distribution for abundance measurements 807 808 # if mass_spectrum_obj.is_centroid: 809 810 abundance_error = self.calc_error( 811 isotopologue_formula.abundance_calc, 812 ms_peak_iso.abundance, 813 method="perc", 814 ) 815 816 # area_error = self.calc_error(ms_peak.area, ms_peak_iso.area, method='perc') 817 818 # margin of error was set empirically/ needs statistical calculation 819 # of margin of error for the measurement of the abundances 820 if ( 821 min_abun_error 822 <= abundance_error 823 <= max_abun_error 824 ): 825 # update the error 826 827 self.set_last_error(error, mass_spectrum_obj) 828 829 # isotopologue_formula.mz_error = error 830 831 # isotopologue_formula.area_error = area_error 832 833 # isotopologue_formula.abundance_error = abundance_error 834 835 isotopologue_formula.mspeak_index_mono_isotopic = ms_peak.index 836 837 mono_isotopic_formula_index = len(ms_peak) 838 839 isotopologue_formula.mspeak_index_mono_isotopic = ms_peak.index 840 841 isotopologue_formula.mono_isotopic_formula_index = mono_isotopic_formula_index 842 843 # add mspeaks isotopologue index to the mono isotopic MolecularFormula obj and the respective formula position 844 845 # add molecular formula match to ms_peak 846 x = ms_peak_iso.add_molecular_formula( 847 isotopologue_formula 848 ) 849 850 molecular_formula.mspeak_mf_isotopologues_indexes.append( 851 (ms_peak_iso.index, x) 852 ) 853 # add mspeaks mono isotopic index to the isotopologue MolecularFormula obj 854 855 y = ms_peak.add_molecular_formula(molecular_formula) 856 857 mspeak_assigned_index.append((ms_peak.index, y)) 858 859 return mspeak_assigned_index
Class for searching molecular formulas in a mass spectrum.
Parameters
- find_isotopologues (bool, optional): Flag to indicate whether to find isotopologues, by default True.
Attributes
- find_isotopologues (bool): Flag to indicate whether to find isotopologues.
Methods
- reset_error(). Reset the error variables.
- set_last_error(). Set the last error.
- find_formulas(). Find the formulas.
- calc_error(). Calculate the error.
527 def reset_error(self, mass_spectrum_obj): 528 """Reset the error variables. 529 530 Parameters 531 ---------- 532 mass_spectrum_obj : MassSpectrum 533 The mass spectrum object. 534 535 Notes 536 ----- 537 This function resets the error variables for the given mass spectrum object. 538 """ 539 global last_error, last_dif, closest_error, error_average, nbValues 540 last_error, last_dif, closest_error, nbValues = 0.0, 0.0, 0.0, 0.0
Reset the error variables.
Parameters
- mass_spectrum_obj (MassSpectrum): The mass spectrum object.
Notes
This function resets the error variables for the given mass spectrum object.
542 def set_last_error(self, error, mass_spectrum_obj): 543 """Set the last error. 544 545 Parameters 546 ---------- 547 error : float 548 The error. 549 mass_spectrum_obj : MassSpectrum 550 The mass spectrum object. 551 """ 552 # set the changes to the global variables, not internal ones 553 global last_error, last_dif, closest_error, error_average, nbValues 554 555 if mass_spectrum_obj.molecular_search_settings.error_method == "distance": 556 dif = error - last_error 557 if dif < last_dif: 558 last_dif = dif 559 closest_error = error 560 mass_spectrum_obj.molecular_search_settings.min_ppm_error = ( 561 closest_error 562 - mass_spectrum_obj.molecular_search_settings.mz_error_range 563 ) 564 mass_spectrum_obj.molecular_search_settings.max_ppm_error = ( 565 closest_error 566 + mass_spectrum_obj.molecular_search_settings.mz_error_range 567 ) 568 569 elif mass_spectrum_obj.molecular_search_settings.error_method == "lowest": 570 if error < last_error: 571 mass_spectrum_obj.molecular_search_settings.min_ppm_error = ( 572 error - mass_spectrum_obj.molecular_search_settings.mz_error_range 573 ) 574 mass_spectrum_obj.molecular_search_settings.max_ppm_error = ( 575 error + mass_spectrum_obj.molecular_search_settings.mz_error_range 576 ) 577 last_error = error 578 579 elif mass_spectrum_obj.molecular_search_settings.error_method == "symmetrical": 580 mass_spectrum_obj.molecular_search_settings.min_ppm_error = ( 581 mass_spectrum_obj.molecular_search_settings.mz_error_average 582 - mass_spectrum_obj.molecular_search_settings.mz_error_range 583 ) 584 mass_spectrum_obj.molecular_search_settings.max_ppm_error = ( 585 mass_spectrum_obj.molecular_search_settings.mz_error_average 586 + mass_spectrum_obj.molecular_search_settings.mz_error_range 587 ) 588 589 elif mass_spectrum_obj.molecular_search_settings.error_method == "average": 590 nbValues += 1 591 error_average = error_average + ((error - error_average) / nbValues) 592 mass_spectrum_obj.molecular_search_settings.min_ppm_error = ( 593 error_average 594 - mass_spectrum_obj.molecular_search_settings.mz_error_range 595 ) 596 mass_spectrum_obj.molecular_search_settings.max_ppm_error = ( 597 error_average 598 + mass_spectrum_obj.molecular_search_settings.mz_error_range 599 ) 600 601 else: 602 # using set mass_spectrum_obj.molecular_search_settings.min_ppm_error and max_ppm_error range 603 pass 604 605 # returns the error based on the selected method at mass_spectrum_obj.molecular_search_settings.method
Set the last error.
Parameters
- error (float): The error.
- mass_spectrum_obj (MassSpectrum): The mass spectrum object.
607 @staticmethod 608 def calc_error(mz_exp, mz_calc, method="ppm"): 609 """Calculate the error. 610 611 Parameters 612 ---------- 613 mz_exp : float 614 The experimental m/z value. 615 mz_calc : float 616 The calculated m/z value. 617 method : str, optional 618 The method, by default 'ppm'. 619 620 Raises 621 ------- 622 Exception 623 If the method is not ppm or ppb. 624 625 Returns 626 ------- 627 float 628 The error. 629 """ 630 631 if method == "ppm": 632 multi_factor = 1_000_000 633 634 elif method == "ppb": 635 multi_factor = 1_000_000_000 636 637 elif method == "perc": 638 multi_factor = 100 639 640 else: 641 raise Exception( 642 "method needs to be ppm or ppb, you have entered %s" % method 643 ) 644 645 if mz_exp: 646 return ((mz_exp - mz_calc) / mz_calc) * multi_factor 647 648 else: 649 raise Exception("Please set mz_calc first")
Calculate the error.
Parameters
- mz_exp (float): The experimental m/z value.
- mz_calc (float): The calculated m/z value.
- method (str, optional): The method, by default 'ppm'.
Raises
- Exception: If the method is not ppm or ppb.
Returns
- float: The error.
651 def find_formulas( 652 self, 653 formulas, 654 min_abundance, 655 mass_spectrum_obj, 656 ms_peak, 657 ion_type, 658 ion_charge, 659 adduct_atom=None, 660 ): 661 """Find the formulas. 662 663 Parameters 664 ---------- 665 formulas : list of MolecularFormula 666 The list of molecular formulas. 667 min_abundance : float 668 The minimum abundance threshold. 669 mass_spectrum_obj : MassSpectrum 670 The mass spectrum object. 671 ms_peak : MSPeak 672 The mass spectrum peak. 673 ion_type : str 674 The ion type. 675 ion_charge : int 676 The ion charge. 677 adduct_atom : str, optional 678 The adduct atom, by default None. 679 680 Returns 681 ------- 682 list of MSPeak 683 The list of mass spectrum peaks with assigned molecular formulas. 684 685 Notes 686 ----- 687 Uses the closest error the next search (this is not ideal, it needs to use confidence 688 metric to choose the right candidate then propagate the error using the error from the best candidate). 689 It needs to add s/n to the equation. 690 It need optimization to define the mz_error_range within a m/z unit since it is directly proportional 691 with the mass, and inversely proportional to the rp. It's not linear, i.e., sigma mass. 692 The idea it to correlate sigma to resolving power, signal to noise and sample complexity per mz unit. 693 Method='distance' 694 """ 695 mspeak_assigned_index = list() 696 697 min_ppm_error = mass_spectrum_obj.molecular_search_settings.min_ppm_error 698 max_ppm_error = mass_spectrum_obj.molecular_search_settings.max_ppm_error 699 700 min_abun_error = mass_spectrum_obj.molecular_search_settings.min_abun_error 701 max_abun_error = mass_spectrum_obj.molecular_search_settings.max_abun_error 702 703 # f = open("abundance_error.txt", "a+") 704 ms_peak_mz_exp, ms_peak_abundance = ms_peak.mz_exp, ms_peak.abundance 705 # min_error = min([pmf.mz_error for pmf in possible_formulas]) 706 707 def mass_by_ion_type(possible_formula_obj): 708 if ion_type == Labels.protonated_de_ion: 709 return possible_formula_obj._protonated_mz(ion_charge) 710 711 elif ion_type == Labels.radical_ion: 712 return possible_formula_obj._radical_mz(ion_charge) 713 714 elif ion_type == Labels.adduct_ion and adduct_atom: 715 return possible_formula._adduct_mz(ion_charge, adduct_atom) 716 717 else: 718 # will return externally calculated mz if is set, #use on Bruker Reference list import 719 # if the ion type is known the ion mass based on molecular formula ion type 720 # if ion type is unknow will return neutral mass 721 return possible_formula.mz_calc 722 723 if formulas: 724 if isinstance(formulas[0], LCMSLibRefMolecularFormula): 725 possible_mf_class = True 726 727 else: 728 possible_mf_class = False 729 730 for possible_formula in formulas: 731 if possible_formula: 732 error = self.calc_error( 733 ms_peak_mz_exp, mass_by_ion_type(possible_formula) 734 ) 735 736 # error = possible_formula.mz_error 737 738 if min_ppm_error <= error <= max_ppm_error: 739 # update the error 740 741 self.set_last_error(error, mass_spectrum_obj) 742 743 # add molecular formula match to ms_peak 744 745 # get molecular formula dict from sql obj 746 # formula_dict = pickle.loads(possible_formula.mol_formula) 747 # if possible_mf_class: 748 749 # molecular_formula = deepcopy(possible_formula) 750 751 # else: 752 753 formula_dict = possible_formula.to_dict() 754 # create the molecular formula obj to be stored 755 if possible_mf_class: 756 molecular_formula = LCMSLibRefMolecularFormula( 757 formula_dict, 758 ion_charge, 759 ion_type=ion_type, 760 adduct_atom=adduct_atom, 761 ) 762 763 molecular_formula.name = possible_formula.name 764 molecular_formula.kegg_id = possible_formula.kegg_id 765 molecular_formula.cas = possible_formula.cas 766 767 else: 768 molecular_formula = MolecularFormula( 769 formula_dict, 770 ion_charge, 771 ion_type=ion_type, 772 adduct_atom=adduct_atom, 773 ) 774 # add the molecular formula obj to the mspeak obj 775 # add the mspeak obj and it's index for tracking next assignment step 776 777 if self.find_isotopologues: 778 # calculates isotopologues 779 isotopologues = molecular_formula.isotopologues( 780 min_abundance, 781 ms_peak_abundance, 782 mass_spectrum_obj.dynamic_range, 783 ) 784 785 # search for isotopologues 786 for isotopologue_formula in isotopologues: 787 molecular_formula.expected_isotopologues.append( 788 isotopologue_formula 789 ) 790 # move this outside to improve preformace 791 # we need to increase the search space to -+1 m_z 792 first_index, last_index = ( 793 mass_spectrum_obj.get_nominal_mz_first_last_indexes( 794 isotopologue_formula.mz_nominal_calc 795 ) 796 ) 797 798 for ms_peak_iso in mass_spectrum_obj[ 799 first_index:last_index 800 ]: 801 error = self.calc_error( 802 ms_peak_iso.mz_exp, isotopologue_formula.mz_calc 803 ) 804 805 if min_ppm_error <= error <= max_ppm_error: 806 # need to define error distribution for abundance measurements 807 808 # if mass_spectrum_obj.is_centroid: 809 810 abundance_error = self.calc_error( 811 isotopologue_formula.abundance_calc, 812 ms_peak_iso.abundance, 813 method="perc", 814 ) 815 816 # area_error = self.calc_error(ms_peak.area, ms_peak_iso.area, method='perc') 817 818 # margin of error was set empirically/ needs statistical calculation 819 # of margin of error for the measurement of the abundances 820 if ( 821 min_abun_error 822 <= abundance_error 823 <= max_abun_error 824 ): 825 # update the error 826 827 self.set_last_error(error, mass_spectrum_obj) 828 829 # isotopologue_formula.mz_error = error 830 831 # isotopologue_formula.area_error = area_error 832 833 # isotopologue_formula.abundance_error = abundance_error 834 835 isotopologue_formula.mspeak_index_mono_isotopic = ms_peak.index 836 837 mono_isotopic_formula_index = len(ms_peak) 838 839 isotopologue_formula.mspeak_index_mono_isotopic = ms_peak.index 840 841 isotopologue_formula.mono_isotopic_formula_index = mono_isotopic_formula_index 842 843 # add mspeaks isotopologue index to the mono isotopic MolecularFormula obj and the respective formula position 844 845 # add molecular formula match to ms_peak 846 x = ms_peak_iso.add_molecular_formula( 847 isotopologue_formula 848 ) 849 850 molecular_formula.mspeak_mf_isotopologues_indexes.append( 851 (ms_peak_iso.index, x) 852 ) 853 # add mspeaks mono isotopic index to the isotopologue MolecularFormula obj 854 855 y = ms_peak.add_molecular_formula(molecular_formula) 856 857 mspeak_assigned_index.append((ms_peak.index, y)) 858 859 return mspeak_assigned_index
Find the formulas.
Parameters
- formulas (list of MolecularFormula): The list of molecular formulas.
- min_abundance (float): The minimum abundance threshold.
- mass_spectrum_obj (MassSpectrum): The mass spectrum object.
- ms_peak (MSPeak): The mass spectrum peak.
- ion_type (str): The ion type.
- ion_charge (int): The ion charge.
- adduct_atom (str, optional): The adduct atom, by default None.
Returns
- list of MSPeak: The list of mass spectrum peaks with assigned molecular formulas.
Notes
Uses the closest error the next search (this is not ideal, it needs to use confidence metric to choose the right candidate then propagate the error using the error from the best candidate). It needs to add s/n to the equation. It need optimization to define the mz_error_range within a m/z unit since it is directly proportional with the mass, and inversely proportional to the rp. It's not linear, i.e., sigma mass. The idea it to correlate sigma to resolving power, signal to noise and sample complexity per mz unit. Method='distance'
862class SearchMolecularFormulasLC: 863 """Class for searching molecular formulas in a LC object. 864 865 Parameters 866 ---------- 867 lcms_obj : LCMSBase 868 The LCMSBase object. 869 sql_db : MolForm_SQL, optional 870 The SQL database object, by default None. 871 first_hit : bool, optional 872 Flag to indicate whether to skip peaks that already have a molecular formula assigned, by default False. 873 find_isotopologues : bool, optional 874 Flag to indicate whether to find isotopologues, by default True. 875 876 Methods 877 ------- 878 879 * search_spectra_against_candidates(). 880 Search a list of mass spectra against a list of candidate formulas with a given ion type and charge. 881 * bulk_run_molecular_formula_search(). 882 Run the molecular formula search on the given list of mass spectra. 883 Pulls the settings from the LCMSBase object to set ion type and charge to search for. 884 * run_mass_feature_search(). 885 Run the molecular formula search on mass features. 886 Calls bulk_run_molecular_formula_search() with specified mass spectra and mass peaks. 887 * run_untargeted_worker_ms1(). 888 Run untargeted molecular formula search on the ms1 mass spectrum. 889 DEPRECATED: use run_mass_feature_search() or bulk_run_molecular_formula_search() instead. 890 * run_target_worker_ms1(). 891 Run targeted molecular formula search on the ms1 mass spectrum. 892 DEPRECATED: use run_mass_feature_search() or bulk_run_molecular_formula_search() instead. 893 """ 894 895 def __init__(self, lcms_obj, sql_db=None, first_hit=False, find_isotopologues=True): 896 self.first_hit = first_hit 897 898 self.find_isotopologues = find_isotopologues 899 900 self.lcms_obj = lcms_obj 901 902 if not sql_db: 903 self.sql_db = MolForm_SQL( 904 url=self.lcms_obj.parameters.mass_spectrum['ms1'].molecular_search.url_database 905 ) 906 907 else: 908 self.sql_db = sql_db 909 910 def search_spectra_against_candidates(self, mass_spectrum_list, ms_peaks_list, candidate_formulas, ion_type, ion_charge): 911 """Search a list of mass spectra against a list of candidate formulas with a given ion type and charge. 912 913 Parameters 914 ---------- 915 mass_spectrum_list : list of MassSpectrum 916 The list of mass spectra to perform the search on. 917 ms_peaks_list : list of lists of MSPeak objects 918 The list of mass spectrum peaks to search within each mass spectrum. 919 candidate_formulas : dict 920 The candidate formulas. 921 ion_type : str 922 The ion type. 923 ion_charge : int 924 The ion charge, either 1 or -1. 925 926 Notes 927 ----- 928 This function is designed to be used with the bulk_run_molecular_formula_search function. 929 """ 930 for mass_spectrum, ms_peaks in zip(mass_spectrum_list, ms_peaks_list): 931 single_ms_search = SearchMolecularFormulas( 932 mass_spectrum, 933 sql_db=self.sql_db, 934 first_hit=self.first_hit, 935 find_isotopologues=self.find_isotopologues, 936 ) 937 single_ms_search.run_search( 938 ms_peaks, 939 candidate_formulas, 940 mass_spectrum.min_abundance, 941 ion_type, 942 ion_charge, 943 ) 944 945 def bulk_run_molecular_formula_search(self, mass_spectrum_list, ms_peaks_list, mass_spectrum_setting_key='ms1'): 946 """Run the molecular formula search on the given list of mass spectra 947 948 Parameters 949 ---------- 950 mass_spectrum_list : list of MassSpectrum 951 The list of mass spectra to search. 952 ms_peaks_list : list of lists of MSPeak objects 953 The mass peaks to perform molecular formula search within each mass spectrum 954 mass_spectrum_setting_key : str, optional 955 The mass spectrum setting key, by default 'ms1'. 956 This is used to get the appropriate molecular search settings from the LCMSBase object 957 """ 958 # Set min_abundance and nominal_mzs 959 if self.lcms_obj.polarity == "positive": 960 ion_charge = 1 961 elif self.lcms_obj.polarity == "negative": 962 ion_charge = -1 963 else: 964 raise ValueError("Polarity must be either 'positive' or 'negative'") 965 966 # Check that the length of the mass spectrum list and the ms_peaks list are the same 967 if len(mass_spectrum_list) != len(ms_peaks_list): 968 raise ValueError("The length of the mass spectrum list and the ms_peaks list must be the same") 969 970 nominal_mzs = [x.nominal_mz for x in mass_spectrum_list] 971 nominal_mzs = list(set([item for sublist in nominal_mzs for item in sublist])) 972 verbose = self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.verbose_processing 973 974 # reset average error, only relevant if average mass error method is being used 975 SearchMolecularFormulaWorker( 976 find_isotopologues=self.find_isotopologues 977 ).reset_error(mass_spectrum_list[0]) 978 979 # check database for all possible molecular formula combinations based on the setting passed to self.mass_spectrum_obj.molecular_search_settings 980 classes = MolecularCombinations(self.sql_db).runworker( 981 self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search, 982 print_time=self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.verbose_processing 983 ) 984 985 # split the database load to not blowout the memory 986 for classe_chunk in chunks( 987 classes, self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.db_chunk_size 988 ): 989 classes_str_list = [class_tuple[0] for class_tuple in classe_chunk] 990 991 # load the molecular formula objs binned by ion type and heteroatoms classes, {ion type:{classe:[list_formula]}} 992 # for adduct ion type a third key is added {atoms:{ion type:{classe:[list_formula]}}} 993 dict_res = SearchMolecularFormulas.database_to_dict( 994 classes_str_list, 995 nominal_mzs, 996 self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search, 997 ion_charge, 998 ) 999 1000 pbar = tqdm.tqdm(classe_chunk, disable = not verbose) 1001 for classe_tuple in pbar: 1002 # class string is a json serialized dict 1003 classe_str = classe_tuple[0] 1004 1005 # Perform search for (de)protonated ion type 1006 if self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.isProtonated: 1007 ion_type = Labels.protonated_de_ion 1008 1009 pbar.set_description_str( 1010 desc="Started molecular formula search for class %s, (de)protonated " 1011 % classe_str, 1012 refresh=True, 1013 ) 1014 1015 candidate_formulas = dict_res.get(ion_type).get(classe_str) 1016 1017 if candidate_formulas: 1018 self.search_spectra_against_candidates( 1019 mass_spectrum_list=mass_spectrum_list, 1020 ms_peaks_list=ms_peaks_list, 1021 candidate_formulas=candidate_formulas, 1022 ion_type=ion_type, 1023 ion_charge=ion_charge 1024 ) 1025 1026 1027 # Perform search for radical ion type 1028 if self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.isRadical: 1029 pbar.set_description_str( 1030 desc="Started molecular formula search for class %s, radical " 1031 % classe_str, 1032 refresh=True, 1033 ) 1034 1035 ion_type = Labels.radical_ion 1036 1037 candidate_formulas = dict_res.get(ion_type).get(classe_str) 1038 1039 if candidate_formulas: 1040 self.search_spectra_against_candidates( 1041 mass_spectrum_list=mass_spectrum_list, 1042 ms_peaks_list=ms_peaks_list, 1043 candidate_formulas=candidate_formulas, 1044 ion_type=ion_type, 1045 ion_charge=ion_charge 1046 ) 1047 1048 # Perform search for adduct ion type 1049 # looks for adduct, used_atom_valences should be 0 1050 # this code does not support H exchance by halogen atoms 1051 if self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.isAdduct: 1052 pbar.set_description_str( 1053 desc="Started molecular formula search for class %s, adduct " 1054 % classe_str, 1055 refresh=True, 1056 ) 1057 1058 ion_type = Labels.adduct_ion 1059 dict_atoms_formulas = dict_res.get(ion_type) 1060 1061 for adduct_atom, dict_by_class in dict_atoms_formulas.items(): 1062 candidate_formulas = dict_by_class.get(classe_str) 1063 1064 if candidate_formulas: 1065 self.search_spectra_against_candidates( 1066 mass_spectrum_list=mass_spectrum_list, 1067 ms_peaks_list=ms_peaks_list, 1068 candidate_formulas=candidate_formulas, 1069 ion_type=ion_type, 1070 ion_charge=ion_charge 1071 ) 1072 self.sql_db.close() 1073 1074 def run_mass_feature_search(self): 1075 """Run the molecular formula search on the mass features. 1076 1077 Calls bulk_run_molecular_formula_search() with specified mass spectra and mass peaks. 1078 """ 1079 mass_features_df = self.lcms_obj.mass_features_to_df() 1080 1081 # Get the list of mass spectrum (and peaks to search with each mass spectrum) for all mass features 1082 scan_list = mass_features_df.apex_scan.unique() 1083 mass_spectrum_list = [self.lcms_obj._ms[x] for x in scan_list] 1084 ms_peaks = [] 1085 for scan in scan_list: 1086 mf_df_scan = mass_features_df[mass_features_df.apex_scan == scan] 1087 peaks_to_search = [ 1088 self.lcms_obj.mass_features[x].ms1_peak for x in mf_df_scan.index.tolist() 1089 ] 1090 ms_peaks.append(peaks_to_search) 1091 1092 # Run the molecular formula search 1093 self.bulk_run_molecular_formula_search(mass_spectrum_list, ms_peaks) 1094 1095 def run_untargeted_worker_ms1(self): 1096 """Run untargeted molecular formula search on the ms1 mass spectrum.""" 1097 raise NotImplementedError("run_untargeted_worker_ms1 search is not implemented in CoreMS 3.0 and greater") 1098 1099 def run_target_worker_ms1(self): 1100 """Run targeted molecular formula search on the ms1 mass spectrum.""" 1101 raise NotImplementedError("run_target_worker_ms1 formula search is not yet implemented in CoreMS 3.0 and greater")
Class for searching molecular formulas in a LC object.
Parameters
- lcms_obj (LCMSBase): The LCMSBase object.
- sql_db (MolForm_SQL, optional): The SQL database object, by default None.
- first_hit (bool, optional): Flag to indicate whether to skip peaks that already have a molecular formula assigned, by default False.
- find_isotopologues (bool, optional): Flag to indicate whether to find isotopologues, by default True.
Methods
- search_spectra_against_candidates(). Search a list of mass spectra against a list of candidate formulas with a given ion type and charge.
- bulk_run_molecular_formula_search(). Run the molecular formula search on the given list of mass spectra. Pulls the settings from the LCMSBase object to set ion type and charge to search for.
- run_mass_feature_search(). Run the molecular formula search on mass features. Calls bulk_run_molecular_formula_search() with specified mass spectra and mass peaks.
- run_untargeted_worker_ms1(). Run untargeted molecular formula search on the ms1 mass spectrum. DEPRECATED: use run_mass_feature_search() or bulk_run_molecular_formula_search() instead.
- run_target_worker_ms1(). Run targeted molecular formula search on the ms1 mass spectrum. DEPRECATED: use run_mass_feature_search() or bulk_run_molecular_formula_search() instead.
895 def __init__(self, lcms_obj, sql_db=None, first_hit=False, find_isotopologues=True): 896 self.first_hit = first_hit 897 898 self.find_isotopologues = find_isotopologues 899 900 self.lcms_obj = lcms_obj 901 902 if not sql_db: 903 self.sql_db = MolForm_SQL( 904 url=self.lcms_obj.parameters.mass_spectrum['ms1'].molecular_search.url_database 905 ) 906 907 else: 908 self.sql_db = sql_db
910 def search_spectra_against_candidates(self, mass_spectrum_list, ms_peaks_list, candidate_formulas, ion_type, ion_charge): 911 """Search a list of mass spectra against a list of candidate formulas with a given ion type and charge. 912 913 Parameters 914 ---------- 915 mass_spectrum_list : list of MassSpectrum 916 The list of mass spectra to perform the search on. 917 ms_peaks_list : list of lists of MSPeak objects 918 The list of mass spectrum peaks to search within each mass spectrum. 919 candidate_formulas : dict 920 The candidate formulas. 921 ion_type : str 922 The ion type. 923 ion_charge : int 924 The ion charge, either 1 or -1. 925 926 Notes 927 ----- 928 This function is designed to be used with the bulk_run_molecular_formula_search function. 929 """ 930 for mass_spectrum, ms_peaks in zip(mass_spectrum_list, ms_peaks_list): 931 single_ms_search = SearchMolecularFormulas( 932 mass_spectrum, 933 sql_db=self.sql_db, 934 first_hit=self.first_hit, 935 find_isotopologues=self.find_isotopologues, 936 ) 937 single_ms_search.run_search( 938 ms_peaks, 939 candidate_formulas, 940 mass_spectrum.min_abundance, 941 ion_type, 942 ion_charge, 943 )
Search a list of mass spectra against a list of candidate formulas with a given ion type and charge.
Parameters
- mass_spectrum_list (list of MassSpectrum): The list of mass spectra to perform the search on.
- ms_peaks_list (list of lists of MSPeak objects): The list of mass spectrum peaks to search within each mass spectrum.
- candidate_formulas (dict): The candidate formulas.
- ion_type (str): The ion type.
- ion_charge (int): The ion charge, either 1 or -1.
Notes
This function is designed to be used with the bulk_run_molecular_formula_search function.
945 def bulk_run_molecular_formula_search(self, mass_spectrum_list, ms_peaks_list, mass_spectrum_setting_key='ms1'): 946 """Run the molecular formula search on the given list of mass spectra 947 948 Parameters 949 ---------- 950 mass_spectrum_list : list of MassSpectrum 951 The list of mass spectra to search. 952 ms_peaks_list : list of lists of MSPeak objects 953 The mass peaks to perform molecular formula search within each mass spectrum 954 mass_spectrum_setting_key : str, optional 955 The mass spectrum setting key, by default 'ms1'. 956 This is used to get the appropriate molecular search settings from the LCMSBase object 957 """ 958 # Set min_abundance and nominal_mzs 959 if self.lcms_obj.polarity == "positive": 960 ion_charge = 1 961 elif self.lcms_obj.polarity == "negative": 962 ion_charge = -1 963 else: 964 raise ValueError("Polarity must be either 'positive' or 'negative'") 965 966 # Check that the length of the mass spectrum list and the ms_peaks list are the same 967 if len(mass_spectrum_list) != len(ms_peaks_list): 968 raise ValueError("The length of the mass spectrum list and the ms_peaks list must be the same") 969 970 nominal_mzs = [x.nominal_mz for x in mass_spectrum_list] 971 nominal_mzs = list(set([item for sublist in nominal_mzs for item in sublist])) 972 verbose = self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.verbose_processing 973 974 # reset average error, only relevant if average mass error method is being used 975 SearchMolecularFormulaWorker( 976 find_isotopologues=self.find_isotopologues 977 ).reset_error(mass_spectrum_list[0]) 978 979 # check database for all possible molecular formula combinations based on the setting passed to self.mass_spectrum_obj.molecular_search_settings 980 classes = MolecularCombinations(self.sql_db).runworker( 981 self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search, 982 print_time=self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.verbose_processing 983 ) 984 985 # split the database load to not blowout the memory 986 for classe_chunk in chunks( 987 classes, self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.db_chunk_size 988 ): 989 classes_str_list = [class_tuple[0] for class_tuple in classe_chunk] 990 991 # load the molecular formula objs binned by ion type and heteroatoms classes, {ion type:{classe:[list_formula]}} 992 # for adduct ion type a third key is added {atoms:{ion type:{classe:[list_formula]}}} 993 dict_res = SearchMolecularFormulas.database_to_dict( 994 classes_str_list, 995 nominal_mzs, 996 self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search, 997 ion_charge, 998 ) 999 1000 pbar = tqdm.tqdm(classe_chunk, disable = not verbose) 1001 for classe_tuple in pbar: 1002 # class string is a json serialized dict 1003 classe_str = classe_tuple[0] 1004 1005 # Perform search for (de)protonated ion type 1006 if self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.isProtonated: 1007 ion_type = Labels.protonated_de_ion 1008 1009 pbar.set_description_str( 1010 desc="Started molecular formula search for class %s, (de)protonated " 1011 % classe_str, 1012 refresh=True, 1013 ) 1014 1015 candidate_formulas = dict_res.get(ion_type).get(classe_str) 1016 1017 if candidate_formulas: 1018 self.search_spectra_against_candidates( 1019 mass_spectrum_list=mass_spectrum_list, 1020 ms_peaks_list=ms_peaks_list, 1021 candidate_formulas=candidate_formulas, 1022 ion_type=ion_type, 1023 ion_charge=ion_charge 1024 ) 1025 1026 1027 # Perform search for radical ion type 1028 if self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.isRadical: 1029 pbar.set_description_str( 1030 desc="Started molecular formula search for class %s, radical " 1031 % classe_str, 1032 refresh=True, 1033 ) 1034 1035 ion_type = Labels.radical_ion 1036 1037 candidate_formulas = dict_res.get(ion_type).get(classe_str) 1038 1039 if candidate_formulas: 1040 self.search_spectra_against_candidates( 1041 mass_spectrum_list=mass_spectrum_list, 1042 ms_peaks_list=ms_peaks_list, 1043 candidate_formulas=candidate_formulas, 1044 ion_type=ion_type, 1045 ion_charge=ion_charge 1046 ) 1047 1048 # Perform search for adduct ion type 1049 # looks for adduct, used_atom_valences should be 0 1050 # this code does not support H exchance by halogen atoms 1051 if self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.isAdduct: 1052 pbar.set_description_str( 1053 desc="Started molecular formula search for class %s, adduct " 1054 % classe_str, 1055 refresh=True, 1056 ) 1057 1058 ion_type = Labels.adduct_ion 1059 dict_atoms_formulas = dict_res.get(ion_type) 1060 1061 for adduct_atom, dict_by_class in dict_atoms_formulas.items(): 1062 candidate_formulas = dict_by_class.get(classe_str) 1063 1064 if candidate_formulas: 1065 self.search_spectra_against_candidates( 1066 mass_spectrum_list=mass_spectrum_list, 1067 ms_peaks_list=ms_peaks_list, 1068 candidate_formulas=candidate_formulas, 1069 ion_type=ion_type, 1070 ion_charge=ion_charge 1071 ) 1072 self.sql_db.close()
Run the molecular formula search on the given list of mass spectra
Parameters
- mass_spectrum_list (list of MassSpectrum): The list of mass spectra to search.
- ms_peaks_list (list of lists of MSPeak objects): The mass peaks to perform molecular formula search within each mass spectrum
- mass_spectrum_setting_key (str, optional): The mass spectrum setting key, by default 'ms1'. This is used to get the appropriate molecular search settings from the LCMSBase object
1074 def run_mass_feature_search(self): 1075 """Run the molecular formula search on the mass features. 1076 1077 Calls bulk_run_molecular_formula_search() with specified mass spectra and mass peaks. 1078 """ 1079 mass_features_df = self.lcms_obj.mass_features_to_df() 1080 1081 # Get the list of mass spectrum (and peaks to search with each mass spectrum) for all mass features 1082 scan_list = mass_features_df.apex_scan.unique() 1083 mass_spectrum_list = [self.lcms_obj._ms[x] for x in scan_list] 1084 ms_peaks = [] 1085 for scan in scan_list: 1086 mf_df_scan = mass_features_df[mass_features_df.apex_scan == scan] 1087 peaks_to_search = [ 1088 self.lcms_obj.mass_features[x].ms1_peak for x in mf_df_scan.index.tolist() 1089 ] 1090 ms_peaks.append(peaks_to_search) 1091 1092 # Run the molecular formula search 1093 self.bulk_run_molecular_formula_search(mass_spectrum_list, ms_peaks)
Run the molecular formula search on the mass features.
Calls bulk_run_molecular_formula_search() with specified mass spectra and mass peaks.
1095 def run_untargeted_worker_ms1(self): 1096 """Run untargeted molecular formula search on the ms1 mass spectrum.""" 1097 raise NotImplementedError("run_untargeted_worker_ms1 search is not implemented in CoreMS 3.0 and greater")
Run untargeted molecular formula search on the ms1 mass spectrum.
1099 def run_target_worker_ms1(self): 1100 """Run targeted molecular formula search on the ms1 mass spectrum.""" 1101 raise NotImplementedError("run_target_worker_ms1 formula search is not yet implemented in CoreMS 3.0 and greater")
Run targeted molecular formula search on the ms1 mass spectrum.