corems.molecular_id.search.molecularFormulaSearch

   1__author__ = "Yuri E. Corilo"
   2__date__ = "Jul 29, 2019"
   3
   4
   5from typing import List
   6
   7import tqdm
   8
   9from corems import chunks, timeit
  10from corems.encapsulation.constant import Labels
  11from corems.molecular_formula.factory.MolecularFormulaFactory import (
  12    LCMSLibRefMolecularFormula,
  13    MolecularFormula,
  14)
  15from corems.molecular_id.factory.MolecularLookupTable import MolecularCombinations
  16from corems.molecular_id.factory.molecularSQL import MolForm_SQL
  17from corems.ms_peak.factory.MSPeakClasses import _MSPeak
  18
  19last_error = 0
  20last_dif = 0
  21closest_error = 0
  22error_average = 0
  23nbValues = 0
  24
  25
  26class SearchMolecularFormulas:
  27    """Class for searching molecular formulas in a mass spectrum.
  28
  29    Parameters
  30    ----------
  31    mass_spectrum_obj : MassSpectrum
  32        The mass spectrum object.
  33    sql_db : MolForm_SQL, optional
  34        The SQL database object, by default None.
  35    first_hit : bool, optional
  36        Flag to indicate whether to skip peaks that already have a molecular formula assigned, by default False.
  37    find_isotopologues : bool, optional
  38        Flag to indicate whether to find isotopologues, by default True.
  39
  40    Attributes
  41    ----------
  42    mass_spectrum_obj : MassSpectrum
  43        The mass spectrum object.
  44    sql_db : MolForm_SQL
  45        The SQL database object.
  46    first_hit : bool
  47        Flag to indicate whether to skip peaks that already have a molecular formula assigned.
  48    find_isotopologues : bool
  49        Flag to indicate whether to find isotopologues.
  50
  51
  52    Methods
  53    -------
  54    * run_search().
  55        Run the molecular formula search.
  56    * run_worker_mass_spectrum().
  57        Run the molecular formula search on the mass spectrum object.
  58    * run_worker_ms_peaks().
  59        Run the molecular formula search on the given list of mass spectrum peaks.
  60    * database_to_dict().
  61        Convert the database results to a dictionary.
  62    * run_molecular_formula().
  63        Run the molecular formula search on the given list of mass spectrum peaks.
  64    * search_mol_formulas().
  65        Search for molecular formulas in the mass spectrum.
  66
  67    """
  68
  69    def __init__(
  70        self,
  71        mass_spectrum_obj,
  72        sql_db=None,
  73        first_hit: bool = False,
  74        find_isotopologues: bool = True,
  75    ):
  76        self.first_hit = first_hit
  77
  78        self.find_isotopologues = find_isotopologues
  79
  80        self.mass_spectrum_obj = mass_spectrum_obj
  81
  82        if not sql_db:
  83            self.sql_db = MolForm_SQL(
  84                url=mass_spectrum_obj.molecular_search_settings.url_database
  85            )
  86
  87        else:
  88            self.sql_db = sql_db
  89
  90    def __enter__(self):
  91        """Open the SQL database connection."""
  92        return self
  93
  94    def __exit__(self, exc_type, exc_val, exc_tb):
  95        """Close the SQL database connection."""
  96        self.sql_db.close()
  97
  98        return False
  99
 100    def run_search(
 101        self,
 102        mspeaks: list,
 103        query: dict,
 104        min_abundance: float,
 105        ion_type: str,
 106        ion_charge: int,
 107        adduct_atom=None,
 108    ):
 109        """Run the molecular formula search.
 110
 111        Parameters
 112        ----------
 113        mspeaks : list of MSPeak
 114            The list of mass spectrum peaks.
 115        query : dict
 116            The query dictionary containing the possible molecular formulas.
 117        min_abundance : float
 118            The minimum abundance threshold.
 119        ion_type : str
 120            The ion type.
 121        ion_charge : int
 122            The ion charge.
 123        adduct_atom : str, optional
 124            The adduct atom, by default None.
 125        """
 126
 127        def get_formulas(nominal_overlay: float = 0.1):
 128            """
 129            Get the list of formulas based on the nominal overlay.
 130
 131            Parameters
 132            ----------
 133            nominal_overlay : float, optional
 134                The nominal overlay, by default 0.1.
 135
 136            Returns
 137            -------
 138            list
 139                The list of formulas.
 140            """
 141            nominal_mz = ms_peak.nominal_mz_exp
 142
 143            defect_mass = ms_peak.mz_exp - nominal_mz
 144            nominal_masses = [nominal_mz]
 145
 146            if (defect_mass) >= 1 - nominal_overlay:
 147                nominal_masses.append(nominal_mz + 1)
 148            elif (defect_mass) <= nominal_overlay:
 149                nominal_masses.append(nominal_mz - 1)
 150
 151            list_formulas_candidates = []
 152
 153            for nominal_mass in nominal_masses:
 154                if nominal_mass in query.keys():
 155                    list_formulas_candidates.extend(query.get(nominal_mass))
 156
 157            return list_formulas_candidates
 158
 159        all_assigned_indexes = list()
 160
 161        # molecular_search_settings = self.mass_spectrum_obj.molecular_search_settings
 162
 163        search_molfrom = SearchMolecularFormulaWorker(
 164            find_isotopologues=self.find_isotopologues
 165        )
 166
 167        for ms_peak in mspeaks:
 168            # already assigned a molecular formula
 169            if self.first_hit:
 170                if ms_peak.is_assigned:
 171                    continue
 172
 173            ms_peak_indexes = search_molfrom.find_formulas(
 174                get_formulas(),
 175                min_abundance,
 176                self.mass_spectrum_obj,
 177                ms_peak,
 178                ion_type,
 179                ion_charge,
 180                adduct_atom,
 181            )
 182
 183            all_assigned_indexes.extend(ms_peak_indexes)
 184
 185        # all_assigned_indexes = MolecularFormulaSearchFilters().filter_isotopologue(all_assigned_indexes, self.mass_spectrum_obj)
 186
 187        # all_assigned_indexes = MolecularFormulaSearchFilters().filter_kendrick(all_assigned_indexes, self.mass_spectrum_obj)
 188
 189        # MolecularFormulaSearchFilters().check_min_peaks(all_assigned_indexes, self.mass_spectrum_obj)
 190        # filter per min peaks per mono isotopic class
 191
 192    def run_worker_mass_spectrum(self):
 193        """Run the molecular formula search on the mass spectrum object."""
 194        self.run_molecular_formula(
 195            self.mass_spectrum_obj.sort_by_abundance(), 
 196            print_time=self.mass_spectrum_obj.molecular_search_settings.verbose_processing
 197            )
 198
 199    def run_worker_ms_peaks(self, ms_peaks):
 200        """Run the molecular formula search on the given list of mass spectrum peaks.
 201
 202        Parameters
 203        ----------
 204        ms_peaks : list of MSPeak
 205            The list of mass spectrum peaks.
 206        """
 207        self.run_molecular_formula(
 208            ms_peaks,
 209            print_time=self.mass_spectrum_obj.molecular_search_settings.verbose_processing
 210        )
 211
 212    @staticmethod
 213    def database_to_dict(classe_str_list, nominal_mzs, mf_search_settings, ion_charge):
 214        """Convert the database results to a dictionary.
 215
 216        Parameters
 217        ----------
 218        classe_str_list : list
 219            The list of class strings.
 220        nominal_mzs : list
 221            The list of nominal m/z values.
 222        mf_search_settings : MolecularFormulaSearchSettings
 223            The molecular formula search settings.
 224        ion_charge : int
 225            The ion charge.
 226
 227        Returns
 228        -------
 229        dict
 230            The dictionary containing the database results.
 231        """
 232        sql_db = MolForm_SQL(url=mf_search_settings.url_database)
 233
 234        dict_res = {}
 235
 236        if mf_search_settings.isProtonated:
 237            dict_res[Labels.protonated_de_ion] = sql_db.get_dict_by_classes(
 238                classe_str_list,
 239                Labels.protonated_de_ion,
 240                nominal_mzs,
 241                ion_charge,
 242                mf_search_settings,
 243            )
 244
 245        if mf_search_settings.isRadical:
 246            dict_res[Labels.radical_ion] = sql_db.get_dict_by_classes(
 247                classe_str_list,
 248                Labels.radical_ion,
 249                nominal_mzs,
 250                ion_charge,
 251                mf_search_settings,
 252            )
 253
 254        if mf_search_settings.isAdduct:
 255            adduct_list = (
 256                mf_search_settings.adduct_atoms_neg
 257                if ion_charge < 0
 258                else mf_search_settings.adduct_atoms_pos
 259            )
 260            dict_res[Labels.adduct_ion] = sql_db.get_dict_by_classes(
 261                classe_str_list,
 262                Labels.adduct_ion,
 263                nominal_mzs,
 264                ion_charge,
 265                mf_search_settings,
 266                adducts=adduct_list,
 267            )
 268
 269        return dict_res
 270
 271    @timeit(print_time=True)
 272    def run_molecular_formula(self, ms_peaks, **kwargs):
 273        """Run the molecular formula search on the given list of mass spectrum peaks.
 274
 275        Parameters
 276        ----------
 277        ms_peaks : list of MSPeak
 278            The list of mass spectrum peaks.
 279        **kwargs
 280            Additional keyword arguments. 
 281            Most notably, print_time, which is a boolean flag to indicate whether to print the time 
 282            and passed to the timeit decorator.
 283        """
 284        ion_charge = self.mass_spectrum_obj.polarity
 285        min_abundance = self.mass_spectrum_obj.min_abundance
 286        nominal_mzs = self.mass_spectrum_obj.nominal_mz
 287
 288        verbose = self.mass_spectrum_obj.molecular_search_settings.verbose_processing
 289        # reset average error, only relevant is average mass error method is being used
 290        SearchMolecularFormulaWorker(
 291            find_isotopologues=self.find_isotopologues
 292        ).reset_error(self.mass_spectrum_obj)
 293
 294        # check database for all possible molecular formula combinations based on the setting passed to self.mass_spectrum_obj.molecular_search_settings
 295        classes = MolecularCombinations(self.sql_db).runworker(
 296            self.mass_spectrum_obj.molecular_search_settings,
 297            print_time=self.mass_spectrum_obj.molecular_search_settings.verbose_processing
 298        )
 299
 300        # split the database load to not blowout the memory
 301        # TODO add to the settings
 302        for classe_chunk in chunks(
 303            classes, self.mass_spectrum_obj.molecular_search_settings.db_chunk_size
 304        ):
 305            classes_str_list = [class_tuple[0] for class_tuple in classe_chunk]
 306
 307            # load the molecular formula objs binned by ion type and heteroatoms classes, {ion type:{classe:[list_formula]}}
 308            # for adduct ion type a third key is added {atoms:{ion type:{classe:[list_formula]}}}
 309            dict_res = self.database_to_dict(
 310                classes_str_list,
 311                nominal_mzs,
 312                self.mass_spectrum_obj.molecular_search_settings,
 313                ion_charge,
 314            )
 315            pbar = tqdm.tqdm(classe_chunk, disable = not verbose)
 316            for classe_tuple in pbar:
 317                # class string is a json serialized dict
 318                classe_str = classe_tuple[0]
 319                classe_dict = classe_tuple[1]
 320
 321                if self.mass_spectrum_obj.molecular_search_settings.isProtonated:
 322                    ion_type = Labels.protonated_de_ion
 323                    if verbose:
 324                        pbar.set_description_str(
 325                            desc="Started molecular formula search for class %s, (de)protonated "
 326                            % classe_str,
 327                            refresh=True,
 328                        )
 329
 330                    candidate_formulas = dict_res.get(ion_type).get(classe_str)
 331
 332                    if candidate_formulas:
 333                        self.run_search(
 334                            ms_peaks,
 335                            candidate_formulas,
 336                            min_abundance,
 337                            ion_type,
 338                            ion_charge,
 339                        )
 340
 341                if self.mass_spectrum_obj.molecular_search_settings.isRadical:
 342                    if verbose:
 343                        pbar.set_description_str(
 344                            desc="Started molecular formula search for class %s, radical "
 345                            % classe_str,
 346                            refresh=True,
 347                        )
 348
 349                    ion_type = Labels.radical_ion
 350
 351                    candidate_formulas = dict_res.get(ion_type).get(classe_str)
 352
 353                    if candidate_formulas:
 354                        self.run_search(
 355                            ms_peaks,
 356                            candidate_formulas,
 357                            min_abundance,
 358                            ion_type,
 359                            ion_charge,
 360                        )
 361                # looks for adduct, used_atom_valences should be 0
 362                # this code does not support H exchance by halogen atoms
 363                if self.mass_spectrum_obj.molecular_search_settings.isAdduct:
 364                    if verbose:
 365                        pbar.set_description_str(
 366                            desc="Started molecular formula search for class %s, adduct "
 367                            % classe_str,
 368                            refresh=True,
 369                        )
 370
 371                    ion_type = Labels.adduct_ion
 372                    dict_atoms_formulas = dict_res.get(ion_type)
 373
 374                    for adduct_atom, dict_by_class in dict_atoms_formulas.items():
 375                        candidate_formulas = dict_by_class.get(classe_str)
 376
 377                        if candidate_formulas:
 378                            self.run_search(
 379                                ms_peaks,
 380                                candidate_formulas,
 381                                min_abundance,
 382                                ion_type,
 383                                ion_charge,
 384                                adduct_atom=adduct_atom,
 385                            )
 386        self.sql_db.close()
 387
 388    def search_mol_formulas(
 389        self,
 390        possible_formulas_list: List[MolecularFormula],
 391        ion_type: str,
 392        neutral_molform=True,
 393        find_isotopologues=True,
 394        adduct_atom=None,
 395    ) -> List[_MSPeak]:
 396        """Search for molecular formulas in the mass spectrum.
 397
 398        Parameters
 399        ----------
 400        possible_formulas_list : list of MolecularFormula
 401            The list of possible molecular formulas.
 402        ion_type : str
 403            The ion type.
 404        neutral_molform : bool, optional
 405            Flag to indicate whether the molecular formulas are neutral, by default True.
 406        find_isotopologues : bool, optional
 407            Flag to indicate whether to find isotopologues, by default True.
 408        adduct_atom : str, optional
 409            The adduct atom, by default None.
 410
 411        Returns
 412        -------
 413        list of MSPeak
 414            The list of mass spectrum peaks with assigned molecular formulas.
 415        """
 416        # neutral_molform: some reference files already present the formula on ion mode, for instance, bruker reference files
 417        #    if that is the case than turn neutral_molform off
 418
 419        SearchMolecularFormulaWorker(find_isotopologues=find_isotopologues).reset_error(
 420            self.mass_spectrum_obj
 421        )
 422
 423        initial_min_peak_bool = (
 424            self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter
 425        )
 426        initial_runtime_kendrick_filter = (
 427            self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter
 428        )
 429
 430        # Are the following 3 lines redundant?
 431        self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = False
 432        self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = (
 433            False  # TODO check this line
 434        )
 435        self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter = (
 436            False
 437        )
 438
 439        possible_formulas_dict_nm = {}
 440
 441        for mf in possible_formulas_list:
 442            if neutral_molform:
 443                nm = int(mf.protonated_mz)
 444            else:
 445                nm = int(mf.mz_nominal_calc)
 446
 447            if nm in possible_formulas_dict_nm.keys():
 448                possible_formulas_dict_nm[nm].append(mf)
 449
 450            else:
 451                possible_formulas_dict_nm[nm] = [mf]
 452
 453        min_abundance = self.mass_spectrum_obj.min_abundance
 454
 455        ion_type = ion_type
 456
 457        self.run_search(
 458            self.mass_spectrum_obj,
 459            possible_formulas_dict_nm,
 460            min_abundance,
 461            ion_type,
 462            self.mass_spectrum_obj.polarity,
 463            adduct_atom=adduct_atom,
 464        )
 465
 466        self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = (
 467            initial_min_peak_bool
 468        )
 469        self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter = (
 470            initial_runtime_kendrick_filter
 471        )
 472
 473        mspeaks = [mspeak for mspeak in self.mass_spectrum_obj if mspeak.is_assigned]
 474
 475        self.sql_db.close()
 476
 477        return mspeaks
 478
 479
 480class SearchMolecularFormulaWorker:
 481    """Class for searching molecular formulas in a mass spectrum.
 482
 483    Parameters
 484    ----------
 485    find_isotopologues : bool, optional
 486        Flag to indicate whether to find isotopologues, by default True.
 487
 488    Attributes
 489    ----------
 490    find_isotopologues : bool
 491        Flag to indicate whether to find isotopologues.
 492
 493    Methods
 494    -------
 495    * reset_error().
 496        Reset the error variables.
 497    * set_last_error().
 498        Set the last error.
 499    * find_formulas().
 500        Find the formulas.
 501    * calc_error().
 502        Calculate the error.
 503    """
 504
 505    # TODO add reset error function
 506    # needs this wraper to pass the class to multiprocessing
 507
 508    def __init__(self, find_isotopologues=True):
 509        self.find_isotopologues = find_isotopologues
 510
 511    def __call__(self, args):
 512        """Call the find formulas function.
 513
 514        Parameters
 515        ----------
 516        args : tuple
 517            The arguments.
 518
 519        Returns
 520        -------
 521        list
 522            The list of mass spectrum peaks with assigned molecular formulas.
 523        """
 524        return self.find_formulas(*args)  # ,args[1]
 525
 526    def reset_error(self, mass_spectrum_obj):
 527        """Reset the error variables.
 528
 529        Parameters
 530        ----------
 531        mass_spectrum_obj : MassSpectrum
 532            The mass spectrum object.
 533
 534        Notes
 535        -----
 536        This function resets the error variables for the given mass spectrum object.
 537        """
 538        global last_error, last_dif, closest_error, error_average, nbValues
 539        last_error, last_dif, closest_error, nbValues = 0.0, 0.0, 0.0, 0.0
 540
 541    def set_last_error(self, error, mass_spectrum_obj):
 542        """Set the last error.
 543
 544        Parameters
 545        ----------
 546        error : float
 547            The error.
 548        mass_spectrum_obj : MassSpectrum
 549            The mass spectrum object.
 550        """
 551        # set the changes to the global variables, not internal ones
 552        global last_error, last_dif, closest_error, error_average, nbValues
 553
 554        if mass_spectrum_obj.molecular_search_settings.error_method == "distance":
 555            dif = error - last_error
 556            if dif < last_dif:
 557                last_dif = dif
 558                closest_error = error
 559                mass_spectrum_obj.molecular_search_settings.min_ppm_error = (
 560                    closest_error
 561                    - mass_spectrum_obj.molecular_search_settings.mz_error_range
 562                )
 563                mass_spectrum_obj.molecular_search_settings.max_ppm_error = (
 564                    closest_error
 565                    + mass_spectrum_obj.molecular_search_settings.mz_error_range
 566                )
 567
 568        elif mass_spectrum_obj.molecular_search_settings.error_method == "lowest":
 569            if error < last_error:
 570                mass_spectrum_obj.molecular_search_settings.min_ppm_error = (
 571                    error - mass_spectrum_obj.molecular_search_settings.mz_error_range
 572                )
 573                mass_spectrum_obj.molecular_search_settings.max_ppm_error = (
 574                    error + mass_spectrum_obj.molecular_search_settings.mz_error_range
 575                )
 576                last_error = error
 577
 578        elif mass_spectrum_obj.molecular_search_settings.error_method == "symmetrical":
 579            mass_spectrum_obj.molecular_search_settings.min_ppm_error = (
 580                mass_spectrum_obj.molecular_search_settings.mz_error_average
 581                - mass_spectrum_obj.molecular_search_settings.mz_error_range
 582            )
 583            mass_spectrum_obj.molecular_search_settings.max_ppm_error = (
 584                mass_spectrum_obj.molecular_search_settings.mz_error_average
 585                + mass_spectrum_obj.molecular_search_settings.mz_error_range
 586            )
 587
 588        elif mass_spectrum_obj.molecular_search_settings.error_method == "average":
 589            nbValues += 1
 590            error_average = error_average + ((error - error_average) / nbValues)
 591            mass_spectrum_obj.molecular_search_settings.min_ppm_error = (
 592                error_average
 593                - mass_spectrum_obj.molecular_search_settings.mz_error_range
 594            )
 595            mass_spectrum_obj.molecular_search_settings.max_ppm_error = (
 596                error_average
 597                + mass_spectrum_obj.molecular_search_settings.mz_error_range
 598            )
 599
 600        else:
 601            # using set mass_spectrum_obj.molecular_search_settings.min_ppm_error  and max_ppm_error range
 602            pass
 603
 604        # returns the error based on the selected method at mass_spectrum_obj.molecular_search_settings.method
 605
 606    @staticmethod
 607    def calc_error(mz_exp, mz_calc, method="ppm"):
 608        """Calculate the error.
 609
 610        Parameters
 611        ----------
 612        mz_exp : float
 613            The experimental m/z value.
 614        mz_calc : float
 615            The calculated m/z value.
 616        method : str, optional
 617            The method, by default 'ppm'.
 618
 619        Raises
 620        -------
 621        Exception
 622            If the method is not ppm or ppb.
 623
 624        Returns
 625        -------
 626        float
 627            The error.
 628        """
 629
 630        if method == "ppm":
 631            multi_factor = 1_000_000
 632
 633        elif method == "ppb":
 634            multi_factor = 1_000_000_000
 635
 636        elif method == "perc":
 637            multi_factor = 100
 638
 639        else:
 640            raise Exception(
 641                "method needs to be ppm or ppb, you have entered %s" % method
 642            )
 643
 644        if mz_exp:
 645            return ((mz_exp - mz_calc) / mz_calc) * multi_factor
 646
 647        else:
 648            raise Exception("Please set mz_calc first")
 649
 650    def find_formulas(
 651        self,
 652        formulas,
 653        min_abundance,
 654        mass_spectrum_obj,
 655        ms_peak,
 656        ion_type,
 657        ion_charge,
 658        adduct_atom=None,
 659    ):
 660        """Find the formulas.
 661
 662        Parameters
 663        ----------
 664        formulas : list of MolecularFormula
 665            The list of molecular formulas.
 666        min_abundance : float
 667            The minimum abundance threshold.
 668        mass_spectrum_obj : MassSpectrum
 669            The mass spectrum object.
 670        ms_peak : MSPeak
 671            The mass spectrum peak.
 672        ion_type : str
 673            The ion type.
 674        ion_charge : int
 675            The ion charge.
 676        adduct_atom : str, optional
 677            The adduct atom, by default None.
 678
 679        Returns
 680        -------
 681        list of MSPeak
 682            The list of mass spectrum peaks with assigned molecular formulas.
 683
 684        Notes
 685        -----
 686        Uses the closest error the next search (this is not ideal, it needs to use confidence
 687        metric to choose the right candidate then propagate the error using the error from the best candidate).
 688        It needs to add s/n to the equation.
 689        It need optimization to define the mz_error_range within a m/z unit since it is directly proportional
 690        with the mass, and inversely proportional to the rp. It's not linear, i.e., sigma mass.
 691        The idea it to correlate sigma to resolving power, signal to noise and sample complexity per mz unit.
 692        Method='distance'
 693        """
 694        mspeak_assigned_index = list()
 695
 696        min_ppm_error = mass_spectrum_obj.molecular_search_settings.min_ppm_error
 697        max_ppm_error = mass_spectrum_obj.molecular_search_settings.max_ppm_error
 698
 699        min_abun_error = mass_spectrum_obj.molecular_search_settings.min_abun_error
 700        max_abun_error = mass_spectrum_obj.molecular_search_settings.max_abun_error
 701
 702        # f = open("abundance_error.txt", "a+")
 703        ms_peak_mz_exp, ms_peak_abundance = ms_peak.mz_exp, ms_peak.abundance
 704        # min_error = min([pmf.mz_error for pmf in possible_formulas])
 705
 706        def mass_by_ion_type(possible_formula_obj):
 707            if ion_type == Labels.protonated_de_ion:
 708                return possible_formula_obj._protonated_mz(ion_charge)
 709
 710            elif ion_type == Labels.radical_ion:
 711                return possible_formula_obj._radical_mz(ion_charge)
 712
 713            elif ion_type == Labels.adduct_ion and adduct_atom:
 714                return possible_formula._adduct_mz(ion_charge, adduct_atom)
 715
 716            else:
 717                # will return externally calculated mz if is set, #use on Bruker Reference list import
 718                # if the ion type is known the ion mass based on molecular formula ion type
 719                # if ion type is unknow will return neutral mass
 720                return possible_formula.mz_calc
 721
 722        if formulas:
 723            if isinstance(formulas[0], LCMSLibRefMolecularFormula):
 724                possible_mf_class = True
 725
 726            else:
 727                possible_mf_class = False
 728
 729        for possible_formula in formulas:
 730            if possible_formula:
 731                error = self.calc_error(
 732                    ms_peak_mz_exp, mass_by_ion_type(possible_formula)
 733                )
 734
 735                # error = possible_formula.mz_error
 736
 737                if min_ppm_error <= error <= max_ppm_error:
 738                    # update the error
 739
 740                    self.set_last_error(error, mass_spectrum_obj)
 741
 742                    # add molecular formula match to ms_peak
 743
 744                    # get molecular formula dict from sql obj
 745                    # formula_dict = pickle.loads(possible_formula.mol_formula)
 746                    # if possible_mf_class:
 747
 748                    #    molecular_formula = deepcopy(possible_formula)
 749
 750                    # else:
 751
 752                    formula_dict = possible_formula.to_dict()
 753                    # create the molecular formula obj to be stored
 754                    if possible_mf_class:
 755                        molecular_formula = LCMSLibRefMolecularFormula(
 756                            formula_dict,
 757                            ion_charge,
 758                            ion_type=ion_type,
 759                            adduct_atom=adduct_atom,
 760                        )
 761
 762                        molecular_formula.name = possible_formula.name
 763                        molecular_formula.kegg_id = possible_formula.kegg_id
 764                        molecular_formula.cas = possible_formula.cas
 765
 766                    else:
 767                        molecular_formula = MolecularFormula(
 768                            formula_dict,
 769                            ion_charge,
 770                            ion_type=ion_type,
 771                            adduct_atom=adduct_atom,
 772                        )
 773                    # add the molecular formula obj to the mspeak obj
 774                    # add the mspeak obj and it's index for tracking next assignment step
 775
 776                    if self.find_isotopologues:
 777                        # calculates isotopologues
 778                        isotopologues = molecular_formula.isotopologues(
 779                            min_abundance,
 780                            ms_peak_abundance,
 781                            mass_spectrum_obj.dynamic_range,
 782                        )
 783
 784                        # search for isotopologues
 785                        for isotopologue_formula in isotopologues:
 786                            molecular_formula.expected_isotopologues.append(
 787                                isotopologue_formula
 788                            )
 789                            # move this outside to improve preformace
 790                            # we need to increase the search space to -+1 m_z
 791                            first_index, last_index = (
 792                                mass_spectrum_obj.get_nominal_mz_first_last_indexes(
 793                                    isotopologue_formula.mz_nominal_calc
 794                                )
 795                            )
 796
 797                            for ms_peak_iso in mass_spectrum_obj[
 798                                first_index:last_index
 799                            ]:
 800                                error = self.calc_error(
 801                                    ms_peak_iso.mz_exp, isotopologue_formula.mz_calc
 802                                )
 803
 804                                if min_ppm_error <= error <= max_ppm_error:
 805                                    # need to define error distribution for abundance measurements
 806
 807                                    # if mass_spectrum_obj.is_centroid:
 808
 809                                    abundance_error = self.calc_error(
 810                                        isotopologue_formula.abundance_calc,
 811                                        ms_peak_iso.abundance,
 812                                        method="perc",
 813                                    )
 814
 815                                    # area_error = self.calc_error(ms_peak.area, ms_peak_iso.area, method='perc')
 816
 817                                    # margin of error was set empirically/ needs statistical calculation
 818                                    #  of margin of error for the measurement of the abundances
 819                                    if (
 820                                        min_abun_error
 821                                        <= abundance_error
 822                                        <= max_abun_error
 823                                    ):
 824                                        # update the error
 825
 826                                        self.set_last_error(error, mass_spectrum_obj)
 827
 828                                        # isotopologue_formula.mz_error = error
 829
 830                                        # isotopologue_formula.area_error = area_error
 831
 832                                        # isotopologue_formula.abundance_error = abundance_error
 833
 834                                        isotopologue_formula.mspeak_index_mono_isotopic = ms_peak.index
 835
 836                                        mono_isotopic_formula_index = len(ms_peak)
 837
 838                                        isotopologue_formula.mspeak_index_mono_isotopic = ms_peak.index
 839
 840                                        isotopologue_formula.mono_isotopic_formula_index = mono_isotopic_formula_index
 841
 842                                        # add mspeaks isotopologue index to the mono isotopic MolecularFormula obj and the respective formula position
 843
 844                                        # add molecular formula match to ms_peak
 845                                        x = ms_peak_iso.add_molecular_formula(
 846                                            isotopologue_formula
 847                                        )
 848
 849                                        molecular_formula.mspeak_mf_isotopologues_indexes.append(
 850                                            (ms_peak_iso.index, x)
 851                                        )
 852                                        # add mspeaks mono isotopic index to the isotopologue MolecularFormula obj
 853
 854                    y = ms_peak.add_molecular_formula(molecular_formula)
 855
 856                    mspeak_assigned_index.append((ms_peak.index, y))
 857
 858        return mspeak_assigned_index
 859
 860
 861class SearchMolecularFormulasLC:
 862    """Class for searching molecular formulas in a LC object.
 863
 864    Parameters
 865    ----------
 866    lcms_obj : LCMSBase
 867        The LCMSBase object.
 868    sql_db : MolForm_SQL, optional
 869        The SQL database object, by default None.
 870    first_hit : bool, optional
 871        Flag to indicate whether to skip peaks that already have a molecular formula assigned, by default False.
 872    find_isotopologues : bool, optional
 873        Flag to indicate whether to find isotopologues, by default True.
 874
 875    Methods
 876    -------
 877
 878    * search_spectra_against_candidates().
 879        Search a list of mass spectra against a list of candidate formulas with a given ion type and charge.
 880    * bulk_run_molecular_formula_search().
 881        Run the molecular formula search on the given list of mass spectra.
 882        Pulls the settings from the LCMSBase object to set ion type and charge to search for. 
 883    * run_mass_feature_search().
 884        Run the molecular formula search on mass features.
 885        Calls bulk_run_molecular_formula_search() with specified mass spectra and mass peaks.
 886    * run_untargeted_worker_ms1().
 887        Run untargeted molecular formula search on the ms1 mass spectrum.
 888        DEPRECATED: use run_mass_feature_search() or bulk_run_molecular_formula_search() instead.
 889    * run_target_worker_ms1().
 890        Run targeted molecular formula search on the ms1 mass spectrum.
 891        DEPRECATED: use run_mass_feature_search() or bulk_run_molecular_formula_search() instead.
 892    """
 893
 894    def __init__(self, lcms_obj, sql_db=None, first_hit=False, find_isotopologues=True):
 895        self.first_hit = first_hit
 896
 897        self.find_isotopologues = find_isotopologues
 898
 899        self.lcms_obj = lcms_obj
 900
 901        if not sql_db:
 902            self.sql_db = MolForm_SQL(
 903                url=self.lcms_obj.parameters.mass_spectrum['ms1'].molecular_search.url_database
 904            )
 905
 906        else:
 907            self.sql_db = sql_db
 908
 909    def search_spectra_against_candidates(self, mass_spectrum_list, ms_peaks_list, candidate_formulas, ion_type, ion_charge):
 910        """Search a list of mass spectra against a list of candidate formulas with a given ion type and charge.
 911
 912        Parameters
 913        ----------
 914        mass_spectrum_list : list of MassSpectrum
 915            The list of mass spectra to perform the search on.
 916        ms_peaks_list : list of lists of MSPeak objects
 917            The list of mass spectrum peaks to search within each mass spectrum.
 918        candidate_formulas : dict
 919            The candidate formulas.
 920        ion_type : str
 921            The ion type.
 922        ion_charge : int
 923            The ion charge, either 1 or -1.
 924
 925        Notes
 926        -----
 927        This function is designed to be used with the bulk_run_molecular_formula_search function.
 928        """
 929        for mass_spectrum, ms_peaks in zip(mass_spectrum_list, ms_peaks_list):
 930            single_ms_search = SearchMolecularFormulas(
 931                mass_spectrum,
 932                sql_db=self.sql_db,
 933                first_hit=self.first_hit,
 934                find_isotopologues=self.find_isotopologues,
 935            )
 936            single_ms_search.run_search(
 937                ms_peaks,
 938                candidate_formulas,
 939                mass_spectrum.min_abundance,
 940                ion_type,
 941                ion_charge,
 942            )
 943
 944    def bulk_run_molecular_formula_search(self, mass_spectrum_list, ms_peaks_list, mass_spectrum_setting_key='ms1'):
 945        """Run the molecular formula search on the given list of mass spectra
 946
 947        Parameters
 948        ----------
 949        mass_spectrum_list : list of MassSpectrum
 950            The list of mass spectra to search.
 951        ms_peaks_list : list of lists of MSPeak objects 
 952            The mass peaks to perform molecular formula search within each mass spectrum
 953        mass_spectrum_setting_key : str, optional
 954            The mass spectrum setting key, by default 'ms1'.
 955            This is used to get the appropriate molecular search settings from the LCMSBase object
 956        """
 957        # Set min_abundance and nominal_mzs
 958        if self.lcms_obj.polarity == "positive":
 959            ion_charge = 1
 960        elif self.lcms_obj.polarity == "negative":
 961            ion_charge = -1
 962        else:
 963            raise ValueError("Polarity must be either 'positive' or 'negative'")
 964        
 965        # Check that the length of the mass spectrum list and the ms_peaks list are the same
 966        if len(mass_spectrum_list) != len(ms_peaks_list):
 967            raise ValueError("The length of the mass spectrum list and the ms_peaks list must be the same")
 968        
 969        nominal_mzs = [x.nominal_mz for x in mass_spectrum_list]
 970        nominal_mzs = list(set([item for sublist in nominal_mzs for item in sublist]))
 971        verbose = self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.verbose_processing 
 972
 973        # reset average error, only relevant if average mass error method is being used
 974        SearchMolecularFormulaWorker(
 975            find_isotopologues=self.find_isotopologues
 976        ).reset_error(mass_spectrum_list[0])
 977
 978        # check database for all possible molecular formula combinations based on the setting passed to self.mass_spectrum_obj.molecular_search_settings
 979        classes = MolecularCombinations(self.sql_db).runworker(
 980            self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search,
 981            print_time=self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.verbose_processing
 982        )
 983        
 984        # split the database load to not blowout the memory
 985        for classe_chunk in chunks(
 986            classes, self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.db_chunk_size
 987        ):
 988            classes_str_list = [class_tuple[0] for class_tuple in classe_chunk]
 989
 990            # load the molecular formula objs binned by ion type and heteroatoms classes, {ion type:{classe:[list_formula]}}
 991            # for adduct ion type a third key is added {atoms:{ion type:{classe:[list_formula]}}}
 992            dict_res = SearchMolecularFormulas.database_to_dict(
 993                classes_str_list,
 994                nominal_mzs,
 995                self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search,
 996                ion_charge,
 997            )
 998
 999            pbar = tqdm.tqdm(classe_chunk, disable = not verbose)
1000            for classe_tuple in pbar:
1001                # class string is a json serialized dict
1002                classe_str = classe_tuple[0]
1003
1004                # Perform search for (de)protonated ion type
1005                if self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.isProtonated:
1006                    ion_type = Labels.protonated_de_ion
1007
1008                    pbar.set_description_str(
1009                        desc="Started molecular formula search for class %s, (de)protonated "
1010                        % classe_str,
1011                        refresh=True,
1012                    )
1013
1014                    candidate_formulas = dict_res.get(ion_type).get(classe_str)
1015
1016                    if candidate_formulas:
1017                        self.search_spectra_against_candidates(
1018                            mass_spectrum_list=mass_spectrum_list,
1019                            ms_peaks_list=ms_peaks_list,
1020                            candidate_formulas=candidate_formulas,
1021                            ion_type=ion_type,
1022                            ion_charge=ion_charge
1023                        )
1024
1025
1026                # Perform search for radical ion type
1027                if self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.isRadical:
1028                    pbar.set_description_str(
1029                        desc="Started molecular formula search for class %s, radical "
1030                        % classe_str,
1031                        refresh=True,
1032                    )
1033
1034                    ion_type = Labels.radical_ion
1035
1036                    candidate_formulas = dict_res.get(ion_type).get(classe_str)
1037
1038                    if candidate_formulas:
1039                        self.search_spectra_against_candidates(
1040                            mass_spectrum_list=mass_spectrum_list,
1041                            ms_peaks_list=ms_peaks_list,
1042                            candidate_formulas=candidate_formulas,
1043                            ion_type=ion_type,
1044                            ion_charge=ion_charge
1045                        )
1046
1047                # Perform search for adduct ion type
1048                # looks for adduct, used_atom_valences should be 0
1049                # this code does not support H exchance by halogen atoms
1050                if self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.isAdduct:
1051                    pbar.set_description_str(
1052                        desc="Started molecular formula search for class %s, adduct "
1053                        % classe_str,
1054                        refresh=True,
1055                    )
1056
1057                    ion_type = Labels.adduct_ion
1058                    dict_atoms_formulas = dict_res.get(ion_type)
1059
1060                    for adduct_atom, dict_by_class in dict_atoms_formulas.items():
1061                        candidate_formulas = dict_by_class.get(classe_str)
1062
1063                        if candidate_formulas:
1064                            self.search_spectra_against_candidates(
1065                                mass_spectrum_list=mass_spectrum_list,
1066                                ms_peaks_list=ms_peaks_list,
1067                                candidate_formulas=candidate_formulas,
1068                                ion_type=ion_type,
1069                                ion_charge=ion_charge
1070                            )
1071        self.sql_db.close()
1072        
1073    def run_mass_feature_search(self):
1074        """Run the molecular formula search on the mass features.
1075        
1076        Calls bulk_run_molecular_formula_search() with specified mass spectra and mass peaks.
1077        """
1078        mass_features_df = self.lcms_obj.mass_features_to_df()
1079
1080        # Get the list of mass spectrum (and peaks to search with each mass spectrum) for all mass features
1081        scan_list = mass_features_df.apex_scan.unique()
1082        mass_spectrum_list = [self.lcms_obj._ms[x] for x in scan_list]
1083        ms_peaks = []
1084        for scan in scan_list:
1085            mf_df_scan = mass_features_df[mass_features_df.apex_scan == scan]
1086            peaks_to_search = [
1087                self.lcms_obj.mass_features[x].ms1_peak for x in mf_df_scan.index.tolist()
1088            ]
1089            ms_peaks.append(peaks_to_search)
1090        
1091        # Run the molecular formula search
1092        self.bulk_run_molecular_formula_search(mass_spectrum_list, ms_peaks)
1093    
1094    def run_untargeted_worker_ms1(self):
1095        """Run untargeted molecular formula search on the ms1 mass spectrum."""
1096        raise NotImplementedError("run_untargeted_worker_ms1 search is not implemented in CoreMS 3.0 and greater")
1097
1098    def run_target_worker_ms1(self):
1099        """Run targeted molecular formula search on the ms1 mass spectrum."""
1100        raise NotImplementedError("run_target_worker_ms1 formula search is not yet implemented in CoreMS 3.0 and greater")
last_error = 0
last_dif = 0
closest_error = 0
error_average = 0
nbValues = 0
class SearchMolecularFormulas:
 27class SearchMolecularFormulas:
 28    """Class for searching molecular formulas in a mass spectrum.
 29
 30    Parameters
 31    ----------
 32    mass_spectrum_obj : MassSpectrum
 33        The mass spectrum object.
 34    sql_db : MolForm_SQL, optional
 35        The SQL database object, by default None.
 36    first_hit : bool, optional
 37        Flag to indicate whether to skip peaks that already have a molecular formula assigned, by default False.
 38    find_isotopologues : bool, optional
 39        Flag to indicate whether to find isotopologues, by default True.
 40
 41    Attributes
 42    ----------
 43    mass_spectrum_obj : MassSpectrum
 44        The mass spectrum object.
 45    sql_db : MolForm_SQL
 46        The SQL database object.
 47    first_hit : bool
 48        Flag to indicate whether to skip peaks that already have a molecular formula assigned.
 49    find_isotopologues : bool
 50        Flag to indicate whether to find isotopologues.
 51
 52
 53    Methods
 54    -------
 55    * run_search().
 56        Run the molecular formula search.
 57    * run_worker_mass_spectrum().
 58        Run the molecular formula search on the mass spectrum object.
 59    * run_worker_ms_peaks().
 60        Run the molecular formula search on the given list of mass spectrum peaks.
 61    * database_to_dict().
 62        Convert the database results to a dictionary.
 63    * run_molecular_formula().
 64        Run the molecular formula search on the given list of mass spectrum peaks.
 65    * search_mol_formulas().
 66        Search for molecular formulas in the mass spectrum.
 67
 68    """
 69
 70    def __init__(
 71        self,
 72        mass_spectrum_obj,
 73        sql_db=None,
 74        first_hit: bool = False,
 75        find_isotopologues: bool = True,
 76    ):
 77        self.first_hit = first_hit
 78
 79        self.find_isotopologues = find_isotopologues
 80
 81        self.mass_spectrum_obj = mass_spectrum_obj
 82
 83        if not sql_db:
 84            self.sql_db = MolForm_SQL(
 85                url=mass_spectrum_obj.molecular_search_settings.url_database
 86            )
 87
 88        else:
 89            self.sql_db = sql_db
 90
 91    def __enter__(self):
 92        """Open the SQL database connection."""
 93        return self
 94
 95    def __exit__(self, exc_type, exc_val, exc_tb):
 96        """Close the SQL database connection."""
 97        self.sql_db.close()
 98
 99        return False
100
101    def run_search(
102        self,
103        mspeaks: list,
104        query: dict,
105        min_abundance: float,
106        ion_type: str,
107        ion_charge: int,
108        adduct_atom=None,
109    ):
110        """Run the molecular formula search.
111
112        Parameters
113        ----------
114        mspeaks : list of MSPeak
115            The list of mass spectrum peaks.
116        query : dict
117            The query dictionary containing the possible molecular formulas.
118        min_abundance : float
119            The minimum abundance threshold.
120        ion_type : str
121            The ion type.
122        ion_charge : int
123            The ion charge.
124        adduct_atom : str, optional
125            The adduct atom, by default None.
126        """
127
128        def get_formulas(nominal_overlay: float = 0.1):
129            """
130            Get the list of formulas based on the nominal overlay.
131
132            Parameters
133            ----------
134            nominal_overlay : float, optional
135                The nominal overlay, by default 0.1.
136
137            Returns
138            -------
139            list
140                The list of formulas.
141            """
142            nominal_mz = ms_peak.nominal_mz_exp
143
144            defect_mass = ms_peak.mz_exp - nominal_mz
145            nominal_masses = [nominal_mz]
146
147            if (defect_mass) >= 1 - nominal_overlay:
148                nominal_masses.append(nominal_mz + 1)
149            elif (defect_mass) <= nominal_overlay:
150                nominal_masses.append(nominal_mz - 1)
151
152            list_formulas_candidates = []
153
154            for nominal_mass in nominal_masses:
155                if nominal_mass in query.keys():
156                    list_formulas_candidates.extend(query.get(nominal_mass))
157
158            return list_formulas_candidates
159
160        all_assigned_indexes = list()
161
162        # molecular_search_settings = self.mass_spectrum_obj.molecular_search_settings
163
164        search_molfrom = SearchMolecularFormulaWorker(
165            find_isotopologues=self.find_isotopologues
166        )
167
168        for ms_peak in mspeaks:
169            # already assigned a molecular formula
170            if self.first_hit:
171                if ms_peak.is_assigned:
172                    continue
173
174            ms_peak_indexes = search_molfrom.find_formulas(
175                get_formulas(),
176                min_abundance,
177                self.mass_spectrum_obj,
178                ms_peak,
179                ion_type,
180                ion_charge,
181                adduct_atom,
182            )
183
184            all_assigned_indexes.extend(ms_peak_indexes)
185
186        # all_assigned_indexes = MolecularFormulaSearchFilters().filter_isotopologue(all_assigned_indexes, self.mass_spectrum_obj)
187
188        # all_assigned_indexes = MolecularFormulaSearchFilters().filter_kendrick(all_assigned_indexes, self.mass_spectrum_obj)
189
190        # MolecularFormulaSearchFilters().check_min_peaks(all_assigned_indexes, self.mass_spectrum_obj)
191        # filter per min peaks per mono isotopic class
192
193    def run_worker_mass_spectrum(self):
194        """Run the molecular formula search on the mass spectrum object."""
195        self.run_molecular_formula(
196            self.mass_spectrum_obj.sort_by_abundance(), 
197            print_time=self.mass_spectrum_obj.molecular_search_settings.verbose_processing
198            )
199
200    def run_worker_ms_peaks(self, ms_peaks):
201        """Run the molecular formula search on the given list of mass spectrum peaks.
202
203        Parameters
204        ----------
205        ms_peaks : list of MSPeak
206            The list of mass spectrum peaks.
207        """
208        self.run_molecular_formula(
209            ms_peaks,
210            print_time=self.mass_spectrum_obj.molecular_search_settings.verbose_processing
211        )
212
213    @staticmethod
214    def database_to_dict(classe_str_list, nominal_mzs, mf_search_settings, ion_charge):
215        """Convert the database results to a dictionary.
216
217        Parameters
218        ----------
219        classe_str_list : list
220            The list of class strings.
221        nominal_mzs : list
222            The list of nominal m/z values.
223        mf_search_settings : MolecularFormulaSearchSettings
224            The molecular formula search settings.
225        ion_charge : int
226            The ion charge.
227
228        Returns
229        -------
230        dict
231            The dictionary containing the database results.
232        """
233        sql_db = MolForm_SQL(url=mf_search_settings.url_database)
234
235        dict_res = {}
236
237        if mf_search_settings.isProtonated:
238            dict_res[Labels.protonated_de_ion] = sql_db.get_dict_by_classes(
239                classe_str_list,
240                Labels.protonated_de_ion,
241                nominal_mzs,
242                ion_charge,
243                mf_search_settings,
244            )
245
246        if mf_search_settings.isRadical:
247            dict_res[Labels.radical_ion] = sql_db.get_dict_by_classes(
248                classe_str_list,
249                Labels.radical_ion,
250                nominal_mzs,
251                ion_charge,
252                mf_search_settings,
253            )
254
255        if mf_search_settings.isAdduct:
256            adduct_list = (
257                mf_search_settings.adduct_atoms_neg
258                if ion_charge < 0
259                else mf_search_settings.adduct_atoms_pos
260            )
261            dict_res[Labels.adduct_ion] = sql_db.get_dict_by_classes(
262                classe_str_list,
263                Labels.adduct_ion,
264                nominal_mzs,
265                ion_charge,
266                mf_search_settings,
267                adducts=adduct_list,
268            )
269
270        return dict_res
271
272    @timeit(print_time=True)
273    def run_molecular_formula(self, ms_peaks, **kwargs):
274        """Run the molecular formula search on the given list of mass spectrum peaks.
275
276        Parameters
277        ----------
278        ms_peaks : list of MSPeak
279            The list of mass spectrum peaks.
280        **kwargs
281            Additional keyword arguments. 
282            Most notably, print_time, which is a boolean flag to indicate whether to print the time 
283            and passed to the timeit decorator.
284        """
285        ion_charge = self.mass_spectrum_obj.polarity
286        min_abundance = self.mass_spectrum_obj.min_abundance
287        nominal_mzs = self.mass_spectrum_obj.nominal_mz
288
289        verbose = self.mass_spectrum_obj.molecular_search_settings.verbose_processing
290        # reset average error, only relevant is average mass error method is being used
291        SearchMolecularFormulaWorker(
292            find_isotopologues=self.find_isotopologues
293        ).reset_error(self.mass_spectrum_obj)
294
295        # check database for all possible molecular formula combinations based on the setting passed to self.mass_spectrum_obj.molecular_search_settings
296        classes = MolecularCombinations(self.sql_db).runworker(
297            self.mass_spectrum_obj.molecular_search_settings,
298            print_time=self.mass_spectrum_obj.molecular_search_settings.verbose_processing
299        )
300
301        # split the database load to not blowout the memory
302        # TODO add to the settings
303        for classe_chunk in chunks(
304            classes, self.mass_spectrum_obj.molecular_search_settings.db_chunk_size
305        ):
306            classes_str_list = [class_tuple[0] for class_tuple in classe_chunk]
307
308            # load the molecular formula objs binned by ion type and heteroatoms classes, {ion type:{classe:[list_formula]}}
309            # for adduct ion type a third key is added {atoms:{ion type:{classe:[list_formula]}}}
310            dict_res = self.database_to_dict(
311                classes_str_list,
312                nominal_mzs,
313                self.mass_spectrum_obj.molecular_search_settings,
314                ion_charge,
315            )
316            pbar = tqdm.tqdm(classe_chunk, disable = not verbose)
317            for classe_tuple in pbar:
318                # class string is a json serialized dict
319                classe_str = classe_tuple[0]
320                classe_dict = classe_tuple[1]
321
322                if self.mass_spectrum_obj.molecular_search_settings.isProtonated:
323                    ion_type = Labels.protonated_de_ion
324                    if verbose:
325                        pbar.set_description_str(
326                            desc="Started molecular formula search for class %s, (de)protonated "
327                            % classe_str,
328                            refresh=True,
329                        )
330
331                    candidate_formulas = dict_res.get(ion_type).get(classe_str)
332
333                    if candidate_formulas:
334                        self.run_search(
335                            ms_peaks,
336                            candidate_formulas,
337                            min_abundance,
338                            ion_type,
339                            ion_charge,
340                        )
341
342                if self.mass_spectrum_obj.molecular_search_settings.isRadical:
343                    if verbose:
344                        pbar.set_description_str(
345                            desc="Started molecular formula search for class %s, radical "
346                            % classe_str,
347                            refresh=True,
348                        )
349
350                    ion_type = Labels.radical_ion
351
352                    candidate_formulas = dict_res.get(ion_type).get(classe_str)
353
354                    if candidate_formulas:
355                        self.run_search(
356                            ms_peaks,
357                            candidate_formulas,
358                            min_abundance,
359                            ion_type,
360                            ion_charge,
361                        )
362                # looks for adduct, used_atom_valences should be 0
363                # this code does not support H exchance by halogen atoms
364                if self.mass_spectrum_obj.molecular_search_settings.isAdduct:
365                    if verbose:
366                        pbar.set_description_str(
367                            desc="Started molecular formula search for class %s, adduct "
368                            % classe_str,
369                            refresh=True,
370                        )
371
372                    ion_type = Labels.adduct_ion
373                    dict_atoms_formulas = dict_res.get(ion_type)
374
375                    for adduct_atom, dict_by_class in dict_atoms_formulas.items():
376                        candidate_formulas = dict_by_class.get(classe_str)
377
378                        if candidate_formulas:
379                            self.run_search(
380                                ms_peaks,
381                                candidate_formulas,
382                                min_abundance,
383                                ion_type,
384                                ion_charge,
385                                adduct_atom=adduct_atom,
386                            )
387        self.sql_db.close()
388
389    def search_mol_formulas(
390        self,
391        possible_formulas_list: List[MolecularFormula],
392        ion_type: str,
393        neutral_molform=True,
394        find_isotopologues=True,
395        adduct_atom=None,
396    ) -> List[_MSPeak]:
397        """Search for molecular formulas in the mass spectrum.
398
399        Parameters
400        ----------
401        possible_formulas_list : list of MolecularFormula
402            The list of possible molecular formulas.
403        ion_type : str
404            The ion type.
405        neutral_molform : bool, optional
406            Flag to indicate whether the molecular formulas are neutral, by default True.
407        find_isotopologues : bool, optional
408            Flag to indicate whether to find isotopologues, by default True.
409        adduct_atom : str, optional
410            The adduct atom, by default None.
411
412        Returns
413        -------
414        list of MSPeak
415            The list of mass spectrum peaks with assigned molecular formulas.
416        """
417        # neutral_molform: some reference files already present the formula on ion mode, for instance, bruker reference files
418        #    if that is the case than turn neutral_molform off
419
420        SearchMolecularFormulaWorker(find_isotopologues=find_isotopologues).reset_error(
421            self.mass_spectrum_obj
422        )
423
424        initial_min_peak_bool = (
425            self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter
426        )
427        initial_runtime_kendrick_filter = (
428            self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter
429        )
430
431        # Are the following 3 lines redundant?
432        self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = False
433        self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = (
434            False  # TODO check this line
435        )
436        self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter = (
437            False
438        )
439
440        possible_formulas_dict_nm = {}
441
442        for mf in possible_formulas_list:
443            if neutral_molform:
444                nm = int(mf.protonated_mz)
445            else:
446                nm = int(mf.mz_nominal_calc)
447
448            if nm in possible_formulas_dict_nm.keys():
449                possible_formulas_dict_nm[nm].append(mf)
450
451            else:
452                possible_formulas_dict_nm[nm] = [mf]
453
454        min_abundance = self.mass_spectrum_obj.min_abundance
455
456        ion_type = ion_type
457
458        self.run_search(
459            self.mass_spectrum_obj,
460            possible_formulas_dict_nm,
461            min_abundance,
462            ion_type,
463            self.mass_spectrum_obj.polarity,
464            adduct_atom=adduct_atom,
465        )
466
467        self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = (
468            initial_min_peak_bool
469        )
470        self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter = (
471            initial_runtime_kendrick_filter
472        )
473
474        mspeaks = [mspeak for mspeak in self.mass_spectrum_obj if mspeak.is_assigned]
475
476        self.sql_db.close()
477
478        return mspeaks

Class for searching molecular formulas in a mass spectrum.

Parameters
  • mass_spectrum_obj (MassSpectrum): The mass spectrum object.
  • sql_db (MolForm_SQL, optional): The SQL database object, by default None.
  • first_hit (bool, optional): Flag to indicate whether to skip peaks that already have a molecular formula assigned, by default False.
  • find_isotopologues (bool, optional): Flag to indicate whether to find isotopologues, by default True.
Attributes
  • mass_spectrum_obj (MassSpectrum): The mass spectrum object.
  • sql_db (MolForm_SQL): The SQL database object.
  • first_hit (bool): Flag to indicate whether to skip peaks that already have a molecular formula assigned.
  • find_isotopologues (bool): Flag to indicate whether to find isotopologues.
Methods
  • run_search(). Run the molecular formula search.
  • run_worker_mass_spectrum(). Run the molecular formula search on the mass spectrum object.
  • run_worker_ms_peaks(). Run the molecular formula search on the given list of mass spectrum peaks.
  • database_to_dict(). Convert the database results to a dictionary.
  • run_molecular_formula(). Run the molecular formula search on the given list of mass spectrum peaks.
  • search_mol_formulas(). Search for molecular formulas in the mass spectrum.
SearchMolecularFormulas( mass_spectrum_obj, sql_db=None, first_hit: bool = False, find_isotopologues: bool = True)
70    def __init__(
71        self,
72        mass_spectrum_obj,
73        sql_db=None,
74        first_hit: bool = False,
75        find_isotopologues: bool = True,
76    ):
77        self.first_hit = first_hit
78
79        self.find_isotopologues = find_isotopologues
80
81        self.mass_spectrum_obj = mass_spectrum_obj
82
83        if not sql_db:
84            self.sql_db = MolForm_SQL(
85                url=mass_spectrum_obj.molecular_search_settings.url_database
86            )
87
88        else:
89            self.sql_db = sql_db
first_hit
find_isotopologues
mass_spectrum_obj
def run_worker_mass_spectrum(self):
193    def run_worker_mass_spectrum(self):
194        """Run the molecular formula search on the mass spectrum object."""
195        self.run_molecular_formula(
196            self.mass_spectrum_obj.sort_by_abundance(), 
197            print_time=self.mass_spectrum_obj.molecular_search_settings.verbose_processing
198            )

Run the molecular formula search on the mass spectrum object.

def run_worker_ms_peaks(self, ms_peaks):
200    def run_worker_ms_peaks(self, ms_peaks):
201        """Run the molecular formula search on the given list of mass spectrum peaks.
202
203        Parameters
204        ----------
205        ms_peaks : list of MSPeak
206            The list of mass spectrum peaks.
207        """
208        self.run_molecular_formula(
209            ms_peaks,
210            print_time=self.mass_spectrum_obj.molecular_search_settings.verbose_processing
211        )

Run the molecular formula search on the given list of mass spectrum peaks.

Parameters
  • ms_peaks (list of MSPeak): The list of mass spectrum peaks.
@staticmethod
def database_to_dict(classe_str_list, nominal_mzs, mf_search_settings, ion_charge):
213    @staticmethod
214    def database_to_dict(classe_str_list, nominal_mzs, mf_search_settings, ion_charge):
215        """Convert the database results to a dictionary.
216
217        Parameters
218        ----------
219        classe_str_list : list
220            The list of class strings.
221        nominal_mzs : list
222            The list of nominal m/z values.
223        mf_search_settings : MolecularFormulaSearchSettings
224            The molecular formula search settings.
225        ion_charge : int
226            The ion charge.
227
228        Returns
229        -------
230        dict
231            The dictionary containing the database results.
232        """
233        sql_db = MolForm_SQL(url=mf_search_settings.url_database)
234
235        dict_res = {}
236
237        if mf_search_settings.isProtonated:
238            dict_res[Labels.protonated_de_ion] = sql_db.get_dict_by_classes(
239                classe_str_list,
240                Labels.protonated_de_ion,
241                nominal_mzs,
242                ion_charge,
243                mf_search_settings,
244            )
245
246        if mf_search_settings.isRadical:
247            dict_res[Labels.radical_ion] = sql_db.get_dict_by_classes(
248                classe_str_list,
249                Labels.radical_ion,
250                nominal_mzs,
251                ion_charge,
252                mf_search_settings,
253            )
254
255        if mf_search_settings.isAdduct:
256            adduct_list = (
257                mf_search_settings.adduct_atoms_neg
258                if ion_charge < 0
259                else mf_search_settings.adduct_atoms_pos
260            )
261            dict_res[Labels.adduct_ion] = sql_db.get_dict_by_classes(
262                classe_str_list,
263                Labels.adduct_ion,
264                nominal_mzs,
265                ion_charge,
266                mf_search_settings,
267                adducts=adduct_list,
268            )
269
270        return dict_res

Convert the database results to a dictionary.

Parameters
  • classe_str_list (list): The list of class strings.
  • nominal_mzs (list): The list of nominal m/z values.
  • mf_search_settings (MolecularFormulaSearchSettings): The molecular formula search settings.
  • ion_charge (int): The ion charge.
Returns
  • dict: The dictionary containing the database results.
def run_molecular_formula(*args, **kw):
27        def timed(*args, **kw):
28            # Extract print_time from kwargs if provided
29            local_print_time = kw.pop('print_time', print_time)
30            ts = time.time()
31            result = method(*args, **kw)
32            te = time.time()
33            if "log_time" in kw:
34                name = kw.get("log_name", method.__name__.upper())
35                kw["log_time"][name] = int((te - ts) * 1000)
36            elif local_print_time:
37                print("%r  %2.2f ms" % (method.__name__, (te - ts) * 1000))
38            return result

Run the molecular formula search on the given list of mass spectrum peaks.

Parameters
  • ms_peaks (list of MSPeak): The list of mass spectrum peaks.
  • **kwargs: Additional keyword arguments. Most notably, print_time, which is a boolean flag to indicate whether to print the time and passed to the timeit decorator.
def search_mol_formulas( self, possible_formulas_list: List[corems.molecular_formula.factory.MolecularFormulaFactory.MolecularFormula], ion_type: str, neutral_molform=True, find_isotopologues=True, adduct_atom=None) -> List[corems.ms_peak.factory.MSPeakClasses._MSPeak]:
389    def search_mol_formulas(
390        self,
391        possible_formulas_list: List[MolecularFormula],
392        ion_type: str,
393        neutral_molform=True,
394        find_isotopologues=True,
395        adduct_atom=None,
396    ) -> List[_MSPeak]:
397        """Search for molecular formulas in the mass spectrum.
398
399        Parameters
400        ----------
401        possible_formulas_list : list of MolecularFormula
402            The list of possible molecular formulas.
403        ion_type : str
404            The ion type.
405        neutral_molform : bool, optional
406            Flag to indicate whether the molecular formulas are neutral, by default True.
407        find_isotopologues : bool, optional
408            Flag to indicate whether to find isotopologues, by default True.
409        adduct_atom : str, optional
410            The adduct atom, by default None.
411
412        Returns
413        -------
414        list of MSPeak
415            The list of mass spectrum peaks with assigned molecular formulas.
416        """
417        # neutral_molform: some reference files already present the formula on ion mode, for instance, bruker reference files
418        #    if that is the case than turn neutral_molform off
419
420        SearchMolecularFormulaWorker(find_isotopologues=find_isotopologues).reset_error(
421            self.mass_spectrum_obj
422        )
423
424        initial_min_peak_bool = (
425            self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter
426        )
427        initial_runtime_kendrick_filter = (
428            self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter
429        )
430
431        # Are the following 3 lines redundant?
432        self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = False
433        self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = (
434            False  # TODO check this line
435        )
436        self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter = (
437            False
438        )
439
440        possible_formulas_dict_nm = {}
441
442        for mf in possible_formulas_list:
443            if neutral_molform:
444                nm = int(mf.protonated_mz)
445            else:
446                nm = int(mf.mz_nominal_calc)
447
448            if nm in possible_formulas_dict_nm.keys():
449                possible_formulas_dict_nm[nm].append(mf)
450
451            else:
452                possible_formulas_dict_nm[nm] = [mf]
453
454        min_abundance = self.mass_spectrum_obj.min_abundance
455
456        ion_type = ion_type
457
458        self.run_search(
459            self.mass_spectrum_obj,
460            possible_formulas_dict_nm,
461            min_abundance,
462            ion_type,
463            self.mass_spectrum_obj.polarity,
464            adduct_atom=adduct_atom,
465        )
466
467        self.mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = (
468            initial_min_peak_bool
469        )
470        self.mass_spectrum_obj.molecular_search_settings.use_runtime_kendrick_filter = (
471            initial_runtime_kendrick_filter
472        )
473
474        mspeaks = [mspeak for mspeak in self.mass_spectrum_obj if mspeak.is_assigned]
475
476        self.sql_db.close()
477
478        return mspeaks

Search for molecular formulas in the mass spectrum.

Parameters
  • possible_formulas_list (list of MolecularFormula): The list of possible molecular formulas.
  • ion_type (str): The ion type.
  • neutral_molform (bool, optional): Flag to indicate whether the molecular formulas are neutral, by default True.
  • find_isotopologues (bool, optional): Flag to indicate whether to find isotopologues, by default True.
  • adduct_atom (str, optional): The adduct atom, by default None.
Returns
  • list of MSPeak: The list of mass spectrum peaks with assigned molecular formulas.
class SearchMolecularFormulaWorker:
481class SearchMolecularFormulaWorker:
482    """Class for searching molecular formulas in a mass spectrum.
483
484    Parameters
485    ----------
486    find_isotopologues : bool, optional
487        Flag to indicate whether to find isotopologues, by default True.
488
489    Attributes
490    ----------
491    find_isotopologues : bool
492        Flag to indicate whether to find isotopologues.
493
494    Methods
495    -------
496    * reset_error().
497        Reset the error variables.
498    * set_last_error().
499        Set the last error.
500    * find_formulas().
501        Find the formulas.
502    * calc_error().
503        Calculate the error.
504    """
505
506    # TODO add reset error function
507    # needs this wraper to pass the class to multiprocessing
508
509    def __init__(self, find_isotopologues=True):
510        self.find_isotopologues = find_isotopologues
511
512    def __call__(self, args):
513        """Call the find formulas function.
514
515        Parameters
516        ----------
517        args : tuple
518            The arguments.
519
520        Returns
521        -------
522        list
523            The list of mass spectrum peaks with assigned molecular formulas.
524        """
525        return self.find_formulas(*args)  # ,args[1]
526
527    def reset_error(self, mass_spectrum_obj):
528        """Reset the error variables.
529
530        Parameters
531        ----------
532        mass_spectrum_obj : MassSpectrum
533            The mass spectrum object.
534
535        Notes
536        -----
537        This function resets the error variables for the given mass spectrum object.
538        """
539        global last_error, last_dif, closest_error, error_average, nbValues
540        last_error, last_dif, closest_error, nbValues = 0.0, 0.0, 0.0, 0.0
541
542    def set_last_error(self, error, mass_spectrum_obj):
543        """Set the last error.
544
545        Parameters
546        ----------
547        error : float
548            The error.
549        mass_spectrum_obj : MassSpectrum
550            The mass spectrum object.
551        """
552        # set the changes to the global variables, not internal ones
553        global last_error, last_dif, closest_error, error_average, nbValues
554
555        if mass_spectrum_obj.molecular_search_settings.error_method == "distance":
556            dif = error - last_error
557            if dif < last_dif:
558                last_dif = dif
559                closest_error = error
560                mass_spectrum_obj.molecular_search_settings.min_ppm_error = (
561                    closest_error
562                    - mass_spectrum_obj.molecular_search_settings.mz_error_range
563                )
564                mass_spectrum_obj.molecular_search_settings.max_ppm_error = (
565                    closest_error
566                    + mass_spectrum_obj.molecular_search_settings.mz_error_range
567                )
568
569        elif mass_spectrum_obj.molecular_search_settings.error_method == "lowest":
570            if error < last_error:
571                mass_spectrum_obj.molecular_search_settings.min_ppm_error = (
572                    error - mass_spectrum_obj.molecular_search_settings.mz_error_range
573                )
574                mass_spectrum_obj.molecular_search_settings.max_ppm_error = (
575                    error + mass_spectrum_obj.molecular_search_settings.mz_error_range
576                )
577                last_error = error
578
579        elif mass_spectrum_obj.molecular_search_settings.error_method == "symmetrical":
580            mass_spectrum_obj.molecular_search_settings.min_ppm_error = (
581                mass_spectrum_obj.molecular_search_settings.mz_error_average
582                - mass_spectrum_obj.molecular_search_settings.mz_error_range
583            )
584            mass_spectrum_obj.molecular_search_settings.max_ppm_error = (
585                mass_spectrum_obj.molecular_search_settings.mz_error_average
586                + mass_spectrum_obj.molecular_search_settings.mz_error_range
587            )
588
589        elif mass_spectrum_obj.molecular_search_settings.error_method == "average":
590            nbValues += 1
591            error_average = error_average + ((error - error_average) / nbValues)
592            mass_spectrum_obj.molecular_search_settings.min_ppm_error = (
593                error_average
594                - mass_spectrum_obj.molecular_search_settings.mz_error_range
595            )
596            mass_spectrum_obj.molecular_search_settings.max_ppm_error = (
597                error_average
598                + mass_spectrum_obj.molecular_search_settings.mz_error_range
599            )
600
601        else:
602            # using set mass_spectrum_obj.molecular_search_settings.min_ppm_error  and max_ppm_error range
603            pass
604
605        # returns the error based on the selected method at mass_spectrum_obj.molecular_search_settings.method
606
607    @staticmethod
608    def calc_error(mz_exp, mz_calc, method="ppm"):
609        """Calculate the error.
610
611        Parameters
612        ----------
613        mz_exp : float
614            The experimental m/z value.
615        mz_calc : float
616            The calculated m/z value.
617        method : str, optional
618            The method, by default 'ppm'.
619
620        Raises
621        -------
622        Exception
623            If the method is not ppm or ppb.
624
625        Returns
626        -------
627        float
628            The error.
629        """
630
631        if method == "ppm":
632            multi_factor = 1_000_000
633
634        elif method == "ppb":
635            multi_factor = 1_000_000_000
636
637        elif method == "perc":
638            multi_factor = 100
639
640        else:
641            raise Exception(
642                "method needs to be ppm or ppb, you have entered %s" % method
643            )
644
645        if mz_exp:
646            return ((mz_exp - mz_calc) / mz_calc) * multi_factor
647
648        else:
649            raise Exception("Please set mz_calc first")
650
651    def find_formulas(
652        self,
653        formulas,
654        min_abundance,
655        mass_spectrum_obj,
656        ms_peak,
657        ion_type,
658        ion_charge,
659        adduct_atom=None,
660    ):
661        """Find the formulas.
662
663        Parameters
664        ----------
665        formulas : list of MolecularFormula
666            The list of molecular formulas.
667        min_abundance : float
668            The minimum abundance threshold.
669        mass_spectrum_obj : MassSpectrum
670            The mass spectrum object.
671        ms_peak : MSPeak
672            The mass spectrum peak.
673        ion_type : str
674            The ion type.
675        ion_charge : int
676            The ion charge.
677        adduct_atom : str, optional
678            The adduct atom, by default None.
679
680        Returns
681        -------
682        list of MSPeak
683            The list of mass spectrum peaks with assigned molecular formulas.
684
685        Notes
686        -----
687        Uses the closest error the next search (this is not ideal, it needs to use confidence
688        metric to choose the right candidate then propagate the error using the error from the best candidate).
689        It needs to add s/n to the equation.
690        It need optimization to define the mz_error_range within a m/z unit since it is directly proportional
691        with the mass, and inversely proportional to the rp. It's not linear, i.e., sigma mass.
692        The idea it to correlate sigma to resolving power, signal to noise and sample complexity per mz unit.
693        Method='distance'
694        """
695        mspeak_assigned_index = list()
696
697        min_ppm_error = mass_spectrum_obj.molecular_search_settings.min_ppm_error
698        max_ppm_error = mass_spectrum_obj.molecular_search_settings.max_ppm_error
699
700        min_abun_error = mass_spectrum_obj.molecular_search_settings.min_abun_error
701        max_abun_error = mass_spectrum_obj.molecular_search_settings.max_abun_error
702
703        # f = open("abundance_error.txt", "a+")
704        ms_peak_mz_exp, ms_peak_abundance = ms_peak.mz_exp, ms_peak.abundance
705        # min_error = min([pmf.mz_error for pmf in possible_formulas])
706
707        def mass_by_ion_type(possible_formula_obj):
708            if ion_type == Labels.protonated_de_ion:
709                return possible_formula_obj._protonated_mz(ion_charge)
710
711            elif ion_type == Labels.radical_ion:
712                return possible_formula_obj._radical_mz(ion_charge)
713
714            elif ion_type == Labels.adduct_ion and adduct_atom:
715                return possible_formula._adduct_mz(ion_charge, adduct_atom)
716
717            else:
718                # will return externally calculated mz if is set, #use on Bruker Reference list import
719                # if the ion type is known the ion mass based on molecular formula ion type
720                # if ion type is unknow will return neutral mass
721                return possible_formula.mz_calc
722
723        if formulas:
724            if isinstance(formulas[0], LCMSLibRefMolecularFormula):
725                possible_mf_class = True
726
727            else:
728                possible_mf_class = False
729
730        for possible_formula in formulas:
731            if possible_formula:
732                error = self.calc_error(
733                    ms_peak_mz_exp, mass_by_ion_type(possible_formula)
734                )
735
736                # error = possible_formula.mz_error
737
738                if min_ppm_error <= error <= max_ppm_error:
739                    # update the error
740
741                    self.set_last_error(error, mass_spectrum_obj)
742
743                    # add molecular formula match to ms_peak
744
745                    # get molecular formula dict from sql obj
746                    # formula_dict = pickle.loads(possible_formula.mol_formula)
747                    # if possible_mf_class:
748
749                    #    molecular_formula = deepcopy(possible_formula)
750
751                    # else:
752
753                    formula_dict = possible_formula.to_dict()
754                    # create the molecular formula obj to be stored
755                    if possible_mf_class:
756                        molecular_formula = LCMSLibRefMolecularFormula(
757                            formula_dict,
758                            ion_charge,
759                            ion_type=ion_type,
760                            adduct_atom=adduct_atom,
761                        )
762
763                        molecular_formula.name = possible_formula.name
764                        molecular_formula.kegg_id = possible_formula.kegg_id
765                        molecular_formula.cas = possible_formula.cas
766
767                    else:
768                        molecular_formula = MolecularFormula(
769                            formula_dict,
770                            ion_charge,
771                            ion_type=ion_type,
772                            adduct_atom=adduct_atom,
773                        )
774                    # add the molecular formula obj to the mspeak obj
775                    # add the mspeak obj and it's index for tracking next assignment step
776
777                    if self.find_isotopologues:
778                        # calculates isotopologues
779                        isotopologues = molecular_formula.isotopologues(
780                            min_abundance,
781                            ms_peak_abundance,
782                            mass_spectrum_obj.dynamic_range,
783                        )
784
785                        # search for isotopologues
786                        for isotopologue_formula in isotopologues:
787                            molecular_formula.expected_isotopologues.append(
788                                isotopologue_formula
789                            )
790                            # move this outside to improve preformace
791                            # we need to increase the search space to -+1 m_z
792                            first_index, last_index = (
793                                mass_spectrum_obj.get_nominal_mz_first_last_indexes(
794                                    isotopologue_formula.mz_nominal_calc
795                                )
796                            )
797
798                            for ms_peak_iso in mass_spectrum_obj[
799                                first_index:last_index
800                            ]:
801                                error = self.calc_error(
802                                    ms_peak_iso.mz_exp, isotopologue_formula.mz_calc
803                                )
804
805                                if min_ppm_error <= error <= max_ppm_error:
806                                    # need to define error distribution for abundance measurements
807
808                                    # if mass_spectrum_obj.is_centroid:
809
810                                    abundance_error = self.calc_error(
811                                        isotopologue_formula.abundance_calc,
812                                        ms_peak_iso.abundance,
813                                        method="perc",
814                                    )
815
816                                    # area_error = self.calc_error(ms_peak.area, ms_peak_iso.area, method='perc')
817
818                                    # margin of error was set empirically/ needs statistical calculation
819                                    #  of margin of error for the measurement of the abundances
820                                    if (
821                                        min_abun_error
822                                        <= abundance_error
823                                        <= max_abun_error
824                                    ):
825                                        # update the error
826
827                                        self.set_last_error(error, mass_spectrum_obj)
828
829                                        # isotopologue_formula.mz_error = error
830
831                                        # isotopologue_formula.area_error = area_error
832
833                                        # isotopologue_formula.abundance_error = abundance_error
834
835                                        isotopologue_formula.mspeak_index_mono_isotopic = ms_peak.index
836
837                                        mono_isotopic_formula_index = len(ms_peak)
838
839                                        isotopologue_formula.mspeak_index_mono_isotopic = ms_peak.index
840
841                                        isotopologue_formula.mono_isotopic_formula_index = mono_isotopic_formula_index
842
843                                        # add mspeaks isotopologue index to the mono isotopic MolecularFormula obj and the respective formula position
844
845                                        # add molecular formula match to ms_peak
846                                        x = ms_peak_iso.add_molecular_formula(
847                                            isotopologue_formula
848                                        )
849
850                                        molecular_formula.mspeak_mf_isotopologues_indexes.append(
851                                            (ms_peak_iso.index, x)
852                                        )
853                                        # add mspeaks mono isotopic index to the isotopologue MolecularFormula obj
854
855                    y = ms_peak.add_molecular_formula(molecular_formula)
856
857                    mspeak_assigned_index.append((ms_peak.index, y))
858
859        return mspeak_assigned_index

Class for searching molecular formulas in a mass spectrum.

Parameters
  • find_isotopologues (bool, optional): Flag to indicate whether to find isotopologues, by default True.
Attributes
  • find_isotopologues (bool): Flag to indicate whether to find isotopologues.
Methods
  • reset_error(). Reset the error variables.
  • set_last_error(). Set the last error.
  • find_formulas(). Find the formulas.
  • calc_error(). Calculate the error.
SearchMolecularFormulaWorker(find_isotopologues=True)
509    def __init__(self, find_isotopologues=True):
510        self.find_isotopologues = find_isotopologues
find_isotopologues
def reset_error(self, mass_spectrum_obj):
527    def reset_error(self, mass_spectrum_obj):
528        """Reset the error variables.
529
530        Parameters
531        ----------
532        mass_spectrum_obj : MassSpectrum
533            The mass spectrum object.
534
535        Notes
536        -----
537        This function resets the error variables for the given mass spectrum object.
538        """
539        global last_error, last_dif, closest_error, error_average, nbValues
540        last_error, last_dif, closest_error, nbValues = 0.0, 0.0, 0.0, 0.0

Reset the error variables.

Parameters
  • mass_spectrum_obj (MassSpectrum): The mass spectrum object.
Notes

This function resets the error variables for the given mass spectrum object.

def set_last_error(self, error, mass_spectrum_obj):
542    def set_last_error(self, error, mass_spectrum_obj):
543        """Set the last error.
544
545        Parameters
546        ----------
547        error : float
548            The error.
549        mass_spectrum_obj : MassSpectrum
550            The mass spectrum object.
551        """
552        # set the changes to the global variables, not internal ones
553        global last_error, last_dif, closest_error, error_average, nbValues
554
555        if mass_spectrum_obj.molecular_search_settings.error_method == "distance":
556            dif = error - last_error
557            if dif < last_dif:
558                last_dif = dif
559                closest_error = error
560                mass_spectrum_obj.molecular_search_settings.min_ppm_error = (
561                    closest_error
562                    - mass_spectrum_obj.molecular_search_settings.mz_error_range
563                )
564                mass_spectrum_obj.molecular_search_settings.max_ppm_error = (
565                    closest_error
566                    + mass_spectrum_obj.molecular_search_settings.mz_error_range
567                )
568
569        elif mass_spectrum_obj.molecular_search_settings.error_method == "lowest":
570            if error < last_error:
571                mass_spectrum_obj.molecular_search_settings.min_ppm_error = (
572                    error - mass_spectrum_obj.molecular_search_settings.mz_error_range
573                )
574                mass_spectrum_obj.molecular_search_settings.max_ppm_error = (
575                    error + mass_spectrum_obj.molecular_search_settings.mz_error_range
576                )
577                last_error = error
578
579        elif mass_spectrum_obj.molecular_search_settings.error_method == "symmetrical":
580            mass_spectrum_obj.molecular_search_settings.min_ppm_error = (
581                mass_spectrum_obj.molecular_search_settings.mz_error_average
582                - mass_spectrum_obj.molecular_search_settings.mz_error_range
583            )
584            mass_spectrum_obj.molecular_search_settings.max_ppm_error = (
585                mass_spectrum_obj.molecular_search_settings.mz_error_average
586                + mass_spectrum_obj.molecular_search_settings.mz_error_range
587            )
588
589        elif mass_spectrum_obj.molecular_search_settings.error_method == "average":
590            nbValues += 1
591            error_average = error_average + ((error - error_average) / nbValues)
592            mass_spectrum_obj.molecular_search_settings.min_ppm_error = (
593                error_average
594                - mass_spectrum_obj.molecular_search_settings.mz_error_range
595            )
596            mass_spectrum_obj.molecular_search_settings.max_ppm_error = (
597                error_average
598                + mass_spectrum_obj.molecular_search_settings.mz_error_range
599            )
600
601        else:
602            # using set mass_spectrum_obj.molecular_search_settings.min_ppm_error  and max_ppm_error range
603            pass
604
605        # returns the error based on the selected method at mass_spectrum_obj.molecular_search_settings.method

Set the last error.

Parameters
  • error (float): The error.
  • mass_spectrum_obj (MassSpectrum): The mass spectrum object.
@staticmethod
def calc_error(mz_exp, mz_calc, method='ppm'):
607    @staticmethod
608    def calc_error(mz_exp, mz_calc, method="ppm"):
609        """Calculate the error.
610
611        Parameters
612        ----------
613        mz_exp : float
614            The experimental m/z value.
615        mz_calc : float
616            The calculated m/z value.
617        method : str, optional
618            The method, by default 'ppm'.
619
620        Raises
621        -------
622        Exception
623            If the method is not ppm or ppb.
624
625        Returns
626        -------
627        float
628            The error.
629        """
630
631        if method == "ppm":
632            multi_factor = 1_000_000
633
634        elif method == "ppb":
635            multi_factor = 1_000_000_000
636
637        elif method == "perc":
638            multi_factor = 100
639
640        else:
641            raise Exception(
642                "method needs to be ppm or ppb, you have entered %s" % method
643            )
644
645        if mz_exp:
646            return ((mz_exp - mz_calc) / mz_calc) * multi_factor
647
648        else:
649            raise Exception("Please set mz_calc first")

Calculate the error.

Parameters
  • mz_exp (float): The experimental m/z value.
  • mz_calc (float): The calculated m/z value.
  • method (str, optional): The method, by default 'ppm'.
Raises
  • Exception: If the method is not ppm or ppb.
Returns
  • float: The error.
def find_formulas( self, formulas, min_abundance, mass_spectrum_obj, ms_peak, ion_type, ion_charge, adduct_atom=None):
651    def find_formulas(
652        self,
653        formulas,
654        min_abundance,
655        mass_spectrum_obj,
656        ms_peak,
657        ion_type,
658        ion_charge,
659        adduct_atom=None,
660    ):
661        """Find the formulas.
662
663        Parameters
664        ----------
665        formulas : list of MolecularFormula
666            The list of molecular formulas.
667        min_abundance : float
668            The minimum abundance threshold.
669        mass_spectrum_obj : MassSpectrum
670            The mass spectrum object.
671        ms_peak : MSPeak
672            The mass spectrum peak.
673        ion_type : str
674            The ion type.
675        ion_charge : int
676            The ion charge.
677        adduct_atom : str, optional
678            The adduct atom, by default None.
679
680        Returns
681        -------
682        list of MSPeak
683            The list of mass spectrum peaks with assigned molecular formulas.
684
685        Notes
686        -----
687        Uses the closest error the next search (this is not ideal, it needs to use confidence
688        metric to choose the right candidate then propagate the error using the error from the best candidate).
689        It needs to add s/n to the equation.
690        It need optimization to define the mz_error_range within a m/z unit since it is directly proportional
691        with the mass, and inversely proportional to the rp. It's not linear, i.e., sigma mass.
692        The idea it to correlate sigma to resolving power, signal to noise and sample complexity per mz unit.
693        Method='distance'
694        """
695        mspeak_assigned_index = list()
696
697        min_ppm_error = mass_spectrum_obj.molecular_search_settings.min_ppm_error
698        max_ppm_error = mass_spectrum_obj.molecular_search_settings.max_ppm_error
699
700        min_abun_error = mass_spectrum_obj.molecular_search_settings.min_abun_error
701        max_abun_error = mass_spectrum_obj.molecular_search_settings.max_abun_error
702
703        # f = open("abundance_error.txt", "a+")
704        ms_peak_mz_exp, ms_peak_abundance = ms_peak.mz_exp, ms_peak.abundance
705        # min_error = min([pmf.mz_error for pmf in possible_formulas])
706
707        def mass_by_ion_type(possible_formula_obj):
708            if ion_type == Labels.protonated_de_ion:
709                return possible_formula_obj._protonated_mz(ion_charge)
710
711            elif ion_type == Labels.radical_ion:
712                return possible_formula_obj._radical_mz(ion_charge)
713
714            elif ion_type == Labels.adduct_ion and adduct_atom:
715                return possible_formula._adduct_mz(ion_charge, adduct_atom)
716
717            else:
718                # will return externally calculated mz if is set, #use on Bruker Reference list import
719                # if the ion type is known the ion mass based on molecular formula ion type
720                # if ion type is unknow will return neutral mass
721                return possible_formula.mz_calc
722
723        if formulas:
724            if isinstance(formulas[0], LCMSLibRefMolecularFormula):
725                possible_mf_class = True
726
727            else:
728                possible_mf_class = False
729
730        for possible_formula in formulas:
731            if possible_formula:
732                error = self.calc_error(
733                    ms_peak_mz_exp, mass_by_ion_type(possible_formula)
734                )
735
736                # error = possible_formula.mz_error
737
738                if min_ppm_error <= error <= max_ppm_error:
739                    # update the error
740
741                    self.set_last_error(error, mass_spectrum_obj)
742
743                    # add molecular formula match to ms_peak
744
745                    # get molecular formula dict from sql obj
746                    # formula_dict = pickle.loads(possible_formula.mol_formula)
747                    # if possible_mf_class:
748
749                    #    molecular_formula = deepcopy(possible_formula)
750
751                    # else:
752
753                    formula_dict = possible_formula.to_dict()
754                    # create the molecular formula obj to be stored
755                    if possible_mf_class:
756                        molecular_formula = LCMSLibRefMolecularFormula(
757                            formula_dict,
758                            ion_charge,
759                            ion_type=ion_type,
760                            adduct_atom=adduct_atom,
761                        )
762
763                        molecular_formula.name = possible_formula.name
764                        molecular_formula.kegg_id = possible_formula.kegg_id
765                        molecular_formula.cas = possible_formula.cas
766
767                    else:
768                        molecular_formula = MolecularFormula(
769                            formula_dict,
770                            ion_charge,
771                            ion_type=ion_type,
772                            adduct_atom=adduct_atom,
773                        )
774                    # add the molecular formula obj to the mspeak obj
775                    # add the mspeak obj and it's index for tracking next assignment step
776
777                    if self.find_isotopologues:
778                        # calculates isotopologues
779                        isotopologues = molecular_formula.isotopologues(
780                            min_abundance,
781                            ms_peak_abundance,
782                            mass_spectrum_obj.dynamic_range,
783                        )
784
785                        # search for isotopologues
786                        for isotopologue_formula in isotopologues:
787                            molecular_formula.expected_isotopologues.append(
788                                isotopologue_formula
789                            )
790                            # move this outside to improve preformace
791                            # we need to increase the search space to -+1 m_z
792                            first_index, last_index = (
793                                mass_spectrum_obj.get_nominal_mz_first_last_indexes(
794                                    isotopologue_formula.mz_nominal_calc
795                                )
796                            )
797
798                            for ms_peak_iso in mass_spectrum_obj[
799                                first_index:last_index
800                            ]:
801                                error = self.calc_error(
802                                    ms_peak_iso.mz_exp, isotopologue_formula.mz_calc
803                                )
804
805                                if min_ppm_error <= error <= max_ppm_error:
806                                    # need to define error distribution for abundance measurements
807
808                                    # if mass_spectrum_obj.is_centroid:
809
810                                    abundance_error = self.calc_error(
811                                        isotopologue_formula.abundance_calc,
812                                        ms_peak_iso.abundance,
813                                        method="perc",
814                                    )
815
816                                    # area_error = self.calc_error(ms_peak.area, ms_peak_iso.area, method='perc')
817
818                                    # margin of error was set empirically/ needs statistical calculation
819                                    #  of margin of error for the measurement of the abundances
820                                    if (
821                                        min_abun_error
822                                        <= abundance_error
823                                        <= max_abun_error
824                                    ):
825                                        # update the error
826
827                                        self.set_last_error(error, mass_spectrum_obj)
828
829                                        # isotopologue_formula.mz_error = error
830
831                                        # isotopologue_formula.area_error = area_error
832
833                                        # isotopologue_formula.abundance_error = abundance_error
834
835                                        isotopologue_formula.mspeak_index_mono_isotopic = ms_peak.index
836
837                                        mono_isotopic_formula_index = len(ms_peak)
838
839                                        isotopologue_formula.mspeak_index_mono_isotopic = ms_peak.index
840
841                                        isotopologue_formula.mono_isotopic_formula_index = mono_isotopic_formula_index
842
843                                        # add mspeaks isotopologue index to the mono isotopic MolecularFormula obj and the respective formula position
844
845                                        # add molecular formula match to ms_peak
846                                        x = ms_peak_iso.add_molecular_formula(
847                                            isotopologue_formula
848                                        )
849
850                                        molecular_formula.mspeak_mf_isotopologues_indexes.append(
851                                            (ms_peak_iso.index, x)
852                                        )
853                                        # add mspeaks mono isotopic index to the isotopologue MolecularFormula obj
854
855                    y = ms_peak.add_molecular_formula(molecular_formula)
856
857                    mspeak_assigned_index.append((ms_peak.index, y))
858
859        return mspeak_assigned_index

Find the formulas.

Parameters
  • formulas (list of MolecularFormula): The list of molecular formulas.
  • min_abundance (float): The minimum abundance threshold.
  • mass_spectrum_obj (MassSpectrum): The mass spectrum object.
  • ms_peak (MSPeak): The mass spectrum peak.
  • ion_type (str): The ion type.
  • ion_charge (int): The ion charge.
  • adduct_atom (str, optional): The adduct atom, by default None.
Returns
  • list of MSPeak: The list of mass spectrum peaks with assigned molecular formulas.
Notes

Uses the closest error the next search (this is not ideal, it needs to use confidence metric to choose the right candidate then propagate the error using the error from the best candidate). It needs to add s/n to the equation. It need optimization to define the mz_error_range within a m/z unit since it is directly proportional with the mass, and inversely proportional to the rp. It's not linear, i.e., sigma mass. The idea it to correlate sigma to resolving power, signal to noise and sample complexity per mz unit. Method='distance'

class SearchMolecularFormulasLC:
 862class SearchMolecularFormulasLC:
 863    """Class for searching molecular formulas in a LC object.
 864
 865    Parameters
 866    ----------
 867    lcms_obj : LCMSBase
 868        The LCMSBase object.
 869    sql_db : MolForm_SQL, optional
 870        The SQL database object, by default None.
 871    first_hit : bool, optional
 872        Flag to indicate whether to skip peaks that already have a molecular formula assigned, by default False.
 873    find_isotopologues : bool, optional
 874        Flag to indicate whether to find isotopologues, by default True.
 875
 876    Methods
 877    -------
 878
 879    * search_spectra_against_candidates().
 880        Search a list of mass spectra against a list of candidate formulas with a given ion type and charge.
 881    * bulk_run_molecular_formula_search().
 882        Run the molecular formula search on the given list of mass spectra.
 883        Pulls the settings from the LCMSBase object to set ion type and charge to search for. 
 884    * run_mass_feature_search().
 885        Run the molecular formula search on mass features.
 886        Calls bulk_run_molecular_formula_search() with specified mass spectra and mass peaks.
 887    * run_untargeted_worker_ms1().
 888        Run untargeted molecular formula search on the ms1 mass spectrum.
 889        DEPRECATED: use run_mass_feature_search() or bulk_run_molecular_formula_search() instead.
 890    * run_target_worker_ms1().
 891        Run targeted molecular formula search on the ms1 mass spectrum.
 892        DEPRECATED: use run_mass_feature_search() or bulk_run_molecular_formula_search() instead.
 893    """
 894
 895    def __init__(self, lcms_obj, sql_db=None, first_hit=False, find_isotopologues=True):
 896        self.first_hit = first_hit
 897
 898        self.find_isotopologues = find_isotopologues
 899
 900        self.lcms_obj = lcms_obj
 901
 902        if not sql_db:
 903            self.sql_db = MolForm_SQL(
 904                url=self.lcms_obj.parameters.mass_spectrum['ms1'].molecular_search.url_database
 905            )
 906
 907        else:
 908            self.sql_db = sql_db
 909
 910    def search_spectra_against_candidates(self, mass_spectrum_list, ms_peaks_list, candidate_formulas, ion_type, ion_charge):
 911        """Search a list of mass spectra against a list of candidate formulas with a given ion type and charge.
 912
 913        Parameters
 914        ----------
 915        mass_spectrum_list : list of MassSpectrum
 916            The list of mass spectra to perform the search on.
 917        ms_peaks_list : list of lists of MSPeak objects
 918            The list of mass spectrum peaks to search within each mass spectrum.
 919        candidate_formulas : dict
 920            The candidate formulas.
 921        ion_type : str
 922            The ion type.
 923        ion_charge : int
 924            The ion charge, either 1 or -1.
 925
 926        Notes
 927        -----
 928        This function is designed to be used with the bulk_run_molecular_formula_search function.
 929        """
 930        for mass_spectrum, ms_peaks in zip(mass_spectrum_list, ms_peaks_list):
 931            single_ms_search = SearchMolecularFormulas(
 932                mass_spectrum,
 933                sql_db=self.sql_db,
 934                first_hit=self.first_hit,
 935                find_isotopologues=self.find_isotopologues,
 936            )
 937            single_ms_search.run_search(
 938                ms_peaks,
 939                candidate_formulas,
 940                mass_spectrum.min_abundance,
 941                ion_type,
 942                ion_charge,
 943            )
 944
 945    def bulk_run_molecular_formula_search(self, mass_spectrum_list, ms_peaks_list, mass_spectrum_setting_key='ms1'):
 946        """Run the molecular formula search on the given list of mass spectra
 947
 948        Parameters
 949        ----------
 950        mass_spectrum_list : list of MassSpectrum
 951            The list of mass spectra to search.
 952        ms_peaks_list : list of lists of MSPeak objects 
 953            The mass peaks to perform molecular formula search within each mass spectrum
 954        mass_spectrum_setting_key : str, optional
 955            The mass spectrum setting key, by default 'ms1'.
 956            This is used to get the appropriate molecular search settings from the LCMSBase object
 957        """
 958        # Set min_abundance and nominal_mzs
 959        if self.lcms_obj.polarity == "positive":
 960            ion_charge = 1
 961        elif self.lcms_obj.polarity == "negative":
 962            ion_charge = -1
 963        else:
 964            raise ValueError("Polarity must be either 'positive' or 'negative'")
 965        
 966        # Check that the length of the mass spectrum list and the ms_peaks list are the same
 967        if len(mass_spectrum_list) != len(ms_peaks_list):
 968            raise ValueError("The length of the mass spectrum list and the ms_peaks list must be the same")
 969        
 970        nominal_mzs = [x.nominal_mz for x in mass_spectrum_list]
 971        nominal_mzs = list(set([item for sublist in nominal_mzs for item in sublist]))
 972        verbose = self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.verbose_processing 
 973
 974        # reset average error, only relevant if average mass error method is being used
 975        SearchMolecularFormulaWorker(
 976            find_isotopologues=self.find_isotopologues
 977        ).reset_error(mass_spectrum_list[0])
 978
 979        # check database for all possible molecular formula combinations based on the setting passed to self.mass_spectrum_obj.molecular_search_settings
 980        classes = MolecularCombinations(self.sql_db).runworker(
 981            self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search,
 982            print_time=self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.verbose_processing
 983        )
 984        
 985        # split the database load to not blowout the memory
 986        for classe_chunk in chunks(
 987            classes, self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.db_chunk_size
 988        ):
 989            classes_str_list = [class_tuple[0] for class_tuple in classe_chunk]
 990
 991            # load the molecular formula objs binned by ion type and heteroatoms classes, {ion type:{classe:[list_formula]}}
 992            # for adduct ion type a third key is added {atoms:{ion type:{classe:[list_formula]}}}
 993            dict_res = SearchMolecularFormulas.database_to_dict(
 994                classes_str_list,
 995                nominal_mzs,
 996                self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search,
 997                ion_charge,
 998            )
 999
1000            pbar = tqdm.tqdm(classe_chunk, disable = not verbose)
1001            for classe_tuple in pbar:
1002                # class string is a json serialized dict
1003                classe_str = classe_tuple[0]
1004
1005                # Perform search for (de)protonated ion type
1006                if self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.isProtonated:
1007                    ion_type = Labels.protonated_de_ion
1008
1009                    pbar.set_description_str(
1010                        desc="Started molecular formula search for class %s, (de)protonated "
1011                        % classe_str,
1012                        refresh=True,
1013                    )
1014
1015                    candidate_formulas = dict_res.get(ion_type).get(classe_str)
1016
1017                    if candidate_formulas:
1018                        self.search_spectra_against_candidates(
1019                            mass_spectrum_list=mass_spectrum_list,
1020                            ms_peaks_list=ms_peaks_list,
1021                            candidate_formulas=candidate_formulas,
1022                            ion_type=ion_type,
1023                            ion_charge=ion_charge
1024                        )
1025
1026
1027                # Perform search for radical ion type
1028                if self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.isRadical:
1029                    pbar.set_description_str(
1030                        desc="Started molecular formula search for class %s, radical "
1031                        % classe_str,
1032                        refresh=True,
1033                    )
1034
1035                    ion_type = Labels.radical_ion
1036
1037                    candidate_formulas = dict_res.get(ion_type).get(classe_str)
1038
1039                    if candidate_formulas:
1040                        self.search_spectra_against_candidates(
1041                            mass_spectrum_list=mass_spectrum_list,
1042                            ms_peaks_list=ms_peaks_list,
1043                            candidate_formulas=candidate_formulas,
1044                            ion_type=ion_type,
1045                            ion_charge=ion_charge
1046                        )
1047
1048                # Perform search for adduct ion type
1049                # looks for adduct, used_atom_valences should be 0
1050                # this code does not support H exchance by halogen atoms
1051                if self.lcms_obj.parameters.mass_spectrum[mass_spectrum_setting_key].molecular_search.isAdduct:
1052                    pbar.set_description_str(
1053                        desc="Started molecular formula search for class %s, adduct "
1054                        % classe_str,
1055                        refresh=True,
1056                    )
1057
1058                    ion_type = Labels.adduct_ion
1059                    dict_atoms_formulas = dict_res.get(ion_type)
1060
1061                    for adduct_atom, dict_by_class in dict_atoms_formulas.items():
1062                        candidate_formulas = dict_by_class.get(classe_str)
1063
1064                        if candidate_formulas:
1065                            self.search_spectra_against_candidates(
1066                                mass_spectrum_list=mass_spectrum_list,
1067                                ms_peaks_list=ms_peaks_list,
1068                                candidate_formulas=candidate_formulas,
1069                                ion_type=ion_type,
1070                                ion_charge=ion_charge
1071                            )
1072        self.sql_db.close()
1073        
1074    def run_mass_feature_search(self):
1075        """Run the molecular formula search on the mass features.
1076        
1077        Calls bulk_run_molecular_formula_search() with specified mass spectra and mass peaks.
1078        """
1079        mass_features_df = self.lcms_obj.mass_features_to_df()
1080
1081        # Get the list of mass spectrum (and peaks to search with each mass spectrum) for all mass features
1082        scan_list = mass_features_df.apex_scan.unique()
1083        mass_spectrum_list = [self.lcms_obj._ms[x] for x in scan_list]
1084        ms_peaks = []
1085        for scan in scan_list:
1086            mf_df_scan = mass_features_df[mass_features_df.apex_scan == scan]
1087            peaks_to_search = [
1088                self.lcms_obj.mass_features[x].ms1_peak for x in mf_df_scan.index.tolist()
1089            ]
1090            ms_peaks.append(peaks_to_search)
1091        
1092        # Run the molecular formula search
1093        self.bulk_run_molecular_formula_search(mass_spectrum_list, ms_peaks)
1094    
1095    def run_untargeted_worker_ms1(self):
1096        """Run untargeted molecular formula search on the ms1 mass spectrum."""
1097        raise NotImplementedError("run_untargeted_worker_ms1 search is not implemented in CoreMS 3.0 and greater")
1098
1099    def run_target_worker_ms1(self):
1100        """Run targeted molecular formula search on the ms1 mass spectrum."""
1101        raise NotImplementedError("run_target_worker_ms1 formula search is not yet implemented in CoreMS 3.0 and greater")

Class for searching molecular formulas in a LC object.

Parameters
  • lcms_obj (LCMSBase): The LCMSBase object.
  • sql_db (MolForm_SQL, optional): The SQL database object, by default None.
  • first_hit (bool, optional): Flag to indicate whether to skip peaks that already have a molecular formula assigned, by default False.
  • find_isotopologues (bool, optional): Flag to indicate whether to find isotopologues, by default True.
Methods
  • search_spectra_against_candidates(). Search a list of mass spectra against a list of candidate formulas with a given ion type and charge.
  • bulk_run_molecular_formula_search(). Run the molecular formula search on the given list of mass spectra. Pulls the settings from the LCMSBase object to set ion type and charge to search for.
  • run_mass_feature_search(). Run the molecular formula search on mass features. Calls bulk_run_molecular_formula_search() with specified mass spectra and mass peaks.
  • run_untargeted_worker_ms1(). Run untargeted molecular formula search on the ms1 mass spectrum. DEPRECATED: use run_mass_feature_search() or bulk_run_molecular_formula_search() instead.
  • run_target_worker_ms1(). Run targeted molecular formula search on the ms1 mass spectrum. DEPRECATED: use run_mass_feature_search() or bulk_run_molecular_formula_search() instead.
SearchMolecularFormulasLC(lcms_obj, sql_db=None, first_hit=False, find_isotopologues=True)
895    def __init__(self, lcms_obj, sql_db=None, first_hit=False, find_isotopologues=True):
896        self.first_hit = first_hit
897
898        self.find_isotopologues = find_isotopologues
899
900        self.lcms_obj = lcms_obj
901
902        if not sql_db:
903            self.sql_db = MolForm_SQL(
904                url=self.lcms_obj.parameters.mass_spectrum['ms1'].molecular_search.url_database
905            )
906
907        else:
908            self.sql_db = sql_db
first_hit
find_isotopologues
lcms_obj
def search_spectra_against_candidates( self, mass_spectrum_list, ms_peaks_list, candidate_formulas, ion_type, ion_charge):
910    def search_spectra_against_candidates(self, mass_spectrum_list, ms_peaks_list, candidate_formulas, ion_type, ion_charge):
911        """Search a list of mass spectra against a list of candidate formulas with a given ion type and charge.
912
913        Parameters
914        ----------
915        mass_spectrum_list : list of MassSpectrum
916            The list of mass spectra to perform the search on.
917        ms_peaks_list : list of lists of MSPeak objects
918            The list of mass spectrum peaks to search within each mass spectrum.
919        candidate_formulas : dict
920            The candidate formulas.
921        ion_type : str
922            The ion type.
923        ion_charge : int
924            The ion charge, either 1 or -1.
925
926        Notes
927        -----
928        This function is designed to be used with the bulk_run_molecular_formula_search function.
929        """
930        for mass_spectrum, ms_peaks in zip(mass_spectrum_list, ms_peaks_list):
931            single_ms_search = SearchMolecularFormulas(
932                mass_spectrum,
933                sql_db=self.sql_db,
934                first_hit=self.first_hit,
935                find_isotopologues=self.find_isotopologues,
936            )
937            single_ms_search.run_search(
938                ms_peaks,
939                candidate_formulas,
940                mass_spectrum.min_abundance,
941                ion_type,
942                ion_charge,
943            )

Search a list of mass spectra against a list of candidate formulas with a given ion type and charge.

Parameters
  • mass_spectrum_list (list of MassSpectrum): The list of mass spectra to perform the search on.
  • ms_peaks_list (list of lists of MSPeak objects): The list of mass spectrum peaks to search within each mass spectrum.
  • candidate_formulas (dict): The candidate formulas.
  • ion_type (str): The ion type.
  • ion_charge (int): The ion charge, either 1 or -1.
Notes

This function is designed to be used with the bulk_run_molecular_formula_search function.

def run_untargeted_worker_ms1(self):
1095    def run_untargeted_worker_ms1(self):
1096        """Run untargeted molecular formula search on the ms1 mass spectrum."""
1097        raise NotImplementedError("run_untargeted_worker_ms1 search is not implemented in CoreMS 3.0 and greater")

Run untargeted molecular formula search on the ms1 mass spectrum.

def run_target_worker_ms1(self):
1099    def run_target_worker_ms1(self):
1100        """Run targeted molecular formula search on the ms1 mass spectrum."""
1101        raise NotImplementedError("run_target_worker_ms1 formula search is not yet implemented in CoreMS 3.0 and greater")

Run targeted molecular formula search on the ms1 mass spectrum.