corems.molecular_id.search.lcms_spectral_search

View Source

  1import re
  2
  3import numpy as np
  4
  5from corems.molecular_id.factory.spectrum_search_results import SpectrumSearchResults
  6
  7
  8class LCMSSpectralSearch:
  9    """
 10    Methods for searching LCMS spectra.
 11
 12    This class is designed to be a mixin class for the :obj:`~corems.mass_spectra.factory.lc_class.LCMSBase` class.
 13
 14    """
 15
 16    @staticmethod
 17    def get_more_match_quals(
 18        query_mz_arr, lib_entry, mz_tol_da=0.1, include_fragment_types=False
 19    ):
 20        """
 21        Return additional match qualities between query and library entry.
 22
 23        Parameters
 24        ----------
 25        query_mz_arr : np.array
 26            Array of query spectrum. Shape (N, 2), with m/z in the first column
 27            and abundance in the second.
 28        lib_entry : dict
 29            Library spectrum entry, with 'mz' key containing the spectrum in
 30            the format (mz, abundance),(mz, abundance), i.e. from MetabRef.
 31        mz_tol_da : float, optional
 32            Tolerance in Da for matching peaks (in MS2). Default is 0.1.
 33        include_fragment_types : bool, optional
 34            If True, include fragment type comparisons in output.
 35            Defaults to False.
 36
 37        Returns
 38        -------
 39        tuple
 40            Tuple of (query_in_lib, query_in_lib_fract, lib_in_query, lib_in_query_fract, query_frags, lib_frags, lib_precursor_mz).
 41
 42        Notes
 43        -----
 44        query_in_lib : int
 45            Number of peaks in query that are present in the library entry (within mz_tol_da).
 46        query_in_lib_fract : float
 47            Fraction of peaks in query that are present in the library entry (within mz_tol_da).
 48        lib_in_query : int
 49            Number of peaks in the library entry that are present in the query (within mz_tol_da).
 50        lib_in_query_fract : float
 51            Fraction of peaks in the library entry that are present in the query (within mz_tol_da).
 52        query_frags : list
 53            List of unique fragment types present in the query, generally 'MLF' or 'LSF' or both.
 54        lib_frags : list
 55            List of unique fragment types present in the library entry, generally 'MLF' or 'LSF' or both.
 56
 57        Raises
 58        ------
 59        ValueError
 60            If library entry does not have 'fragment_types' key and include_fragment_types is True.
 61
 62        """
 63
 64        if "mz" in lib_entry.keys():
 65            # Get the original mz values from the library entry
 66            lib_mzs = np.array(
 67                re.findall(r"\(([^,]+),([^)]+)\)", lib_entry["mz"]), dtype=float
 68            ).reshape(-1, 2)[:, 0]
 69        elif "peaks" in lib_entry.keys() and lib_entry["peaks"] is not None:
 70            lib_mzs = lib_entry["peaks"][:, 0]
 71
 72        # Get count and fraction of peaks in query that are in lib entry
 73        query_in_lib = 0
 74        for peak in query_mz_arr:
 75            if np.any(np.isclose(lib_mzs, peak, atol=mz_tol_da)):
 76                query_in_lib += 1
 77        query_in_lib_fract = query_in_lib / len(query_mz_arr)
 78
 79        # Get count and fraction of peaks in lib that are in query
 80        lib_in_query = 0
 81        for peak in lib_mzs:
 82            if np.any(np.isclose(query_mz_arr, peak, atol=mz_tol_da)):
 83                lib_in_query += 1
 84        lib_in_query_fract = lib_in_query / len(lib_mzs)
 85
 86        if include_fragment_types:
 87            # Check that fragment types are present in the library entry
 88            if "fragment_types" not in lib_entry.keys():
 89                raise ValueError(
 90                    "Flash entropy library entry must have 'fragment_types' key to include fragment types in output."
 91                )
 92
 93            # Get types of fragments in the lib entry
 94            lib_frags = lib_entry["fragment_types"]
 95            # make list of the fragment types that are present in the query spectrum
 96            lib_in_query_ids = list(
 97                set(
 98                    [
 99                        ind
100                        for ind, x in enumerate(lib_mzs)
101                        if len(np.where(np.isclose(query_mz_arr, x, atol=mz_tol_da))[0])
102                        > 0
103                    ]
104                )
105            )
106            query_frags = list(set([lib_frags[x] for x in lib_in_query_ids]))
107            lib_frags = list(set(lib_frags))
108
109        else:
110            query_frags = None
111            lib_frags = None
112
113        return (
114            query_in_lib,
115            query_in_lib_fract,
116            lib_in_query,
117            lib_in_query_fract,
118            query_frags,
119            lib_frags,
120        )
121
122    def fe_search(
123        self,
124        scan_list,
125        fe_lib,
126        precursor_mz_list=[],
127        use_mass_features=True,
128        peak_sep_da=0.01,
129        get_additional_metrics=True,
130    ):
131        """
132        Search LCMS spectra using a FlashEntropy approach.
133
134        Parameters
135        ----------
136        scan_list : list
137            List of scan numbers to search.
138        fe_lib : :obj:`~ms_entropy.FlashEntropySearch`
139            FlashEntropy Search instance.
140        precursor_mz_list : list, optional
141            List of precursor m/z values to search, by default [], which implies
142            matched with mass features; to enable this use_mass_features must be True.
143        use_mass_features : bool, optional
144            If True, use mass features to get precursor m/z values, by default True.
145            If True, will add search results to mass features' ms2_similarity_results attribute.
146        peak_sep_da : float, optional
147            Minimum separation between m/z peaks spectra in Da. This needs match the
148            approximate resolution of the search spectra and the FlashEntropySearch
149            instance, by default 0.01.
150        get_additional_metrics : bool, optional
151            If True, get additional metrics from FlashEntropy search, by default True.
152
153        Returns
154        -------
155        None, but adds results to self.spectral_search_results and associates these
156        spectral_search_results with mass_features within the self.mass_features dictionary.
157
158        """
159        # Retrieve parameters from self
160        # include_fragment_types should used for lipids queries only, not general metabolomics
161        include_fragment_types = self.parameters.lc_ms.include_fragment_types
162        min_match_score = self.parameters.lc_ms.ms2_min_fe_score
163
164        # If precursor_mz_list is empty and use_mass_features is True, get precursor m/z values from mass features for each scan in scan_list
165        if use_mass_features and len(precursor_mz_list) == 0:
166            precursor_mz_list = []
167            for scan in scan_list:
168                mf_ids = [
169                    key
170                    for key, value in self.mass_features.items()
171                    if scan in value.ms2_mass_spectra
172                ]
173                precursor_mz = [
174                    value.mz
175                    for key, value in self.mass_features.items()
176                    if key in mf_ids
177                ]
178                precursor_mz_list.append(precursor_mz)
179
180        # Check that precursor_mz_list same length as scan_list, if not, raise error
181        if len(precursor_mz_list) != len(scan_list):
182            raise ValueError("Length of precursor_mz_list is not equal to scan_list.")
183
184        # Loop through each query spectrum / precursor match and save ids of db spectrum that are decent matches
185        overall_results_dict = {}
186        for i in np.arange(len(scan_list)):
187            scan_oi = scan_list[i]
188            if len(self._ms[scan_oi].mspeaks) > 0:
189                precursor_mzs = precursor_mz_list[i]
190                overall_results_dict[scan_oi] = {}
191                for precursor_mz in precursor_mzs:
192                    query_spectrum = fe_lib.clean_spectrum_for_search(
193                        precursor_mz=precursor_mz,
194                        peaks=np.vstack(
195                            (self._ms[scan_oi].mz_exp, self._ms[scan_oi].abundance)
196                        ).T,
197                        precursor_ions_removal_da=None,
198                        noise_threshold=self._ms[
199                            scan_oi
200                        ].parameters.mass_spectrum.noise_threshold_min_relative_abundance
201                        / 100,
202                        min_ms2_difference_in_da=peak_sep_da,
203                    )
204                    search_results = fe_lib.search(
205                        precursor_mz=precursor_mz,
206                        peaks=query_spectrum,
207                        ms1_tolerance_in_da=self.parameters.mass_spectrum[
208                            "ms1"
209                        ].molecular_search.max_ppm_error
210                        * 10**-6
211                        * precursor_mz,
212                        ms2_tolerance_in_da=peak_sep_da * 0.5,
213                        method={"identity"},
214                        precursor_ions_removal_da=None,
215                        noise_threshold=self._ms[
216                            scan_oi
217                        ].parameters.mass_spectrum.noise_threshold_min_relative_abundance
218                        / 100,
219                        target="cpu",
220                    )["identity_search"]
221                    match_inds = np.where(search_results > min_match_score)[0]
222
223                    # If any decent matches are found, add them to the results dictionary
224                    if len(match_inds) > 0:
225                        match_scores = search_results[match_inds]
226                        ref_ms_ids = [fe_lib[x]["id"] for x in match_inds]
227                        ref_mol_ids = [
228                            fe_lib[x]["molecular_data_id"] for x in match_inds
229                        ]
230                        ref_precursor_mzs = [
231                            fe_lib[x]["precursor_mz"] for x in match_inds
232                        ]
233                        ion_types = [fe_lib[x]["ion_type"] for x in match_inds]
234                        overall_results_dict[scan_oi][precursor_mz] = {
235                            "ref_mol_id": ref_mol_ids,
236                            "ref_ms_id": ref_ms_ids,
237                            "ref_precursor_mz": ref_precursor_mzs,
238                            "precursor_mz_error_ppm": [
239                                (precursor_mz - x) / precursor_mz * 10**6
240                                for x in ref_precursor_mzs
241                            ],
242                            "entropy_similarity": match_scores,
243                            "ref_ion_type": ion_types,
244                        }
245                        # Add database name, if present
246                        db_name = [
247                            fe_lib[x].get("database_name") for x in match_inds
248                        ]
249                        if db_name is not None:
250                            overall_results_dict[scan_oi][precursor_mz].update(
251                                {"database_name": db_name}
252                            )
253                        if get_additional_metrics:
254                            more_match_quals = [
255                                self.get_more_match_quals(
256                                    self._ms[scan_oi].mz_exp,
257                                    fe_lib[x],
258                                    mz_tol_da=peak_sep_da,
259                                    include_fragment_types=include_fragment_types,
260                                )
261                                for x in match_inds
262                            ]
263                            overall_results_dict[scan_oi][precursor_mz].update(
264                                {
265                                    "query_mz_in_ref_n": [
266                                        x[0] for x in more_match_quals
267                                    ],
268                                    "query_mz_in_ref_fract": [
269                                        x[1] for x in more_match_quals
270                                    ],
271                                    "ref_mz_in_query_n": [
272                                        x[2] for x in more_match_quals
273                                    ],
274                                    "ref_mz_in_query_fract": [
275                                        x[3] for x in more_match_quals
276                                    ],
277                                }
278                            )
279                            if include_fragment_types:
280                                overall_results_dict[scan_oi][precursor_mz].update(
281                                    {
282                                        "query_frag_types": [
283                                            x[4] for x in more_match_quals
284                                        ],
285                                        "ref_frag_types": [
286                                            x[5] for x in more_match_quals
287                                        ],
288                                    }
289                                )
290
291        # Drop scans with no results from dictionary
292        overall_results_dict = {k: v for k, v in overall_results_dict.items() if v}
293
294        # Cast each entry as a MS2SearchResults object
295        for scan_id in overall_results_dict.keys():
296            for precursor_mz in overall_results_dict[scan_id].keys():
297                ms2_spectrum = self._ms[scan_id]
298                ms2_search_results = overall_results_dict[scan_id][precursor_mz]
299                overall_results_dict[scan_id][precursor_mz] = SpectrumSearchResults(
300                    ms2_spectrum, precursor_mz, ms2_search_results
301                )
302
303        # Add MS2SearchResults to the existing spectral search results dictionary
304        self.spectral_search_results.update(overall_results_dict)
305
306        # If there are mass features, associate the results with each mass feature
307        if len(self.mass_features) > 0:
308            for mass_feature_id, mass_feature in self.mass_features.items():
309                scan_ids = mass_feature.ms2_scan_numbers
310                for ms2_scan_id in scan_ids:
311                    precursor_mz = mass_feature.mz
312                    try:
313                        self.spectral_search_results[ms2_scan_id][precursor_mz]
314                    except KeyError:
315                        pass
316                    else:
317                        self.mass_features[
318                            mass_feature_id
319                        ].ms2_similarity_results.append(
320                            self.spectral_search_results[ms2_scan_id][precursor_mz]
321                        )

class LCMSSpectralSearch: View Source

  9class LCMSSpectralSearch:
 10    """
 11    Methods for searching LCMS spectra.
 12
 13    This class is designed to be a mixin class for the :obj:`~corems.mass_spectra.factory.lc_class.LCMSBase` class.
 14
 15    """
 16
 17    @staticmethod
 18    def get_more_match_quals(
 19        query_mz_arr, lib_entry, mz_tol_da=0.1, include_fragment_types=False
 20    ):
 21        """
 22        Return additional match qualities between query and library entry.
 23
 24        Parameters
 25        ----------
 26        query_mz_arr : np.array
 27            Array of query spectrum. Shape (N, 2), with m/z in the first column
 28            and abundance in the second.
 29        lib_entry : dict
 30            Library spectrum entry, with 'mz' key containing the spectrum in
 31            the format (mz, abundance),(mz, abundance), i.e. from MetabRef.
 32        mz_tol_da : float, optional
 33            Tolerance in Da for matching peaks (in MS2). Default is 0.1.
 34        include_fragment_types : bool, optional
 35            If True, include fragment type comparisons in output.
 36            Defaults to False.
 37
 38        Returns
 39        -------
 40        tuple
 41            Tuple of (query_in_lib, query_in_lib_fract, lib_in_query, lib_in_query_fract, query_frags, lib_frags, lib_precursor_mz).
 42
 43        Notes
 44        -----
 45        query_in_lib : int
 46            Number of peaks in query that are present in the library entry (within mz_tol_da).
 47        query_in_lib_fract : float
 48            Fraction of peaks in query that are present in the library entry (within mz_tol_da).
 49        lib_in_query : int
 50            Number of peaks in the library entry that are present in the query (within mz_tol_da).
 51        lib_in_query_fract : float
 52            Fraction of peaks in the library entry that are present in the query (within mz_tol_da).
 53        query_frags : list
 54            List of unique fragment types present in the query, generally 'MLF' or 'LSF' or both.
 55        lib_frags : list
 56            List of unique fragment types present in the library entry, generally 'MLF' or 'LSF' or both.
 57
 58        Raises
 59        ------
 60        ValueError
 61            If library entry does not have 'fragment_types' key and include_fragment_types is True.
 62
 63        """
 64
 65        if "mz" in lib_entry.keys():
 66            # Get the original mz values from the library entry
 67            lib_mzs = np.array(
 68                re.findall(r"\(([^,]+),([^)]+)\)", lib_entry["mz"]), dtype=float
 69            ).reshape(-1, 2)[:, 0]
 70        elif "peaks" in lib_entry.keys() and lib_entry["peaks"] is not None:
 71            lib_mzs = lib_entry["peaks"][:, 0]
 72
 73        # Get count and fraction of peaks in query that are in lib entry
 74        query_in_lib = 0
 75        for peak in query_mz_arr:
 76            if np.any(np.isclose(lib_mzs, peak, atol=mz_tol_da)):
 77                query_in_lib += 1
 78        query_in_lib_fract = query_in_lib / len(query_mz_arr)
 79
 80        # Get count and fraction of peaks in lib that are in query
 81        lib_in_query = 0
 82        for peak in lib_mzs:
 83            if np.any(np.isclose(query_mz_arr, peak, atol=mz_tol_da)):
 84                lib_in_query += 1
 85        lib_in_query_fract = lib_in_query / len(lib_mzs)
 86
 87        if include_fragment_types:
 88            # Check that fragment types are present in the library entry
 89            if "fragment_types" not in lib_entry.keys():
 90                raise ValueError(
 91                    "Flash entropy library entry must have 'fragment_types' key to include fragment types in output."
 92                )
 93
 94            # Get types of fragments in the lib entry
 95            lib_frags = lib_entry["fragment_types"]
 96            # make list of the fragment types that are present in the query spectrum
 97            lib_in_query_ids = list(
 98                set(
 99                    [
100                        ind
101                        for ind, x in enumerate(lib_mzs)
102                        if len(np.where(np.isclose(query_mz_arr, x, atol=mz_tol_da))[0])
103                        > 0
104                    ]
105                )
106            )
107            query_frags = list(set([lib_frags[x] for x in lib_in_query_ids]))
108            lib_frags = list(set(lib_frags))
109
110        else:
111            query_frags = None
112            lib_frags = None
113
114        return (
115            query_in_lib,
116            query_in_lib_fract,
117            lib_in_query,
118            lib_in_query_fract,
119            query_frags,
120            lib_frags,
121        )
122
123    def fe_search(
124        self,
125        scan_list,
126        fe_lib,
127        precursor_mz_list=[],
128        use_mass_features=True,
129        peak_sep_da=0.01,
130        get_additional_metrics=True,
131    ):
132        """
133        Search LCMS spectra using a FlashEntropy approach.
134
135        Parameters
136        ----------
137        scan_list : list
138            List of scan numbers to search.
139        fe_lib : :obj:`~ms_entropy.FlashEntropySearch`
140            FlashEntropy Search instance.
141        precursor_mz_list : list, optional
142            List of precursor m/z values to search, by default [], which implies
143            matched with mass features; to enable this use_mass_features must be True.
144        use_mass_features : bool, optional
145            If True, use mass features to get precursor m/z values, by default True.
146            If True, will add search results to mass features' ms2_similarity_results attribute.
147        peak_sep_da : float, optional
148            Minimum separation between m/z peaks spectra in Da. This needs match the
149            approximate resolution of the search spectra and the FlashEntropySearch
150            instance, by default 0.01.
151        get_additional_metrics : bool, optional
152            If True, get additional metrics from FlashEntropy search, by default True.
153
154        Returns
155        -------
156        None, but adds results to self.spectral_search_results and associates these
157        spectral_search_results with mass_features within the self.mass_features dictionary.
158
159        """
160        # Retrieve parameters from self
161        # include_fragment_types should used for lipids queries only, not general metabolomics
162        include_fragment_types = self.parameters.lc_ms.include_fragment_types
163        min_match_score = self.parameters.lc_ms.ms2_min_fe_score
164
165        # If precursor_mz_list is empty and use_mass_features is True, get precursor m/z values from mass features for each scan in scan_list
166        if use_mass_features and len(precursor_mz_list) == 0:
167            precursor_mz_list = []
168            for scan in scan_list:
169                mf_ids = [
170                    key
171                    for key, value in self.mass_features.items()
172                    if scan in value.ms2_mass_spectra
173                ]
174                precursor_mz = [
175                    value.mz
176                    for key, value in self.mass_features.items()
177                    if key in mf_ids
178                ]
179                precursor_mz_list.append(precursor_mz)
180
181        # Check that precursor_mz_list same length as scan_list, if not, raise error
182        if len(precursor_mz_list) != len(scan_list):
183            raise ValueError("Length of precursor_mz_list is not equal to scan_list.")
184
185        # Loop through each query spectrum / precursor match and save ids of db spectrum that are decent matches
186        overall_results_dict = {}
187        for i in np.arange(len(scan_list)):
188            scan_oi = scan_list[i]
189            if len(self._ms[scan_oi].mspeaks) > 0:
190                precursor_mzs = precursor_mz_list[i]
191                overall_results_dict[scan_oi] = {}
192                for precursor_mz in precursor_mzs:
193                    query_spectrum = fe_lib.clean_spectrum_for_search(
194                        precursor_mz=precursor_mz,
195                        peaks=np.vstack(
196                            (self._ms[scan_oi].mz_exp, self._ms[scan_oi].abundance)
197                        ).T,
198                        precursor_ions_removal_da=None,
199                        noise_threshold=self._ms[
200                            scan_oi
201                        ].parameters.mass_spectrum.noise_threshold_min_relative_abundance
202                        / 100,
203                        min_ms2_difference_in_da=peak_sep_da,
204                    )
205                    search_results = fe_lib.search(
206                        precursor_mz=precursor_mz,
207                        peaks=query_spectrum,
208                        ms1_tolerance_in_da=self.parameters.mass_spectrum[
209                            "ms1"
210                        ].molecular_search.max_ppm_error
211                        * 10**-6
212                        * precursor_mz,
213                        ms2_tolerance_in_da=peak_sep_da * 0.5,
214                        method={"identity"},
215                        precursor_ions_removal_da=None,
216                        noise_threshold=self._ms[
217                            scan_oi
218                        ].parameters.mass_spectrum.noise_threshold_min_relative_abundance
219                        / 100,
220                        target="cpu",
221                    )["identity_search"]
222                    match_inds = np.where(search_results > min_match_score)[0]
223
224                    # If any decent matches are found, add them to the results dictionary
225                    if len(match_inds) > 0:
226                        match_scores = search_results[match_inds]
227                        ref_ms_ids = [fe_lib[x]["id"] for x in match_inds]
228                        ref_mol_ids = [
229                            fe_lib[x]["molecular_data_id"] for x in match_inds
230                        ]
231                        ref_precursor_mzs = [
232                            fe_lib[x]["precursor_mz"] for x in match_inds
233                        ]
234                        ion_types = [fe_lib[x]["ion_type"] for x in match_inds]
235                        overall_results_dict[scan_oi][precursor_mz] = {
236                            "ref_mol_id": ref_mol_ids,
237                            "ref_ms_id": ref_ms_ids,
238                            "ref_precursor_mz": ref_precursor_mzs,
239                            "precursor_mz_error_ppm": [
240                                (precursor_mz - x) / precursor_mz * 10**6
241                                for x in ref_precursor_mzs
242                            ],
243                            "entropy_similarity": match_scores,
244                            "ref_ion_type": ion_types,
245                        }
246                        # Add database name, if present
247                        db_name = [
248                            fe_lib[x].get("database_name") for x in match_inds
249                        ]
250                        if db_name is not None:
251                            overall_results_dict[scan_oi][precursor_mz].update(
252                                {"database_name": db_name}
253                            )
254                        if get_additional_metrics:
255                            more_match_quals = [
256                                self.get_more_match_quals(
257                                    self._ms[scan_oi].mz_exp,
258                                    fe_lib[x],
259                                    mz_tol_da=peak_sep_da,
260                                    include_fragment_types=include_fragment_types,
261                                )
262                                for x in match_inds
263                            ]
264                            overall_results_dict[scan_oi][precursor_mz].update(
265                                {
266                                    "query_mz_in_ref_n": [
267                                        x[0] for x in more_match_quals
268                                    ],
269                                    "query_mz_in_ref_fract": [
270                                        x[1] for x in more_match_quals
271                                    ],
272                                    "ref_mz_in_query_n": [
273                                        x[2] for x in more_match_quals
274                                    ],
275                                    "ref_mz_in_query_fract": [
276                                        x[3] for x in more_match_quals
277                                    ],
278                                }
279                            )
280                            if include_fragment_types:
281                                overall_results_dict[scan_oi][precursor_mz].update(
282                                    {
283                                        "query_frag_types": [
284                                            x[4] for x in more_match_quals
285                                        ],
286                                        "ref_frag_types": [
287                                            x[5] for x in more_match_quals
288                                        ],
289                                    }
290                                )
291
292        # Drop scans with no results from dictionary
293        overall_results_dict = {k: v for k, v in overall_results_dict.items() if v}
294
295        # Cast each entry as a MS2SearchResults object
296        for scan_id in overall_results_dict.keys():
297            for precursor_mz in overall_results_dict[scan_id].keys():
298                ms2_spectrum = self._ms[scan_id]
299                ms2_search_results = overall_results_dict[scan_id][precursor_mz]
300                overall_results_dict[scan_id][precursor_mz] = SpectrumSearchResults(
301                    ms2_spectrum, precursor_mz, ms2_search_results
302                )
303
304        # Add MS2SearchResults to the existing spectral search results dictionary
305        self.spectral_search_results.update(overall_results_dict)
306
307        # If there are mass features, associate the results with each mass feature
308        if len(self.mass_features) > 0:
309            for mass_feature_id, mass_feature in self.mass_features.items():
310                scan_ids = mass_feature.ms2_scan_numbers
311                for ms2_scan_id in scan_ids:
312                    precursor_mz = mass_feature.mz
313                    try:
314                        self.spectral_search_results[ms2_scan_id][precursor_mz]
315                    except KeyError:
316                        pass
317                    else:
318                        self.mass_features[
319                            mass_feature_id
320                        ].ms2_similarity_results.append(
321                            self.spectral_search_results[ms2_scan_id][precursor_mz]
322                        )

Methods for searching LCMS spectra.

This class is designed to be a mixin class for the ~corems.mass_spectra.factory.lc_class.LCMSBase class.

@staticmethod

def get_more_match_quals(query_mz_arr, lib_entry, mz_tol_da=0.1, include_fragment_types=False): View Source

 17    @staticmethod
 18    def get_more_match_quals(
 19        query_mz_arr, lib_entry, mz_tol_da=0.1, include_fragment_types=False
 20    ):
 21        """
 22        Return additional match qualities between query and library entry.
 23
 24        Parameters
 25        ----------
 26        query_mz_arr : np.array
 27            Array of query spectrum. Shape (N, 2), with m/z in the first column
 28            and abundance in the second.
 29        lib_entry : dict
 30            Library spectrum entry, with 'mz' key containing the spectrum in
 31            the format (mz, abundance),(mz, abundance), i.e. from MetabRef.
 32        mz_tol_da : float, optional
 33            Tolerance in Da for matching peaks (in MS2). Default is 0.1.
 34        include_fragment_types : bool, optional
 35            If True, include fragment type comparisons in output.
 36            Defaults to False.
 37
 38        Returns
 39        -------
 40        tuple
 41            Tuple of (query_in_lib, query_in_lib_fract, lib_in_query, lib_in_query_fract, query_frags, lib_frags, lib_precursor_mz).
 42
 43        Notes
 44        -----
 45        query_in_lib : int
 46            Number of peaks in query that are present in the library entry (within mz_tol_da).
 47        query_in_lib_fract : float
 48            Fraction of peaks in query that are present in the library entry (within mz_tol_da).
 49        lib_in_query : int
 50            Number of peaks in the library entry that are present in the query (within mz_tol_da).
 51        lib_in_query_fract : float
 52            Fraction of peaks in the library entry that are present in the query (within mz_tol_da).
 53        query_frags : list
 54            List of unique fragment types present in the query, generally 'MLF' or 'LSF' or both.
 55        lib_frags : list
 56            List of unique fragment types present in the library entry, generally 'MLF' or 'LSF' or both.
 57
 58        Raises
 59        ------
 60        ValueError
 61            If library entry does not have 'fragment_types' key and include_fragment_types is True.
 62
 63        """
 64
 65        if "mz" in lib_entry.keys():
 66            # Get the original mz values from the library entry
 67            lib_mzs = np.array(
 68                re.findall(r"\(([^,]+),([^)]+)\)", lib_entry["mz"]), dtype=float
 69            ).reshape(-1, 2)[:, 0]
 70        elif "peaks" in lib_entry.keys() and lib_entry["peaks"] is not None:
 71            lib_mzs = lib_entry["peaks"][:, 0]
 72
 73        # Get count and fraction of peaks in query that are in lib entry
 74        query_in_lib = 0
 75        for peak in query_mz_arr:
 76            if np.any(np.isclose(lib_mzs, peak, atol=mz_tol_da)):
 77                query_in_lib += 1
 78        query_in_lib_fract = query_in_lib / len(query_mz_arr)
 79
 80        # Get count and fraction of peaks in lib that are in query
 81        lib_in_query = 0
 82        for peak in lib_mzs:
 83            if np.any(np.isclose(query_mz_arr, peak, atol=mz_tol_da)):
 84                lib_in_query += 1
 85        lib_in_query_fract = lib_in_query / len(lib_mzs)
 86
 87        if include_fragment_types:
 88            # Check that fragment types are present in the library entry
 89            if "fragment_types" not in lib_entry.keys():
 90                raise ValueError(
 91                    "Flash entropy library entry must have 'fragment_types' key to include fragment types in output."
 92                )
 93
 94            # Get types of fragments in the lib entry
 95            lib_frags = lib_entry["fragment_types"]
 96            # make list of the fragment types that are present in the query spectrum
 97            lib_in_query_ids = list(
 98                set(
 99                    [
100                        ind
101                        for ind, x in enumerate(lib_mzs)
102                        if len(np.where(np.isclose(query_mz_arr, x, atol=mz_tol_da))[0])
103                        > 0
104                    ]
105                )
106            )
107            query_frags = list(set([lib_frags[x] for x in lib_in_query_ids]))
108            lib_frags = list(set(lib_frags))
109
110        else:
111            query_frags = None
112            lib_frags = None
113
114        return (
115            query_in_lib,
116            query_in_lib_fract,
117            lib_in_query,
118            lib_in_query_fract,
119            query_frags,
120            lib_frags,
121        )

Return additional match qualities between query and library entry.

Parameters

query_mz_arr (np.array): Array of query spectrum. Shape (N, 2), with m/z in the first column and abundance in the second.
lib_entry (dict): Library spectrum entry, with 'mz' key containing the spectrum in the format (mz, abundance),(mz, abundance), i.e. from MetabRef.
mz_tol_da (float, optional): Tolerance in Da for matching peaks (in MS2). Default is 0.1.
include_fragment_types (bool, optional): If True, include fragment type comparisons in output. Defaults to False.

Returns

tuple: Tuple of (query_in_lib, query_in_lib_fract, lib_in_query, lib_in_query_fract, query_frags, lib_frags, lib_precursor_mz).

Notes

query_in_lib : int Number of peaks in query that are present in the library entry (within mz_tol_da). query_in_lib_fract : float Fraction of peaks in query that are present in the library entry (within mz_tol_da). lib_in_query : int Number of peaks in the library entry that are present in the query (within mz_tol_da). lib_in_query_fract : float Fraction of peaks in the library entry that are present in the query (within mz_tol_da). query_frags : list List of unique fragment types present in the query, generally 'MLF' or 'LSF' or both. lib_frags : list List of unique fragment types present in the library entry, generally 'MLF' or 'LSF' or both.

Raises

ValueError: If library entry does not have 'fragment_types' key and include_fragment_types is True.

def fe_search( self, scan_list, fe_lib, precursor_mz_list=[], use_mass_features=True, peak_sep_da=0.01, get_additional_metrics=True): View Source

123    def fe_search(
124        self,
125        scan_list,
126        fe_lib,
127        precursor_mz_list=[],
128        use_mass_features=True,
129        peak_sep_da=0.01,
130        get_additional_metrics=True,
131    ):
132        """
133        Search LCMS spectra using a FlashEntropy approach.
134
135        Parameters
136        ----------
137        scan_list : list
138            List of scan numbers to search.
139        fe_lib : :obj:`~ms_entropy.FlashEntropySearch`
140            FlashEntropy Search instance.
141        precursor_mz_list : list, optional
142            List of precursor m/z values to search, by default [], which implies
143            matched with mass features; to enable this use_mass_features must be True.
144        use_mass_features : bool, optional
145            If True, use mass features to get precursor m/z values, by default True.
146            If True, will add search results to mass features' ms2_similarity_results attribute.
147        peak_sep_da : float, optional
148            Minimum separation between m/z peaks spectra in Da. This needs match the
149            approximate resolution of the search spectra and the FlashEntropySearch
150            instance, by default 0.01.
151        get_additional_metrics : bool, optional
152            If True, get additional metrics from FlashEntropy search, by default True.
153
154        Returns
155        -------
156        None, but adds results to self.spectral_search_results and associates these
157        spectral_search_results with mass_features within the self.mass_features dictionary.
158
159        """
160        # Retrieve parameters from self
161        # include_fragment_types should used for lipids queries only, not general metabolomics
162        include_fragment_types = self.parameters.lc_ms.include_fragment_types
163        min_match_score = self.parameters.lc_ms.ms2_min_fe_score
164
165        # If precursor_mz_list is empty and use_mass_features is True, get precursor m/z values from mass features for each scan in scan_list
166        if use_mass_features and len(precursor_mz_list) == 0:
167            precursor_mz_list = []
168            for scan in scan_list:
169                mf_ids = [
170                    key
171                    for key, value in self.mass_features.items()
172                    if scan in value.ms2_mass_spectra
173                ]
174                precursor_mz = [
175                    value.mz
176                    for key, value in self.mass_features.items()
177                    if key in mf_ids
178                ]
179                precursor_mz_list.append(precursor_mz)
180
181        # Check that precursor_mz_list same length as scan_list, if not, raise error
182        if len(precursor_mz_list) != len(scan_list):
183            raise ValueError("Length of precursor_mz_list is not equal to scan_list.")
184
185        # Loop through each query spectrum / precursor match and save ids of db spectrum that are decent matches
186        overall_results_dict = {}
187        for i in np.arange(len(scan_list)):
188            scan_oi = scan_list[i]
189            if len(self._ms[scan_oi].mspeaks) > 0:
190                precursor_mzs = precursor_mz_list[i]
191                overall_results_dict[scan_oi] = {}
192                for precursor_mz in precursor_mzs:
193                    query_spectrum = fe_lib.clean_spectrum_for_search(
194                        precursor_mz=precursor_mz,
195                        peaks=np.vstack(
196                            (self._ms[scan_oi].mz_exp, self._ms[scan_oi].abundance)
197                        ).T,
198                        precursor_ions_removal_da=None,
199                        noise_threshold=self._ms[
200                            scan_oi
201                        ].parameters.mass_spectrum.noise_threshold_min_relative_abundance
202                        / 100,
203                        min_ms2_difference_in_da=peak_sep_da,
204                    )
205                    search_results = fe_lib.search(
206                        precursor_mz=precursor_mz,
207                        peaks=query_spectrum,
208                        ms1_tolerance_in_da=self.parameters.mass_spectrum[
209                            "ms1"
210                        ].molecular_search.max_ppm_error
211                        * 10**-6
212                        * precursor_mz,
213                        ms2_tolerance_in_da=peak_sep_da * 0.5,
214                        method={"identity"},
215                        precursor_ions_removal_da=None,
216                        noise_threshold=self._ms[
217                            scan_oi
218                        ].parameters.mass_spectrum.noise_threshold_min_relative_abundance
219                        / 100,
220                        target="cpu",
221                    )["identity_search"]
222                    match_inds = np.where(search_results > min_match_score)[0]
223
224                    # If any decent matches are found, add them to the results dictionary
225                    if len(match_inds) > 0:
226                        match_scores = search_results[match_inds]
227                        ref_ms_ids = [fe_lib[x]["id"] for x in match_inds]
228                        ref_mol_ids = [
229                            fe_lib[x]["molecular_data_id"] for x in match_inds
230                        ]
231                        ref_precursor_mzs = [
232                            fe_lib[x]["precursor_mz"] for x in match_inds
233                        ]
234                        ion_types = [fe_lib[x]["ion_type"] for x in match_inds]
235                        overall_results_dict[scan_oi][precursor_mz] = {
236                            "ref_mol_id": ref_mol_ids,
237                            "ref_ms_id": ref_ms_ids,
238                            "ref_precursor_mz": ref_precursor_mzs,
239                            "precursor_mz_error_ppm": [
240                                (precursor_mz - x) / precursor_mz * 10**6
241                                for x in ref_precursor_mzs
242                            ],
243                            "entropy_similarity": match_scores,
244                            "ref_ion_type": ion_types,
245                        }
246                        # Add database name, if present
247                        db_name = [
248                            fe_lib[x].get("database_name") for x in match_inds
249                        ]
250                        if db_name is not None:
251                            overall_results_dict[scan_oi][precursor_mz].update(
252                                {"database_name": db_name}
253                            )
254                        if get_additional_metrics:
255                            more_match_quals = [
256                                self.get_more_match_quals(
257                                    self._ms[scan_oi].mz_exp,
258                                    fe_lib[x],
259                                    mz_tol_da=peak_sep_da,
260                                    include_fragment_types=include_fragment_types,
261                                )
262                                for x in match_inds
263                            ]
264                            overall_results_dict[scan_oi][precursor_mz].update(
265                                {
266                                    "query_mz_in_ref_n": [
267                                        x[0] for x in more_match_quals
268                                    ],
269                                    "query_mz_in_ref_fract": [
270                                        x[1] for x in more_match_quals
271                                    ],
272                                    "ref_mz_in_query_n": [
273                                        x[2] for x in more_match_quals
274                                    ],
275                                    "ref_mz_in_query_fract": [
276                                        x[3] for x in more_match_quals
277                                    ],
278                                }
279                            )
280                            if include_fragment_types:
281                                overall_results_dict[scan_oi][precursor_mz].update(
282                                    {
283                                        "query_frag_types": [
284                                            x[4] for x in more_match_quals
285                                        ],
286                                        "ref_frag_types": [
287                                            x[5] for x in more_match_quals
288                                        ],
289                                    }
290                                )
291
292        # Drop scans with no results from dictionary
293        overall_results_dict = {k: v for k, v in overall_results_dict.items() if v}
294
295        # Cast each entry as a MS2SearchResults object
296        for scan_id in overall_results_dict.keys():
297            for precursor_mz in overall_results_dict[scan_id].keys():
298                ms2_spectrum = self._ms[scan_id]
299                ms2_search_results = overall_results_dict[scan_id][precursor_mz]
300                overall_results_dict[scan_id][precursor_mz] = SpectrumSearchResults(
301                    ms2_spectrum, precursor_mz, ms2_search_results
302                )
303
304        # Add MS2SearchResults to the existing spectral search results dictionary
305        self.spectral_search_results.update(overall_results_dict)
306
307        # If there are mass features, associate the results with each mass feature
308        if len(self.mass_features) > 0:
309            for mass_feature_id, mass_feature in self.mass_features.items():
310                scan_ids = mass_feature.ms2_scan_numbers
311                for ms2_scan_id in scan_ids:
312                    precursor_mz = mass_feature.mz
313                    try:
314                        self.spectral_search_results[ms2_scan_id][precursor_mz]
315                    except KeyError:
316                        pass
317                    else:
318                        self.mass_features[
319                            mass_feature_id
320                        ].ms2_similarity_results.append(
321                            self.spectral_search_results[ms2_scan_id][precursor_mz]
322                        )

Search LCMS spectra using a FlashEntropy approach.

Parameters

scan_list (list): List of scan numbers to search.
fe_lib (~ms_entropy.FlashEntropySearch): FlashEntropy Search instance.
precursor_mz_list (list, optional): List of precursor m/z values to search, by default [], which implies matched with mass features; to enable this use_mass_features must be True.
use_mass_features (bool, optional): If True, use mass features to get precursor m/z values, by default True. If True, will add search results to mass features' ms2_similarity_results attribute.
peak_sep_da (float, optional): Minimum separation between m/z peaks spectra in Da. This needs match the approximate resolution of the search spectra and the FlashEntropySearch instance, by default 0.01.
get_additional_metrics (bool, optional): If True, get additional metrics from FlashEntropy search, by default True.

Returns

None, but adds results to self.spectral_search_results and associates these
spectral_search_results with mass_features within the self.mass_features dictionary.