corems.molecular_id.search.lcms_spectral_search
1import re 2 3import numpy as np 4 5from corems.molecular_id.factory.spectrum_search_results import SpectrumSearchResults 6 7 8class LCMSSpectralSearch: 9 """ 10 Methods for searching LCMS spectra. 11 12 This class is designed to be a mixin class for the :obj:`~corems.mass_spectra.factory.lc_class.LCMSBase` class. 13 14 """ 15 16 @staticmethod 17 def get_more_match_quals( 18 query_mz_arr, lib_entry, mz_tol_da=0.1, include_fragment_types=False 19 ): 20 """ 21 Return additional match qualities between query and library entry. 22 23 Parameters 24 ---------- 25 query_mz_arr : np.array 26 Array of query spectrum. Shape (N, 2), with m/z in the first column 27 and abundance in the second. 28 lib_entry : dict 29 Library spectrum entry, with 'mz' key containing the spectrum in 30 the format (mz, abundance),(mz, abundance), i.e. from MetabRef. 31 mz_tol_da : float, optional 32 Tolerance in Da for matching peaks (in MS2). Default is 0.1. 33 include_fragment_types : bool, optional 34 If True, include fragment type comparisons in output. 35 Defaults to False. 36 37 Returns 38 ------- 39 tuple 40 Tuple of (query_in_lib, query_in_lib_fract, lib_in_query, lib_in_query_fract, query_frags, lib_frags, lib_precursor_mz). 41 42 Notes 43 ----- 44 query_in_lib : int 45 Number of peaks in query that are present in the library entry (within mz_tol_da). 46 query_in_lib_fract : float 47 Fraction of peaks in query that are present in the library entry (within mz_tol_da). 48 lib_in_query : int 49 Number of peaks in the library entry that are present in the query (within mz_tol_da). 50 lib_in_query_fract : float 51 Fraction of peaks in the library entry that are present in the query (within mz_tol_da). 52 query_frags : list 53 List of unique fragment types present in the query, generally 'MLF' or 'LSF' or both. 54 lib_frags : list 55 List of unique fragment types present in the library entry, generally 'MLF' or 'LSF' or both. 56 57 Raises 58 ------ 59 ValueError 60 If library entry does not have 'fragment_types' key and include_fragment_types is True. 61 62 """ 63 64 if "mz" in lib_entry.keys(): 65 # Get the original mz values from the library entry 66 lib_mzs = np.array( 67 re.findall(r"\(([^,]+),([^)]+)\)", lib_entry["mz"]), dtype=float 68 ).reshape(-1, 2)[:, 0] 69 elif "peaks" in lib_entry.keys() and lib_entry["peaks"] is not None: 70 lib_mzs = lib_entry["peaks"][:, 0] 71 72 # Get count and fraction of peaks in query that are in lib entry 73 query_in_lib = 0 74 for peak in query_mz_arr: 75 if np.any(np.isclose(lib_mzs, peak, atol=mz_tol_da)): 76 query_in_lib += 1 77 query_in_lib_fract = query_in_lib / len(query_mz_arr) 78 79 # Get count and fraction of peaks in lib that are in query 80 lib_in_query = 0 81 for peak in lib_mzs: 82 if np.any(np.isclose(query_mz_arr, peak, atol=mz_tol_da)): 83 lib_in_query += 1 84 lib_in_query_fract = lib_in_query / len(lib_mzs) 85 86 if include_fragment_types: 87 # Check that fragment types are present in the library entry 88 if "fragment_types" not in lib_entry.keys(): 89 raise ValueError( 90 "Flash entropy library entry must have 'fragment_types' key to include fragment types in output." 91 ) 92 93 # Get types of fragments in the lib entry 94 lib_frags = lib_entry["fragment_types"] 95 # make list of the fragment types that are present in the query spectrum 96 lib_in_query_ids = list( 97 set( 98 [ 99 ind 100 for ind, x in enumerate(lib_mzs) 101 if len(np.where(np.isclose(query_mz_arr, x, atol=mz_tol_da))[0]) 102 > 0 103 ] 104 ) 105 ) 106 query_frags = list(set([lib_frags[x] for x in lib_in_query_ids])) 107 lib_frags = list(set(lib_frags)) 108 109 else: 110 query_frags = None 111 lib_frags = None 112 113 return ( 114 query_in_lib, 115 query_in_lib_fract, 116 lib_in_query, 117 lib_in_query_fract, 118 query_frags, 119 lib_frags, 120 ) 121 122 def fe_search( 123 self, 124 scan_list, 125 fe_lib, 126 precursor_mz_list=[], 127 use_mass_features=True, 128 peak_sep_da=0.01, 129 get_additional_metrics=True, 130 ): 131 """ 132 Search LCMS spectra using a FlashEntropy approach. 133 134 Parameters 135 ---------- 136 scan_list : list 137 List of scan numbers to search. 138 fe_lib : :obj:`~ms_entropy.FlashEntropySearch` 139 FlashEntropy Search instance. 140 precursor_mz_list : list, optional 141 List of precursor m/z values to search, by default [], which implies 142 matched with mass features; to enable this use_mass_features must be True. 143 use_mass_features : bool, optional 144 If True, use mass features to get precursor m/z values, by default True. 145 If True, will add search results to mass features' ms2_similarity_results attribute. 146 peak_sep_da : float, optional 147 Minimum separation between m/z peaks spectra in Da. This needs match the 148 approximate resolution of the search spectra and the FlashEntropySearch 149 instance, by default 0.01. 150 get_additional_metrics : bool, optional 151 If True, get additional metrics from FlashEntropy search, by default True. 152 153 Returns 154 ------- 155 None, but adds results to self.spectral_search_results and associates these 156 spectral_search_results with mass_features within the self.mass_features dictionary. 157 158 """ 159 # Retrieve parameters from self 160 # include_fragment_types should used for lipids queries only, not general metabolomics 161 include_fragment_types = self.parameters.lc_ms.include_fragment_types 162 min_match_score = self.parameters.lc_ms.ms2_min_fe_score 163 164 # If precursor_mz_list is empty and use_mass_features is True, get precursor m/z values from mass features for each scan in scan_list 165 if use_mass_features and len(precursor_mz_list) == 0: 166 precursor_mz_list = [] 167 for scan in scan_list: 168 mf_ids = [ 169 key 170 for key, value in self.mass_features.items() 171 if scan in value.ms2_mass_spectra 172 ] 173 precursor_mz = [ 174 value.mz 175 for key, value in self.mass_features.items() 176 if key in mf_ids 177 ] 178 precursor_mz_list.append(precursor_mz) 179 180 # Check that precursor_mz_list same length as scan_list, if not, raise error 181 if len(precursor_mz_list) != len(scan_list): 182 raise ValueError("Length of precursor_mz_list is not equal to scan_list.") 183 184 # Loop through each query spectrum / precursor match and save ids of db spectrum that are decent matches 185 overall_results_dict = {} 186 for i in np.arange(len(scan_list)): 187 scan_oi = scan_list[i] 188 if len(self._ms[scan_oi].mspeaks) > 0: 189 precursor_mzs = precursor_mz_list[i] 190 overall_results_dict[scan_oi] = {} 191 for precursor_mz in precursor_mzs: 192 query_spectrum = fe_lib.clean_spectrum_for_search( 193 precursor_mz=precursor_mz, 194 peaks=np.vstack( 195 (self._ms[scan_oi].mz_exp, self._ms[scan_oi].abundance) 196 ).T, 197 precursor_ions_removal_da=None, 198 noise_threshold=self._ms[ 199 scan_oi 200 ].parameters.mass_spectrum.noise_threshold_min_relative_abundance 201 / 100, 202 min_ms2_difference_in_da=peak_sep_da, 203 ) 204 search_results = fe_lib.search( 205 precursor_mz=precursor_mz, 206 peaks=query_spectrum, 207 ms1_tolerance_in_da=self.parameters.mass_spectrum[ 208 "ms1" 209 ].molecular_search.max_ppm_error 210 * 10**-6 211 * precursor_mz, 212 ms2_tolerance_in_da=peak_sep_da * 0.5, 213 method={"identity"}, 214 precursor_ions_removal_da=None, 215 noise_threshold=self._ms[ 216 scan_oi 217 ].parameters.mass_spectrum.noise_threshold_min_relative_abundance 218 / 100, 219 target="cpu", 220 )["identity_search"] 221 match_inds = np.where(search_results > min_match_score)[0] 222 223 # If any decent matches are found, add them to the results dictionary 224 if len(match_inds) > 0: 225 match_scores = search_results[match_inds] 226 ref_ms_ids = [fe_lib[x]["id"] for x in match_inds] 227 ref_mol_ids = [ 228 fe_lib[x]["molecular_data_id"] for x in match_inds 229 ] 230 ref_precursor_mzs = [ 231 fe_lib[x]["precursor_mz"] for x in match_inds 232 ] 233 ion_types = [fe_lib[x]["ion_type"] for x in match_inds] 234 overall_results_dict[scan_oi][precursor_mz] = { 235 "ref_mol_id": ref_mol_ids, 236 "ref_ms_id": ref_ms_ids, 237 "ref_precursor_mz": ref_precursor_mzs, 238 "precursor_mz_error_ppm": [ 239 (precursor_mz - x) / precursor_mz * 10**6 240 for x in ref_precursor_mzs 241 ], 242 "entropy_similarity": match_scores, 243 "ref_ion_type": ion_types, 244 } 245 # Add database name, if present 246 db_name = [ 247 fe_lib[x].get("database_name") for x in match_inds 248 ] 249 if db_name is not None: 250 overall_results_dict[scan_oi][precursor_mz].update( 251 {"database_name": db_name} 252 ) 253 if get_additional_metrics: 254 more_match_quals = [ 255 self.get_more_match_quals( 256 self._ms[scan_oi].mz_exp, 257 fe_lib[x], 258 mz_tol_da=peak_sep_da, 259 include_fragment_types=include_fragment_types, 260 ) 261 for x in match_inds 262 ] 263 overall_results_dict[scan_oi][precursor_mz].update( 264 { 265 "query_mz_in_ref_n": [ 266 x[0] for x in more_match_quals 267 ], 268 "query_mz_in_ref_fract": [ 269 x[1] for x in more_match_quals 270 ], 271 "ref_mz_in_query_n": [ 272 x[2] for x in more_match_quals 273 ], 274 "ref_mz_in_query_fract": [ 275 x[3] for x in more_match_quals 276 ], 277 } 278 ) 279 if include_fragment_types: 280 overall_results_dict[scan_oi][precursor_mz].update( 281 { 282 "query_frag_types": [ 283 x[4] for x in more_match_quals 284 ], 285 "ref_frag_types": [ 286 x[5] for x in more_match_quals 287 ], 288 } 289 ) 290 291 # Drop scans with no results from dictionary 292 overall_results_dict = {k: v for k, v in overall_results_dict.items() if v} 293 294 # Cast each entry as a MS2SearchResults object 295 for scan_id in overall_results_dict.keys(): 296 for precursor_mz in overall_results_dict[scan_id].keys(): 297 ms2_spectrum = self._ms[scan_id] 298 ms2_search_results = overall_results_dict[scan_id][precursor_mz] 299 overall_results_dict[scan_id][precursor_mz] = SpectrumSearchResults( 300 ms2_spectrum, precursor_mz, ms2_search_results 301 ) 302 303 # Add MS2SearchResults to the existing spectral search results dictionary 304 self.spectral_search_results.update(overall_results_dict) 305 306 # If there are mass features, associate the results with each mass feature 307 if len(self.mass_features) > 0: 308 for mass_feature_id, mass_feature in self.mass_features.items(): 309 scan_ids = mass_feature.ms2_scan_numbers 310 for ms2_scan_id in scan_ids: 311 precursor_mz = mass_feature.mz 312 try: 313 self.spectral_search_results[ms2_scan_id][precursor_mz] 314 except KeyError: 315 pass 316 else: 317 self.mass_features[ 318 mass_feature_id 319 ].ms2_similarity_results.append( 320 self.spectral_search_results[ms2_scan_id][precursor_mz] 321 )
9class LCMSSpectralSearch: 10 """ 11 Methods for searching LCMS spectra. 12 13 This class is designed to be a mixin class for the :obj:`~corems.mass_spectra.factory.lc_class.LCMSBase` class. 14 15 """ 16 17 @staticmethod 18 def get_more_match_quals( 19 query_mz_arr, lib_entry, mz_tol_da=0.1, include_fragment_types=False 20 ): 21 """ 22 Return additional match qualities between query and library entry. 23 24 Parameters 25 ---------- 26 query_mz_arr : np.array 27 Array of query spectrum. Shape (N, 2), with m/z in the first column 28 and abundance in the second. 29 lib_entry : dict 30 Library spectrum entry, with 'mz' key containing the spectrum in 31 the format (mz, abundance),(mz, abundance), i.e. from MetabRef. 32 mz_tol_da : float, optional 33 Tolerance in Da for matching peaks (in MS2). Default is 0.1. 34 include_fragment_types : bool, optional 35 If True, include fragment type comparisons in output. 36 Defaults to False. 37 38 Returns 39 ------- 40 tuple 41 Tuple of (query_in_lib, query_in_lib_fract, lib_in_query, lib_in_query_fract, query_frags, lib_frags, lib_precursor_mz). 42 43 Notes 44 ----- 45 query_in_lib : int 46 Number of peaks in query that are present in the library entry (within mz_tol_da). 47 query_in_lib_fract : float 48 Fraction of peaks in query that are present in the library entry (within mz_tol_da). 49 lib_in_query : int 50 Number of peaks in the library entry that are present in the query (within mz_tol_da). 51 lib_in_query_fract : float 52 Fraction of peaks in the library entry that are present in the query (within mz_tol_da). 53 query_frags : list 54 List of unique fragment types present in the query, generally 'MLF' or 'LSF' or both. 55 lib_frags : list 56 List of unique fragment types present in the library entry, generally 'MLF' or 'LSF' or both. 57 58 Raises 59 ------ 60 ValueError 61 If library entry does not have 'fragment_types' key and include_fragment_types is True. 62 63 """ 64 65 if "mz" in lib_entry.keys(): 66 # Get the original mz values from the library entry 67 lib_mzs = np.array( 68 re.findall(r"\(([^,]+),([^)]+)\)", lib_entry["mz"]), dtype=float 69 ).reshape(-1, 2)[:, 0] 70 elif "peaks" in lib_entry.keys() and lib_entry["peaks"] is not None: 71 lib_mzs = lib_entry["peaks"][:, 0] 72 73 # Get count and fraction of peaks in query that are in lib entry 74 query_in_lib = 0 75 for peak in query_mz_arr: 76 if np.any(np.isclose(lib_mzs, peak, atol=mz_tol_da)): 77 query_in_lib += 1 78 query_in_lib_fract = query_in_lib / len(query_mz_arr) 79 80 # Get count and fraction of peaks in lib that are in query 81 lib_in_query = 0 82 for peak in lib_mzs: 83 if np.any(np.isclose(query_mz_arr, peak, atol=mz_tol_da)): 84 lib_in_query += 1 85 lib_in_query_fract = lib_in_query / len(lib_mzs) 86 87 if include_fragment_types: 88 # Check that fragment types are present in the library entry 89 if "fragment_types" not in lib_entry.keys(): 90 raise ValueError( 91 "Flash entropy library entry must have 'fragment_types' key to include fragment types in output." 92 ) 93 94 # Get types of fragments in the lib entry 95 lib_frags = lib_entry["fragment_types"] 96 # make list of the fragment types that are present in the query spectrum 97 lib_in_query_ids = list( 98 set( 99 [ 100 ind 101 for ind, x in enumerate(lib_mzs) 102 if len(np.where(np.isclose(query_mz_arr, x, atol=mz_tol_da))[0]) 103 > 0 104 ] 105 ) 106 ) 107 query_frags = list(set([lib_frags[x] for x in lib_in_query_ids])) 108 lib_frags = list(set(lib_frags)) 109 110 else: 111 query_frags = None 112 lib_frags = None 113 114 return ( 115 query_in_lib, 116 query_in_lib_fract, 117 lib_in_query, 118 lib_in_query_fract, 119 query_frags, 120 lib_frags, 121 ) 122 123 def fe_search( 124 self, 125 scan_list, 126 fe_lib, 127 precursor_mz_list=[], 128 use_mass_features=True, 129 peak_sep_da=0.01, 130 get_additional_metrics=True, 131 ): 132 """ 133 Search LCMS spectra using a FlashEntropy approach. 134 135 Parameters 136 ---------- 137 scan_list : list 138 List of scan numbers to search. 139 fe_lib : :obj:`~ms_entropy.FlashEntropySearch` 140 FlashEntropy Search instance. 141 precursor_mz_list : list, optional 142 List of precursor m/z values to search, by default [], which implies 143 matched with mass features; to enable this use_mass_features must be True. 144 use_mass_features : bool, optional 145 If True, use mass features to get precursor m/z values, by default True. 146 If True, will add search results to mass features' ms2_similarity_results attribute. 147 peak_sep_da : float, optional 148 Minimum separation between m/z peaks spectra in Da. This needs match the 149 approximate resolution of the search spectra and the FlashEntropySearch 150 instance, by default 0.01. 151 get_additional_metrics : bool, optional 152 If True, get additional metrics from FlashEntropy search, by default True. 153 154 Returns 155 ------- 156 None, but adds results to self.spectral_search_results and associates these 157 spectral_search_results with mass_features within the self.mass_features dictionary. 158 159 """ 160 # Retrieve parameters from self 161 # include_fragment_types should used for lipids queries only, not general metabolomics 162 include_fragment_types = self.parameters.lc_ms.include_fragment_types 163 min_match_score = self.parameters.lc_ms.ms2_min_fe_score 164 165 # If precursor_mz_list is empty and use_mass_features is True, get precursor m/z values from mass features for each scan in scan_list 166 if use_mass_features and len(precursor_mz_list) == 0: 167 precursor_mz_list = [] 168 for scan in scan_list: 169 mf_ids = [ 170 key 171 for key, value in self.mass_features.items() 172 if scan in value.ms2_mass_spectra 173 ] 174 precursor_mz = [ 175 value.mz 176 for key, value in self.mass_features.items() 177 if key in mf_ids 178 ] 179 precursor_mz_list.append(precursor_mz) 180 181 # Check that precursor_mz_list same length as scan_list, if not, raise error 182 if len(precursor_mz_list) != len(scan_list): 183 raise ValueError("Length of precursor_mz_list is not equal to scan_list.") 184 185 # Loop through each query spectrum / precursor match and save ids of db spectrum that are decent matches 186 overall_results_dict = {} 187 for i in np.arange(len(scan_list)): 188 scan_oi = scan_list[i] 189 if len(self._ms[scan_oi].mspeaks) > 0: 190 precursor_mzs = precursor_mz_list[i] 191 overall_results_dict[scan_oi] = {} 192 for precursor_mz in precursor_mzs: 193 query_spectrum = fe_lib.clean_spectrum_for_search( 194 precursor_mz=precursor_mz, 195 peaks=np.vstack( 196 (self._ms[scan_oi].mz_exp, self._ms[scan_oi].abundance) 197 ).T, 198 precursor_ions_removal_da=None, 199 noise_threshold=self._ms[ 200 scan_oi 201 ].parameters.mass_spectrum.noise_threshold_min_relative_abundance 202 / 100, 203 min_ms2_difference_in_da=peak_sep_da, 204 ) 205 search_results = fe_lib.search( 206 precursor_mz=precursor_mz, 207 peaks=query_spectrum, 208 ms1_tolerance_in_da=self.parameters.mass_spectrum[ 209 "ms1" 210 ].molecular_search.max_ppm_error 211 * 10**-6 212 * precursor_mz, 213 ms2_tolerance_in_da=peak_sep_da * 0.5, 214 method={"identity"}, 215 precursor_ions_removal_da=None, 216 noise_threshold=self._ms[ 217 scan_oi 218 ].parameters.mass_spectrum.noise_threshold_min_relative_abundance 219 / 100, 220 target="cpu", 221 )["identity_search"] 222 match_inds = np.where(search_results > min_match_score)[0] 223 224 # If any decent matches are found, add them to the results dictionary 225 if len(match_inds) > 0: 226 match_scores = search_results[match_inds] 227 ref_ms_ids = [fe_lib[x]["id"] for x in match_inds] 228 ref_mol_ids = [ 229 fe_lib[x]["molecular_data_id"] for x in match_inds 230 ] 231 ref_precursor_mzs = [ 232 fe_lib[x]["precursor_mz"] for x in match_inds 233 ] 234 ion_types = [fe_lib[x]["ion_type"] for x in match_inds] 235 overall_results_dict[scan_oi][precursor_mz] = { 236 "ref_mol_id": ref_mol_ids, 237 "ref_ms_id": ref_ms_ids, 238 "ref_precursor_mz": ref_precursor_mzs, 239 "precursor_mz_error_ppm": [ 240 (precursor_mz - x) / precursor_mz * 10**6 241 for x in ref_precursor_mzs 242 ], 243 "entropy_similarity": match_scores, 244 "ref_ion_type": ion_types, 245 } 246 # Add database name, if present 247 db_name = [ 248 fe_lib[x].get("database_name") for x in match_inds 249 ] 250 if db_name is not None: 251 overall_results_dict[scan_oi][precursor_mz].update( 252 {"database_name": db_name} 253 ) 254 if get_additional_metrics: 255 more_match_quals = [ 256 self.get_more_match_quals( 257 self._ms[scan_oi].mz_exp, 258 fe_lib[x], 259 mz_tol_da=peak_sep_da, 260 include_fragment_types=include_fragment_types, 261 ) 262 for x in match_inds 263 ] 264 overall_results_dict[scan_oi][precursor_mz].update( 265 { 266 "query_mz_in_ref_n": [ 267 x[0] for x in more_match_quals 268 ], 269 "query_mz_in_ref_fract": [ 270 x[1] for x in more_match_quals 271 ], 272 "ref_mz_in_query_n": [ 273 x[2] for x in more_match_quals 274 ], 275 "ref_mz_in_query_fract": [ 276 x[3] for x in more_match_quals 277 ], 278 } 279 ) 280 if include_fragment_types: 281 overall_results_dict[scan_oi][precursor_mz].update( 282 { 283 "query_frag_types": [ 284 x[4] for x in more_match_quals 285 ], 286 "ref_frag_types": [ 287 x[5] for x in more_match_quals 288 ], 289 } 290 ) 291 292 # Drop scans with no results from dictionary 293 overall_results_dict = {k: v for k, v in overall_results_dict.items() if v} 294 295 # Cast each entry as a MS2SearchResults object 296 for scan_id in overall_results_dict.keys(): 297 for precursor_mz in overall_results_dict[scan_id].keys(): 298 ms2_spectrum = self._ms[scan_id] 299 ms2_search_results = overall_results_dict[scan_id][precursor_mz] 300 overall_results_dict[scan_id][precursor_mz] = SpectrumSearchResults( 301 ms2_spectrum, precursor_mz, ms2_search_results 302 ) 303 304 # Add MS2SearchResults to the existing spectral search results dictionary 305 self.spectral_search_results.update(overall_results_dict) 306 307 # If there are mass features, associate the results with each mass feature 308 if len(self.mass_features) > 0: 309 for mass_feature_id, mass_feature in self.mass_features.items(): 310 scan_ids = mass_feature.ms2_scan_numbers 311 for ms2_scan_id in scan_ids: 312 precursor_mz = mass_feature.mz 313 try: 314 self.spectral_search_results[ms2_scan_id][precursor_mz] 315 except KeyError: 316 pass 317 else: 318 self.mass_features[ 319 mass_feature_id 320 ].ms2_similarity_results.append( 321 self.spectral_search_results[ms2_scan_id][precursor_mz] 322 )
Methods for searching LCMS spectra.
This class is designed to be a mixin class for the ~corems.mass_spectra.factory.lc_class.LCMSBase
class.
17 @staticmethod 18 def get_more_match_quals( 19 query_mz_arr, lib_entry, mz_tol_da=0.1, include_fragment_types=False 20 ): 21 """ 22 Return additional match qualities between query and library entry. 23 24 Parameters 25 ---------- 26 query_mz_arr : np.array 27 Array of query spectrum. Shape (N, 2), with m/z in the first column 28 and abundance in the second. 29 lib_entry : dict 30 Library spectrum entry, with 'mz' key containing the spectrum in 31 the format (mz, abundance),(mz, abundance), i.e. from MetabRef. 32 mz_tol_da : float, optional 33 Tolerance in Da for matching peaks (in MS2). Default is 0.1. 34 include_fragment_types : bool, optional 35 If True, include fragment type comparisons in output. 36 Defaults to False. 37 38 Returns 39 ------- 40 tuple 41 Tuple of (query_in_lib, query_in_lib_fract, lib_in_query, lib_in_query_fract, query_frags, lib_frags, lib_precursor_mz). 42 43 Notes 44 ----- 45 query_in_lib : int 46 Number of peaks in query that are present in the library entry (within mz_tol_da). 47 query_in_lib_fract : float 48 Fraction of peaks in query that are present in the library entry (within mz_tol_da). 49 lib_in_query : int 50 Number of peaks in the library entry that are present in the query (within mz_tol_da). 51 lib_in_query_fract : float 52 Fraction of peaks in the library entry that are present in the query (within mz_tol_da). 53 query_frags : list 54 List of unique fragment types present in the query, generally 'MLF' or 'LSF' or both. 55 lib_frags : list 56 List of unique fragment types present in the library entry, generally 'MLF' or 'LSF' or both. 57 58 Raises 59 ------ 60 ValueError 61 If library entry does not have 'fragment_types' key and include_fragment_types is True. 62 63 """ 64 65 if "mz" in lib_entry.keys(): 66 # Get the original mz values from the library entry 67 lib_mzs = np.array( 68 re.findall(r"\(([^,]+),([^)]+)\)", lib_entry["mz"]), dtype=float 69 ).reshape(-1, 2)[:, 0] 70 elif "peaks" in lib_entry.keys() and lib_entry["peaks"] is not None: 71 lib_mzs = lib_entry["peaks"][:, 0] 72 73 # Get count and fraction of peaks in query that are in lib entry 74 query_in_lib = 0 75 for peak in query_mz_arr: 76 if np.any(np.isclose(lib_mzs, peak, atol=mz_tol_da)): 77 query_in_lib += 1 78 query_in_lib_fract = query_in_lib / len(query_mz_arr) 79 80 # Get count and fraction of peaks in lib that are in query 81 lib_in_query = 0 82 for peak in lib_mzs: 83 if np.any(np.isclose(query_mz_arr, peak, atol=mz_tol_da)): 84 lib_in_query += 1 85 lib_in_query_fract = lib_in_query / len(lib_mzs) 86 87 if include_fragment_types: 88 # Check that fragment types are present in the library entry 89 if "fragment_types" not in lib_entry.keys(): 90 raise ValueError( 91 "Flash entropy library entry must have 'fragment_types' key to include fragment types in output." 92 ) 93 94 # Get types of fragments in the lib entry 95 lib_frags = lib_entry["fragment_types"] 96 # make list of the fragment types that are present in the query spectrum 97 lib_in_query_ids = list( 98 set( 99 [ 100 ind 101 for ind, x in enumerate(lib_mzs) 102 if len(np.where(np.isclose(query_mz_arr, x, atol=mz_tol_da))[0]) 103 > 0 104 ] 105 ) 106 ) 107 query_frags = list(set([lib_frags[x] for x in lib_in_query_ids])) 108 lib_frags = list(set(lib_frags)) 109 110 else: 111 query_frags = None 112 lib_frags = None 113 114 return ( 115 query_in_lib, 116 query_in_lib_fract, 117 lib_in_query, 118 lib_in_query_fract, 119 query_frags, 120 lib_frags, 121 )
Return additional match qualities between query and library entry.
Parameters
- query_mz_arr (np.array): Array of query spectrum. Shape (N, 2), with m/z in the first column and abundance in the second.
- lib_entry (dict): Library spectrum entry, with 'mz' key containing the spectrum in the format (mz, abundance),(mz, abundance), i.e. from MetabRef.
- mz_tol_da (float, optional): Tolerance in Da for matching peaks (in MS2). Default is 0.1.
- include_fragment_types (bool, optional): If True, include fragment type comparisons in output. Defaults to False.
Returns
- tuple: Tuple of (query_in_lib, query_in_lib_fract, lib_in_query, lib_in_query_fract, query_frags, lib_frags, lib_precursor_mz).
Notes
query_in_lib : int Number of peaks in query that are present in the library entry (within mz_tol_da). query_in_lib_fract : float Fraction of peaks in query that are present in the library entry (within mz_tol_da). lib_in_query : int Number of peaks in the library entry that are present in the query (within mz_tol_da). lib_in_query_fract : float Fraction of peaks in the library entry that are present in the query (within mz_tol_da). query_frags : list List of unique fragment types present in the query, generally 'MLF' or 'LSF' or both. lib_frags : list List of unique fragment types present in the library entry, generally 'MLF' or 'LSF' or both.
Raises
- ValueError: If library entry does not have 'fragment_types' key and include_fragment_types is True.
123 def fe_search( 124 self, 125 scan_list, 126 fe_lib, 127 precursor_mz_list=[], 128 use_mass_features=True, 129 peak_sep_da=0.01, 130 get_additional_metrics=True, 131 ): 132 """ 133 Search LCMS spectra using a FlashEntropy approach. 134 135 Parameters 136 ---------- 137 scan_list : list 138 List of scan numbers to search. 139 fe_lib : :obj:`~ms_entropy.FlashEntropySearch` 140 FlashEntropy Search instance. 141 precursor_mz_list : list, optional 142 List of precursor m/z values to search, by default [], which implies 143 matched with mass features; to enable this use_mass_features must be True. 144 use_mass_features : bool, optional 145 If True, use mass features to get precursor m/z values, by default True. 146 If True, will add search results to mass features' ms2_similarity_results attribute. 147 peak_sep_da : float, optional 148 Minimum separation between m/z peaks spectra in Da. This needs match the 149 approximate resolution of the search spectra and the FlashEntropySearch 150 instance, by default 0.01. 151 get_additional_metrics : bool, optional 152 If True, get additional metrics from FlashEntropy search, by default True. 153 154 Returns 155 ------- 156 None, but adds results to self.spectral_search_results and associates these 157 spectral_search_results with mass_features within the self.mass_features dictionary. 158 159 """ 160 # Retrieve parameters from self 161 # include_fragment_types should used for lipids queries only, not general metabolomics 162 include_fragment_types = self.parameters.lc_ms.include_fragment_types 163 min_match_score = self.parameters.lc_ms.ms2_min_fe_score 164 165 # If precursor_mz_list is empty and use_mass_features is True, get precursor m/z values from mass features for each scan in scan_list 166 if use_mass_features and len(precursor_mz_list) == 0: 167 precursor_mz_list = [] 168 for scan in scan_list: 169 mf_ids = [ 170 key 171 for key, value in self.mass_features.items() 172 if scan in value.ms2_mass_spectra 173 ] 174 precursor_mz = [ 175 value.mz 176 for key, value in self.mass_features.items() 177 if key in mf_ids 178 ] 179 precursor_mz_list.append(precursor_mz) 180 181 # Check that precursor_mz_list same length as scan_list, if not, raise error 182 if len(precursor_mz_list) != len(scan_list): 183 raise ValueError("Length of precursor_mz_list is not equal to scan_list.") 184 185 # Loop through each query spectrum / precursor match and save ids of db spectrum that are decent matches 186 overall_results_dict = {} 187 for i in np.arange(len(scan_list)): 188 scan_oi = scan_list[i] 189 if len(self._ms[scan_oi].mspeaks) > 0: 190 precursor_mzs = precursor_mz_list[i] 191 overall_results_dict[scan_oi] = {} 192 for precursor_mz in precursor_mzs: 193 query_spectrum = fe_lib.clean_spectrum_for_search( 194 precursor_mz=precursor_mz, 195 peaks=np.vstack( 196 (self._ms[scan_oi].mz_exp, self._ms[scan_oi].abundance) 197 ).T, 198 precursor_ions_removal_da=None, 199 noise_threshold=self._ms[ 200 scan_oi 201 ].parameters.mass_spectrum.noise_threshold_min_relative_abundance 202 / 100, 203 min_ms2_difference_in_da=peak_sep_da, 204 ) 205 search_results = fe_lib.search( 206 precursor_mz=precursor_mz, 207 peaks=query_spectrum, 208 ms1_tolerance_in_da=self.parameters.mass_spectrum[ 209 "ms1" 210 ].molecular_search.max_ppm_error 211 * 10**-6 212 * precursor_mz, 213 ms2_tolerance_in_da=peak_sep_da * 0.5, 214 method={"identity"}, 215 precursor_ions_removal_da=None, 216 noise_threshold=self._ms[ 217 scan_oi 218 ].parameters.mass_spectrum.noise_threshold_min_relative_abundance 219 / 100, 220 target="cpu", 221 )["identity_search"] 222 match_inds = np.where(search_results > min_match_score)[0] 223 224 # If any decent matches are found, add them to the results dictionary 225 if len(match_inds) > 0: 226 match_scores = search_results[match_inds] 227 ref_ms_ids = [fe_lib[x]["id"] for x in match_inds] 228 ref_mol_ids = [ 229 fe_lib[x]["molecular_data_id"] for x in match_inds 230 ] 231 ref_precursor_mzs = [ 232 fe_lib[x]["precursor_mz"] for x in match_inds 233 ] 234 ion_types = [fe_lib[x]["ion_type"] for x in match_inds] 235 overall_results_dict[scan_oi][precursor_mz] = { 236 "ref_mol_id": ref_mol_ids, 237 "ref_ms_id": ref_ms_ids, 238 "ref_precursor_mz": ref_precursor_mzs, 239 "precursor_mz_error_ppm": [ 240 (precursor_mz - x) / precursor_mz * 10**6 241 for x in ref_precursor_mzs 242 ], 243 "entropy_similarity": match_scores, 244 "ref_ion_type": ion_types, 245 } 246 # Add database name, if present 247 db_name = [ 248 fe_lib[x].get("database_name") for x in match_inds 249 ] 250 if db_name is not None: 251 overall_results_dict[scan_oi][precursor_mz].update( 252 {"database_name": db_name} 253 ) 254 if get_additional_metrics: 255 more_match_quals = [ 256 self.get_more_match_quals( 257 self._ms[scan_oi].mz_exp, 258 fe_lib[x], 259 mz_tol_da=peak_sep_da, 260 include_fragment_types=include_fragment_types, 261 ) 262 for x in match_inds 263 ] 264 overall_results_dict[scan_oi][precursor_mz].update( 265 { 266 "query_mz_in_ref_n": [ 267 x[0] for x in more_match_quals 268 ], 269 "query_mz_in_ref_fract": [ 270 x[1] for x in more_match_quals 271 ], 272 "ref_mz_in_query_n": [ 273 x[2] for x in more_match_quals 274 ], 275 "ref_mz_in_query_fract": [ 276 x[3] for x in more_match_quals 277 ], 278 } 279 ) 280 if include_fragment_types: 281 overall_results_dict[scan_oi][precursor_mz].update( 282 { 283 "query_frag_types": [ 284 x[4] for x in more_match_quals 285 ], 286 "ref_frag_types": [ 287 x[5] for x in more_match_quals 288 ], 289 } 290 ) 291 292 # Drop scans with no results from dictionary 293 overall_results_dict = {k: v for k, v in overall_results_dict.items() if v} 294 295 # Cast each entry as a MS2SearchResults object 296 for scan_id in overall_results_dict.keys(): 297 for precursor_mz in overall_results_dict[scan_id].keys(): 298 ms2_spectrum = self._ms[scan_id] 299 ms2_search_results = overall_results_dict[scan_id][precursor_mz] 300 overall_results_dict[scan_id][precursor_mz] = SpectrumSearchResults( 301 ms2_spectrum, precursor_mz, ms2_search_results 302 ) 303 304 # Add MS2SearchResults to the existing spectral search results dictionary 305 self.spectral_search_results.update(overall_results_dict) 306 307 # If there are mass features, associate the results with each mass feature 308 if len(self.mass_features) > 0: 309 for mass_feature_id, mass_feature in self.mass_features.items(): 310 scan_ids = mass_feature.ms2_scan_numbers 311 for ms2_scan_id in scan_ids: 312 precursor_mz = mass_feature.mz 313 try: 314 self.spectral_search_results[ms2_scan_id][precursor_mz] 315 except KeyError: 316 pass 317 else: 318 self.mass_features[ 319 mass_feature_id 320 ].ms2_similarity_results.append( 321 self.spectral_search_results[ms2_scan_id][precursor_mz] 322 )
Search LCMS spectra using a FlashEntropy approach.
Parameters
- scan_list (list): List of scan numbers to search.
- fe_lib (
~ms_entropy.FlashEntropySearch
): FlashEntropy Search instance. - precursor_mz_list (list, optional): List of precursor m/z values to search, by default [], which implies matched with mass features; to enable this use_mass_features must be True.
- use_mass_features (bool, optional): If True, use mass features to get precursor m/z values, by default True. If True, will add search results to mass features' ms2_similarity_results attribute.
- peak_sep_da (float, optional): Minimum separation between m/z peaks spectra in Da. This needs match the approximate resolution of the search spectra and the FlashEntropySearch instance, by default 0.01.
- get_additional_metrics (bool, optional): If True, get additional metrics from FlashEntropy search, by default True.
Returns
- None, but adds results to self.spectral_search_results and associates these
- spectral_search_results with mass_features within the self.mass_features dictionary.