corems.mass_spectra.calc.lc_calc
1import numpy as np 2import pandas as pd 3from ripser import ripser 4from scipy import sparse 5from scipy.spatial import KDTree 6 7from corems.chroma_peak.factory.chroma_peak_classes import LCMSMassFeature 8from corems.mass_spectra.calc import SignalProcessing as sp 9from corems.mass_spectra.factory.chromat_data import EIC_Data 10from corems.mass_spectrum.input.numpyArray import ms_from_array_profile 11 12 13def find_closest(A, target): 14 """Find the index of closest value in A to each value in target. 15 16 Parameters 17 ---------- 18 A : :obj:`~numpy.array` 19 The array to search (blueprint). A must be sorted. 20 target : :obj:`~numpy.array` 21 The array of values to search for. target must be sorted. 22 23 Returns 24 ------- 25 :obj:`~numpy.array` 26 The indices of the closest values in A to each value in target. 27 """ 28 idx = A.searchsorted(target) 29 idx = np.clip(idx, 1, len(A) - 1) 30 left = A[idx - 1] 31 right = A[idx] 32 idx -= target - left < right - target 33 return idx 34 35 36class LCCalculations: 37 """Methods for performing LC calculations on mass spectra data. 38 39 Notes 40 ----- 41 This class is intended to be used as a mixin for the LCMSBase class. 42 43 Methods 44 ------- 45 * get_max_eic(eic_data). 46 Returns the maximum EIC value from the given EIC data. A static method. 47 * smooth_tic(tic). 48 Smooths the TIC data using the specified smoothing method and settings. 49 * eic_centroid_detector(rt, eic, max_eic). 50 Performs EIC centroid detection on the given EIC data. 51 * find_nearest_scan(rt). 52 Finds the nearest scan to the given retention time. 53 * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). 54 Returns an averaged mass spectrum object. 55 * find_mass_features(ms_level=1). 56 Find regions of interest for a given MS level (default is MS1). 57 * integrate_mass_features(drop_if_fail=False, ms_level=1). 58 Integrate mass features of interest and extracts EICs. 59 * find_c13_mass_features(). 60 Evaluate mass features and mark likely C13 isotopes. 61 * deconvolute_ms1_mass_features(). 62 Deconvolute mass features' ms1 mass spectra. 63 """ 64 65 @staticmethod 66 def get_max_eic(eic_data: dict): 67 """Returns the maximum EIC value from the given EIC data. 68 69 Notes 70 ----- 71 This is a static method. 72 73 Parameters 74 ---------- 75 eic_data : dict 76 A dictionary containing EIC data. 77 78 Returns 79 ------- 80 float 81 The maximum EIC value. 82 """ 83 max_eic = 0 84 for eic_data in eic_data.values(): 85 ind_max_eic = max(eic_data.get("EIC")) 86 max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic 87 88 return max_eic 89 90 def smooth_tic(self, tic): 91 """Smooths the TIC or EIC data using the specified smoothing method and settings. 92 93 Parameters 94 ---------- 95 tic : numpy.ndarray 96 The TIC (or EIC) data to be smoothed. 97 98 Returns 99 ------- 100 numpy.ndarray 101 The smoothed TIC data. 102 """ 103 implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method 104 105 pol_order = self.parameters.lc_ms.savgol_pol_order 106 107 window_len = self.parameters.lc_ms.smooth_window 108 109 window = self.parameters.lc_ms.smooth_method 110 111 return sp.smooth_signal( 112 tic, window_len, window, pol_order, implemented_smooth_method 113 ) 114 115 def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]): 116 """Performs EIC centroid detection on the given EIC data. 117 118 Parameters 119 ---------- 120 rt : numpy.ndarray 121 The retention time data. 122 eic : numpy.ndarray 123 The EIC data. 124 max_eic : float 125 The maximum EIC value. 126 apex_indexes : list, optional 127 The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function. 128 129 Returns 130 ------- 131 numpy.ndarray 132 The indexes of left, apex, and right limits as a generator. 133 """ 134 135 max_prominence = self.parameters.lc_ms.peak_max_prominence_percent 136 137 max_height = self.parameters.lc_ms.peak_height_max_percent 138 139 signal_threshold = self.parameters.lc_ms.eic_signal_threshold 140 141 min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints 142 143 peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold 144 145 include_indexes = sp.peak_picking_first_derivative( 146 domain=rt, 147 signal=eic, 148 max_height=max_height, 149 max_prominence=max_prominence, 150 max_signal=max_eic, 151 min_peak_datapoints=min_peak_datapoints, 152 peak_derivative_threshold=peak_derivative_threshold, 153 signal_threshold=signal_threshold, 154 correct_baseline=False, 155 plot_res=False, 156 apex_indexes=apex_indexes, 157 ) 158 159 return include_indexes 160 161 def find_nearest_scan(self, rt): 162 """Finds the nearest scan to the given retention time. 163 164 Parameters 165 ---------- 166 rt : float 167 The retention time (in minutes) to find the nearest scan for. 168 169 Returns 170 ------- 171 int 172 The scan number of the nearest scan. 173 """ 174 array_rt = np.array(self.retention_time) 175 176 scan_index = (np.abs(array_rt - rt)).argmin() 177 178 real_scan = self.scans_number[scan_index] 179 180 return real_scan 181 182 def add_peak_metrics(self): 183 """Add peak metrics to the mass features. 184 185 This function calculates the peak metrics for each mass feature and adds them to the mass feature objects. 186 """ 187 # Check that at least some mass features have eic data 188 if not any([mf._eic_data is not None for mf in self.mass_features.values()]): 189 raise ValueError( 190 "No mass features have EIC data. Run integrate_mass_features first." 191 ) 192 193 for mass_feature in self.mass_features.values(): 194 # Check if the mass feature has been integrated 195 if mass_feature._eic_data is not None: 196 # Calculate peak metrics 197 mass_feature.calc_half_height_width() 198 mass_feature.calc_tailing_factor() 199 mass_feature.calc_dispersity_index() 200 201 def get_average_mass_spectrum( 202 self, 203 scan_list, 204 apex_scan, 205 spectrum_mode="profile", 206 ms_level=1, 207 auto_process=True, 208 use_parser=False, 209 perform_checks=True, 210 polarity=None, 211 ms_params=None, 212 ): 213 """Returns an averaged mass spectrum object 214 215 Parameters 216 ---------- 217 scan_list : list 218 List of scan numbers to average. 219 apex_scan : int 220 Number of the apex scan 221 spectrum_mode : str, optional 222 The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging. 223 ms_level : int, optional 224 The MS level to use. Defaults to 1. 225 auto_process : bool, optional 226 If True, the averaged mass spectrum will be auto-processed. Defaults to True. 227 use_parser : bool, optional 228 If True, the mass spectra will be obtained from the parser. Defaults to False. 229 perform_checks : bool, optional 230 If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan 231 polarity : int, optional 232 The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1) 233 ms_params : MSParameters, optional 234 The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None. 235 236 Returns 237 ------- 238 MassSpectrumProfile 239 The averaged mass spectrum object. 240 241 Raises 242 ------ 243 ValueError 244 If the spectrum mode is not "profile". 245 If the MS level is not found in the unprocessed mass spectra dictionary. 246 If not all scan numbers are found in the unprocessed mass spectra dictionary. 247 """ 248 if perform_checks: 249 if spectrum_mode != "profile": 250 raise ValueError("Averaging only supported for profile mode") 251 252 if polarity is None: 253 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 254 if self.polarity == "negative": 255 polarity = -1 256 elif self.polarity == "positive": 257 polarity = 1 258 else: 259 raise ValueError( 260 "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'" 261 ) 262 263 # if not using_parser, check that scan numbers are in _ms_unprocessed 264 if not use_parser: 265 if perform_checks: 266 # Set index to scan for faster lookup 267 ms_df = ( 268 self._ms_unprocessed[ms_level] 269 .copy() 270 .set_index("scan", drop=False) 271 .sort_index() 272 ) 273 my_ms_df = ms_df.loc[scan_list] 274 # Check that all scan numbers are in the ms_df 275 if not all(np.isin(scan_list, ms_df.index)): 276 raise ValueError( 277 "Not all scan numbers found in the unprocessed mass spectra dictionary" 278 ) 279 else: 280 my_ms_df = ( 281 pd.DataFrame({"scan": scan_list}) 282 .set_index("scan") 283 .join(self._ms_unprocessed[ms_level], how="left") 284 ) 285 286 if use_parser: 287 ms_list = [ 288 self.spectra_parser.get_mass_spectrum_from_scan( 289 x, spectrum_mode=spectrum_mode, auto_process=False 290 ) 291 for x in scan_list 292 ] 293 ms_mz = [x._mz_exp for x in ms_list] 294 ms_int = [x._abundance for x in ms_list] 295 my_ms_df = [] 296 for i in np.arange(len(ms_mz)): 297 my_ms_df.append( 298 pd.DataFrame( 299 {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]} 300 ) 301 ) 302 my_ms_df = pd.concat(my_ms_df) 303 304 if not self.check_if_grid(my_ms_df): 305 my_ms_df = self.grid_data(my_ms_df) 306 307 my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index() 308 309 ms = ms_from_array_profile( 310 my_ms_ave.mz, 311 my_ms_ave.intensity, 312 self.file_location, 313 polarity=polarity, 314 auto_process=False, 315 ) 316 317 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 318 if ms is not None: 319 if ms_params is not None: 320 ms.parameters = ms_params 321 ms.scan_number = apex_scan 322 if auto_process: 323 ms.process_mass_spec() 324 return ms 325 326 def find_mass_features(self, ms_level=1, grid=True): 327 """Find mass features within an LCMSBase object 328 329 Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future. 330 331 Parameters 332 ---------- 333 ms_level : int, optional 334 The MS level to use for peak picking Default is 1. 335 grid : bool, optional 336 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded, used for persistent homology peak picking. Default is True. 337 338 Raises 339 ------ 340 ValueError 341 If no MS level data is found on the object. 342 If persistent homology peak picking is attempted on non-profile mode data. 343 If data is not gridded and grid is False. 344 If peak picking method is not implemented. 345 346 Returns 347 ------- 348 None, but assigns the mass_features and eics attributes to the object. 349 350 """ 351 pp_method = self.parameters.lc_ms.peak_picking_method 352 353 if pp_method == "persistent homology": 354 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 355 if all(msx_scan_df["ms_format"] == "profile"): 356 self.find_mass_features_ph(ms_level=ms_level, grid=grid) 357 self.cluster_mass_features(drop_children=True, sort_by="persistence") 358 else: 359 raise ValueError( 360 "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format( 361 ms_level 362 ) 363 ) 364 else: 365 raise ValueError("Peak picking method not implemented") 366 367 def integrate_mass_features( 368 self, drop_if_fail=True, drop_duplicates=True, ms_level=1 369 ): 370 """Integrate mass features and extract EICs. 371 372 Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute. 373 374 Parameters 375 ---------- 376 drop_if_fail : bool, optional 377 Whether to drop mass features if the EIC limit calculations fail. 378 Default is True. 379 drop_duplicates : bool, optional 380 Whether to mass features that appear to be duplicates 381 (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). 382 Default is True. 383 ms_level : int, optional 384 The MS level to use. Default is 1. 385 386 Raises 387 ------ 388 ValueError 389 If no mass features are found. 390 If no MS level data is found for the given MS level (either in data or in the scan data) 391 392 Returns 393 ------- 394 None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute. 395 396 Notes 397 ----- 398 drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection). 399 """ 400 # Check if there is data 401 if ms_level in self._ms_unprocessed.keys(): 402 raw_data = self._ms_unprocessed[ms_level].copy() 403 else: 404 raise ValueError("No MS level " + str(ms_level) + " data found") 405 if self.mass_features is not None: 406 mf_df = self.mass_features_to_df().copy() 407 else: 408 raise ValueError( 409 "No mass features found, did you run find_mass_features() first?" 410 ) 411 # Check if mass_spectrum exists on each mass feature 412 if not all( 413 [mf.mass_spectrum is not None for mf in self.mass_features.values()] 414 ): 415 raise ValueError( 416 "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?" 417 ) 418 419 # Subset scan data to only include correct ms_level 420 scan_df_sub = self.scan_df[ 421 self.scan_df["ms_level"] == int(ms_level) 422 ].reset_index(drop=True) 423 if scan_df_sub.empty: 424 raise ValueError("No MS level " + ms_level + " data found in scan data") 425 scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy() 426 427 mzs_to_extract = np.unique(mf_df["mz"].values) 428 mzs_to_extract.sort() 429 430 # Get EICs for each unique mz in mass features list 431 for mz in mzs_to_extract: 432 mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 433 mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 434 raw_data_sub = raw_data[ 435 (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max) 436 ].reset_index(drop=True) 437 raw_data_sub = ( 438 raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index() 439 ) 440 raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left") 441 raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0) 442 myEIC = EIC_Data( 443 scans=raw_data_sub["scan"].values, 444 time=raw_data_sub["scan_time"].values, 445 eic=raw_data_sub["intensity"].values, 446 ) 447 # Smooth EIC 448 smoothed_eic = self.smooth_tic(myEIC.eic) 449 smoothed_eic[smoothed_eic < 0] = 0 450 myEIC.eic_smoothed = smoothed_eic 451 self.eics[mz] = myEIC 452 453 # Get limits of mass features using EIC centroid detector and integrate 454 mf_df["area"] = np.nan 455 for idx, mass_feature in mf_df.iterrows(): 456 mz = mass_feature.mz 457 apex_scan = mass_feature.apex_scan 458 459 # Pull EIC data and find apex scan index 460 myEIC = self.eics[mz] 461 self.mass_features[idx]._eic_data = myEIC 462 apex_index = np.where(myEIC.scans == apex_scan)[0][0] 463 464 # Find left and right limits of peak using EIC centroid detector, add to EICData 465 centroid_eics = self.eic_centroid_detector( 466 myEIC.time, 467 myEIC.eic_smoothed, 468 mass_feature.intensity * 1.1, 469 apex_indexes=[int(apex_index)], 470 ) 471 l_a_r_scan_idx = [i for i in centroid_eics] 472 if len(l_a_r_scan_idx) > 0: 473 # Add start and final scan to mass_features and EICData 474 left_scan, right_scan = ( 475 myEIC.scans[l_a_r_scan_idx[0][0]], 476 myEIC.scans[l_a_r_scan_idx[0][2]], 477 ) 478 mf_scan_apex = [(left_scan, int(apex_scan), right_scan)] 479 myEIC.apexes = myEIC.apexes + mf_scan_apex 480 self.mass_features[idx].start_scan = left_scan 481 self.mass_features[idx].final_scan = right_scan 482 483 # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData 484 area = np.trapz( 485 myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 486 myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 487 ) 488 mf_df.at[idx, "area"] = area 489 myEIC.areas = myEIC.areas + [area] 490 self.eics[mz] = myEIC 491 self.mass_features[idx]._area = area 492 else: 493 if drop_if_fail is True: 494 self.mass_features.pop(idx) 495 496 if drop_duplicates: 497 # Prepare mass feature dataframe 498 mf_df = self.mass_features_to_df().copy() 499 500 # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature 501 # Kepp the first mass fea 502 for idx, mass_feature in mf_df.iterrows(): 503 mz = mass_feature.mz 504 apex_scan = mass_feature.apex_scan 505 506 mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6 507 mf_df_sub = mf_df[ 508 mf_df["mz_diff_ppm"] 509 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 510 * 10**6 511 ].copy() 512 513 # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature 514 for idx2, mass_feature2 in mf_df_sub.iterrows(): 515 if idx2 != idx: 516 if ( 517 mass_feature2.start_scan >= mass_feature.start_scan 518 and mass_feature2.final_scan <= mass_feature.final_scan 519 ): 520 if idx2 in self.mass_features.keys(): 521 self.mass_features.pop(idx2) 522 523 def find_c13_mass_features(self): 524 """Mark likely C13 isotopes and connect to monoisoitopic mass features. 525 526 Returns 527 ------- 528 None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object. 529 530 Raises 531 ------ 532 ValueError 533 If no mass features are found. 534 """ 535 verbose = self.parameters.lc_ms.verbose_processing 536 if verbose: 537 print("evaluating mass features for C13 isotopes") 538 if self.mass_features is None: 539 raise ValueError("No mass features found, run find_mass_features() first") 540 541 # Data prep fo sparse distance matrix 542 dims = ["mz", "scan_time"] 543 mf_df = self.mass_features_to_df().copy() 544 # Drop mass features that have no area (these are likely to be noise) 545 mf_df = mf_df[mf_df["area"].notnull()] 546 mf_df["mf_id"] = mf_df.index.values 547 dims = ["mz", "scan_time"] 548 549 # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features 550 mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy() 551 552 mz_diff = 1.003355 # C13-C12 mass difference 553 tol = [ 554 mf_df["mz"].median() 555 * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 556 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5, 557 ] # mz, in relative; scan_time in minutes 558 559 # Compute inter-feature distances 560 distances = None 561 for i in range(len(dims)): 562 # Construct k-d tree 563 values = mf_df[dims[i]].values 564 tree = KDTree(values.reshape(-1, 1)) 565 566 max_tol = tol[i] 567 if dims[i] == "mz": 568 # Maximum absolute tolerance 569 max_tol = mz_diff + tol[i] 570 571 # Compute sparse distance matrix 572 # the larger the max_tol, the slower this operation is 573 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 574 575 # Only consider forward case, exclude diagonal 576 sdm = sparse.triu(sdm, k=1) 577 578 if dims[i] == "mz": 579 min_tol = mz_diff - tol[i] 580 # Get only the ones that are above the min tol 581 idx = sdm.data > min_tol 582 583 # Reconstruct sparse distance matrix 584 sdm = sparse.coo_matrix( 585 (sdm.data[idx], (sdm.row[idx], sdm.col[idx])), 586 shape=(len(values), len(values)), 587 ) 588 589 # Cast as binary matrix 590 sdm.data = np.ones_like(sdm.data) 591 592 # Stack distances 593 if distances is None: 594 distances = sdm 595 else: 596 distances = distances.multiply(sdm) 597 598 # Extract indices of within-tolerance points 599 distances = distances.tocoo() 600 pairs = np.stack((distances.row, distances.col), axis=1) # C12 to C13 pairs 601 602 # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df 603 pairs_mf = pairs.copy() 604 pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values 605 pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values 606 607 # Connect monoisotopic masses with isotopologes within mass_features 608 monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1])) 609 for mono in monos: 610 self.mass_features[mono].monoisotopic_mf_id = mono 611 pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"]) 612 while not pairs_iso_df.empty: 613 pairs_iso_df = pairs_iso_df.set_index("parent", drop=False) 614 m1_isos = pairs_iso_df.loc[monos, "child"].unique() 615 for iso in m1_isos: 616 # Set monoisotopic_mf_id and isotopologue_type for isotopologues 617 parent = pairs_mf[pairs_mf[:, 1] == iso, 0] 618 if len(parent) > 1: 619 # Choose the parent that is closest in time to the isotopologue 620 parent_time = [self.mass_features[p].retention_time for p in parent] 621 time_diff = [ 622 np.abs(self.mass_features[iso].retention_time - x) 623 for x in parent_time 624 ] 625 parent = parent[np.argmin(time_diff)] 626 else: 627 parent = parent[0] 628 self.mass_features[iso].monoisotopic_mf_id = self.mass_features[ 629 parent 630 ].monoisotopic_mf_id 631 if self.mass_features[iso].monoisotopic_mf_id is not None: 632 mass_diff = ( 633 self.mass_features[iso].mz 634 - self.mass_features[ 635 self.mass_features[iso].monoisotopic_mf_id 636 ].mz 637 ) 638 self.mass_features[iso].isotopologue_type = "13C" + str( 639 int(round(mass_diff, 0)) 640 ) 641 642 # Drop the mono and iso from the pairs_iso_df 643 pairs_iso_df = pairs_iso_df.drop( 644 index=monos, errors="ignore" 645 ) # Drop pairs where the parent is a child that is a child of a root 646 pairs_iso_df = pairs_iso_df.set_index("child", drop=False) 647 pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore") 648 649 if not pairs_iso_df.empty: 650 # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process 651 monos = np.setdiff1d( 652 np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child) 653 ) 654 if verbose: 655 # Report fraction of compounds annotated with isotopes 656 mf_df["c13_flag"] = np.where( 657 np.logical_or( 658 np.isin(mf_df["mf_id"], pairs_mf[:, 0]), 659 np.isin(mf_df["mf_id"], pairs_mf[:, 1]), 660 ), 661 1, 662 0, 663 ) 664 print( 665 str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3)) 666 + " of mass features have or are C13 isotopes" 667 ) 668 669 def deconvolute_ms1_mass_features(self): 670 """Deconvolute MS1 mass features 671 672 Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features 673 674 Parameters 675 ---------- 676 None 677 678 Returns 679 ------- 680 None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent, 681 and associated_mass_features_deconvoluted attributes to the mass features in the 682 mass_features attribute of the LCMSBase object. 683 684 Raises 685 ------ 686 ValueError 687 If no mass features are found, must run find_mass_features() first. 688 If no EICs are found, did you run integrate_mass_features() first? 689 690 """ 691 # Checks for set mass_features and eics 692 if self.mass_features is None: 693 raise ValueError( 694 "No mass features found, did you run find_mass_features() first?" 695 ) 696 697 if self.eics == {}: 698 raise ValueError( 699 "No EICs found, did you run integrate_mass_features() first?" 700 ) 701 702 if 1 not in self._ms_unprocessed.keys(): 703 raise ValueError("No unprocessed MS1 spectra found.") 704 705 # Prep ms1 data 706 ms1_data = self._ms_unprocessed[1].copy() 707 ms1_data = ms1_data.set_index("scan") 708 709 # Prep mass feature summary 710 mass_feature_df = self.mass_features_to_df() 711 712 # Loop through each mass feature 713 for mf_id, mass_feature in self.mass_features.items(): 714 # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df 715 if mass_feature.mz != mass_feature.ms1_peak.mz_exp: 716 continue 717 718 # Get the left and right limits of the EIC of the mass feature 719 l_scan, _, r_scan = mass_feature._eic_data.apexes[0] 720 721 # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz 722 ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy() 723 ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False) 724 725 # Get the centroided masses of the mass feature 726 mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp 727 728 # Find the closest mz in the ms1 data to the centroided masses of the mass feature 729 ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[ 730 find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values) 731 ] 732 733 # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz 734 ms1_data_sub["mz_diff_rel"] = ( 735 np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"]) 736 / ms1_data_sub["mz"] 737 ) 738 ms1_data_sub = ms1_data_sub[ 739 ms1_data_sub["mz_diff_rel"] 740 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 741 ].reset_index(drop=True) 742 743 # Group by mass_feature_mz and scan and sum intensity 744 ms1_data_sub_group = ( 745 ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"] 746 .sum() 747 .reset_index() 748 ) 749 750 # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity) 751 corr = ( 752 ms1_data_sub_group.pivot( 753 index="scan", columns="mass_feature_mz", values="intensity" 754 ) 755 .fillna(0) 756 .corr() 757 ) 758 759 # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8 760 decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min 761 corr_subset = corr.loc[mass_feature.mz,] 762 corr_subset = corr_subset[corr_subset > decon_corr_min] 763 764 # Get the masses from the mass spectrum that are the result of the deconvolution 765 mzs_decon = corr_subset.index.values 766 767 # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature 768 mzs_decon_idx = [ 769 id 770 for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp) 771 if mz in mzs_decon 772 ] 773 mass_feature._ms_deconvoluted_idx = mzs_decon_idx 774 775 # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum 776 if ( 777 mass_feature.ms1_peak.abundance 778 == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max() 779 ): 780 mass_feature.mass_spectrum_deconvoluted_parent = True 781 else: 782 mass_feature.mass_spectrum_deconvoluted_parent = False 783 784 # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature 785 # Subset mass_feature_df to only include mass features that are within the clustering tolerance 786 mass_feature_df_sub = mass_feature_df[ 787 abs(mass_feature.retention_time - mass_feature_df["scan_time"]) 788 < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 789 ].copy() 790 # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum 791 mass_feature_df_sub["mz_diff_ppm"] = [ 792 np.abs(mzs_decon - mz).min() / mz * 10**6 793 for mz in mass_feature_df_sub["mz"] 794 ] 795 # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses 796 mfs_associated_decon = mass_feature_df_sub[ 797 mass_feature_df_sub["mz_diff_ppm"] 798 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6 799 ].index.values 800 801 mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon 802 803 804class PHCalculations: 805 """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data. 806 807 Notes 808 ----- 809 This class is intended to be used as a mixin for the LCMSBase class. 810 811 Methods 812 ------- 813 * sparse_mean_filter(idx, V, radius=[0, 1, 1]). 814 Sparse implementation of a mean filter. 815 * embed_unique_indices(a). 816 Creates an array of indices, sorted by unique element. 817 * sparse_upper_star(idx, V). 818 Sparse implementation of an upper star filtration. 819 * check_if_grid(data). 820 Check if the data is gridded in mz space. 821 * grid_data(data). 822 Grid the data in the mz dimension. 823 * find_mass_features_ph(ms_level=1, grid=True). 824 Find mass features within an LCMSBase object using persistent homology. 825 * cluster_mass_features(drop_children=True). 826 Cluster regions of interest. 827 """ 828 829 @staticmethod 830 def sparse_mean_filter(idx, V, radius=[0, 1, 1]): 831 """Sparse implementation of a mean filter. 832 833 Parameters 834 ---------- 835 idx : :obj:`~numpy.array` 836 Edge indices for each dimension (MxN). 837 V : :obj:`~numpy.array` 838 Array of intensity data (Mx1). 839 radius : float or list 840 Radius of the sparse filter in each dimension. Values less than 841 zero indicate no connectivity in that dimension. 842 843 Returns 844 ------- 845 :obj:`~numpy.array` 846 Filtered intensities (Mx1). 847 848 Notes 849 ----- 850 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. 851 This is a static method. 852 """ 853 854 # Copy indices 855 idx = idx.copy().astype(V.dtype) 856 857 # Scale 858 for i, r in enumerate(radius): 859 # Increase inter-index distance 860 if r < 1: 861 idx[:, i] *= 2 862 863 # Do nothing 864 elif r == 1: 865 pass 866 867 # Decrease inter-index distance 868 else: 869 idx[:, i] /= r 870 871 # Connectivity matrix 872 cmat = KDTree(idx) 873 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 874 cmat.setdiag(1) 875 876 # Pair indices 877 I, J = cmat.nonzero() 878 879 # Delete cmat 880 cmat_shape = cmat.shape 881 del cmat 882 883 # Sum over columns 884 V_sum = sparse.bsr_matrix( 885 (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype 886 ).diagonal(0) 887 888 # Count over columns 889 V_count = sparse.bsr_matrix( 890 (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype 891 ).diagonal(0) 892 893 return V_sum / V_count 894 895 @staticmethod 896 def embed_unique_indices(a): 897 """Creates an array of indices, sorted by unique element. 898 899 Parameters 900 ---------- 901 a : :obj:`~numpy.array` 902 Array of unique elements (Mx1). 903 904 Returns 905 ------- 906 :obj:`~numpy.array` 907 Array of indices (Mx1). 908 909 Notes 910 ----- 911 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 912 This is a static method. 913 """ 914 915 def count_tens(n): 916 # Count tens 917 ntens = (n - 1) // 10 918 919 while True: 920 ntens_test = (ntens + n - 1) // 10 921 922 if ntens_test == ntens: 923 return ntens 924 else: 925 ntens = ntens_test 926 927 def arange_exclude_10s(n): 928 # How many 10s will there be? 929 ntens = count_tens(n) 930 931 # Base array 932 arr = np.arange(0, n + ntens) 933 934 # Exclude 10s 935 arr = arr[(arr == 0) | (arr % 10 != 0)][:n] 936 937 return arr 938 939 # Creates an array of indices, sorted by unique element 940 idx_sort = np.argsort(a) 941 idx_unsort = np.argsort(idx_sort) 942 943 # Sorts records array so all unique elements are together 944 sorted_a = a[idx_sort] 945 946 # Returns the unique values, the index of the first occurrence, 947 # and the count for each element 948 vals, idx_start, count = np.unique( 949 sorted_a, return_index=True, return_counts=True 950 ) 951 952 # Splits the indices into separate arrays 953 splits = np.split(idx_sort, idx_start[1:]) 954 955 # Creates unique indices for each split 956 idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits]) 957 958 # Reorders according to input array 959 idx_unq = idx_unq[idx_unsort] 960 961 # Magnitude of each index 962 exp = np.log10( 963 idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64) 964 ) 965 idx_unq_mag = np.power(10, np.floor(exp) + 1) 966 967 # Result 968 return a + idx_unq / idx_unq_mag 969 970 def sparse_upper_star(self, idx, V): 971 """Sparse implementation of an upper star filtration. 972 973 Parameters 974 ---------- 975 idx : :obj:`~numpy.array` 976 Edge indices for each dimension (MxN). 977 V : :obj:`~numpy.array` 978 Array of intensity data (Mx1). 979 Returns 980 ------- 981 idx : :obj:`~numpy.array` 982 Index of filtered points (Mx1). 983 persistence : :obj:`~numpy.array` 984 Persistence of each filtered point (Mx1). 985 986 Notes 987 ----- 988 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 989 """ 990 991 # Invert 992 V = -1 * V.copy().astype(int) 993 994 # Embed indices 995 V = self.embed_unique_indices(V) 996 997 # Connectivity matrix 998 cmat = KDTree(idx) 999 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1000 cmat.setdiag(1) 1001 cmat = sparse.triu(cmat) 1002 1003 # Pairwise minimums 1004 I, J = cmat.nonzero() 1005 d = np.maximum(V[I], V[J]) 1006 1007 # Delete connectiity matrix 1008 cmat_shape = cmat.shape 1009 del cmat 1010 1011 # Sparse distance matrix 1012 sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape) 1013 1014 # Delete pairwise mins 1015 del d, I, J 1016 1017 # Persistence homology 1018 ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0] 1019 1020 # Bound death values 1021 ph[ph[:, 1] == np.inf, 1] = np.max(V) 1022 1023 # Construct tree to query against 1024 tree = KDTree(V.reshape((-1, 1))) 1025 1026 # Get the indexes of the first nearest neighbor by birth 1027 _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1) 1028 1029 return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1) 1030 1031 def check_if_grid(self, data): 1032 """Check if the data are gridded in mz space. 1033 1034 Parameters 1035 ---------- 1036 data : DataFrame 1037 DataFrame containing the mass spectrometry data. Needs to have mz and scan columns. 1038 1039 Returns 1040 ------- 1041 bool 1042 True if the data is gridded in the mz direction, False otherwise. 1043 1044 Notes 1045 ----- 1046 This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly. 1047 """ 1048 # Calculate the difference between consecutive mz values in a single scan 1049 dat_check = data.copy().reset_index(drop=True) 1050 dat_check["mz_diff"] = np.abs(dat_check["mz"].diff()) 1051 mz_diff_min = ( 1052 dat_check.groupby("scan")["mz_diff"].min().min() 1053 ) # within each scan, what is the smallest mz difference between consecutive mz values 1054 1055 # Find the mininum mz difference between mz values in the data; regardless of scan 1056 dat_check_mz = dat_check[["mz"]].drop_duplicates().copy() 1057 dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True) 1058 dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff()) 1059 1060 # Get minimum mz_diff between mz values in the data 1061 mz_diff_min_raw = dat_check_mz["mz_diff"].min() 1062 1063 # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded 1064 if mz_diff_min_raw < mz_diff_min: 1065 return False 1066 else: 1067 return True 1068 1069 def grid_data(self, data): 1070 """Grid the data in the mz dimension. 1071 1072 Data must be gridded prior to persistent homology calculations. 1073 1074 Parameters 1075 ---------- 1076 data : DataFrame 1077 The input data containing mz, scan, scan_time, and intensity columns. 1078 1079 Returns 1080 ------- 1081 DataFrame 1082 The gridded data with mz, scan, scan_time, and intensity columns. 1083 1084 Raises 1085 ------ 1086 ValueError 1087 If gridding fails. 1088 """ 1089 1090 # Calculate the difference between consecutive mz values in a single scan for grid spacing 1091 data_w = data.copy().reset_index(drop=True) 1092 data_w["mz_diff"] = np.abs(data_w["mz"].diff()) 1093 mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999 1094 1095 # Need high intensity mz values first so they are parents in the output pairs stack 1096 dat_mz = data_w[["mz", "intensity"]].sort_values( 1097 by=["intensity"], ascending=False 1098 ) 1099 dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy() 1100 1101 # Construct KD tree 1102 tree = KDTree(dat_mz.mz.values.reshape(-1, 1)) 1103 sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix") 1104 sdm = sparse.triu(sdm, k=1) 1105 sdm.data = np.ones_like(sdm.data) 1106 distances = sdm.tocoo() 1107 pairs = np.stack((distances.row, distances.col), axis=1) 1108 1109 # Cull pairs to just get root 1110 to_drop = [] 1111 while len(pairs) > 0: 1112 root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1])) 1113 id_root_parents = np.isin(pairs[:, 0], root_parents) 1114 children_of_roots = np.unique(pairs[id_root_parents, 1]) 1115 to_drop = np.append(to_drop, children_of_roots) 1116 1117 # Set up pairs array for next iteration by removing pairs with children or parents already dropped 1118 pairs = pairs[~np.isin(pairs[:, 1], to_drop), :] 1119 pairs = pairs[~np.isin(pairs[:, 0], to_drop), :] 1120 dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop)) 1121 mz_dat_np = ( 1122 dat_mz[["mz"]] 1123 .sort_values(by=["mz"]) 1124 .reset_index(drop=True) 1125 .values.flatten() 1126 ) 1127 1128 # Sort data by mz and recast mz to nearest value in mz_dat_np 1129 data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy() 1130 data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)] 1131 data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"]) 1132 1133 # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity 1134 new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy() 1135 new_data_w = ( 1136 new_data_w.drop(columns=["mz_diff", "mz_orig"]) 1137 .groupby(["scan", "mz"])["intensity"] 1138 .sum() 1139 .reset_index() 1140 ) 1141 new_data_w = ( 1142 new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True]) 1143 .reset_index(drop=True) 1144 .copy() 1145 ) 1146 1147 # Check if grid worked and return 1148 if self.check_if_grid(new_data_w): 1149 return new_data_w 1150 else: 1151 raise ValueError("Gridding failed") 1152 1153 def find_mass_features_ph(self, ms_level=1, grid=True): 1154 """Find mass features within an LCMSBase object using persistent homology. 1155 1156 Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id) 1157 1158 Parameters 1159 ---------- 1160 ms_level : int, optional 1161 The MS level to use. Default is 1. 1162 grid : bool, optional 1163 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True. 1164 1165 Raises 1166 ------ 1167 ValueError 1168 If no MS level data is found on the object. 1169 If data is not gridded and grid is False. 1170 1171 Returns 1172 ------- 1173 None, but assigns the mass_features attribute to the object. 1174 1175 Notes 1176 ----- 1177 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1178 """ 1179 # Check that ms_level is a key in self._ms_uprocessed 1180 if ms_level not in self._ms_unprocessed.keys(): 1181 raise ValueError( 1182 "No MS level " 1183 + str(ms_level) 1184 + " data found, did you instantiate with parser specific to MS level?" 1185 ) 1186 1187 # Get ms data 1188 data = self._ms_unprocessed[ms_level].copy() 1189 1190 # Drop rows with missing intensity values and reset index 1191 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1192 1193 # Threshold data 1194 dims = ["mz", "scan_time"] 1195 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1196 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1197 1198 # Check if gridded, if not, grid 1199 gridded_mz = self.check_if_grid(data_thres) 1200 if gridded_mz is False: 1201 if grid is False: 1202 raise ValueError( 1203 "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function" 1204 ) 1205 else: 1206 data_thres = self.grid_data(data_thres) 1207 1208 # Add build factors and add scan_time 1209 data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1210 factors = { 1211 dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32) 1212 for dim in dims 1213 } # this is return a float64 index 1214 1215 # Build indexes 1216 index = { 1217 dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32) 1218 for dim in factors 1219 } 1220 1221 # Smooth data 1222 iterations = self.parameters.lc_ms.ph_smooth_it 1223 smooth_radius = [ 1224 self.parameters.lc_ms.ph_smooth_radius_mz, 1225 self.parameters.lc_ms.ph_smooth_radius_scan, 1226 ] # mz, scan_time smoothing radius (in steps) 1227 1228 index = np.vstack([index[dim] for dim in dims]).T 1229 V = data_thres["intensity"].values 1230 resid = np.inf 1231 for i in range(iterations): 1232 # Previous iteration 1233 V_prev = V.copy() 1234 resid_prev = resid 1235 V = self.sparse_mean_filter(index, V, radius=smooth_radius) 1236 1237 # Calculate residual with previous iteration 1238 resid = np.sqrt(np.mean(np.square(V - V_prev))) 1239 1240 # Evaluate convergence 1241 if i > 0: 1242 # Percent change in residual 1243 test = np.abs(resid - resid_prev) / resid_prev 1244 1245 # Exit criteria 1246 if test <= 0: 1247 break 1248 1249 # Overwrite values 1250 data_thres["intensity"] = V 1251 1252 # Use persistent homology to find regions of interest 1253 pidx, pers = self.sparse_upper_star(index, V) 1254 pidx = pidx[pers > 1] 1255 pers = pers[pers > 1] 1256 1257 # Get peaks 1258 peaks = data_thres.iloc[pidx, :].reset_index(drop=True) 1259 1260 # Add persistence column 1261 peaks["persistence"] = pers 1262 mass_features = peaks.sort_values( 1263 by="persistence", ascending=False 1264 ).reset_index(drop=True) 1265 1266 # Filter by persistence threshold 1267 persistence_threshold = ( 1268 self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max() 1269 ) 1270 mass_features = mass_features.loc[ 1271 mass_features["persistence"] > persistence_threshold, : 1272 ].reset_index(drop=True) 1273 1274 # Rename scan column to apex_scan 1275 mass_features = mass_features.rename( 1276 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1277 ) 1278 1279 # Populate mass_features attribute 1280 self.mass_features = {} 1281 for row in mass_features.itertuples(): 1282 row_dict = mass_features.iloc[row.Index].to_dict() 1283 lcms_feature = LCMSMassFeature(self, **row_dict) 1284 self.mass_features[lcms_feature.id] = lcms_feature 1285 1286 if self.parameters.lc_ms.verbose_processing: 1287 print("Found " + str(len(mass_features)) + " initial mass features") 1288 1289 def cluster_mass_features(self, drop_children=True, sort_by="persistence"): 1290 """Cluster mass features 1291 1292 Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence. 1293 1294 Parameters 1295 ---------- 1296 drop_children : bool, optional 1297 Whether to drop the mass features that are not cluster parents. Default is True. 1298 sort_by : str, optional 1299 The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence". 1300 1301 Raises 1302 ------ 1303 ValueError 1304 If no mass features are found. 1305 If too many mass features are found. 1306 1307 Returns 1308 ------- 1309 None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents. 1310 """ 1311 verbose = self.parameters.lc_ms.verbose_processing 1312 1313 if self.mass_features is None: 1314 raise ValueError("No mass features found, run find_mass_features() first") 1315 if len(self.mass_features) > 400000: 1316 raise ValueError( 1317 "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold" 1318 ) 1319 dims = ["mz", "scan_time"] 1320 mf_df_og = self.mass_features_to_df() 1321 mf_df = mf_df_og.copy() 1322 1323 # Sort mass features by sort_by column, make mf_id its own column for easier bookkeeping 1324 mf_df = mf_df.sort_values(by=sort_by, ascending=False).reset_index(drop=False) 1325 1326 tol = [ 1327 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1328 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1329 ] # mz, in relative; scan_time in minutes 1330 relative = [True, False] 1331 1332 # Compute inter-feature distances 1333 distances = None 1334 for i in range(len(dims)): 1335 # Construct k-d tree 1336 values = mf_df[dims[i]].values 1337 tree = KDTree(values.reshape(-1, 1)) 1338 1339 max_tol = tol[i] 1340 if relative[i] is True: 1341 # Maximum absolute tolerance 1342 max_tol = tol[i] * values.max() 1343 1344 # Compute sparse distance matrix 1345 # the larger the max_tol, the slower this operation is 1346 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 1347 1348 # Only consider forward case, exclude diagonal 1349 sdm = sparse.triu(sdm, k=1) 1350 1351 # Filter relative distances 1352 if relative[i] is True: 1353 # Compute relative distances 1354 rel_dists = sdm.data / values[sdm.row] # or col? 1355 1356 # Indices of relative distances less than tolerance 1357 idx = rel_dists <= tol[i] 1358 1359 # Reconstruct sparse distance matrix 1360 sdm = sparse.coo_matrix( 1361 (rel_dists[idx], (sdm.row[idx], sdm.col[idx])), 1362 shape=(len(values), len(values)), 1363 ) 1364 1365 # Cast as binary matrix 1366 sdm.data = np.ones_like(sdm.data) 1367 1368 # Stack distances 1369 if distances is None: 1370 distances = sdm 1371 else: 1372 distances = distances.multiply(sdm) 1373 1374 # Extract indices of within-tolerance points 1375 distances = distances.tocoo() 1376 pairs = np.stack((distances.row, distances.col), axis=1) 1377 pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]) 1378 pairs_df = pairs_df.set_index("parent") 1379 1380 to_drop = [] 1381 while not pairs_df.empty: 1382 # Find root_parents and their children 1383 root_parents = np.setdiff1d( 1384 np.unique(pairs_df.index.values), np.unique(pairs_df.child.values) 1385 ) 1386 children_of_roots = pairs_df.loc[root_parents, "child"].unique() 1387 to_drop = np.append(to_drop, children_of_roots) 1388 1389 # Remove root_children as possible parents from pairs_df for next iteration 1390 pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore") 1391 pairs_df = pairs_df.reset_index().set_index("child") 1392 # Remove root_children as possible children from pairs_df for next iteration 1393 pairs_df = pairs_df.drop(index=children_of_roots) 1394 1395 # Prepare for next iteration 1396 pairs_df = pairs_df.reset_index().set_index("parent") 1397 1398 # Drop mass features that are not cluster parents 1399 mf_df = mf_df.drop(index=np.array(to_drop)) 1400 1401 # Set index back to mf_id 1402 mf_df = mf_df.set_index("mf_id") 1403 if verbose: 1404 print(str(len(mf_df)) + " mass features remaining") 1405 1406 mf_df_new = mf_df_og.copy() 1407 mf_df_new["cluster_parent"] = np.where( 1408 np.isin(mf_df_new.index, mf_df.index), True, False 1409 ) 1410 1411 # get mass feature ids of features that are not cluster parents 1412 cluster_daughters = mf_df_new[mf_df_new["cluster_parent"] == False].index.values 1413 if drop_children is True: 1414 # Drop mass features that are not cluster parents from self 1415 self.mass_features = { 1416 k: v 1417 for k, v in self.mass_features.items() 1418 if k not in cluster_daughters 1419 } 1420 else: 1421 return cluster_daughters
14def find_closest(A, target): 15 """Find the index of closest value in A to each value in target. 16 17 Parameters 18 ---------- 19 A : :obj:`~numpy.array` 20 The array to search (blueprint). A must be sorted. 21 target : :obj:`~numpy.array` 22 The array of values to search for. target must be sorted. 23 24 Returns 25 ------- 26 :obj:`~numpy.array` 27 The indices of the closest values in A to each value in target. 28 """ 29 idx = A.searchsorted(target) 30 idx = np.clip(idx, 1, len(A) - 1) 31 left = A[idx - 1] 32 right = A[idx] 33 idx -= target - left < right - target 34 return idx
Find the index of closest value in A to each value in target.
Parameters
- A (
~numpy.array
): The array to search (blueprint). A must be sorted. - target (
~numpy.array
): The array of values to search for. target must be sorted.
Returns
~numpy.array
: The indices of the closest values in A to each value in target.
37class LCCalculations: 38 """Methods for performing LC calculations on mass spectra data. 39 40 Notes 41 ----- 42 This class is intended to be used as a mixin for the LCMSBase class. 43 44 Methods 45 ------- 46 * get_max_eic(eic_data). 47 Returns the maximum EIC value from the given EIC data. A static method. 48 * smooth_tic(tic). 49 Smooths the TIC data using the specified smoothing method and settings. 50 * eic_centroid_detector(rt, eic, max_eic). 51 Performs EIC centroid detection on the given EIC data. 52 * find_nearest_scan(rt). 53 Finds the nearest scan to the given retention time. 54 * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). 55 Returns an averaged mass spectrum object. 56 * find_mass_features(ms_level=1). 57 Find regions of interest for a given MS level (default is MS1). 58 * integrate_mass_features(drop_if_fail=False, ms_level=1). 59 Integrate mass features of interest and extracts EICs. 60 * find_c13_mass_features(). 61 Evaluate mass features and mark likely C13 isotopes. 62 * deconvolute_ms1_mass_features(). 63 Deconvolute mass features' ms1 mass spectra. 64 """ 65 66 @staticmethod 67 def get_max_eic(eic_data: dict): 68 """Returns the maximum EIC value from the given EIC data. 69 70 Notes 71 ----- 72 This is a static method. 73 74 Parameters 75 ---------- 76 eic_data : dict 77 A dictionary containing EIC data. 78 79 Returns 80 ------- 81 float 82 The maximum EIC value. 83 """ 84 max_eic = 0 85 for eic_data in eic_data.values(): 86 ind_max_eic = max(eic_data.get("EIC")) 87 max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic 88 89 return max_eic 90 91 def smooth_tic(self, tic): 92 """Smooths the TIC or EIC data using the specified smoothing method and settings. 93 94 Parameters 95 ---------- 96 tic : numpy.ndarray 97 The TIC (or EIC) data to be smoothed. 98 99 Returns 100 ------- 101 numpy.ndarray 102 The smoothed TIC data. 103 """ 104 implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method 105 106 pol_order = self.parameters.lc_ms.savgol_pol_order 107 108 window_len = self.parameters.lc_ms.smooth_window 109 110 window = self.parameters.lc_ms.smooth_method 111 112 return sp.smooth_signal( 113 tic, window_len, window, pol_order, implemented_smooth_method 114 ) 115 116 def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]): 117 """Performs EIC centroid detection on the given EIC data. 118 119 Parameters 120 ---------- 121 rt : numpy.ndarray 122 The retention time data. 123 eic : numpy.ndarray 124 The EIC data. 125 max_eic : float 126 The maximum EIC value. 127 apex_indexes : list, optional 128 The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function. 129 130 Returns 131 ------- 132 numpy.ndarray 133 The indexes of left, apex, and right limits as a generator. 134 """ 135 136 max_prominence = self.parameters.lc_ms.peak_max_prominence_percent 137 138 max_height = self.parameters.lc_ms.peak_height_max_percent 139 140 signal_threshold = self.parameters.lc_ms.eic_signal_threshold 141 142 min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints 143 144 peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold 145 146 include_indexes = sp.peak_picking_first_derivative( 147 domain=rt, 148 signal=eic, 149 max_height=max_height, 150 max_prominence=max_prominence, 151 max_signal=max_eic, 152 min_peak_datapoints=min_peak_datapoints, 153 peak_derivative_threshold=peak_derivative_threshold, 154 signal_threshold=signal_threshold, 155 correct_baseline=False, 156 plot_res=False, 157 apex_indexes=apex_indexes, 158 ) 159 160 return include_indexes 161 162 def find_nearest_scan(self, rt): 163 """Finds the nearest scan to the given retention time. 164 165 Parameters 166 ---------- 167 rt : float 168 The retention time (in minutes) to find the nearest scan for. 169 170 Returns 171 ------- 172 int 173 The scan number of the nearest scan. 174 """ 175 array_rt = np.array(self.retention_time) 176 177 scan_index = (np.abs(array_rt - rt)).argmin() 178 179 real_scan = self.scans_number[scan_index] 180 181 return real_scan 182 183 def add_peak_metrics(self): 184 """Add peak metrics to the mass features. 185 186 This function calculates the peak metrics for each mass feature and adds them to the mass feature objects. 187 """ 188 # Check that at least some mass features have eic data 189 if not any([mf._eic_data is not None for mf in self.mass_features.values()]): 190 raise ValueError( 191 "No mass features have EIC data. Run integrate_mass_features first." 192 ) 193 194 for mass_feature in self.mass_features.values(): 195 # Check if the mass feature has been integrated 196 if mass_feature._eic_data is not None: 197 # Calculate peak metrics 198 mass_feature.calc_half_height_width() 199 mass_feature.calc_tailing_factor() 200 mass_feature.calc_dispersity_index() 201 202 def get_average_mass_spectrum( 203 self, 204 scan_list, 205 apex_scan, 206 spectrum_mode="profile", 207 ms_level=1, 208 auto_process=True, 209 use_parser=False, 210 perform_checks=True, 211 polarity=None, 212 ms_params=None, 213 ): 214 """Returns an averaged mass spectrum object 215 216 Parameters 217 ---------- 218 scan_list : list 219 List of scan numbers to average. 220 apex_scan : int 221 Number of the apex scan 222 spectrum_mode : str, optional 223 The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging. 224 ms_level : int, optional 225 The MS level to use. Defaults to 1. 226 auto_process : bool, optional 227 If True, the averaged mass spectrum will be auto-processed. Defaults to True. 228 use_parser : bool, optional 229 If True, the mass spectra will be obtained from the parser. Defaults to False. 230 perform_checks : bool, optional 231 If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan 232 polarity : int, optional 233 The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1) 234 ms_params : MSParameters, optional 235 The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None. 236 237 Returns 238 ------- 239 MassSpectrumProfile 240 The averaged mass spectrum object. 241 242 Raises 243 ------ 244 ValueError 245 If the spectrum mode is not "profile". 246 If the MS level is not found in the unprocessed mass spectra dictionary. 247 If not all scan numbers are found in the unprocessed mass spectra dictionary. 248 """ 249 if perform_checks: 250 if spectrum_mode != "profile": 251 raise ValueError("Averaging only supported for profile mode") 252 253 if polarity is None: 254 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 255 if self.polarity == "negative": 256 polarity = -1 257 elif self.polarity == "positive": 258 polarity = 1 259 else: 260 raise ValueError( 261 "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'" 262 ) 263 264 # if not using_parser, check that scan numbers are in _ms_unprocessed 265 if not use_parser: 266 if perform_checks: 267 # Set index to scan for faster lookup 268 ms_df = ( 269 self._ms_unprocessed[ms_level] 270 .copy() 271 .set_index("scan", drop=False) 272 .sort_index() 273 ) 274 my_ms_df = ms_df.loc[scan_list] 275 # Check that all scan numbers are in the ms_df 276 if not all(np.isin(scan_list, ms_df.index)): 277 raise ValueError( 278 "Not all scan numbers found in the unprocessed mass spectra dictionary" 279 ) 280 else: 281 my_ms_df = ( 282 pd.DataFrame({"scan": scan_list}) 283 .set_index("scan") 284 .join(self._ms_unprocessed[ms_level], how="left") 285 ) 286 287 if use_parser: 288 ms_list = [ 289 self.spectra_parser.get_mass_spectrum_from_scan( 290 x, spectrum_mode=spectrum_mode, auto_process=False 291 ) 292 for x in scan_list 293 ] 294 ms_mz = [x._mz_exp for x in ms_list] 295 ms_int = [x._abundance for x in ms_list] 296 my_ms_df = [] 297 for i in np.arange(len(ms_mz)): 298 my_ms_df.append( 299 pd.DataFrame( 300 {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]} 301 ) 302 ) 303 my_ms_df = pd.concat(my_ms_df) 304 305 if not self.check_if_grid(my_ms_df): 306 my_ms_df = self.grid_data(my_ms_df) 307 308 my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index() 309 310 ms = ms_from_array_profile( 311 my_ms_ave.mz, 312 my_ms_ave.intensity, 313 self.file_location, 314 polarity=polarity, 315 auto_process=False, 316 ) 317 318 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 319 if ms is not None: 320 if ms_params is not None: 321 ms.parameters = ms_params 322 ms.scan_number = apex_scan 323 if auto_process: 324 ms.process_mass_spec() 325 return ms 326 327 def find_mass_features(self, ms_level=1, grid=True): 328 """Find mass features within an LCMSBase object 329 330 Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future. 331 332 Parameters 333 ---------- 334 ms_level : int, optional 335 The MS level to use for peak picking Default is 1. 336 grid : bool, optional 337 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded, used for persistent homology peak picking. Default is True. 338 339 Raises 340 ------ 341 ValueError 342 If no MS level data is found on the object. 343 If persistent homology peak picking is attempted on non-profile mode data. 344 If data is not gridded and grid is False. 345 If peak picking method is not implemented. 346 347 Returns 348 ------- 349 None, but assigns the mass_features and eics attributes to the object. 350 351 """ 352 pp_method = self.parameters.lc_ms.peak_picking_method 353 354 if pp_method == "persistent homology": 355 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 356 if all(msx_scan_df["ms_format"] == "profile"): 357 self.find_mass_features_ph(ms_level=ms_level, grid=grid) 358 self.cluster_mass_features(drop_children=True, sort_by="persistence") 359 else: 360 raise ValueError( 361 "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format( 362 ms_level 363 ) 364 ) 365 else: 366 raise ValueError("Peak picking method not implemented") 367 368 def integrate_mass_features( 369 self, drop_if_fail=True, drop_duplicates=True, ms_level=1 370 ): 371 """Integrate mass features and extract EICs. 372 373 Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute. 374 375 Parameters 376 ---------- 377 drop_if_fail : bool, optional 378 Whether to drop mass features if the EIC limit calculations fail. 379 Default is True. 380 drop_duplicates : bool, optional 381 Whether to mass features that appear to be duplicates 382 (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). 383 Default is True. 384 ms_level : int, optional 385 The MS level to use. Default is 1. 386 387 Raises 388 ------ 389 ValueError 390 If no mass features are found. 391 If no MS level data is found for the given MS level (either in data or in the scan data) 392 393 Returns 394 ------- 395 None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute. 396 397 Notes 398 ----- 399 drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection). 400 """ 401 # Check if there is data 402 if ms_level in self._ms_unprocessed.keys(): 403 raw_data = self._ms_unprocessed[ms_level].copy() 404 else: 405 raise ValueError("No MS level " + str(ms_level) + " data found") 406 if self.mass_features is not None: 407 mf_df = self.mass_features_to_df().copy() 408 else: 409 raise ValueError( 410 "No mass features found, did you run find_mass_features() first?" 411 ) 412 # Check if mass_spectrum exists on each mass feature 413 if not all( 414 [mf.mass_spectrum is not None for mf in self.mass_features.values()] 415 ): 416 raise ValueError( 417 "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?" 418 ) 419 420 # Subset scan data to only include correct ms_level 421 scan_df_sub = self.scan_df[ 422 self.scan_df["ms_level"] == int(ms_level) 423 ].reset_index(drop=True) 424 if scan_df_sub.empty: 425 raise ValueError("No MS level " + ms_level + " data found in scan data") 426 scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy() 427 428 mzs_to_extract = np.unique(mf_df["mz"].values) 429 mzs_to_extract.sort() 430 431 # Get EICs for each unique mz in mass features list 432 for mz in mzs_to_extract: 433 mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 434 mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 435 raw_data_sub = raw_data[ 436 (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max) 437 ].reset_index(drop=True) 438 raw_data_sub = ( 439 raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index() 440 ) 441 raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left") 442 raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0) 443 myEIC = EIC_Data( 444 scans=raw_data_sub["scan"].values, 445 time=raw_data_sub["scan_time"].values, 446 eic=raw_data_sub["intensity"].values, 447 ) 448 # Smooth EIC 449 smoothed_eic = self.smooth_tic(myEIC.eic) 450 smoothed_eic[smoothed_eic < 0] = 0 451 myEIC.eic_smoothed = smoothed_eic 452 self.eics[mz] = myEIC 453 454 # Get limits of mass features using EIC centroid detector and integrate 455 mf_df["area"] = np.nan 456 for idx, mass_feature in mf_df.iterrows(): 457 mz = mass_feature.mz 458 apex_scan = mass_feature.apex_scan 459 460 # Pull EIC data and find apex scan index 461 myEIC = self.eics[mz] 462 self.mass_features[idx]._eic_data = myEIC 463 apex_index = np.where(myEIC.scans == apex_scan)[0][0] 464 465 # Find left and right limits of peak using EIC centroid detector, add to EICData 466 centroid_eics = self.eic_centroid_detector( 467 myEIC.time, 468 myEIC.eic_smoothed, 469 mass_feature.intensity * 1.1, 470 apex_indexes=[int(apex_index)], 471 ) 472 l_a_r_scan_idx = [i for i in centroid_eics] 473 if len(l_a_r_scan_idx) > 0: 474 # Add start and final scan to mass_features and EICData 475 left_scan, right_scan = ( 476 myEIC.scans[l_a_r_scan_idx[0][0]], 477 myEIC.scans[l_a_r_scan_idx[0][2]], 478 ) 479 mf_scan_apex = [(left_scan, int(apex_scan), right_scan)] 480 myEIC.apexes = myEIC.apexes + mf_scan_apex 481 self.mass_features[idx].start_scan = left_scan 482 self.mass_features[idx].final_scan = right_scan 483 484 # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData 485 area = np.trapz( 486 myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 487 myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 488 ) 489 mf_df.at[idx, "area"] = area 490 myEIC.areas = myEIC.areas + [area] 491 self.eics[mz] = myEIC 492 self.mass_features[idx]._area = area 493 else: 494 if drop_if_fail is True: 495 self.mass_features.pop(idx) 496 497 if drop_duplicates: 498 # Prepare mass feature dataframe 499 mf_df = self.mass_features_to_df().copy() 500 501 # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature 502 # Kepp the first mass fea 503 for idx, mass_feature in mf_df.iterrows(): 504 mz = mass_feature.mz 505 apex_scan = mass_feature.apex_scan 506 507 mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6 508 mf_df_sub = mf_df[ 509 mf_df["mz_diff_ppm"] 510 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 511 * 10**6 512 ].copy() 513 514 # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature 515 for idx2, mass_feature2 in mf_df_sub.iterrows(): 516 if idx2 != idx: 517 if ( 518 mass_feature2.start_scan >= mass_feature.start_scan 519 and mass_feature2.final_scan <= mass_feature.final_scan 520 ): 521 if idx2 in self.mass_features.keys(): 522 self.mass_features.pop(idx2) 523 524 def find_c13_mass_features(self): 525 """Mark likely C13 isotopes and connect to monoisoitopic mass features. 526 527 Returns 528 ------- 529 None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object. 530 531 Raises 532 ------ 533 ValueError 534 If no mass features are found. 535 """ 536 verbose = self.parameters.lc_ms.verbose_processing 537 if verbose: 538 print("evaluating mass features for C13 isotopes") 539 if self.mass_features is None: 540 raise ValueError("No mass features found, run find_mass_features() first") 541 542 # Data prep fo sparse distance matrix 543 dims = ["mz", "scan_time"] 544 mf_df = self.mass_features_to_df().copy() 545 # Drop mass features that have no area (these are likely to be noise) 546 mf_df = mf_df[mf_df["area"].notnull()] 547 mf_df["mf_id"] = mf_df.index.values 548 dims = ["mz", "scan_time"] 549 550 # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features 551 mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy() 552 553 mz_diff = 1.003355 # C13-C12 mass difference 554 tol = [ 555 mf_df["mz"].median() 556 * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 557 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5, 558 ] # mz, in relative; scan_time in minutes 559 560 # Compute inter-feature distances 561 distances = None 562 for i in range(len(dims)): 563 # Construct k-d tree 564 values = mf_df[dims[i]].values 565 tree = KDTree(values.reshape(-1, 1)) 566 567 max_tol = tol[i] 568 if dims[i] == "mz": 569 # Maximum absolute tolerance 570 max_tol = mz_diff + tol[i] 571 572 # Compute sparse distance matrix 573 # the larger the max_tol, the slower this operation is 574 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 575 576 # Only consider forward case, exclude diagonal 577 sdm = sparse.triu(sdm, k=1) 578 579 if dims[i] == "mz": 580 min_tol = mz_diff - tol[i] 581 # Get only the ones that are above the min tol 582 idx = sdm.data > min_tol 583 584 # Reconstruct sparse distance matrix 585 sdm = sparse.coo_matrix( 586 (sdm.data[idx], (sdm.row[idx], sdm.col[idx])), 587 shape=(len(values), len(values)), 588 ) 589 590 # Cast as binary matrix 591 sdm.data = np.ones_like(sdm.data) 592 593 # Stack distances 594 if distances is None: 595 distances = sdm 596 else: 597 distances = distances.multiply(sdm) 598 599 # Extract indices of within-tolerance points 600 distances = distances.tocoo() 601 pairs = np.stack((distances.row, distances.col), axis=1) # C12 to C13 pairs 602 603 # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df 604 pairs_mf = pairs.copy() 605 pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values 606 pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values 607 608 # Connect monoisotopic masses with isotopologes within mass_features 609 monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1])) 610 for mono in monos: 611 self.mass_features[mono].monoisotopic_mf_id = mono 612 pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"]) 613 while not pairs_iso_df.empty: 614 pairs_iso_df = pairs_iso_df.set_index("parent", drop=False) 615 m1_isos = pairs_iso_df.loc[monos, "child"].unique() 616 for iso in m1_isos: 617 # Set monoisotopic_mf_id and isotopologue_type for isotopologues 618 parent = pairs_mf[pairs_mf[:, 1] == iso, 0] 619 if len(parent) > 1: 620 # Choose the parent that is closest in time to the isotopologue 621 parent_time = [self.mass_features[p].retention_time for p in parent] 622 time_diff = [ 623 np.abs(self.mass_features[iso].retention_time - x) 624 for x in parent_time 625 ] 626 parent = parent[np.argmin(time_diff)] 627 else: 628 parent = parent[0] 629 self.mass_features[iso].monoisotopic_mf_id = self.mass_features[ 630 parent 631 ].monoisotopic_mf_id 632 if self.mass_features[iso].monoisotopic_mf_id is not None: 633 mass_diff = ( 634 self.mass_features[iso].mz 635 - self.mass_features[ 636 self.mass_features[iso].monoisotopic_mf_id 637 ].mz 638 ) 639 self.mass_features[iso].isotopologue_type = "13C" + str( 640 int(round(mass_diff, 0)) 641 ) 642 643 # Drop the mono and iso from the pairs_iso_df 644 pairs_iso_df = pairs_iso_df.drop( 645 index=monos, errors="ignore" 646 ) # Drop pairs where the parent is a child that is a child of a root 647 pairs_iso_df = pairs_iso_df.set_index("child", drop=False) 648 pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore") 649 650 if not pairs_iso_df.empty: 651 # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process 652 monos = np.setdiff1d( 653 np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child) 654 ) 655 if verbose: 656 # Report fraction of compounds annotated with isotopes 657 mf_df["c13_flag"] = np.where( 658 np.logical_or( 659 np.isin(mf_df["mf_id"], pairs_mf[:, 0]), 660 np.isin(mf_df["mf_id"], pairs_mf[:, 1]), 661 ), 662 1, 663 0, 664 ) 665 print( 666 str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3)) 667 + " of mass features have or are C13 isotopes" 668 ) 669 670 def deconvolute_ms1_mass_features(self): 671 """Deconvolute MS1 mass features 672 673 Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features 674 675 Parameters 676 ---------- 677 None 678 679 Returns 680 ------- 681 None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent, 682 and associated_mass_features_deconvoluted attributes to the mass features in the 683 mass_features attribute of the LCMSBase object. 684 685 Raises 686 ------ 687 ValueError 688 If no mass features are found, must run find_mass_features() first. 689 If no EICs are found, did you run integrate_mass_features() first? 690 691 """ 692 # Checks for set mass_features and eics 693 if self.mass_features is None: 694 raise ValueError( 695 "No mass features found, did you run find_mass_features() first?" 696 ) 697 698 if self.eics == {}: 699 raise ValueError( 700 "No EICs found, did you run integrate_mass_features() first?" 701 ) 702 703 if 1 not in self._ms_unprocessed.keys(): 704 raise ValueError("No unprocessed MS1 spectra found.") 705 706 # Prep ms1 data 707 ms1_data = self._ms_unprocessed[1].copy() 708 ms1_data = ms1_data.set_index("scan") 709 710 # Prep mass feature summary 711 mass_feature_df = self.mass_features_to_df() 712 713 # Loop through each mass feature 714 for mf_id, mass_feature in self.mass_features.items(): 715 # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df 716 if mass_feature.mz != mass_feature.ms1_peak.mz_exp: 717 continue 718 719 # Get the left and right limits of the EIC of the mass feature 720 l_scan, _, r_scan = mass_feature._eic_data.apexes[0] 721 722 # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz 723 ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy() 724 ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False) 725 726 # Get the centroided masses of the mass feature 727 mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp 728 729 # Find the closest mz in the ms1 data to the centroided masses of the mass feature 730 ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[ 731 find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values) 732 ] 733 734 # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz 735 ms1_data_sub["mz_diff_rel"] = ( 736 np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"]) 737 / ms1_data_sub["mz"] 738 ) 739 ms1_data_sub = ms1_data_sub[ 740 ms1_data_sub["mz_diff_rel"] 741 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 742 ].reset_index(drop=True) 743 744 # Group by mass_feature_mz and scan and sum intensity 745 ms1_data_sub_group = ( 746 ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"] 747 .sum() 748 .reset_index() 749 ) 750 751 # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity) 752 corr = ( 753 ms1_data_sub_group.pivot( 754 index="scan", columns="mass_feature_mz", values="intensity" 755 ) 756 .fillna(0) 757 .corr() 758 ) 759 760 # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8 761 decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min 762 corr_subset = corr.loc[mass_feature.mz,] 763 corr_subset = corr_subset[corr_subset > decon_corr_min] 764 765 # Get the masses from the mass spectrum that are the result of the deconvolution 766 mzs_decon = corr_subset.index.values 767 768 # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature 769 mzs_decon_idx = [ 770 id 771 for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp) 772 if mz in mzs_decon 773 ] 774 mass_feature._ms_deconvoluted_idx = mzs_decon_idx 775 776 # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum 777 if ( 778 mass_feature.ms1_peak.abundance 779 == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max() 780 ): 781 mass_feature.mass_spectrum_deconvoluted_parent = True 782 else: 783 mass_feature.mass_spectrum_deconvoluted_parent = False 784 785 # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature 786 # Subset mass_feature_df to only include mass features that are within the clustering tolerance 787 mass_feature_df_sub = mass_feature_df[ 788 abs(mass_feature.retention_time - mass_feature_df["scan_time"]) 789 < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 790 ].copy() 791 # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum 792 mass_feature_df_sub["mz_diff_ppm"] = [ 793 np.abs(mzs_decon - mz).min() / mz * 10**6 794 for mz in mass_feature_df_sub["mz"] 795 ] 796 # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses 797 mfs_associated_decon = mass_feature_df_sub[ 798 mass_feature_df_sub["mz_diff_ppm"] 799 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6 800 ].index.values 801 802 mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon
Methods for performing LC calculations on mass spectra data.
Notes
This class is intended to be used as a mixin for the LCMSBase class.
Methods
- get_max_eic(eic_data). Returns the maximum EIC value from the given EIC data. A static method.
- smooth_tic(tic). Smooths the TIC data using the specified smoothing method and settings.
- eic_centroid_detector(rt, eic, max_eic). Performs EIC centroid detection on the given EIC data.
- find_nearest_scan(rt). Finds the nearest scan to the given retention time.
- get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). Returns an averaged mass spectrum object.
- find_mass_features(ms_level=1). Find regions of interest for a given MS level (default is MS1).
- integrate_mass_features(drop_if_fail=False, ms_level=1). Integrate mass features of interest and extracts EICs.
- find_c13_mass_features(). Evaluate mass features and mark likely C13 isotopes.
- deconvolute_ms1_mass_features(). Deconvolute mass features' ms1 mass spectra.
66 @staticmethod 67 def get_max_eic(eic_data: dict): 68 """Returns the maximum EIC value from the given EIC data. 69 70 Notes 71 ----- 72 This is a static method. 73 74 Parameters 75 ---------- 76 eic_data : dict 77 A dictionary containing EIC data. 78 79 Returns 80 ------- 81 float 82 The maximum EIC value. 83 """ 84 max_eic = 0 85 for eic_data in eic_data.values(): 86 ind_max_eic = max(eic_data.get("EIC")) 87 max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic 88 89 return max_eic
Returns the maximum EIC value from the given EIC data.
Notes
This is a static method.
Parameters
- eic_data (dict): A dictionary containing EIC data.
Returns
- float: The maximum EIC value.
91 def smooth_tic(self, tic): 92 """Smooths the TIC or EIC data using the specified smoothing method and settings. 93 94 Parameters 95 ---------- 96 tic : numpy.ndarray 97 The TIC (or EIC) data to be smoothed. 98 99 Returns 100 ------- 101 numpy.ndarray 102 The smoothed TIC data. 103 """ 104 implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method 105 106 pol_order = self.parameters.lc_ms.savgol_pol_order 107 108 window_len = self.parameters.lc_ms.smooth_window 109 110 window = self.parameters.lc_ms.smooth_method 111 112 return sp.smooth_signal( 113 tic, window_len, window, pol_order, implemented_smooth_method 114 )
Smooths the TIC or EIC data using the specified smoothing method and settings.
Parameters
- tic (numpy.ndarray): The TIC (or EIC) data to be smoothed.
Returns
- numpy.ndarray: The smoothed TIC data.
116 def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]): 117 """Performs EIC centroid detection on the given EIC data. 118 119 Parameters 120 ---------- 121 rt : numpy.ndarray 122 The retention time data. 123 eic : numpy.ndarray 124 The EIC data. 125 max_eic : float 126 The maximum EIC value. 127 apex_indexes : list, optional 128 The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function. 129 130 Returns 131 ------- 132 numpy.ndarray 133 The indexes of left, apex, and right limits as a generator. 134 """ 135 136 max_prominence = self.parameters.lc_ms.peak_max_prominence_percent 137 138 max_height = self.parameters.lc_ms.peak_height_max_percent 139 140 signal_threshold = self.parameters.lc_ms.eic_signal_threshold 141 142 min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints 143 144 peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold 145 146 include_indexes = sp.peak_picking_first_derivative( 147 domain=rt, 148 signal=eic, 149 max_height=max_height, 150 max_prominence=max_prominence, 151 max_signal=max_eic, 152 min_peak_datapoints=min_peak_datapoints, 153 peak_derivative_threshold=peak_derivative_threshold, 154 signal_threshold=signal_threshold, 155 correct_baseline=False, 156 plot_res=False, 157 apex_indexes=apex_indexes, 158 ) 159 160 return include_indexes
Performs EIC centroid detection on the given EIC data.
Parameters
- rt (numpy.ndarray): The retention time data.
- eic (numpy.ndarray): The EIC data.
- max_eic (float): The maximum EIC value.
- apex_indexes (list, optional): The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
Returns
- numpy.ndarray: The indexes of left, apex, and right limits as a generator.
162 def find_nearest_scan(self, rt): 163 """Finds the nearest scan to the given retention time. 164 165 Parameters 166 ---------- 167 rt : float 168 The retention time (in minutes) to find the nearest scan for. 169 170 Returns 171 ------- 172 int 173 The scan number of the nearest scan. 174 """ 175 array_rt = np.array(self.retention_time) 176 177 scan_index = (np.abs(array_rt - rt)).argmin() 178 179 real_scan = self.scans_number[scan_index] 180 181 return real_scan
Finds the nearest scan to the given retention time.
Parameters
- rt (float): The retention time (in minutes) to find the nearest scan for.
Returns
- int: The scan number of the nearest scan.
183 def add_peak_metrics(self): 184 """Add peak metrics to the mass features. 185 186 This function calculates the peak metrics for each mass feature and adds them to the mass feature objects. 187 """ 188 # Check that at least some mass features have eic data 189 if not any([mf._eic_data is not None for mf in self.mass_features.values()]): 190 raise ValueError( 191 "No mass features have EIC data. Run integrate_mass_features first." 192 ) 193 194 for mass_feature in self.mass_features.values(): 195 # Check if the mass feature has been integrated 196 if mass_feature._eic_data is not None: 197 # Calculate peak metrics 198 mass_feature.calc_half_height_width() 199 mass_feature.calc_tailing_factor() 200 mass_feature.calc_dispersity_index()
Add peak metrics to the mass features.
This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
202 def get_average_mass_spectrum( 203 self, 204 scan_list, 205 apex_scan, 206 spectrum_mode="profile", 207 ms_level=1, 208 auto_process=True, 209 use_parser=False, 210 perform_checks=True, 211 polarity=None, 212 ms_params=None, 213 ): 214 """Returns an averaged mass spectrum object 215 216 Parameters 217 ---------- 218 scan_list : list 219 List of scan numbers to average. 220 apex_scan : int 221 Number of the apex scan 222 spectrum_mode : str, optional 223 The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging. 224 ms_level : int, optional 225 The MS level to use. Defaults to 1. 226 auto_process : bool, optional 227 If True, the averaged mass spectrum will be auto-processed. Defaults to True. 228 use_parser : bool, optional 229 If True, the mass spectra will be obtained from the parser. Defaults to False. 230 perform_checks : bool, optional 231 If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan 232 polarity : int, optional 233 The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1) 234 ms_params : MSParameters, optional 235 The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None. 236 237 Returns 238 ------- 239 MassSpectrumProfile 240 The averaged mass spectrum object. 241 242 Raises 243 ------ 244 ValueError 245 If the spectrum mode is not "profile". 246 If the MS level is not found in the unprocessed mass spectra dictionary. 247 If not all scan numbers are found in the unprocessed mass spectra dictionary. 248 """ 249 if perform_checks: 250 if spectrum_mode != "profile": 251 raise ValueError("Averaging only supported for profile mode") 252 253 if polarity is None: 254 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 255 if self.polarity == "negative": 256 polarity = -1 257 elif self.polarity == "positive": 258 polarity = 1 259 else: 260 raise ValueError( 261 "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'" 262 ) 263 264 # if not using_parser, check that scan numbers are in _ms_unprocessed 265 if not use_parser: 266 if perform_checks: 267 # Set index to scan for faster lookup 268 ms_df = ( 269 self._ms_unprocessed[ms_level] 270 .copy() 271 .set_index("scan", drop=False) 272 .sort_index() 273 ) 274 my_ms_df = ms_df.loc[scan_list] 275 # Check that all scan numbers are in the ms_df 276 if not all(np.isin(scan_list, ms_df.index)): 277 raise ValueError( 278 "Not all scan numbers found in the unprocessed mass spectra dictionary" 279 ) 280 else: 281 my_ms_df = ( 282 pd.DataFrame({"scan": scan_list}) 283 .set_index("scan") 284 .join(self._ms_unprocessed[ms_level], how="left") 285 ) 286 287 if use_parser: 288 ms_list = [ 289 self.spectra_parser.get_mass_spectrum_from_scan( 290 x, spectrum_mode=spectrum_mode, auto_process=False 291 ) 292 for x in scan_list 293 ] 294 ms_mz = [x._mz_exp for x in ms_list] 295 ms_int = [x._abundance for x in ms_list] 296 my_ms_df = [] 297 for i in np.arange(len(ms_mz)): 298 my_ms_df.append( 299 pd.DataFrame( 300 {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]} 301 ) 302 ) 303 my_ms_df = pd.concat(my_ms_df) 304 305 if not self.check_if_grid(my_ms_df): 306 my_ms_df = self.grid_data(my_ms_df) 307 308 my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index() 309 310 ms = ms_from_array_profile( 311 my_ms_ave.mz, 312 my_ms_ave.intensity, 313 self.file_location, 314 polarity=polarity, 315 auto_process=False, 316 ) 317 318 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 319 if ms is not None: 320 if ms_params is not None: 321 ms.parameters = ms_params 322 ms.scan_number = apex_scan 323 if auto_process: 324 ms.process_mass_spec() 325 return ms
Returns an averaged mass spectrum object
Parameters
- scan_list (list): List of scan numbers to average.
- apex_scan (int): Number of the apex scan
- spectrum_mode (str, optional): The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
- ms_level (int, optional): The MS level to use. Defaults to 1.
- auto_process (bool, optional): If True, the averaged mass spectrum will be auto-processed. Defaults to True.
- use_parser (bool, optional): If True, the mass spectra will be obtained from the parser. Defaults to False.
- perform_checks (bool, optional): If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan
- polarity (int, optional): The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
- ms_params (MSParameters, optional): The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
Returns
- MassSpectrumProfile: The averaged mass spectrum object.
Raises
- ValueError: If the spectrum mode is not "profile". If the MS level is not found in the unprocessed mass spectra dictionary. If not all scan numbers are found in the unprocessed mass spectra dictionary.
327 def find_mass_features(self, ms_level=1, grid=True): 328 """Find mass features within an LCMSBase object 329 330 Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future. 331 332 Parameters 333 ---------- 334 ms_level : int, optional 335 The MS level to use for peak picking Default is 1. 336 grid : bool, optional 337 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded, used for persistent homology peak picking. Default is True. 338 339 Raises 340 ------ 341 ValueError 342 If no MS level data is found on the object. 343 If persistent homology peak picking is attempted on non-profile mode data. 344 If data is not gridded and grid is False. 345 If peak picking method is not implemented. 346 347 Returns 348 ------- 349 None, but assigns the mass_features and eics attributes to the object. 350 351 """ 352 pp_method = self.parameters.lc_ms.peak_picking_method 353 354 if pp_method == "persistent homology": 355 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 356 if all(msx_scan_df["ms_format"] == "profile"): 357 self.find_mass_features_ph(ms_level=ms_level, grid=grid) 358 self.cluster_mass_features(drop_children=True, sort_by="persistence") 359 else: 360 raise ValueError( 361 "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format( 362 ms_level 363 ) 364 ) 365 else: 366 raise ValueError("Peak picking method not implemented")
Find mass features within an LCMSBase object
Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
Parameters
- ms_level (int, optional): The MS level to use for peak picking Default is 1.
- grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded, used for persistent homology peak picking. Default is True.
Raises
- ValueError: If no MS level data is found on the object. If persistent homology peak picking is attempted on non-profile mode data. If data is not gridded and grid is False. If peak picking method is not implemented.
Returns
- None, but assigns the mass_features and eics attributes to the object.
368 def integrate_mass_features( 369 self, drop_if_fail=True, drop_duplicates=True, ms_level=1 370 ): 371 """Integrate mass features and extract EICs. 372 373 Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute. 374 375 Parameters 376 ---------- 377 drop_if_fail : bool, optional 378 Whether to drop mass features if the EIC limit calculations fail. 379 Default is True. 380 drop_duplicates : bool, optional 381 Whether to mass features that appear to be duplicates 382 (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). 383 Default is True. 384 ms_level : int, optional 385 The MS level to use. Default is 1. 386 387 Raises 388 ------ 389 ValueError 390 If no mass features are found. 391 If no MS level data is found for the given MS level (either in data or in the scan data) 392 393 Returns 394 ------- 395 None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute. 396 397 Notes 398 ----- 399 drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection). 400 """ 401 # Check if there is data 402 if ms_level in self._ms_unprocessed.keys(): 403 raw_data = self._ms_unprocessed[ms_level].copy() 404 else: 405 raise ValueError("No MS level " + str(ms_level) + " data found") 406 if self.mass_features is not None: 407 mf_df = self.mass_features_to_df().copy() 408 else: 409 raise ValueError( 410 "No mass features found, did you run find_mass_features() first?" 411 ) 412 # Check if mass_spectrum exists on each mass feature 413 if not all( 414 [mf.mass_spectrum is not None for mf in self.mass_features.values()] 415 ): 416 raise ValueError( 417 "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?" 418 ) 419 420 # Subset scan data to only include correct ms_level 421 scan_df_sub = self.scan_df[ 422 self.scan_df["ms_level"] == int(ms_level) 423 ].reset_index(drop=True) 424 if scan_df_sub.empty: 425 raise ValueError("No MS level " + ms_level + " data found in scan data") 426 scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy() 427 428 mzs_to_extract = np.unique(mf_df["mz"].values) 429 mzs_to_extract.sort() 430 431 # Get EICs for each unique mz in mass features list 432 for mz in mzs_to_extract: 433 mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 434 mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 435 raw_data_sub = raw_data[ 436 (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max) 437 ].reset_index(drop=True) 438 raw_data_sub = ( 439 raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index() 440 ) 441 raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left") 442 raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0) 443 myEIC = EIC_Data( 444 scans=raw_data_sub["scan"].values, 445 time=raw_data_sub["scan_time"].values, 446 eic=raw_data_sub["intensity"].values, 447 ) 448 # Smooth EIC 449 smoothed_eic = self.smooth_tic(myEIC.eic) 450 smoothed_eic[smoothed_eic < 0] = 0 451 myEIC.eic_smoothed = smoothed_eic 452 self.eics[mz] = myEIC 453 454 # Get limits of mass features using EIC centroid detector and integrate 455 mf_df["area"] = np.nan 456 for idx, mass_feature in mf_df.iterrows(): 457 mz = mass_feature.mz 458 apex_scan = mass_feature.apex_scan 459 460 # Pull EIC data and find apex scan index 461 myEIC = self.eics[mz] 462 self.mass_features[idx]._eic_data = myEIC 463 apex_index = np.where(myEIC.scans == apex_scan)[0][0] 464 465 # Find left and right limits of peak using EIC centroid detector, add to EICData 466 centroid_eics = self.eic_centroid_detector( 467 myEIC.time, 468 myEIC.eic_smoothed, 469 mass_feature.intensity * 1.1, 470 apex_indexes=[int(apex_index)], 471 ) 472 l_a_r_scan_idx = [i for i in centroid_eics] 473 if len(l_a_r_scan_idx) > 0: 474 # Add start and final scan to mass_features and EICData 475 left_scan, right_scan = ( 476 myEIC.scans[l_a_r_scan_idx[0][0]], 477 myEIC.scans[l_a_r_scan_idx[0][2]], 478 ) 479 mf_scan_apex = [(left_scan, int(apex_scan), right_scan)] 480 myEIC.apexes = myEIC.apexes + mf_scan_apex 481 self.mass_features[idx].start_scan = left_scan 482 self.mass_features[idx].final_scan = right_scan 483 484 # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData 485 area = np.trapz( 486 myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 487 myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 488 ) 489 mf_df.at[idx, "area"] = area 490 myEIC.areas = myEIC.areas + [area] 491 self.eics[mz] = myEIC 492 self.mass_features[idx]._area = area 493 else: 494 if drop_if_fail is True: 495 self.mass_features.pop(idx) 496 497 if drop_duplicates: 498 # Prepare mass feature dataframe 499 mf_df = self.mass_features_to_df().copy() 500 501 # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature 502 # Kepp the first mass fea 503 for idx, mass_feature in mf_df.iterrows(): 504 mz = mass_feature.mz 505 apex_scan = mass_feature.apex_scan 506 507 mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6 508 mf_df_sub = mf_df[ 509 mf_df["mz_diff_ppm"] 510 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 511 * 10**6 512 ].copy() 513 514 # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature 515 for idx2, mass_feature2 in mf_df_sub.iterrows(): 516 if idx2 != idx: 517 if ( 518 mass_feature2.start_scan >= mass_feature.start_scan 519 and mass_feature2.final_scan <= mass_feature.final_scan 520 ): 521 if idx2 in self.mass_features.keys(): 522 self.mass_features.pop(idx2)
Integrate mass features and extract EICs.
Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
Parameters
- drop_if_fail (bool, optional): Whether to drop mass features if the EIC limit calculations fail. Default is True.
- drop_duplicates (bool, optional): Whether to mass features that appear to be duplicates (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). Default is True.
- ms_level (int, optional): The MS level to use. Default is 1.
Raises
- ValueError: If no mass features are found. If no MS level data is found for the given MS level (either in data or in the scan data)
Returns
- None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
Notes
drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
524 def find_c13_mass_features(self): 525 """Mark likely C13 isotopes and connect to monoisoitopic mass features. 526 527 Returns 528 ------- 529 None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object. 530 531 Raises 532 ------ 533 ValueError 534 If no mass features are found. 535 """ 536 verbose = self.parameters.lc_ms.verbose_processing 537 if verbose: 538 print("evaluating mass features for C13 isotopes") 539 if self.mass_features is None: 540 raise ValueError("No mass features found, run find_mass_features() first") 541 542 # Data prep fo sparse distance matrix 543 dims = ["mz", "scan_time"] 544 mf_df = self.mass_features_to_df().copy() 545 # Drop mass features that have no area (these are likely to be noise) 546 mf_df = mf_df[mf_df["area"].notnull()] 547 mf_df["mf_id"] = mf_df.index.values 548 dims = ["mz", "scan_time"] 549 550 # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features 551 mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy() 552 553 mz_diff = 1.003355 # C13-C12 mass difference 554 tol = [ 555 mf_df["mz"].median() 556 * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 557 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5, 558 ] # mz, in relative; scan_time in minutes 559 560 # Compute inter-feature distances 561 distances = None 562 for i in range(len(dims)): 563 # Construct k-d tree 564 values = mf_df[dims[i]].values 565 tree = KDTree(values.reshape(-1, 1)) 566 567 max_tol = tol[i] 568 if dims[i] == "mz": 569 # Maximum absolute tolerance 570 max_tol = mz_diff + tol[i] 571 572 # Compute sparse distance matrix 573 # the larger the max_tol, the slower this operation is 574 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 575 576 # Only consider forward case, exclude diagonal 577 sdm = sparse.triu(sdm, k=1) 578 579 if dims[i] == "mz": 580 min_tol = mz_diff - tol[i] 581 # Get only the ones that are above the min tol 582 idx = sdm.data > min_tol 583 584 # Reconstruct sparse distance matrix 585 sdm = sparse.coo_matrix( 586 (sdm.data[idx], (sdm.row[idx], sdm.col[idx])), 587 shape=(len(values), len(values)), 588 ) 589 590 # Cast as binary matrix 591 sdm.data = np.ones_like(sdm.data) 592 593 # Stack distances 594 if distances is None: 595 distances = sdm 596 else: 597 distances = distances.multiply(sdm) 598 599 # Extract indices of within-tolerance points 600 distances = distances.tocoo() 601 pairs = np.stack((distances.row, distances.col), axis=1) # C12 to C13 pairs 602 603 # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df 604 pairs_mf = pairs.copy() 605 pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values 606 pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values 607 608 # Connect monoisotopic masses with isotopologes within mass_features 609 monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1])) 610 for mono in monos: 611 self.mass_features[mono].monoisotopic_mf_id = mono 612 pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"]) 613 while not pairs_iso_df.empty: 614 pairs_iso_df = pairs_iso_df.set_index("parent", drop=False) 615 m1_isos = pairs_iso_df.loc[monos, "child"].unique() 616 for iso in m1_isos: 617 # Set monoisotopic_mf_id and isotopologue_type for isotopologues 618 parent = pairs_mf[pairs_mf[:, 1] == iso, 0] 619 if len(parent) > 1: 620 # Choose the parent that is closest in time to the isotopologue 621 parent_time = [self.mass_features[p].retention_time for p in parent] 622 time_diff = [ 623 np.abs(self.mass_features[iso].retention_time - x) 624 for x in parent_time 625 ] 626 parent = parent[np.argmin(time_diff)] 627 else: 628 parent = parent[0] 629 self.mass_features[iso].monoisotopic_mf_id = self.mass_features[ 630 parent 631 ].monoisotopic_mf_id 632 if self.mass_features[iso].monoisotopic_mf_id is not None: 633 mass_diff = ( 634 self.mass_features[iso].mz 635 - self.mass_features[ 636 self.mass_features[iso].monoisotopic_mf_id 637 ].mz 638 ) 639 self.mass_features[iso].isotopologue_type = "13C" + str( 640 int(round(mass_diff, 0)) 641 ) 642 643 # Drop the mono and iso from the pairs_iso_df 644 pairs_iso_df = pairs_iso_df.drop( 645 index=monos, errors="ignore" 646 ) # Drop pairs where the parent is a child that is a child of a root 647 pairs_iso_df = pairs_iso_df.set_index("child", drop=False) 648 pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore") 649 650 if not pairs_iso_df.empty: 651 # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process 652 monos = np.setdiff1d( 653 np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child) 654 ) 655 if verbose: 656 # Report fraction of compounds annotated with isotopes 657 mf_df["c13_flag"] = np.where( 658 np.logical_or( 659 np.isin(mf_df["mf_id"], pairs_mf[:, 0]), 660 np.isin(mf_df["mf_id"], pairs_mf[:, 1]), 661 ), 662 1, 663 0, 664 ) 665 print( 666 str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3)) 667 + " of mass features have or are C13 isotopes" 668 )
Mark likely C13 isotopes and connect to monoisoitopic mass features.
Returns
- None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
Raises
- ValueError: If no mass features are found.
670 def deconvolute_ms1_mass_features(self): 671 """Deconvolute MS1 mass features 672 673 Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features 674 675 Parameters 676 ---------- 677 None 678 679 Returns 680 ------- 681 None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent, 682 and associated_mass_features_deconvoluted attributes to the mass features in the 683 mass_features attribute of the LCMSBase object. 684 685 Raises 686 ------ 687 ValueError 688 If no mass features are found, must run find_mass_features() first. 689 If no EICs are found, did you run integrate_mass_features() first? 690 691 """ 692 # Checks for set mass_features and eics 693 if self.mass_features is None: 694 raise ValueError( 695 "No mass features found, did you run find_mass_features() first?" 696 ) 697 698 if self.eics == {}: 699 raise ValueError( 700 "No EICs found, did you run integrate_mass_features() first?" 701 ) 702 703 if 1 not in self._ms_unprocessed.keys(): 704 raise ValueError("No unprocessed MS1 spectra found.") 705 706 # Prep ms1 data 707 ms1_data = self._ms_unprocessed[1].copy() 708 ms1_data = ms1_data.set_index("scan") 709 710 # Prep mass feature summary 711 mass_feature_df = self.mass_features_to_df() 712 713 # Loop through each mass feature 714 for mf_id, mass_feature in self.mass_features.items(): 715 # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df 716 if mass_feature.mz != mass_feature.ms1_peak.mz_exp: 717 continue 718 719 # Get the left and right limits of the EIC of the mass feature 720 l_scan, _, r_scan = mass_feature._eic_data.apexes[0] 721 722 # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz 723 ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy() 724 ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False) 725 726 # Get the centroided masses of the mass feature 727 mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp 728 729 # Find the closest mz in the ms1 data to the centroided masses of the mass feature 730 ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[ 731 find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values) 732 ] 733 734 # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz 735 ms1_data_sub["mz_diff_rel"] = ( 736 np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"]) 737 / ms1_data_sub["mz"] 738 ) 739 ms1_data_sub = ms1_data_sub[ 740 ms1_data_sub["mz_diff_rel"] 741 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 742 ].reset_index(drop=True) 743 744 # Group by mass_feature_mz and scan and sum intensity 745 ms1_data_sub_group = ( 746 ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"] 747 .sum() 748 .reset_index() 749 ) 750 751 # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity) 752 corr = ( 753 ms1_data_sub_group.pivot( 754 index="scan", columns="mass_feature_mz", values="intensity" 755 ) 756 .fillna(0) 757 .corr() 758 ) 759 760 # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8 761 decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min 762 corr_subset = corr.loc[mass_feature.mz,] 763 corr_subset = corr_subset[corr_subset > decon_corr_min] 764 765 # Get the masses from the mass spectrum that are the result of the deconvolution 766 mzs_decon = corr_subset.index.values 767 768 # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature 769 mzs_decon_idx = [ 770 id 771 for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp) 772 if mz in mzs_decon 773 ] 774 mass_feature._ms_deconvoluted_idx = mzs_decon_idx 775 776 # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum 777 if ( 778 mass_feature.ms1_peak.abundance 779 == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max() 780 ): 781 mass_feature.mass_spectrum_deconvoluted_parent = True 782 else: 783 mass_feature.mass_spectrum_deconvoluted_parent = False 784 785 # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature 786 # Subset mass_feature_df to only include mass features that are within the clustering tolerance 787 mass_feature_df_sub = mass_feature_df[ 788 abs(mass_feature.retention_time - mass_feature_df["scan_time"]) 789 < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 790 ].copy() 791 # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum 792 mass_feature_df_sub["mz_diff_ppm"] = [ 793 np.abs(mzs_decon - mz).min() / mz * 10**6 794 for mz in mass_feature_df_sub["mz"] 795 ] 796 # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses 797 mfs_associated_decon = mass_feature_df_sub[ 798 mass_feature_df_sub["mz_diff_ppm"] 799 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6 800 ].index.values 801 802 mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon
Deconvolute MS1 mass features
Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
Parameters
- None
Returns
- None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
- and associated_mass_features_deconvoluted attributes to the mass features in the
- mass_features attribute of the LCMSBase object.
Raises
- ValueError: If no mass features are found, must run find_mass_features() first. If no EICs are found, did you run integrate_mass_features() first?
805class PHCalculations: 806 """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data. 807 808 Notes 809 ----- 810 This class is intended to be used as a mixin for the LCMSBase class. 811 812 Methods 813 ------- 814 * sparse_mean_filter(idx, V, radius=[0, 1, 1]). 815 Sparse implementation of a mean filter. 816 * embed_unique_indices(a). 817 Creates an array of indices, sorted by unique element. 818 * sparse_upper_star(idx, V). 819 Sparse implementation of an upper star filtration. 820 * check_if_grid(data). 821 Check if the data is gridded in mz space. 822 * grid_data(data). 823 Grid the data in the mz dimension. 824 * find_mass_features_ph(ms_level=1, grid=True). 825 Find mass features within an LCMSBase object using persistent homology. 826 * cluster_mass_features(drop_children=True). 827 Cluster regions of interest. 828 """ 829 830 @staticmethod 831 def sparse_mean_filter(idx, V, radius=[0, 1, 1]): 832 """Sparse implementation of a mean filter. 833 834 Parameters 835 ---------- 836 idx : :obj:`~numpy.array` 837 Edge indices for each dimension (MxN). 838 V : :obj:`~numpy.array` 839 Array of intensity data (Mx1). 840 radius : float or list 841 Radius of the sparse filter in each dimension. Values less than 842 zero indicate no connectivity in that dimension. 843 844 Returns 845 ------- 846 :obj:`~numpy.array` 847 Filtered intensities (Mx1). 848 849 Notes 850 ----- 851 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. 852 This is a static method. 853 """ 854 855 # Copy indices 856 idx = idx.copy().astype(V.dtype) 857 858 # Scale 859 for i, r in enumerate(radius): 860 # Increase inter-index distance 861 if r < 1: 862 idx[:, i] *= 2 863 864 # Do nothing 865 elif r == 1: 866 pass 867 868 # Decrease inter-index distance 869 else: 870 idx[:, i] /= r 871 872 # Connectivity matrix 873 cmat = KDTree(idx) 874 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 875 cmat.setdiag(1) 876 877 # Pair indices 878 I, J = cmat.nonzero() 879 880 # Delete cmat 881 cmat_shape = cmat.shape 882 del cmat 883 884 # Sum over columns 885 V_sum = sparse.bsr_matrix( 886 (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype 887 ).diagonal(0) 888 889 # Count over columns 890 V_count = sparse.bsr_matrix( 891 (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype 892 ).diagonal(0) 893 894 return V_sum / V_count 895 896 @staticmethod 897 def embed_unique_indices(a): 898 """Creates an array of indices, sorted by unique element. 899 900 Parameters 901 ---------- 902 a : :obj:`~numpy.array` 903 Array of unique elements (Mx1). 904 905 Returns 906 ------- 907 :obj:`~numpy.array` 908 Array of indices (Mx1). 909 910 Notes 911 ----- 912 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 913 This is a static method. 914 """ 915 916 def count_tens(n): 917 # Count tens 918 ntens = (n - 1) // 10 919 920 while True: 921 ntens_test = (ntens + n - 1) // 10 922 923 if ntens_test == ntens: 924 return ntens 925 else: 926 ntens = ntens_test 927 928 def arange_exclude_10s(n): 929 # How many 10s will there be? 930 ntens = count_tens(n) 931 932 # Base array 933 arr = np.arange(0, n + ntens) 934 935 # Exclude 10s 936 arr = arr[(arr == 0) | (arr % 10 != 0)][:n] 937 938 return arr 939 940 # Creates an array of indices, sorted by unique element 941 idx_sort = np.argsort(a) 942 idx_unsort = np.argsort(idx_sort) 943 944 # Sorts records array so all unique elements are together 945 sorted_a = a[idx_sort] 946 947 # Returns the unique values, the index of the first occurrence, 948 # and the count for each element 949 vals, idx_start, count = np.unique( 950 sorted_a, return_index=True, return_counts=True 951 ) 952 953 # Splits the indices into separate arrays 954 splits = np.split(idx_sort, idx_start[1:]) 955 956 # Creates unique indices for each split 957 idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits]) 958 959 # Reorders according to input array 960 idx_unq = idx_unq[idx_unsort] 961 962 # Magnitude of each index 963 exp = np.log10( 964 idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64) 965 ) 966 idx_unq_mag = np.power(10, np.floor(exp) + 1) 967 968 # Result 969 return a + idx_unq / idx_unq_mag 970 971 def sparse_upper_star(self, idx, V): 972 """Sparse implementation of an upper star filtration. 973 974 Parameters 975 ---------- 976 idx : :obj:`~numpy.array` 977 Edge indices for each dimension (MxN). 978 V : :obj:`~numpy.array` 979 Array of intensity data (Mx1). 980 Returns 981 ------- 982 idx : :obj:`~numpy.array` 983 Index of filtered points (Mx1). 984 persistence : :obj:`~numpy.array` 985 Persistence of each filtered point (Mx1). 986 987 Notes 988 ----- 989 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 990 """ 991 992 # Invert 993 V = -1 * V.copy().astype(int) 994 995 # Embed indices 996 V = self.embed_unique_indices(V) 997 998 # Connectivity matrix 999 cmat = KDTree(idx) 1000 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1001 cmat.setdiag(1) 1002 cmat = sparse.triu(cmat) 1003 1004 # Pairwise minimums 1005 I, J = cmat.nonzero() 1006 d = np.maximum(V[I], V[J]) 1007 1008 # Delete connectiity matrix 1009 cmat_shape = cmat.shape 1010 del cmat 1011 1012 # Sparse distance matrix 1013 sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape) 1014 1015 # Delete pairwise mins 1016 del d, I, J 1017 1018 # Persistence homology 1019 ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0] 1020 1021 # Bound death values 1022 ph[ph[:, 1] == np.inf, 1] = np.max(V) 1023 1024 # Construct tree to query against 1025 tree = KDTree(V.reshape((-1, 1))) 1026 1027 # Get the indexes of the first nearest neighbor by birth 1028 _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1) 1029 1030 return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1) 1031 1032 def check_if_grid(self, data): 1033 """Check if the data are gridded in mz space. 1034 1035 Parameters 1036 ---------- 1037 data : DataFrame 1038 DataFrame containing the mass spectrometry data. Needs to have mz and scan columns. 1039 1040 Returns 1041 ------- 1042 bool 1043 True if the data is gridded in the mz direction, False otherwise. 1044 1045 Notes 1046 ----- 1047 This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly. 1048 """ 1049 # Calculate the difference between consecutive mz values in a single scan 1050 dat_check = data.copy().reset_index(drop=True) 1051 dat_check["mz_diff"] = np.abs(dat_check["mz"].diff()) 1052 mz_diff_min = ( 1053 dat_check.groupby("scan")["mz_diff"].min().min() 1054 ) # within each scan, what is the smallest mz difference between consecutive mz values 1055 1056 # Find the mininum mz difference between mz values in the data; regardless of scan 1057 dat_check_mz = dat_check[["mz"]].drop_duplicates().copy() 1058 dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True) 1059 dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff()) 1060 1061 # Get minimum mz_diff between mz values in the data 1062 mz_diff_min_raw = dat_check_mz["mz_diff"].min() 1063 1064 # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded 1065 if mz_diff_min_raw < mz_diff_min: 1066 return False 1067 else: 1068 return True 1069 1070 def grid_data(self, data): 1071 """Grid the data in the mz dimension. 1072 1073 Data must be gridded prior to persistent homology calculations. 1074 1075 Parameters 1076 ---------- 1077 data : DataFrame 1078 The input data containing mz, scan, scan_time, and intensity columns. 1079 1080 Returns 1081 ------- 1082 DataFrame 1083 The gridded data with mz, scan, scan_time, and intensity columns. 1084 1085 Raises 1086 ------ 1087 ValueError 1088 If gridding fails. 1089 """ 1090 1091 # Calculate the difference between consecutive mz values in a single scan for grid spacing 1092 data_w = data.copy().reset_index(drop=True) 1093 data_w["mz_diff"] = np.abs(data_w["mz"].diff()) 1094 mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999 1095 1096 # Need high intensity mz values first so they are parents in the output pairs stack 1097 dat_mz = data_w[["mz", "intensity"]].sort_values( 1098 by=["intensity"], ascending=False 1099 ) 1100 dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy() 1101 1102 # Construct KD tree 1103 tree = KDTree(dat_mz.mz.values.reshape(-1, 1)) 1104 sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix") 1105 sdm = sparse.triu(sdm, k=1) 1106 sdm.data = np.ones_like(sdm.data) 1107 distances = sdm.tocoo() 1108 pairs = np.stack((distances.row, distances.col), axis=1) 1109 1110 # Cull pairs to just get root 1111 to_drop = [] 1112 while len(pairs) > 0: 1113 root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1])) 1114 id_root_parents = np.isin(pairs[:, 0], root_parents) 1115 children_of_roots = np.unique(pairs[id_root_parents, 1]) 1116 to_drop = np.append(to_drop, children_of_roots) 1117 1118 # Set up pairs array for next iteration by removing pairs with children or parents already dropped 1119 pairs = pairs[~np.isin(pairs[:, 1], to_drop), :] 1120 pairs = pairs[~np.isin(pairs[:, 0], to_drop), :] 1121 dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop)) 1122 mz_dat_np = ( 1123 dat_mz[["mz"]] 1124 .sort_values(by=["mz"]) 1125 .reset_index(drop=True) 1126 .values.flatten() 1127 ) 1128 1129 # Sort data by mz and recast mz to nearest value in mz_dat_np 1130 data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy() 1131 data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)] 1132 data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"]) 1133 1134 # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity 1135 new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy() 1136 new_data_w = ( 1137 new_data_w.drop(columns=["mz_diff", "mz_orig"]) 1138 .groupby(["scan", "mz"])["intensity"] 1139 .sum() 1140 .reset_index() 1141 ) 1142 new_data_w = ( 1143 new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True]) 1144 .reset_index(drop=True) 1145 .copy() 1146 ) 1147 1148 # Check if grid worked and return 1149 if self.check_if_grid(new_data_w): 1150 return new_data_w 1151 else: 1152 raise ValueError("Gridding failed") 1153 1154 def find_mass_features_ph(self, ms_level=1, grid=True): 1155 """Find mass features within an LCMSBase object using persistent homology. 1156 1157 Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id) 1158 1159 Parameters 1160 ---------- 1161 ms_level : int, optional 1162 The MS level to use. Default is 1. 1163 grid : bool, optional 1164 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True. 1165 1166 Raises 1167 ------ 1168 ValueError 1169 If no MS level data is found on the object. 1170 If data is not gridded and grid is False. 1171 1172 Returns 1173 ------- 1174 None, but assigns the mass_features attribute to the object. 1175 1176 Notes 1177 ----- 1178 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1179 """ 1180 # Check that ms_level is a key in self._ms_uprocessed 1181 if ms_level not in self._ms_unprocessed.keys(): 1182 raise ValueError( 1183 "No MS level " 1184 + str(ms_level) 1185 + " data found, did you instantiate with parser specific to MS level?" 1186 ) 1187 1188 # Get ms data 1189 data = self._ms_unprocessed[ms_level].copy() 1190 1191 # Drop rows with missing intensity values and reset index 1192 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1193 1194 # Threshold data 1195 dims = ["mz", "scan_time"] 1196 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1197 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1198 1199 # Check if gridded, if not, grid 1200 gridded_mz = self.check_if_grid(data_thres) 1201 if gridded_mz is False: 1202 if grid is False: 1203 raise ValueError( 1204 "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function" 1205 ) 1206 else: 1207 data_thres = self.grid_data(data_thres) 1208 1209 # Add build factors and add scan_time 1210 data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1211 factors = { 1212 dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32) 1213 for dim in dims 1214 } # this is return a float64 index 1215 1216 # Build indexes 1217 index = { 1218 dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32) 1219 for dim in factors 1220 } 1221 1222 # Smooth data 1223 iterations = self.parameters.lc_ms.ph_smooth_it 1224 smooth_radius = [ 1225 self.parameters.lc_ms.ph_smooth_radius_mz, 1226 self.parameters.lc_ms.ph_smooth_radius_scan, 1227 ] # mz, scan_time smoothing radius (in steps) 1228 1229 index = np.vstack([index[dim] for dim in dims]).T 1230 V = data_thres["intensity"].values 1231 resid = np.inf 1232 for i in range(iterations): 1233 # Previous iteration 1234 V_prev = V.copy() 1235 resid_prev = resid 1236 V = self.sparse_mean_filter(index, V, radius=smooth_radius) 1237 1238 # Calculate residual with previous iteration 1239 resid = np.sqrt(np.mean(np.square(V - V_prev))) 1240 1241 # Evaluate convergence 1242 if i > 0: 1243 # Percent change in residual 1244 test = np.abs(resid - resid_prev) / resid_prev 1245 1246 # Exit criteria 1247 if test <= 0: 1248 break 1249 1250 # Overwrite values 1251 data_thres["intensity"] = V 1252 1253 # Use persistent homology to find regions of interest 1254 pidx, pers = self.sparse_upper_star(index, V) 1255 pidx = pidx[pers > 1] 1256 pers = pers[pers > 1] 1257 1258 # Get peaks 1259 peaks = data_thres.iloc[pidx, :].reset_index(drop=True) 1260 1261 # Add persistence column 1262 peaks["persistence"] = pers 1263 mass_features = peaks.sort_values( 1264 by="persistence", ascending=False 1265 ).reset_index(drop=True) 1266 1267 # Filter by persistence threshold 1268 persistence_threshold = ( 1269 self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max() 1270 ) 1271 mass_features = mass_features.loc[ 1272 mass_features["persistence"] > persistence_threshold, : 1273 ].reset_index(drop=True) 1274 1275 # Rename scan column to apex_scan 1276 mass_features = mass_features.rename( 1277 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1278 ) 1279 1280 # Populate mass_features attribute 1281 self.mass_features = {} 1282 for row in mass_features.itertuples(): 1283 row_dict = mass_features.iloc[row.Index].to_dict() 1284 lcms_feature = LCMSMassFeature(self, **row_dict) 1285 self.mass_features[lcms_feature.id] = lcms_feature 1286 1287 if self.parameters.lc_ms.verbose_processing: 1288 print("Found " + str(len(mass_features)) + " initial mass features") 1289 1290 def cluster_mass_features(self, drop_children=True, sort_by="persistence"): 1291 """Cluster mass features 1292 1293 Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence. 1294 1295 Parameters 1296 ---------- 1297 drop_children : bool, optional 1298 Whether to drop the mass features that are not cluster parents. Default is True. 1299 sort_by : str, optional 1300 The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence". 1301 1302 Raises 1303 ------ 1304 ValueError 1305 If no mass features are found. 1306 If too many mass features are found. 1307 1308 Returns 1309 ------- 1310 None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents. 1311 """ 1312 verbose = self.parameters.lc_ms.verbose_processing 1313 1314 if self.mass_features is None: 1315 raise ValueError("No mass features found, run find_mass_features() first") 1316 if len(self.mass_features) > 400000: 1317 raise ValueError( 1318 "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold" 1319 ) 1320 dims = ["mz", "scan_time"] 1321 mf_df_og = self.mass_features_to_df() 1322 mf_df = mf_df_og.copy() 1323 1324 # Sort mass features by sort_by column, make mf_id its own column for easier bookkeeping 1325 mf_df = mf_df.sort_values(by=sort_by, ascending=False).reset_index(drop=False) 1326 1327 tol = [ 1328 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1329 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1330 ] # mz, in relative; scan_time in minutes 1331 relative = [True, False] 1332 1333 # Compute inter-feature distances 1334 distances = None 1335 for i in range(len(dims)): 1336 # Construct k-d tree 1337 values = mf_df[dims[i]].values 1338 tree = KDTree(values.reshape(-1, 1)) 1339 1340 max_tol = tol[i] 1341 if relative[i] is True: 1342 # Maximum absolute tolerance 1343 max_tol = tol[i] * values.max() 1344 1345 # Compute sparse distance matrix 1346 # the larger the max_tol, the slower this operation is 1347 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 1348 1349 # Only consider forward case, exclude diagonal 1350 sdm = sparse.triu(sdm, k=1) 1351 1352 # Filter relative distances 1353 if relative[i] is True: 1354 # Compute relative distances 1355 rel_dists = sdm.data / values[sdm.row] # or col? 1356 1357 # Indices of relative distances less than tolerance 1358 idx = rel_dists <= tol[i] 1359 1360 # Reconstruct sparse distance matrix 1361 sdm = sparse.coo_matrix( 1362 (rel_dists[idx], (sdm.row[idx], sdm.col[idx])), 1363 shape=(len(values), len(values)), 1364 ) 1365 1366 # Cast as binary matrix 1367 sdm.data = np.ones_like(sdm.data) 1368 1369 # Stack distances 1370 if distances is None: 1371 distances = sdm 1372 else: 1373 distances = distances.multiply(sdm) 1374 1375 # Extract indices of within-tolerance points 1376 distances = distances.tocoo() 1377 pairs = np.stack((distances.row, distances.col), axis=1) 1378 pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]) 1379 pairs_df = pairs_df.set_index("parent") 1380 1381 to_drop = [] 1382 while not pairs_df.empty: 1383 # Find root_parents and their children 1384 root_parents = np.setdiff1d( 1385 np.unique(pairs_df.index.values), np.unique(pairs_df.child.values) 1386 ) 1387 children_of_roots = pairs_df.loc[root_parents, "child"].unique() 1388 to_drop = np.append(to_drop, children_of_roots) 1389 1390 # Remove root_children as possible parents from pairs_df for next iteration 1391 pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore") 1392 pairs_df = pairs_df.reset_index().set_index("child") 1393 # Remove root_children as possible children from pairs_df for next iteration 1394 pairs_df = pairs_df.drop(index=children_of_roots) 1395 1396 # Prepare for next iteration 1397 pairs_df = pairs_df.reset_index().set_index("parent") 1398 1399 # Drop mass features that are not cluster parents 1400 mf_df = mf_df.drop(index=np.array(to_drop)) 1401 1402 # Set index back to mf_id 1403 mf_df = mf_df.set_index("mf_id") 1404 if verbose: 1405 print(str(len(mf_df)) + " mass features remaining") 1406 1407 mf_df_new = mf_df_og.copy() 1408 mf_df_new["cluster_parent"] = np.where( 1409 np.isin(mf_df_new.index, mf_df.index), True, False 1410 ) 1411 1412 # get mass feature ids of features that are not cluster parents 1413 cluster_daughters = mf_df_new[mf_df_new["cluster_parent"] == False].index.values 1414 if drop_children is True: 1415 # Drop mass features that are not cluster parents from self 1416 self.mass_features = { 1417 k: v 1418 for k, v in self.mass_features.items() 1419 if k not in cluster_daughters 1420 } 1421 else: 1422 return cluster_daughters
Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.
Notes
This class is intended to be used as a mixin for the LCMSBase class.
Methods
- sparse_mean_filter(idx, V, radius=[0, 1, 1]). Sparse implementation of a mean filter.
- embed_unique_indices(a). Creates an array of indices, sorted by unique element.
- sparse_upper_star(idx, V). Sparse implementation of an upper star filtration.
- check_if_grid(data). Check if the data is gridded in mz space.
- grid_data(data). Grid the data in the mz dimension.
- find_mass_features_ph(ms_level=1, grid=True). Find mass features within an LCMSBase object using persistent homology.
- cluster_mass_features(drop_children=True). Cluster regions of interest.
830 @staticmethod 831 def sparse_mean_filter(idx, V, radius=[0, 1, 1]): 832 """Sparse implementation of a mean filter. 833 834 Parameters 835 ---------- 836 idx : :obj:`~numpy.array` 837 Edge indices for each dimension (MxN). 838 V : :obj:`~numpy.array` 839 Array of intensity data (Mx1). 840 radius : float or list 841 Radius of the sparse filter in each dimension. Values less than 842 zero indicate no connectivity in that dimension. 843 844 Returns 845 ------- 846 :obj:`~numpy.array` 847 Filtered intensities (Mx1). 848 849 Notes 850 ----- 851 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. 852 This is a static method. 853 """ 854 855 # Copy indices 856 idx = idx.copy().astype(V.dtype) 857 858 # Scale 859 for i, r in enumerate(radius): 860 # Increase inter-index distance 861 if r < 1: 862 idx[:, i] *= 2 863 864 # Do nothing 865 elif r == 1: 866 pass 867 868 # Decrease inter-index distance 869 else: 870 idx[:, i] /= r 871 872 # Connectivity matrix 873 cmat = KDTree(idx) 874 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 875 cmat.setdiag(1) 876 877 # Pair indices 878 I, J = cmat.nonzero() 879 880 # Delete cmat 881 cmat_shape = cmat.shape 882 del cmat 883 884 # Sum over columns 885 V_sum = sparse.bsr_matrix( 886 (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype 887 ).diagonal(0) 888 889 # Count over columns 890 V_count = sparse.bsr_matrix( 891 (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype 892 ).diagonal(0) 893 894 return V_sum / V_count
Sparse implementation of a mean filter.
Parameters
- idx (
~numpy.array
): Edge indices for each dimension (MxN). - V (
~numpy.array
): Array of intensity data (Mx1). - radius (float or list): Radius of the sparse filter in each dimension. Values less than zero indicate no connectivity in that dimension.
Returns
~numpy.array
: Filtered intensities (Mx1).
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. This is a static method.
896 @staticmethod 897 def embed_unique_indices(a): 898 """Creates an array of indices, sorted by unique element. 899 900 Parameters 901 ---------- 902 a : :obj:`~numpy.array` 903 Array of unique elements (Mx1). 904 905 Returns 906 ------- 907 :obj:`~numpy.array` 908 Array of indices (Mx1). 909 910 Notes 911 ----- 912 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 913 This is a static method. 914 """ 915 916 def count_tens(n): 917 # Count tens 918 ntens = (n - 1) // 10 919 920 while True: 921 ntens_test = (ntens + n - 1) // 10 922 923 if ntens_test == ntens: 924 return ntens 925 else: 926 ntens = ntens_test 927 928 def arange_exclude_10s(n): 929 # How many 10s will there be? 930 ntens = count_tens(n) 931 932 # Base array 933 arr = np.arange(0, n + ntens) 934 935 # Exclude 10s 936 arr = arr[(arr == 0) | (arr % 10 != 0)][:n] 937 938 return arr 939 940 # Creates an array of indices, sorted by unique element 941 idx_sort = np.argsort(a) 942 idx_unsort = np.argsort(idx_sort) 943 944 # Sorts records array so all unique elements are together 945 sorted_a = a[idx_sort] 946 947 # Returns the unique values, the index of the first occurrence, 948 # and the count for each element 949 vals, idx_start, count = np.unique( 950 sorted_a, return_index=True, return_counts=True 951 ) 952 953 # Splits the indices into separate arrays 954 splits = np.split(idx_sort, idx_start[1:]) 955 956 # Creates unique indices for each split 957 idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits]) 958 959 # Reorders according to input array 960 idx_unq = idx_unq[idx_unsort] 961 962 # Magnitude of each index 963 exp = np.log10( 964 idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64) 965 ) 966 idx_unq_mag = np.power(10, np.floor(exp) + 1) 967 968 # Result 969 return a + idx_unq / idx_unq_mag
Creates an array of indices, sorted by unique element.
Parameters
- a (
~numpy.array
): Array of unique elements (Mx1).
Returns
~numpy.array
: Array of indices (Mx1).
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos This is a static method.
971 def sparse_upper_star(self, idx, V): 972 """Sparse implementation of an upper star filtration. 973 974 Parameters 975 ---------- 976 idx : :obj:`~numpy.array` 977 Edge indices for each dimension (MxN). 978 V : :obj:`~numpy.array` 979 Array of intensity data (Mx1). 980 Returns 981 ------- 982 idx : :obj:`~numpy.array` 983 Index of filtered points (Mx1). 984 persistence : :obj:`~numpy.array` 985 Persistence of each filtered point (Mx1). 986 987 Notes 988 ----- 989 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 990 """ 991 992 # Invert 993 V = -1 * V.copy().astype(int) 994 995 # Embed indices 996 V = self.embed_unique_indices(V) 997 998 # Connectivity matrix 999 cmat = KDTree(idx) 1000 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1001 cmat.setdiag(1) 1002 cmat = sparse.triu(cmat) 1003 1004 # Pairwise minimums 1005 I, J = cmat.nonzero() 1006 d = np.maximum(V[I], V[J]) 1007 1008 # Delete connectiity matrix 1009 cmat_shape = cmat.shape 1010 del cmat 1011 1012 # Sparse distance matrix 1013 sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape) 1014 1015 # Delete pairwise mins 1016 del d, I, J 1017 1018 # Persistence homology 1019 ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0] 1020 1021 # Bound death values 1022 ph[ph[:, 1] == np.inf, 1] = np.max(V) 1023 1024 # Construct tree to query against 1025 tree = KDTree(V.reshape((-1, 1))) 1026 1027 # Get the indexes of the first nearest neighbor by birth 1028 _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1) 1029 1030 return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)
Sparse implementation of an upper star filtration.
Parameters
- idx (
~numpy.array
): Edge indices for each dimension (MxN). - V (
~numpy.array
): Array of intensity data (Mx1).
Returns
- idx (
~numpy.array
): Index of filtered points (Mx1). - persistence (
~numpy.array
): Persistence of each filtered point (Mx1).
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1032 def check_if_grid(self, data): 1033 """Check if the data are gridded in mz space. 1034 1035 Parameters 1036 ---------- 1037 data : DataFrame 1038 DataFrame containing the mass spectrometry data. Needs to have mz and scan columns. 1039 1040 Returns 1041 ------- 1042 bool 1043 True if the data is gridded in the mz direction, False otherwise. 1044 1045 Notes 1046 ----- 1047 This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly. 1048 """ 1049 # Calculate the difference between consecutive mz values in a single scan 1050 dat_check = data.copy().reset_index(drop=True) 1051 dat_check["mz_diff"] = np.abs(dat_check["mz"].diff()) 1052 mz_diff_min = ( 1053 dat_check.groupby("scan")["mz_diff"].min().min() 1054 ) # within each scan, what is the smallest mz difference between consecutive mz values 1055 1056 # Find the mininum mz difference between mz values in the data; regardless of scan 1057 dat_check_mz = dat_check[["mz"]].drop_duplicates().copy() 1058 dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True) 1059 dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff()) 1060 1061 # Get minimum mz_diff between mz values in the data 1062 mz_diff_min_raw = dat_check_mz["mz_diff"].min() 1063 1064 # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded 1065 if mz_diff_min_raw < mz_diff_min: 1066 return False 1067 else: 1068 return True
Check if the data are gridded in mz space.
Parameters
- data (DataFrame): DataFrame containing the mass spectrometry data. Needs to have mz and scan columns.
Returns
- bool: True if the data is gridded in the mz direction, False otherwise.
Notes
This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1070 def grid_data(self, data): 1071 """Grid the data in the mz dimension. 1072 1073 Data must be gridded prior to persistent homology calculations. 1074 1075 Parameters 1076 ---------- 1077 data : DataFrame 1078 The input data containing mz, scan, scan_time, and intensity columns. 1079 1080 Returns 1081 ------- 1082 DataFrame 1083 The gridded data with mz, scan, scan_time, and intensity columns. 1084 1085 Raises 1086 ------ 1087 ValueError 1088 If gridding fails. 1089 """ 1090 1091 # Calculate the difference between consecutive mz values in a single scan for grid spacing 1092 data_w = data.copy().reset_index(drop=True) 1093 data_w["mz_diff"] = np.abs(data_w["mz"].diff()) 1094 mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999 1095 1096 # Need high intensity mz values first so they are parents in the output pairs stack 1097 dat_mz = data_w[["mz", "intensity"]].sort_values( 1098 by=["intensity"], ascending=False 1099 ) 1100 dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy() 1101 1102 # Construct KD tree 1103 tree = KDTree(dat_mz.mz.values.reshape(-1, 1)) 1104 sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix") 1105 sdm = sparse.triu(sdm, k=1) 1106 sdm.data = np.ones_like(sdm.data) 1107 distances = sdm.tocoo() 1108 pairs = np.stack((distances.row, distances.col), axis=1) 1109 1110 # Cull pairs to just get root 1111 to_drop = [] 1112 while len(pairs) > 0: 1113 root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1])) 1114 id_root_parents = np.isin(pairs[:, 0], root_parents) 1115 children_of_roots = np.unique(pairs[id_root_parents, 1]) 1116 to_drop = np.append(to_drop, children_of_roots) 1117 1118 # Set up pairs array for next iteration by removing pairs with children or parents already dropped 1119 pairs = pairs[~np.isin(pairs[:, 1], to_drop), :] 1120 pairs = pairs[~np.isin(pairs[:, 0], to_drop), :] 1121 dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop)) 1122 mz_dat_np = ( 1123 dat_mz[["mz"]] 1124 .sort_values(by=["mz"]) 1125 .reset_index(drop=True) 1126 .values.flatten() 1127 ) 1128 1129 # Sort data by mz and recast mz to nearest value in mz_dat_np 1130 data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy() 1131 data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)] 1132 data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"]) 1133 1134 # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity 1135 new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy() 1136 new_data_w = ( 1137 new_data_w.drop(columns=["mz_diff", "mz_orig"]) 1138 .groupby(["scan", "mz"])["intensity"] 1139 .sum() 1140 .reset_index() 1141 ) 1142 new_data_w = ( 1143 new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True]) 1144 .reset_index(drop=True) 1145 .copy() 1146 ) 1147 1148 # Check if grid worked and return 1149 if self.check_if_grid(new_data_w): 1150 return new_data_w 1151 else: 1152 raise ValueError("Gridding failed")
Grid the data in the mz dimension.
Data must be gridded prior to persistent homology calculations.
Parameters
- data (DataFrame): The input data containing mz, scan, scan_time, and intensity columns.
Returns
- DataFrame: The gridded data with mz, scan, scan_time, and intensity columns.
Raises
- ValueError: If gridding fails.
1154 def find_mass_features_ph(self, ms_level=1, grid=True): 1155 """Find mass features within an LCMSBase object using persistent homology. 1156 1157 Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id) 1158 1159 Parameters 1160 ---------- 1161 ms_level : int, optional 1162 The MS level to use. Default is 1. 1163 grid : bool, optional 1164 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True. 1165 1166 Raises 1167 ------ 1168 ValueError 1169 If no MS level data is found on the object. 1170 If data is not gridded and grid is False. 1171 1172 Returns 1173 ------- 1174 None, but assigns the mass_features attribute to the object. 1175 1176 Notes 1177 ----- 1178 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1179 """ 1180 # Check that ms_level is a key in self._ms_uprocessed 1181 if ms_level not in self._ms_unprocessed.keys(): 1182 raise ValueError( 1183 "No MS level " 1184 + str(ms_level) 1185 + " data found, did you instantiate with parser specific to MS level?" 1186 ) 1187 1188 # Get ms data 1189 data = self._ms_unprocessed[ms_level].copy() 1190 1191 # Drop rows with missing intensity values and reset index 1192 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1193 1194 # Threshold data 1195 dims = ["mz", "scan_time"] 1196 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1197 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1198 1199 # Check if gridded, if not, grid 1200 gridded_mz = self.check_if_grid(data_thres) 1201 if gridded_mz is False: 1202 if grid is False: 1203 raise ValueError( 1204 "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function" 1205 ) 1206 else: 1207 data_thres = self.grid_data(data_thres) 1208 1209 # Add build factors and add scan_time 1210 data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1211 factors = { 1212 dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32) 1213 for dim in dims 1214 } # this is return a float64 index 1215 1216 # Build indexes 1217 index = { 1218 dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32) 1219 for dim in factors 1220 } 1221 1222 # Smooth data 1223 iterations = self.parameters.lc_ms.ph_smooth_it 1224 smooth_radius = [ 1225 self.parameters.lc_ms.ph_smooth_radius_mz, 1226 self.parameters.lc_ms.ph_smooth_radius_scan, 1227 ] # mz, scan_time smoothing radius (in steps) 1228 1229 index = np.vstack([index[dim] for dim in dims]).T 1230 V = data_thres["intensity"].values 1231 resid = np.inf 1232 for i in range(iterations): 1233 # Previous iteration 1234 V_prev = V.copy() 1235 resid_prev = resid 1236 V = self.sparse_mean_filter(index, V, radius=smooth_radius) 1237 1238 # Calculate residual with previous iteration 1239 resid = np.sqrt(np.mean(np.square(V - V_prev))) 1240 1241 # Evaluate convergence 1242 if i > 0: 1243 # Percent change in residual 1244 test = np.abs(resid - resid_prev) / resid_prev 1245 1246 # Exit criteria 1247 if test <= 0: 1248 break 1249 1250 # Overwrite values 1251 data_thres["intensity"] = V 1252 1253 # Use persistent homology to find regions of interest 1254 pidx, pers = self.sparse_upper_star(index, V) 1255 pidx = pidx[pers > 1] 1256 pers = pers[pers > 1] 1257 1258 # Get peaks 1259 peaks = data_thres.iloc[pidx, :].reset_index(drop=True) 1260 1261 # Add persistence column 1262 peaks["persistence"] = pers 1263 mass_features = peaks.sort_values( 1264 by="persistence", ascending=False 1265 ).reset_index(drop=True) 1266 1267 # Filter by persistence threshold 1268 persistence_threshold = ( 1269 self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max() 1270 ) 1271 mass_features = mass_features.loc[ 1272 mass_features["persistence"] > persistence_threshold, : 1273 ].reset_index(drop=True) 1274 1275 # Rename scan column to apex_scan 1276 mass_features = mass_features.rename( 1277 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1278 ) 1279 1280 # Populate mass_features attribute 1281 self.mass_features = {} 1282 for row in mass_features.itertuples(): 1283 row_dict = mass_features.iloc[row.Index].to_dict() 1284 lcms_feature = LCMSMassFeature(self, **row_dict) 1285 self.mass_features[lcms_feature.id] = lcms_feature 1286 1287 if self.parameters.lc_ms.verbose_processing: 1288 print("Found " + str(len(mass_features)) + " initial mass features")
Find mass features within an LCMSBase object using persistent homology.
Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
Parameters
- ms_level (int, optional): The MS level to use. Default is 1.
- grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
Raises
- ValueError: If no MS level data is found on the object. If data is not gridded and grid is False.
Returns
- None, but assigns the mass_features attribute to the object.
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1290 def cluster_mass_features(self, drop_children=True, sort_by="persistence"): 1291 """Cluster mass features 1292 1293 Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence. 1294 1295 Parameters 1296 ---------- 1297 drop_children : bool, optional 1298 Whether to drop the mass features that are not cluster parents. Default is True. 1299 sort_by : str, optional 1300 The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence". 1301 1302 Raises 1303 ------ 1304 ValueError 1305 If no mass features are found. 1306 If too many mass features are found. 1307 1308 Returns 1309 ------- 1310 None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents. 1311 """ 1312 verbose = self.parameters.lc_ms.verbose_processing 1313 1314 if self.mass_features is None: 1315 raise ValueError("No mass features found, run find_mass_features() first") 1316 if len(self.mass_features) > 400000: 1317 raise ValueError( 1318 "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold" 1319 ) 1320 dims = ["mz", "scan_time"] 1321 mf_df_og = self.mass_features_to_df() 1322 mf_df = mf_df_og.copy() 1323 1324 # Sort mass features by sort_by column, make mf_id its own column for easier bookkeeping 1325 mf_df = mf_df.sort_values(by=sort_by, ascending=False).reset_index(drop=False) 1326 1327 tol = [ 1328 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1329 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1330 ] # mz, in relative; scan_time in minutes 1331 relative = [True, False] 1332 1333 # Compute inter-feature distances 1334 distances = None 1335 for i in range(len(dims)): 1336 # Construct k-d tree 1337 values = mf_df[dims[i]].values 1338 tree = KDTree(values.reshape(-1, 1)) 1339 1340 max_tol = tol[i] 1341 if relative[i] is True: 1342 # Maximum absolute tolerance 1343 max_tol = tol[i] * values.max() 1344 1345 # Compute sparse distance matrix 1346 # the larger the max_tol, the slower this operation is 1347 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 1348 1349 # Only consider forward case, exclude diagonal 1350 sdm = sparse.triu(sdm, k=1) 1351 1352 # Filter relative distances 1353 if relative[i] is True: 1354 # Compute relative distances 1355 rel_dists = sdm.data / values[sdm.row] # or col? 1356 1357 # Indices of relative distances less than tolerance 1358 idx = rel_dists <= tol[i] 1359 1360 # Reconstruct sparse distance matrix 1361 sdm = sparse.coo_matrix( 1362 (rel_dists[idx], (sdm.row[idx], sdm.col[idx])), 1363 shape=(len(values), len(values)), 1364 ) 1365 1366 # Cast as binary matrix 1367 sdm.data = np.ones_like(sdm.data) 1368 1369 # Stack distances 1370 if distances is None: 1371 distances = sdm 1372 else: 1373 distances = distances.multiply(sdm) 1374 1375 # Extract indices of within-tolerance points 1376 distances = distances.tocoo() 1377 pairs = np.stack((distances.row, distances.col), axis=1) 1378 pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]) 1379 pairs_df = pairs_df.set_index("parent") 1380 1381 to_drop = [] 1382 while not pairs_df.empty: 1383 # Find root_parents and their children 1384 root_parents = np.setdiff1d( 1385 np.unique(pairs_df.index.values), np.unique(pairs_df.child.values) 1386 ) 1387 children_of_roots = pairs_df.loc[root_parents, "child"].unique() 1388 to_drop = np.append(to_drop, children_of_roots) 1389 1390 # Remove root_children as possible parents from pairs_df for next iteration 1391 pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore") 1392 pairs_df = pairs_df.reset_index().set_index("child") 1393 # Remove root_children as possible children from pairs_df for next iteration 1394 pairs_df = pairs_df.drop(index=children_of_roots) 1395 1396 # Prepare for next iteration 1397 pairs_df = pairs_df.reset_index().set_index("parent") 1398 1399 # Drop mass features that are not cluster parents 1400 mf_df = mf_df.drop(index=np.array(to_drop)) 1401 1402 # Set index back to mf_id 1403 mf_df = mf_df.set_index("mf_id") 1404 if verbose: 1405 print(str(len(mf_df)) + " mass features remaining") 1406 1407 mf_df_new = mf_df_og.copy() 1408 mf_df_new["cluster_parent"] = np.where( 1409 np.isin(mf_df_new.index, mf_df.index), True, False 1410 ) 1411 1412 # get mass feature ids of features that are not cluster parents 1413 cluster_daughters = mf_df_new[mf_df_new["cluster_parent"] == False].index.values 1414 if drop_children is True: 1415 # Drop mass features that are not cluster parents from self 1416 self.mass_features = { 1417 k: v 1418 for k, v in self.mass_features.items() 1419 if k not in cluster_daughters 1420 } 1421 else: 1422 return cluster_daughters
Cluster mass features
Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
Parameters
- drop_children (bool, optional): Whether to drop the mass features that are not cluster parents. Default is True.
- sort_by (str, optional): The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
Raises
- ValueError: If no mass features are found. If too many mass features are found.
Returns
- None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.