corems.mass_spectra.calc.lc_calc
1import numpy as np 2import pandas as pd 3from ripser import ripser 4from scipy import sparse 5from scipy.spatial import KDTree 6 7from corems.chroma_peak.factory.chroma_peak_classes import LCMSMassFeature 8from corems.mass_spectra.calc import SignalProcessing as sp 9from corems.mass_spectra.factory.chromat_data import EIC_Data 10from corems.mass_spectrum.input.numpyArray import ms_from_array_profile 11 12 13def find_closest(A, target): 14 """Find the index of closest value in A to each value in target. 15 16 Parameters 17 ---------- 18 A : :obj:`~numpy.array` 19 The array to search (blueprint). A must be sorted. 20 target : :obj:`~numpy.array` 21 The array of values to search for. target must be sorted. 22 23 Returns 24 ------- 25 :obj:`~numpy.array` 26 The indices of the closest values in A to each value in target. 27 """ 28 idx = A.searchsorted(target) 29 idx = np.clip(idx, 1, len(A) - 1) 30 left = A[idx - 1] 31 right = A[idx] 32 idx -= target - left < right - target 33 return idx 34 35 36class LCCalculations: 37 """Methods for performing LC calculations on mass spectra data. 38 39 Notes 40 ----- 41 This class is intended to be used as a mixin for the LCMSBase class. 42 43 Methods 44 ------- 45 * get_max_eic(eic_data). 46 Returns the maximum EIC value from the given EIC data. A static method. 47 * smooth_tic(tic). 48 Smooths the TIC data using the specified smoothing method and settings. 49 * eic_centroid_detector(rt, eic, max_eic). 50 Performs EIC centroid detection on the given EIC data. 51 * find_nearest_scan(rt). 52 Finds the nearest scan to the given retention time. 53 * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). 54 Returns an averaged mass spectrum object. 55 * find_mass_features(ms_level=1). 56 Find regions of interest for a given MS level (default is MS1). 57 * integrate_mass_features(drop_if_fail=False, ms_level=1). 58 Integrate mass features of interest and extracts EICs. 59 * find_c13_mass_features(). 60 Evaluate mass features and mark likely C13 isotopes. 61 * deconvolute_ms1_mass_features(). 62 Deconvolute mass features' ms1 mass spectra. 63 """ 64 65 @staticmethod 66 def get_max_eic(eic_data: dict): 67 """Returns the maximum EIC value from the given EIC data. 68 69 Notes 70 ----- 71 This is a static method. 72 73 Parameters 74 ---------- 75 eic_data : dict 76 A dictionary containing EIC data. 77 78 Returns 79 ------- 80 float 81 The maximum EIC value. 82 """ 83 max_eic = 0 84 for eic_data in eic_data.values(): 85 ind_max_eic = max(eic_data.get("EIC")) 86 max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic 87 88 return max_eic 89 90 def smooth_tic(self, tic): 91 """Smooths the TIC or EIC data using the specified smoothing method and settings. 92 93 Parameters 94 ---------- 95 tic : numpy.ndarray 96 The TIC (or EIC) data to be smoothed. 97 98 Returns 99 ------- 100 numpy.ndarray 101 The smoothed TIC data. 102 """ 103 implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method 104 105 pol_order = self.parameters.lc_ms.savgol_pol_order 106 107 window_len = self.parameters.lc_ms.smooth_window 108 109 window = self.parameters.lc_ms.smooth_method 110 111 return sp.smooth_signal( 112 tic, window_len, window, pol_order, implemented_smooth_method 113 ) 114 115 def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]): 116 """Performs EIC centroid detection on the given EIC data. 117 118 Parameters 119 ---------- 120 rt : numpy.ndarray 121 The retention time data. 122 eic : numpy.ndarray 123 The EIC data. 124 max_eic : float 125 The maximum EIC value. 126 apex_indexes : list, optional 127 The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function. 128 129 Returns 130 ------- 131 numpy.ndarray 132 The indexes of left, apex, and right limits as a generator. 133 """ 134 135 max_prominence = self.parameters.lc_ms.peak_max_prominence_percent 136 137 max_height = self.parameters.lc_ms.peak_height_max_percent 138 139 signal_threshold = self.parameters.lc_ms.eic_signal_threshold 140 141 min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints 142 143 peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold 144 145 include_indexes = sp.peak_picking_first_derivative( 146 domain=rt, 147 signal=eic, 148 max_height=max_height, 149 max_prominence=max_prominence, 150 max_signal=max_eic, 151 min_peak_datapoints=min_peak_datapoints, 152 peak_derivative_threshold=peak_derivative_threshold, 153 signal_threshold=signal_threshold, 154 correct_baseline=False, 155 plot_res=False, 156 apex_indexes=apex_indexes, 157 ) 158 159 return include_indexes 160 161 def find_nearest_scan(self, rt): 162 """Finds the nearest scan to the given retention time. 163 164 Parameters 165 ---------- 166 rt : float 167 The retention time (in minutes) to find the nearest scan for. 168 169 Returns 170 ------- 171 int 172 The scan number of the nearest scan. 173 """ 174 array_rt = np.array(self.retention_time) 175 176 scan_index = (np.abs(array_rt - rt)).argmin() 177 178 real_scan = self.scans_number[scan_index] 179 180 return real_scan 181 182 def add_peak_metrics(self): 183 """Add peak metrics to the mass features. 184 185 This function calculates the peak metrics for each mass feature and adds them to the mass feature objects. 186 """ 187 # Check that at least some mass features have eic data 188 if not any([mf._eic_data is not None for mf in self.mass_features.values()]): 189 raise ValueError( 190 "No mass features have EIC data. Run integrate_mass_features first." 191 ) 192 193 for mass_feature in self.mass_features.values(): 194 # Check if the mass feature has been integrated 195 if mass_feature._eic_data is not None: 196 # Calculate peak metrics 197 mass_feature.calc_half_height_width() 198 mass_feature.calc_tailing_factor() 199 mass_feature.calc_dispersity_index() 200 201 def get_average_mass_spectrum( 202 self, 203 scan_list, 204 apex_scan, 205 spectrum_mode="profile", 206 ms_level=1, 207 auto_process=True, 208 use_parser=False, 209 perform_checks=True, 210 polarity=None, 211 ms_params=None, 212 ): 213 """Returns an averaged mass spectrum object 214 215 Parameters 216 ---------- 217 scan_list : list 218 List of scan numbers to average. 219 apex_scan : int 220 Number of the apex scan 221 spectrum_mode : str, optional 222 The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging. 223 ms_level : int, optional 224 The MS level to use. Defaults to 1. 225 auto_process : bool, optional 226 If True, the averaged mass spectrum will be auto-processed. Defaults to True. 227 use_parser : bool, optional 228 If True, the mass spectra will be obtained from the parser. Defaults to False. 229 perform_checks : bool, optional 230 If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan 231 polarity : int, optional 232 The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1) 233 ms_params : MSParameters, optional 234 The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None. 235 236 Returns 237 ------- 238 MassSpectrumProfile 239 The averaged mass spectrum object. 240 241 Raises 242 ------ 243 ValueError 244 If the spectrum mode is not "profile". 245 If the MS level is not found in the unprocessed mass spectra dictionary. 246 If not all scan numbers are found in the unprocessed mass spectra dictionary. 247 """ 248 if perform_checks: 249 if spectrum_mode != "profile": 250 raise ValueError("Averaging only supported for profile mode") 251 252 if polarity is None: 253 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 254 if self.polarity == "negative": 255 polarity = -1 256 elif self.polarity == "positive": 257 polarity = 1 258 else: 259 raise ValueError( 260 "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'" 261 ) 262 263 # if not using_parser, check that scan numbers are in _ms_unprocessed 264 if not use_parser: 265 if perform_checks: 266 # Set index to scan for faster lookup 267 ms_df = ( 268 self._ms_unprocessed[ms_level] 269 .copy() 270 .set_index("scan", drop=False) 271 .sort_index() 272 ) 273 my_ms_df = ms_df.loc[scan_list] 274 # Check that all scan numbers are in the ms_df 275 if not all(np.isin(scan_list, ms_df.index)): 276 raise ValueError( 277 "Not all scan numbers found in the unprocessed mass spectra dictionary" 278 ) 279 else: 280 my_ms_df = ( 281 pd.DataFrame({"scan": scan_list}) 282 .set_index("scan") 283 .join(self._ms_unprocessed[ms_level], how="left") 284 ) 285 286 if use_parser: 287 ms_list = [ 288 self.spectra_parser.get_mass_spectrum_from_scan( 289 x, spectrum_mode=spectrum_mode, auto_process=False 290 ) 291 for x in scan_list 292 ] 293 ms_mz = [x._mz_exp for x in ms_list] 294 ms_int = [x._abundance for x in ms_list] 295 my_ms_df = [] 296 for i in np.arange(len(ms_mz)): 297 my_ms_df.append( 298 pd.DataFrame( 299 {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]} 300 ) 301 ) 302 my_ms_df = pd.concat(my_ms_df) 303 304 if not self.check_if_grid(my_ms_df): 305 my_ms_df = self.grid_data(my_ms_df) 306 307 my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index() 308 309 ms = ms_from_array_profile( 310 my_ms_ave.mz, 311 my_ms_ave.intensity, 312 self.file_location, 313 polarity=polarity, 314 auto_process=False, 315 ) 316 317 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 318 if ms is not None: 319 if ms_params is not None: 320 ms.parameters = ms_params 321 ms.scan_number = apex_scan 322 if auto_process: 323 ms.process_mass_spec() 324 return ms 325 326 def find_mass_features(self, ms_level=1, grid=True): 327 """Find mass features within an LCMSBase object 328 329 Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future. 330 331 Parameters 332 ---------- 333 ms_level : int, optional 334 The MS level to use for peak picking Default is 1. 335 grid : bool, optional 336 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 337 used for persistent homology peak picking for profile data only. Default is True. 338 339 Raises 340 ------ 341 ValueError 342 If no MS level data is found on the object. 343 If persistent homology peak picking is attempted on non-profile mode data. 344 If data is not gridded and grid is False. 345 If peak picking method is not implemented. 346 347 Returns 348 ------- 349 None, but assigns the mass_features and eics attributes to the object. 350 351 """ 352 pp_method = self.parameters.lc_ms.peak_picking_method 353 354 if pp_method == "persistent homology": 355 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 356 if all(msx_scan_df["ms_format"] == "profile"): 357 self.find_mass_features_ph(ms_level=ms_level, grid=grid) 358 else: 359 raise ValueError( 360 "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format( 361 ms_level 362 ) 363 ) 364 elif pp_method == "centroided_persistent_homology": 365 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 366 if all(msx_scan_df["ms_format"] == "centroid"): 367 self.find_mass_features_ph_centroid(ms_level=ms_level) 368 else: 369 raise ValueError( 370 "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format( 371 ms_level 372 ) 373 ) 374 else: 375 raise ValueError("Peak picking method not implemented") 376 377 def integrate_mass_features( 378 self, drop_if_fail=True, drop_duplicates=True, ms_level=1 379 ): 380 """Integrate mass features and extract EICs. 381 382 Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute. 383 384 Parameters 385 ---------- 386 drop_if_fail : bool, optional 387 Whether to drop mass features if the EIC limit calculations fail. 388 Default is True. 389 drop_duplicates : bool, optional 390 Whether to mass features that appear to be duplicates 391 (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). 392 Default is True. 393 ms_level : int, optional 394 The MS level to use. Default is 1. 395 396 Raises 397 ------ 398 ValueError 399 If no mass features are found. 400 If no MS level data is found for the given MS level (either in data or in the scan data) 401 402 Returns 403 ------- 404 None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute. 405 406 Notes 407 ----- 408 drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection). 409 """ 410 # Check if there is data 411 if ms_level in self._ms_unprocessed.keys(): 412 raw_data = self._ms_unprocessed[ms_level].copy() 413 else: 414 raise ValueError("No MS level " + str(ms_level) + " data found") 415 if self.mass_features is not None: 416 mf_df = self.mass_features_to_df().copy() 417 else: 418 raise ValueError( 419 "No mass features found, did you run find_mass_features() first?" 420 ) 421 # Check if mass_spectrum exists on each mass feature 422 if not all( 423 [mf.mass_spectrum is not None for mf in self.mass_features.values()] 424 ): 425 raise ValueError( 426 "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?" 427 ) 428 429 # Subset scan data to only include correct ms_level 430 scan_df_sub = self.scan_df[ 431 self.scan_df["ms_level"] == int(ms_level) 432 ].reset_index(drop=True) 433 if scan_df_sub.empty: 434 raise ValueError("No MS level " + ms_level + " data found in scan data") 435 scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy() 436 437 mzs_to_extract = np.unique(mf_df["mz"].values) 438 mzs_to_extract.sort() 439 440 # Pre-sort raw_data by mz for faster filtering 441 raw_data_sorted = raw_data.sort_values(["mz", "scan"]).reset_index(drop=True) 442 raw_data_mz = raw_data_sorted["mz"].values 443 444 # Get EICs for each unique mz in mass features list 445 for mz in mzs_to_extract: 446 mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 447 mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 448 449 # Use binary search for faster mz range filtering 450 left_idx = np.searchsorted(raw_data_mz, mz_min, side="left") 451 right_idx = np.searchsorted(raw_data_mz, mz_max, side="right") 452 raw_data_sub = raw_data_sorted.iloc[left_idx:right_idx].copy() 453 454 raw_data_sub = ( 455 raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index() 456 ) 457 raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left") 458 raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0) 459 myEIC = EIC_Data( 460 scans=raw_data_sub["scan"].values, 461 time=raw_data_sub["scan_time"].values, 462 eic=raw_data_sub["intensity"].values, 463 ) 464 # Smooth EIC 465 smoothed_eic = self.smooth_tic(myEIC.eic) 466 smoothed_eic[smoothed_eic < 0] = 0 467 myEIC.eic_smoothed = smoothed_eic 468 self.eics[mz] = myEIC 469 470 # Get limits of mass features using EIC centroid detector and integrate 471 mf_df["area"] = np.nan 472 for idx, mass_feature in mf_df.iterrows(): 473 mz = mass_feature.mz 474 apex_scan = mass_feature.apex_scan 475 476 # Pull EIC data and find apex scan index 477 myEIC = self.eics[mz] 478 self.mass_features[idx]._eic_data = myEIC 479 apex_index = np.where(myEIC.scans == apex_scan)[0][0] 480 481 # Find left and right limits of peak using EIC centroid detector, add to EICData 482 centroid_eics = self.eic_centroid_detector( 483 myEIC.time, 484 myEIC.eic_smoothed, 485 mass_feature.intensity * 1.1, 486 apex_indexes=[int(apex_index)], 487 ) 488 l_a_r_scan_idx = [i for i in centroid_eics] 489 if len(l_a_r_scan_idx) > 0: 490 # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans 491 # Find the number of consecutive non-zero values in the EIC segment 492 mask = myEIC.eic[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1] > 0 493 # Find the longest run of consecutive True values 494 if np.any(mask): 495 # Find indices where mask changes value 496 diff = np.diff(np.concatenate(([0], mask.astype(int), [0]))) 497 starts = np.where(diff == 1)[0] 498 ends = np.where(diff == -1)[0] 499 consecutive_scans = (ends - starts).max() 500 else: 501 consecutive_scans = 0 502 if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min: 503 self.mass_features.pop(idx) 504 continue 505 # Add start and final scan to mass_features and EICData 506 left_scan, right_scan = ( 507 myEIC.scans[l_a_r_scan_idx[0][0]], 508 myEIC.scans[l_a_r_scan_idx[0][2]], 509 ) 510 mf_scan_apex = [(left_scan, int(apex_scan), right_scan)] 511 myEIC.apexes = myEIC.apexes + mf_scan_apex 512 self.mass_features[idx].start_scan = left_scan 513 self.mass_features[idx].final_scan = right_scan 514 515 # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData 516 area = np.trapz( 517 myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 518 myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 519 ) 520 mf_df.at[idx, "area"] = area 521 myEIC.areas = myEIC.areas + [area] 522 self.eics[mz] = myEIC 523 self.mass_features[idx]._area = area 524 else: 525 if drop_if_fail is True: 526 self.mass_features.pop(idx) 527 528 if drop_duplicates: 529 # Prepare mass feature dataframe 530 mf_df = self.mass_features_to_df() 531 532 # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature 533 # Keep the first mass feature (highest persistence) 534 for idx, mass_feature in mf_df.iterrows(): 535 mz = mass_feature.mz 536 apex_scan = mass_feature.apex_scan 537 538 mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6 539 mf_df_sub = mf_df[ 540 mf_df["mz_diff_ppm"] 541 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 542 * 10**6 543 ].copy() 544 545 # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature 546 for idx2, mass_feature2 in mf_df_sub.iterrows(): 547 if idx2 != idx: 548 if ( 549 mass_feature2.start_scan >= mass_feature.start_scan 550 and mass_feature2.final_scan <= mass_feature.final_scan 551 ): 552 if idx2 in self.mass_features.keys(): 553 self.mass_features.pop(idx2) 554 555 def find_c13_mass_features(self): 556 """Mark likely C13 isotopes and connect to monoisoitopic mass features. 557 558 Returns 559 ------- 560 None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object. 561 562 Raises 563 ------ 564 ValueError 565 If no mass features are found. 566 """ 567 verbose = self.parameters.lc_ms.verbose_processing 568 if verbose: 569 print("evaluating mass features for C13 isotopes") 570 if self.mass_features is None: 571 raise ValueError("No mass features found, run find_mass_features() first") 572 573 # Data prep fo sparse distance matrix 574 dims = ["mz", "scan_time"] 575 mf_df = self.mass_features_to_df().copy() 576 # Drop mass features that have no area (these are likely to be noise) 577 mf_df = mf_df[mf_df["area"].notnull()] 578 mf_df["mf_id"] = mf_df.index.values 579 dims = ["mz", "scan_time"] 580 581 # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features 582 mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy() 583 584 mz_diff = 1.003355 # C13-C12 mass difference 585 tol = [ 586 mf_df["mz"].median() 587 * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 588 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5, 589 ] # mz, in relative; scan_time in minutes 590 591 # Compute inter-feature distances 592 distances = None 593 for i in range(len(dims)): 594 # Construct k-d tree 595 values = mf_df[dims[i]].values 596 tree = KDTree(values.reshape(-1, 1)) 597 598 max_tol = tol[i] 599 if dims[i] == "mz": 600 # Maximum absolute tolerance 601 max_tol = mz_diff + tol[i] 602 603 # Compute sparse distance matrix 604 # the larger the max_tol, the slower this operation is 605 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 606 607 # Only consider forward case, exclude diagonal 608 sdm = sparse.triu(sdm, k=1) 609 610 if dims[i] == "mz": 611 min_tol = mz_diff - tol[i] 612 # Get only the ones that are above the min tol 613 idx = sdm.data > min_tol 614 615 # Reconstruct sparse distance matrix 616 sdm = sparse.coo_matrix( 617 (sdm.data[idx], (sdm.row[idx], sdm.col[idx])), 618 shape=(len(values), len(values)), 619 ) 620 621 # Cast as binary matrix 622 sdm.data = np.ones_like(sdm.data) 623 624 # Stack distances 625 if distances is None: 626 distances = sdm 627 else: 628 distances = distances.multiply(sdm) 629 630 # Extract indices of within-tolerance points 631 distances = distances.tocoo() 632 pairs = np.stack((distances.row, distances.col), axis=1) # C12 to C13 pairs 633 634 # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df 635 pairs_mf = pairs.copy() 636 pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values 637 pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values 638 639 # Connect monoisotopic masses with isotopologes within mass_features 640 monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1])) 641 for mono in monos: 642 self.mass_features[mono].monoisotopic_mf_id = mono 643 pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"]) 644 while not pairs_iso_df.empty: 645 pairs_iso_df = pairs_iso_df.set_index("parent", drop=False) 646 m1_isos = pairs_iso_df.loc[monos, "child"].unique() 647 for iso in m1_isos: 648 # Set monoisotopic_mf_id and isotopologue_type for isotopologues 649 parent = pairs_mf[pairs_mf[:, 1] == iso, 0] 650 if len(parent) > 1: 651 # Choose the parent that is closest in time to the isotopologue 652 parent_time = [self.mass_features[p].retention_time for p in parent] 653 time_diff = [ 654 np.abs(self.mass_features[iso].retention_time - x) 655 for x in parent_time 656 ] 657 parent = parent[np.argmin(time_diff)] 658 else: 659 parent = parent[0] 660 self.mass_features[iso].monoisotopic_mf_id = self.mass_features[ 661 parent 662 ].monoisotopic_mf_id 663 if self.mass_features[iso].monoisotopic_mf_id is not None: 664 mass_diff = ( 665 self.mass_features[iso].mz 666 - self.mass_features[ 667 self.mass_features[iso].monoisotopic_mf_id 668 ].mz 669 ) 670 self.mass_features[iso].isotopologue_type = "13C" + str( 671 int(round(mass_diff, 0)) 672 ) 673 674 # Drop the mono and iso from the pairs_iso_df 675 pairs_iso_df = pairs_iso_df.drop( 676 index=monos, errors="ignore" 677 ) # Drop pairs where the parent is a child that is a child of a root 678 pairs_iso_df = pairs_iso_df.set_index("child", drop=False) 679 pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore") 680 681 if not pairs_iso_df.empty: 682 # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process 683 monos = np.setdiff1d( 684 np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child) 685 ) 686 if verbose: 687 # Report fraction of compounds annotated with isotopes 688 mf_df["c13_flag"] = np.where( 689 np.logical_or( 690 np.isin(mf_df["mf_id"], pairs_mf[:, 0]), 691 np.isin(mf_df["mf_id"], pairs_mf[:, 1]), 692 ), 693 1, 694 0, 695 ) 696 print( 697 str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3)) 698 + " of mass features have or are C13 isotopes" 699 ) 700 701 def deconvolute_ms1_mass_features(self): 702 """Deconvolute MS1 mass features 703 704 Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features 705 706 Parameters 707 ---------- 708 None 709 710 Returns 711 ------- 712 None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent, 713 and associated_mass_features_deconvoluted attributes to the mass features in the 714 mass_features attribute of the LCMSBase object. 715 716 Raises 717 ------ 718 ValueError 719 If no mass features are found, must run find_mass_features() first. 720 If no EICs are found, did you run integrate_mass_features() first? 721 722 """ 723 # Checks for set mass_features and eics 724 if self.mass_features is None: 725 raise ValueError( 726 "No mass features found, did you run find_mass_features() first?" 727 ) 728 729 if self.eics == {}: 730 raise ValueError( 731 "No EICs found, did you run integrate_mass_features() first?" 732 ) 733 734 if 1 not in self._ms_unprocessed.keys(): 735 raise ValueError("No unprocessed MS1 spectra found.") 736 737 # Prep ms1 data 738 ms1_data = self._ms_unprocessed[1].copy() 739 ms1_data = ms1_data.set_index("scan") 740 741 # Prep mass feature summary 742 mass_feature_df = self.mass_features_to_df() 743 744 # Loop through each mass feature 745 for mf_id, mass_feature in self.mass_features.items(): 746 # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df 747 if mass_feature.mz != mass_feature.ms1_peak.mz_exp: 748 continue 749 750 # Get the left and right limits of the EIC of the mass feature 751 l_scan, _, r_scan = mass_feature._eic_data.apexes[0] 752 753 # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz 754 ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy() 755 ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False) 756 757 # Get the centroided masses of the mass feature 758 mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp 759 760 # Find the closest mz in the ms1 data to the centroided masses of the mass feature 761 ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[ 762 find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values) 763 ] 764 765 # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz 766 ms1_data_sub["mz_diff_rel"] = ( 767 np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"]) 768 / ms1_data_sub["mz"] 769 ) 770 ms1_data_sub = ms1_data_sub[ 771 ms1_data_sub["mz_diff_rel"] 772 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 773 ].reset_index(drop=True) 774 775 # Group by mass_feature_mz and scan and sum intensity 776 ms1_data_sub_group = ( 777 ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"] 778 .sum() 779 .reset_index() 780 ) 781 782 # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity) 783 corr = ( 784 ms1_data_sub_group.pivot( 785 index="scan", columns="mass_feature_mz", values="intensity" 786 ) 787 .fillna(0) 788 .corr() 789 ) 790 791 # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8 792 decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min 793 794 # Try catch for KeyError in case the mass feature mz is not in the correlation matrix 795 try: 796 corr_subset = corr.loc[mass_feature.mz,] 797 except KeyError: 798 # If the mass feature mz is not in the correlation matrix, skip to the next mass feature 799 continue 800 801 corr_subset = corr_subset[corr_subset > decon_corr_min] 802 803 # Get the masses from the mass spectrum that are the result of the deconvolution 804 mzs_decon = corr_subset.index.values 805 806 # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature 807 mzs_decon_idx = [ 808 id 809 for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp) 810 if mz in mzs_decon 811 ] 812 mass_feature._ms_deconvoluted_idx = mzs_decon_idx 813 814 # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum 815 if ( 816 mass_feature.ms1_peak.abundance 817 == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max() 818 ): 819 mass_feature.mass_spectrum_deconvoluted_parent = True 820 else: 821 mass_feature.mass_spectrum_deconvoluted_parent = False 822 823 # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature 824 # Subset mass_feature_df to only include mass features that are within the clustering tolerance 825 mass_feature_df_sub = mass_feature_df[ 826 abs(mass_feature.retention_time - mass_feature_df["scan_time"]) 827 < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 828 ].copy() 829 # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum 830 mass_feature_df_sub["mz_diff_ppm"] = [ 831 np.abs(mzs_decon - mz).min() / mz * 10**6 832 for mz in mass_feature_df_sub["mz"] 833 ] 834 # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses 835 mfs_associated_decon = mass_feature_df_sub[ 836 mass_feature_df_sub["mz_diff_ppm"] 837 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6 838 ].index.values 839 840 mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon 841 842 843class PHCalculations: 844 """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data. 845 846 Notes 847 ----- 848 This class is intended to be used as a mixin for the LCMSBase class. 849 850 Methods 851 ------- 852 * sparse_mean_filter(idx, V, radius=[0, 1, 1]). 853 Sparse implementation of a mean filter. 854 * embed_unique_indices(a). 855 Creates an array of indices, sorted by unique element. 856 * sparse_upper_star(idx, V). 857 Sparse implementation of an upper star filtration. 858 * check_if_grid(data). 859 Check if the data is gridded in mz space. 860 * grid_data(data). 861 Grid the data in the mz dimension. 862 * find_mass_features_ph(ms_level=1, grid=True). 863 Find mass features within an LCMSBase object using persistent homology. 864 * cluster_mass_features(drop_children=True). 865 Cluster regions of interest. 866 """ 867 868 @staticmethod 869 def sparse_mean_filter(idx, V, radius=[0, 1, 1]): 870 """Sparse implementation of a mean filter. 871 872 Parameters 873 ---------- 874 idx : :obj:`~numpy.array` 875 Edge indices for each dimension (MxN). 876 V : :obj:`~numpy.array` 877 Array of intensity data (Mx1). 878 radius : float or list 879 Radius of the sparse filter in each dimension. Values less than 880 zero indicate no connectivity in that dimension. 881 882 Returns 883 ------- 884 :obj:`~numpy.array` 885 Filtered intensities (Mx1). 886 887 Notes 888 ----- 889 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. 890 This is a static method. 891 """ 892 893 # Copy indices 894 idx = idx.copy().astype(V.dtype) 895 896 # Scale 897 for i, r in enumerate(radius): 898 # Increase inter-index distance 899 if r < 1: 900 idx[:, i] *= 2 901 902 # Do nothing 903 elif r == 1: 904 pass 905 906 # Decrease inter-index distance 907 else: 908 idx[:, i] /= r 909 910 # Connectivity matrix 911 cmat = KDTree(idx) 912 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 913 cmat.setdiag(1) 914 915 # Pair indices 916 I, J = cmat.nonzero() 917 918 # Delete cmat 919 cmat_shape = cmat.shape 920 del cmat 921 922 # Sum over columns 923 V_sum = sparse.bsr_matrix( 924 (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype 925 ).diagonal(0) 926 927 # Count over columns 928 V_count = sparse.bsr_matrix( 929 (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype 930 ).diagonal(0) 931 932 return V_sum / V_count 933 934 @staticmethod 935 def embed_unique_indices(a): 936 """Creates an array of indices, sorted by unique element. 937 938 Parameters 939 ---------- 940 a : :obj:`~numpy.array` 941 Array of unique elements (Mx1). 942 943 Returns 944 ------- 945 :obj:`~numpy.array` 946 Array of indices (Mx1). 947 948 Notes 949 ----- 950 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 951 This is a static method. 952 """ 953 954 def count_tens(n): 955 # Count tens 956 ntens = (n - 1) // 10 957 958 while True: 959 ntens_test = (ntens + n - 1) // 10 960 961 if ntens_test == ntens: 962 return ntens 963 else: 964 ntens = ntens_test 965 966 def arange_exclude_10s(n): 967 # How many 10s will there be? 968 ntens = count_tens(n) 969 970 # Base array 971 arr = np.arange(0, n + ntens) 972 973 # Exclude 10s 974 arr = arr[(arr == 0) | (arr % 10 != 0)][:n] 975 976 return arr 977 978 # Creates an array of indices, sorted by unique element 979 idx_sort = np.argsort(a) 980 idx_unsort = np.argsort(idx_sort) 981 982 # Sorts records array so all unique elements are together 983 sorted_a = a[idx_sort] 984 985 # Returns the unique values, the index of the first occurrence, 986 # and the count for each element 987 vals, idx_start, count = np.unique( 988 sorted_a, return_index=True, return_counts=True 989 ) 990 991 # Splits the indices into separate arrays 992 splits = np.split(idx_sort, idx_start[1:]) 993 994 # Creates unique indices for each split 995 idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits]) 996 997 # Reorders according to input array 998 idx_unq = idx_unq[idx_unsort] 999 1000 # Magnitude of each index 1001 exp = np.log10( 1002 idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64) 1003 ) 1004 idx_unq_mag = np.power(10, np.floor(exp) + 1) 1005 1006 # Result 1007 return a + idx_unq / idx_unq_mag 1008 1009 @staticmethod 1010 def roll_up_dataframe( 1011 df: pd.DataFrame, 1012 sort_by: str, 1013 tol: list, 1014 relative: list, 1015 dims: list, 1016 memory_opt_threshold: int = 10000, 1017 ): 1018 """Subset data by rolling up into apex in appropriate dimensions. 1019 1020 Parameters 1021 ---------- 1022 data : pd.DataFrame 1023 The input data containing "dims" columns and the "sort_by" column. 1024 sort_by : str 1025 The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 1026 (i.e., the mass feature with the highest value in the sort_by column). 1027 dims : list 1028 A list of dimension names (column names in the data DataFrame) to roll up the mass features by. 1029 tol : list 1030 A list of tolerances for each dimension. The length of the list must match the number of dimensions. 1031 The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). 1032 If relative is True, the tolerance will be multiplied by the maximum value in that dimension. 1033 If relative is False, the tolerance will be used as is. 1034 relative : list 1035 A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1036 memory_opt_threshold : int, optional 1037 Minimum number of rows to trigger memory-optimized processing. Default is 10000. 1038 1039 Returns 1040 ------- 1041 pd.DataFrame 1042 A DataFrame with only the rolled up mass features, with the original index and columns. 1043 1044 1045 Raises 1046 ------ 1047 ValueError 1048 If the input data is not a pandas DataFrame. 1049 If the input data does not have columns for each of the dimensions in "dims". 1050 If the length of "dims", "tol", and "relative" do not match. 1051 """ 1052 og_columns = df.columns.copy() 1053 1054 # Unindex the data, but keep the original index 1055 if df.index.name is not None: 1056 og_index = df.index.name 1057 else: 1058 og_index = "index" 1059 df = df.reset_index(drop=False) 1060 1061 # Sort data by sort_by column, and reindex 1062 df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True) 1063 1064 # Check that data is a DataFrame and has columns for each of the dims 1065 if not isinstance(df, pd.DataFrame): 1066 raise ValueError("Data must be a pandas DataFrame") 1067 for dim in dims: 1068 if dim not in df.columns: 1069 raise ValueError(f"Data must have a column for {dim}") 1070 if len(dims) != len(tol) or len(dims) != len(relative): 1071 raise ValueError( 1072 "Dimensions, tolerances, and relative flags must be the same length" 1073 ) 1074 1075 # Pre-compute all values arrays 1076 all_values = [df[dim].values for dim in dims] 1077 1078 # Choose processing method based on dataframe size 1079 if len(df) >= memory_opt_threshold: 1080 # Memory-optimized approach for large dataframes 1081 distances = PHCalculations._compute_distances_memory_optimized( 1082 all_values, tol, relative 1083 ) 1084 else: 1085 # Faster approach for smaller dataframes 1086 distances = PHCalculations._compute_distances_original( 1087 all_values, tol, relative 1088 ) 1089 1090 # Process pairs with original logic but memory optimizations 1091 distances = distances.tocoo() 1092 pairs = np.stack((distances.row, distances.col), axis=1) 1093 pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]).set_index("parent") 1094 del distances, pairs # Free memory immediately 1095 1096 to_drop = [] 1097 while not pairs_df.empty: 1098 # Find root_parents and their children (original logic preserved) 1099 root_parents = np.setdiff1d( 1100 np.unique(pairs_df.index.values), np.unique(pairs_df.child.values) 1101 ) 1102 children_of_roots = pairs_df.loc[root_parents, "child"].unique() 1103 to_drop.extend(children_of_roots) # Use extend instead of append 1104 1105 # Remove root_children as possible parents from pairs_df for next iteration 1106 pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore") 1107 pairs_df = pairs_df.reset_index().set_index("child") 1108 # Remove root_children as possible children from pairs_df for next iteration 1109 pairs_df = pairs_df.drop(index=children_of_roots) 1110 1111 # Prepare for next iteration 1112 pairs_df = pairs_df.reset_index().set_index("parent") 1113 1114 # Convert to numpy array for efficient dropping 1115 to_drop = np.array(to_drop) 1116 1117 # Drop mass features that are not cluster parents 1118 df_sub = df.drop(index=to_drop) 1119 1120 # Set index back to og_index and only keep original columns 1121 df_sub = df_sub.set_index(og_index).sort_index()[og_columns] 1122 1123 return df_sub 1124 1125 @staticmethod 1126 def _compute_distances_original(all_values, tol, relative): 1127 """Original distance computation method for smaller datasets. 1128 1129 This method computes the pairwise distances between features in the dataset 1130 using a straightforward approach. It is suitable for smaller datasets where 1131 memory usage is not a primary concern. 1132 1133 Parameters 1134 ---------- 1135 all_values : list of :obj:`~numpy.array` 1136 List of arrays containing the values for each dimension. 1137 tol : list of float 1138 List of tolerances for each dimension. 1139 relative : list of bool 1140 List of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1141 1142 Returns 1143 ------- 1144 :obj:`~scipy.sparse.coo_matrix` 1145 Sparse matrix indicating pairwise distances within tolerances. 1146 """ 1147 # Compute inter-feature distances with memory optimization 1148 distances = None 1149 for i in range(len(all_values)): 1150 values = all_values[i] 1151 # Use single precision if possible to reduce memory 1152 tree = KDTree(values.reshape(-1, 1).astype(np.float32)) 1153 1154 max_tol = tol[i] 1155 if relative[i] is True: 1156 max_tol = tol[i] * values.max() 1157 1158 # Compute sparse distance matrix with smaller chunks if memory is an issue 1159 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 1160 1161 # Only consider forward case, exclude diagonal 1162 sdm = sparse.triu(sdm, k=1) 1163 1164 # Process relative distances more efficiently 1165 if relative[i] is True: 1166 # Vectorized computation without creating intermediate arrays 1167 row_values = values[sdm.row] 1168 valid_idx = sdm.data <= tol[i] * row_values 1169 1170 # Reconstruct sparse matrix more efficiently 1171 sdm = sparse.coo_matrix( 1172 ( 1173 np.ones(valid_idx.sum(), dtype=np.uint8), 1174 (sdm.row[valid_idx], sdm.col[valid_idx]), 1175 ), 1176 shape=(len(values), len(values)), 1177 ) 1178 else: 1179 # Cast as binary matrix with smaller data type 1180 sdm.data = np.ones(len(sdm.data), dtype=np.uint8) 1181 1182 # Stack distances with memory-efficient multiplication 1183 if distances is None: 1184 distances = sdm 1185 else: 1186 # Use in-place operations where possible 1187 distances = distances.multiply(sdm) 1188 del sdm # Free memory immediately 1189 1190 return distances 1191 1192 @staticmethod 1193 def _compute_distances_memory_optimized(all_values, tol, relative): 1194 """Memory-optimized distance computation for large datasets. 1195 1196 This method computes the pairwise distances between features in the dataset 1197 using a more memory-efficient approach. It is suitable for larger datasets 1198 where memory usage is a primary concern. 1199 1200 Parameters 1201 ---------- 1202 all_values : list of :obj:`~numpy.array` 1203 List of arrays containing the values for each dimension. 1204 tol : list of float 1205 List of tolerances for each dimension. 1206 relative : list of bool 1207 List of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1208 1209 Returns 1210 ------- 1211 :obj:`~scipy.sparse.coo_matrix` 1212 Sparse matrix indicating pairwise distances within tolerances. 1213 """ 1214 # Compute distance matrix for first dimension (full matrix as before) 1215 values_0 = all_values[0].astype(np.float32) 1216 tree_0 = KDTree(values_0.reshape(-1, 1)) 1217 1218 max_tol_0 = tol[0] 1219 if relative[0] is True: 1220 max_tol_0 = tol[0] * values_0.max() 1221 1222 # Compute sparse distance matrix for first dimension 1223 distances = tree_0.sparse_distance_matrix( 1224 tree_0, max_tol_0, output_type="coo_matrix" 1225 ) 1226 distances = sparse.triu(distances, k=1) 1227 1228 # Process relative distances for first dimension 1229 if relative[0] is True: 1230 row_values = values_0[distances.row] 1231 valid_idx = distances.data <= tol[0] * row_values 1232 distances = sparse.coo_matrix( 1233 ( 1234 np.ones(valid_idx.sum(), dtype=np.uint8), 1235 (distances.row[valid_idx], distances.col[valid_idx]), 1236 ), 1237 shape=(len(values_0), len(values_0)), 1238 ) 1239 else: 1240 distances.data = np.ones(len(distances.data), dtype=np.uint8) 1241 1242 # For remaining dimensions, work only on chunks defined by first dimension pairs 1243 if len(all_values) > 1: 1244 distances_coo = distances.tocoo() 1245 valid_pairs = [] 1246 1247 # Process each pair from first dimension 1248 for idx in range(len(distances_coo.data)): 1249 i, j = distances_coo.row[idx], distances_coo.col[idx] 1250 is_valid_pair = True 1251 1252 # Check remaining dimensions for this specific pair 1253 for dim_idx in range(1, len(all_values)): 1254 values = all_values[dim_idx] 1255 val_i, val_j = values[i], values[j] 1256 1257 max_tol = tol[dim_idx] 1258 if relative[dim_idx] is True: 1259 max_tol = tol[dim_idx] * values.max() 1260 1261 distance_ij = abs(val_i - val_j) 1262 1263 # Check if this pair satisfies the tolerance for this dimension 1264 if relative[dim_idx] is True: 1265 if distance_ij > tol[dim_idx] * val_i: 1266 is_valid_pair = False 1267 break 1268 else: 1269 if distance_ij > max_tol: 1270 is_valid_pair = False 1271 break 1272 1273 if is_valid_pair: 1274 valid_pairs.append((i, j)) 1275 1276 # Rebuild distances matrix with only valid pairs 1277 if valid_pairs: 1278 valid_pairs = np.array(valid_pairs) 1279 distances = sparse.coo_matrix( 1280 ( 1281 np.ones(len(valid_pairs), dtype=np.uint8), 1282 (valid_pairs[:, 0], valid_pairs[:, 1]), 1283 ), 1284 shape=(len(values_0), len(values_0)), 1285 ) 1286 else: 1287 # No valid pairs found 1288 distances = sparse.coo_matrix( 1289 (len(values_0), len(values_0)), dtype=np.uint8 1290 ) 1291 1292 return distances 1293 1294 def sparse_upper_star(self, idx, V): 1295 """Sparse implementation of an upper star filtration. 1296 1297 Parameters 1298 ---------- 1299 idx : :obj:`~numpy.array` 1300 Edge indices for each dimension (MxN). 1301 V : :obj:`~numpy.array` 1302 Array of intensity data (Mx1). 1303 Returns 1304 ------- 1305 idx : :obj:`~numpy.array` 1306 Index of filtered points (Mx1). 1307 persistence : :obj:`~numpy.array` 1308 Persistence of each filtered point (Mx1). 1309 1310 Notes 1311 ----- 1312 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1313 """ 1314 1315 # Invert 1316 V = -1 * V.copy().astype(int) 1317 1318 # Embed indices 1319 V = self.embed_unique_indices(V) 1320 1321 # Connectivity matrix 1322 cmat = KDTree(idx) 1323 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1324 cmat.setdiag(1) 1325 cmat = sparse.triu(cmat) 1326 1327 # Pairwise minimums 1328 I, J = cmat.nonzero() 1329 d = np.maximum(V[I], V[J]) 1330 1331 # Delete connectiity matrix 1332 cmat_shape = cmat.shape 1333 del cmat 1334 1335 # Sparse distance matrix 1336 sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape) 1337 1338 # Delete pairwise mins 1339 del d, I, J 1340 1341 # Persistence homology 1342 ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0] 1343 1344 # Bound death values 1345 ph[ph[:, 1] == np.inf, 1] = np.max(V) 1346 1347 # Construct tree to query against 1348 tree = KDTree(V.reshape((-1, 1))) 1349 1350 # Get the indexes of the first nearest neighbor by birth 1351 _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1) 1352 1353 return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1) 1354 1355 def check_if_grid(self, data): 1356 """Check if the data are gridded in mz space. 1357 1358 Parameters 1359 ---------- 1360 data : DataFrame 1361 DataFrame containing the mass spectrometry data. Needs to have mz and scan columns. 1362 1363 Returns 1364 ------- 1365 bool 1366 True if the data is gridded in the mz direction, False otherwise. 1367 1368 Notes 1369 ----- 1370 This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly. 1371 """ 1372 # Calculate the difference between consecutive mz values in a single scan 1373 dat_check = data.copy().reset_index(drop=True) 1374 dat_check["mz_diff"] = np.abs(dat_check["mz"].diff()) 1375 mz_diff_min = ( 1376 dat_check.groupby("scan")["mz_diff"].min().min() 1377 ) # within each scan, what is the smallest mz difference between consecutive mz values 1378 1379 # Find the mininum mz difference between mz values in the data; regardless of scan 1380 dat_check_mz = dat_check[["mz"]].drop_duplicates().copy() 1381 dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True) 1382 dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff()) 1383 1384 # Get minimum mz_diff between mz values in the data 1385 mz_diff_min_raw = dat_check_mz["mz_diff"].min() 1386 1387 # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded 1388 if mz_diff_min_raw < mz_diff_min: 1389 return False 1390 else: 1391 return True 1392 1393 def grid_data(self, data, attempts=5): 1394 """Grid the data in the mz dimension. 1395 1396 Data must be gridded prior to persistent homology calculations and computing average mass spectrum 1397 1398 Parameters 1399 ---------- 1400 data : DataFrame 1401 The input data containing mz, scan, scan_time, and intensity columns. 1402 attempts : int, optional 1403 The number of attempts to grid the data. Default is 5. 1404 1405 Returns 1406 ------- 1407 DataFrame 1408 The gridded data with mz, scan, scan_time, and intensity columns. 1409 1410 Raises 1411 ------ 1412 ValueError 1413 If gridding fails after the specified number of attempts. 1414 """ 1415 attempt_i = 0 1416 while attempt_i < attempts: 1417 attempt_i += 1 1418 data = self._grid_data(data) 1419 1420 if self.check_if_grid(data): 1421 return data 1422 1423 if not self.check_if_grid(data): 1424 raise ValueError( 1425 "Gridding failed after " 1426 + str(attempt_i) 1427 + " attempts. Please check the data." 1428 ) 1429 else: 1430 return data 1431 1432 def _grid_data(self, data): 1433 """Internal method to grid the data in the mz dimension. 1434 1435 Notes 1436 ----- 1437 This method is called by the grid_data method and should not be called directly. 1438 It will attempt to grid the data in the mz dimension by creating a grid of mz values based on the minimum mz difference within each scan, 1439 but it does not check if the data is already gridded or if the gridding is successful. 1440 1441 Parameters 1442 ---------- 1443 data : pd.DataFrame or pl.DataFrame 1444 The input data to grid. 1445 1446 Returns 1447 ------- 1448 pd.DataFrame or pl.DataFrame 1449 The data after attempting to grid it in the mz dimension. 1450 """ 1451 # Calculate the difference between consecutive mz values in a single scan for grid spacing 1452 data_w = data.copy().reset_index(drop=True) 1453 data_w["mz_diff"] = np.abs(data_w["mz"].diff()) 1454 mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999 1455 1456 # Need high intensity mz values first so they are parents in the output pairs stack 1457 dat_mz = data_w[["mz", "intensity"]].sort_values( 1458 by=["intensity"], ascending=False 1459 ) 1460 dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy() 1461 1462 # Construct KD tree 1463 tree = KDTree(dat_mz.mz.values.reshape(-1, 1)) 1464 sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix") 1465 sdm = sparse.triu(sdm, k=1) 1466 sdm.data = np.ones_like(sdm.data) 1467 distances = sdm.tocoo() 1468 pairs = np.stack((distances.row, distances.col), axis=1) 1469 1470 # Cull pairs to just get root 1471 to_drop = [] 1472 while len(pairs) > 0: 1473 root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1])) 1474 id_root_parents = np.isin(pairs[:, 0], root_parents) 1475 children_of_roots = np.unique(pairs[id_root_parents, 1]) 1476 to_drop = np.append(to_drop, children_of_roots) 1477 1478 # Set up pairs array for next iteration by removing pairs with children or parents already dropped 1479 pairs = pairs[~np.isin(pairs[:, 1], to_drop), :] 1480 pairs = pairs[~np.isin(pairs[:, 0], to_drop), :] 1481 dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop)) 1482 mz_dat_np = ( 1483 dat_mz[["mz"]] 1484 .sort_values(by=["mz"]) 1485 .reset_index(drop=True) 1486 .values.flatten() 1487 ) 1488 1489 # Sort data by mz and recast mz to nearest value in mz_dat_np 1490 data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy() 1491 data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)] 1492 data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"]) 1493 1494 # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity 1495 new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy() 1496 new_data_w = ( 1497 new_data_w.drop(columns=["mz_diff", "mz_orig"]) 1498 .groupby(["scan", "mz"])["intensity"] 1499 .sum() 1500 .reset_index() 1501 ) 1502 new_data_w = ( 1503 new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True]) 1504 .reset_index(drop=True) 1505 .copy() 1506 ) 1507 1508 return new_data_w 1509 1510 def find_mass_features_ph(self, ms_level=1, grid=True): 1511 """Find mass features within an LCMSBase object using persistent homology. 1512 1513 Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id) 1514 1515 Parameters 1516 ---------- 1517 ms_level : int, optional 1518 The MS level to use. Default is 1. 1519 grid : bool, optional 1520 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True. 1521 1522 Raises 1523 ------ 1524 ValueError 1525 If no MS level data is found on the object. 1526 If data is not gridded and grid is False. 1527 1528 Returns 1529 ------- 1530 None, but assigns the mass_features attribute to the object. 1531 1532 Notes 1533 ----- 1534 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1535 """ 1536 # Check that ms_level is a key in self._ms_uprocessed 1537 if ms_level not in self._ms_unprocessed.keys(): 1538 raise ValueError( 1539 "No MS level " 1540 + str(ms_level) 1541 + " data found, did you instantiate with parser specific to MS level?" 1542 ) 1543 1544 # Get ms data 1545 data = self._ms_unprocessed[ms_level].copy() 1546 1547 # Drop rows with missing intensity values and reset index 1548 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1549 1550 # Threshold data 1551 dims = ["mz", "scan_time"] 1552 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1553 persistence_threshold = ( 1554 self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max() 1555 ) 1556 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1557 1558 # Check if gridded, if not, grid 1559 gridded_mz = self.check_if_grid(data_thres) 1560 if gridded_mz is False: 1561 if grid is False: 1562 raise ValueError( 1563 "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function" 1564 ) 1565 else: 1566 data_thres = self.grid_data(data_thres) 1567 1568 # Add scan_time 1569 data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1570 # Process in chunks if required 1571 if len(data_thres) > 10000: 1572 return self._find_mass_features_ph_partition( 1573 data_thres, dims, persistence_threshold 1574 ) 1575 else: 1576 # Process all at once 1577 return self._find_mass_features_ph_single( 1578 data_thres, dims, persistence_threshold 1579 ) 1580 1581 def _find_mass_features_ph_single(self, data_thres, dims, persistence_threshold): 1582 """Process all data at once (original logic).""" 1583 # Build factors 1584 factors = { 1585 dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32) 1586 for dim in dims 1587 } 1588 1589 # Build indexes 1590 index = { 1591 dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32) 1592 for dim in factors 1593 } 1594 1595 # Smooth and process 1596 mass_features_df = self._process_partition_ph( 1597 data_thres, index, dims, persistence_threshold 1598 ) 1599 1600 # Roll up within chunk to remove duplicates 1601 mass_features_df = self.roll_up_dataframe( 1602 df=mass_features_df, 1603 sort_by="persistence", 1604 dims=["mz", "scan_time"], 1605 tol=[ 1606 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1607 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1608 ], 1609 relative=[True, False], 1610 ) 1611 mass_features_df = mass_features_df.reset_index(drop=True) 1612 1613 # Populate mass_features attribute 1614 self._populate_mass_features(mass_features_df) 1615 1616 def _find_mass_features_ph_partition(self, data_thres, dims, persistence_threshold): 1617 """Partition the persistent homology mass feature detection for large datasets. 1618 1619 This method splits the input data into overlapping scan partitions, processes each partition to detect mass features 1620 using persistent homology, rolls up duplicates within and across partitions, and populates the mass_features attribute. 1621 1622 Parameters 1623 ---------- 1624 data_thres : pd.DataFrame 1625 The thresholded input data containing mass spectrometry information. 1626 dims : list 1627 List of dimension names (e.g., ["mz", "scan_time"]) used for feature detection. 1628 persistence_threshold : float 1629 Minimum persistence value required for a detected mass feature to be retained. 1630 1631 Returns 1632 ------- 1633 None 1634 Populates the mass_features attribute of the object with detected mass features. 1635 """ 1636 all_mass_features = [] 1637 1638 # Split scans into partitions 1639 unique_scans = sorted(data_thres["scan"].unique()) 1640 unique_scans_n = len(unique_scans) 1641 1642 # Calculate partition size in scans based on goal 1643 partition_size_goal = 5000 1644 scans_per_partition = max( 1645 1, partition_size_goal // (len(data_thres) // unique_scans_n) 1646 ) 1647 if scans_per_partition == 0: 1648 scans_per_partition = 1 1649 1650 # Make partitions based on scans, with overlapping in partitioned scans 1651 scan_overlap = 4 1652 partition_scans = [] 1653 for i in range(0, unique_scans_n, scans_per_partition): 1654 start_idx = max(0, i - scan_overlap) 1655 end_idx = min( 1656 unique_scans_n - 1, i + scans_per_partition - 1 + scan_overlap 1657 ) 1658 scans_group = [int(s) for s in unique_scans[start_idx : end_idx + 1]] 1659 partition_scans.append(scans_group) 1660 1661 # Set index to scan for faster filtering 1662 data_thres = data_thres.set_index("scan") 1663 for scans in partition_scans: 1664 # Determine start and end scan for partition, with 5 scans overlap 1665 partition_data = data_thres.loc[scans].reset_index(drop=False).copy() 1666 1667 if len(partition_data) == 0: 1668 continue 1669 1670 # Build factors for this partition 1671 factors = { 1672 dim: pd.factorize(partition_data[dim], sort=True)[1].astype(np.float32) 1673 for dim in dims 1674 } 1675 1676 # Build indexes 1677 index = { 1678 dim: np.searchsorted(factors[dim], partition_data[dim]).astype( 1679 np.float32 1680 ) 1681 for dim in factors 1682 } 1683 1684 # Process partition 1685 partition_features = self._process_partition_ph( 1686 partition_data, index, dims, persistence_threshold 1687 ) 1688 1689 if len(partition_features) == 0: 1690 continue 1691 1692 # Roll up within partition 1693 partition_features = self.roll_up_dataframe( 1694 df=partition_features, 1695 sort_by="persistence", 1696 dims=["mz", "scan_time"], 1697 tol=[ 1698 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1699 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1700 ], 1701 relative=[True, False], 1702 ) 1703 partition_features = partition_features.reset_index(drop=True) 1704 1705 if len(partition_features) > 0: 1706 all_mass_features.append(partition_features) 1707 1708 # Combine results from all partitions 1709 if all_mass_features: 1710 combined_features = pd.concat(all_mass_features, ignore_index=True) 1711 1712 # Sort by persistence 1713 combined_features = combined_features.sort_values( 1714 by="persistence", ascending=False 1715 ).reset_index(drop=True) 1716 1717 # Remove duplicates from overlapping regions 1718 combined_features = self.roll_up_dataframe( 1719 df=combined_features, 1720 sort_by="persistence", 1721 dims=["mz", "scan_time"], 1722 tol=[ 1723 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1724 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1725 ], 1726 relative=[True, False], 1727 ) 1728 1729 # resort by persistence and reset index 1730 combined_features = combined_features.reset_index(drop=True) 1731 1732 # Populate mass_features attribute 1733 self._populate_mass_features(combined_features) 1734 else: 1735 self.mass_features = {} 1736 1737 def _process_partition_ph(self, partition_data, index, dims, persistence_threshold): 1738 """Process a single partition with persistent homology.""" 1739 # Smooth data 1740 iterations = self.parameters.lc_ms.ph_smooth_it 1741 smooth_radius = [ 1742 self.parameters.lc_ms.ph_smooth_radius_mz, 1743 self.parameters.lc_ms.ph_smooth_radius_scan, 1744 ] 1745 1746 index_array = np.vstack([index[dim] for dim in dims]).T 1747 V = partition_data["intensity"].values 1748 resid = np.inf 1749 1750 for i in range(iterations): 1751 # Previous iteration 1752 V_prev = V.copy() 1753 resid_prev = resid 1754 V = self.sparse_mean_filter(index_array, V, radius=smooth_radius) 1755 1756 # Calculate residual with previous iteration 1757 resid = np.sqrt(np.mean(np.square(V - V_prev))) 1758 1759 # Evaluate convergence 1760 if i > 0: 1761 # Percent change in residual 1762 test = np.abs(resid - resid_prev) / resid_prev 1763 1764 # Exit criteria 1765 if test <= 0: 1766 break 1767 1768 # Overwrite values 1769 partition_data = partition_data.copy() 1770 partition_data["intensity"] = V 1771 1772 # Use persistent homology to find regions of interest 1773 pidx, pers = self.sparse_upper_star(index_array, V) 1774 pidx = pidx[pers > 1] 1775 pers = pers[pers > 1] 1776 1777 if len(pidx) == 0: 1778 return pd.DataFrame() 1779 1780 # Get peaks 1781 peaks = partition_data.iloc[pidx, :].reset_index(drop=True) 1782 1783 # Add persistence column 1784 peaks["persistence"] = pers 1785 mass_features = peaks.sort_values( 1786 by="persistence", ascending=False 1787 ).reset_index(drop=True) 1788 1789 # Filter by persistence threshold 1790 mass_features = mass_features.loc[ 1791 mass_features["persistence"] > persistence_threshold, : 1792 ].reset_index(drop=True) 1793 1794 return mass_features 1795 1796 def _populate_mass_features(self, mass_features_df): 1797 """Populate the mass_features attribute from a DataFrame. 1798 1799 Parameters 1800 ---------- 1801 mass_features_df : pd.DataFrame 1802 DataFrame containing mass feature information. 1803 Note that the order of this DataFrame will determine the order of mass features in the mass_features attribute. 1804 1805 Returns 1806 ------- 1807 None, but assigns the mass_features attribute to the object. 1808 """ 1809 # Rename scan column to apex_scan 1810 mass_features_df = mass_features_df.rename( 1811 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1812 ) 1813 1814 # Populate mass_features attribute 1815 self.mass_features = {} 1816 for row in mass_features_df.itertuples(): 1817 row_dict = mass_features_df.iloc[row.Index].to_dict() 1818 lcms_feature = LCMSMassFeature(self, **row_dict) 1819 self.mass_features[lcms_feature.id] = lcms_feature 1820 1821 if self.parameters.lc_ms.verbose_processing: 1822 print("Found " + str(len(mass_features_df)) + " initial mass features") 1823 1824 def find_mass_features_ph_centroid(self, ms_level=1): 1825 """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data. 1826 1827 Parameters 1828 ---------- 1829 ms_level : int, optional 1830 The MS level to use. Default is 1. 1831 1832 Raises 1833 ------ 1834 ValueError 1835 If no MS level data is found on the object. 1836 1837 Returns 1838 ------- 1839 None, but assigns the mass_features attribute to the object. 1840 """ 1841 # Check that ms_level is a key in self._ms_uprocessed 1842 if ms_level not in self._ms_unprocessed.keys(): 1843 raise ValueError( 1844 "No MS level " 1845 + str(ms_level) 1846 + " data found, did you instantiate with parser specific to MS level?" 1847 ) 1848 1849 # Work with reference instead of copy 1850 data = self._ms_unprocessed[ms_level] 1851 1852 # Calculate threshold first to avoid multiple operations 1853 max_intensity = data["intensity"].max() 1854 threshold = self.parameters.lc_ms.ph_inten_min_rel * max_intensity 1855 1856 # Create single filter condition and apply to required columns only 1857 valid_mask = data["intensity"].notna() & (data["intensity"] > threshold) 1858 required_cols = ["mz", "intensity", "scan"] 1859 data_thres = data.loc[valid_mask, required_cols].copy() 1860 data_thres["persistence"] = data_thres["intensity"] 1861 1862 # Merge with required scan data 1863 scan_subset = self.scan_df[["scan", "scan_time"]] 1864 mf_df = data_thres.merge(scan_subset, on="scan", how="inner") 1865 del data_thres, scan_subset 1866 1867 # Order by scan_time and then mz to ensure features near in rt are processed together 1868 # It's ok that different scans are in different partitions; we will roll up later 1869 mf_df = mf_df.sort_values( 1870 by=["scan_time", "mz"], ascending=[True, True] 1871 ).reset_index(drop=True) 1872 partition_size = 10000 1873 partitions = [ 1874 mf_df.iloc[i : i + partition_size].reset_index(drop=True) 1875 for i in range(0, len(mf_df), partition_size) 1876 ] 1877 del mf_df 1878 1879 # Run roll_up_dataframe on each partition 1880 rolled_partitions = [] 1881 for part in partitions: 1882 rolled = self.roll_up_dataframe( 1883 df=part, 1884 sort_by="persistence", 1885 dims=["mz", "scan_time"], 1886 tol=[ 1887 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1888 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1889 ], 1890 relative=[True, False], 1891 ) 1892 rolled_partitions.append(rolled) 1893 del partitions 1894 1895 # Run roll_up_dataframe on the rolled_up partitions to merge features near partition boundaries 1896 1897 # Combine results and run a final roll-up to merge features near partition boundaries 1898 mf_df_final = pd.concat(rolled_partitions, ignore_index=True) 1899 del rolled_partitions 1900 1901 # Reorder by persistence before final roll-up 1902 mf_df_final = mf_df_final.sort_values( 1903 by="persistence", ascending=False 1904 ).reset_index(drop=True) 1905 1906 mf_df_final = self.roll_up_dataframe( 1907 df=mf_df_final, 1908 sort_by="persistence", 1909 dims=["mz", "scan_time"], 1910 tol=[ 1911 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1912 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1913 ], 1914 relative=[True, False], 1915 ) 1916 # reset index 1917 mf_df_final = mf_df_final.reset_index(drop=True) 1918 1919 # Combine rename and sort operations 1920 mass_features = ( 1921 mf_df_final.rename( 1922 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1923 ) 1924 .sort_values(by="persistence", ascending=False) 1925 .reset_index(drop=True) 1926 ) 1927 del mf_df_final # Free memory 1928 1929 # Order by persistence and reset index 1930 mass_features = mass_features.sort_values( 1931 by="persistence", ascending=False 1932 ).reset_index(drop=True) 1933 1934 self.mass_features = {} 1935 for idx, row in mass_features.iterrows(): 1936 row_dict = row.to_dict() 1937 lcms_feature = LCMSMassFeature(self, **row_dict) 1938 self.mass_features[lcms_feature.id] = lcms_feature 1939 1940 if self.parameters.lc_ms.verbose_processing: 1941 print("Found " + str(len(mass_features)) + " initial mass features") 1942 1943 def cluster_mass_features(self, drop_children=True, sort_by="persistence"): 1944 """Cluster mass features 1945 1946 Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence. 1947 1948 Parameters 1949 ---------- 1950 drop_children : bool, optional 1951 Whether to drop the mass features that are not cluster parents. Default is True. 1952 sort_by : str, optional 1953 The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence". 1954 1955 Raises 1956 ------ 1957 ValueError 1958 If no mass features are found. 1959 If too many mass features are found. 1960 1961 Returns 1962 ------- 1963 None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents. 1964 """ 1965 if self.mass_features is None: 1966 raise ValueError("No mass features found, run find_mass_features() first") 1967 if len(self.mass_features) > 400000: 1968 raise ValueError( 1969 "Too many mass features of interest found, run find_mass_features() with a higher intensity threshold" 1970 ) 1971 dims = ["mz", "scan_time"] 1972 mf_df_og = self.mass_features_to_df() 1973 mf_df = mf_df_og.copy() 1974 1975 tol = [ 1976 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1977 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1978 ] # mz, in relative; scan_time in minutes 1979 relative = [True, False] 1980 1981 # Roll up mass features based on their proximity in the declared dimensions 1982 mf_df_new = self.roll_up_dataframe( 1983 df=mf_df, sort_by=sort_by, dims=dims, tol=tol, relative=relative 1984 ) 1985 1986 mf_df["cluster_parent"] = np.where( 1987 np.isin(mf_df.index, mf_df_new.index), True, False 1988 ) 1989 1990 # get mass feature ids of features that are not cluster parents 1991 cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values 1992 if drop_children is True: 1993 # Drop mass features that are not cluster parents from self 1994 self.mass_features = { 1995 k: v 1996 for k, v in self.mass_features.items() 1997 if k not in cluster_daughters 1998 } 1999 else: 2000 return cluster_daughters
14def find_closest(A, target): 15 """Find the index of closest value in A to each value in target. 16 17 Parameters 18 ---------- 19 A : :obj:`~numpy.array` 20 The array to search (blueprint). A must be sorted. 21 target : :obj:`~numpy.array` 22 The array of values to search for. target must be sorted. 23 24 Returns 25 ------- 26 :obj:`~numpy.array` 27 The indices of the closest values in A to each value in target. 28 """ 29 idx = A.searchsorted(target) 30 idx = np.clip(idx, 1, len(A) - 1) 31 left = A[idx - 1] 32 right = A[idx] 33 idx -= target - left < right - target 34 return idx
Find the index of closest value in A to each value in target.
Parameters
- A (
~numpy.array
): The array to search (blueprint). A must be sorted. - target (
~numpy.array
): The array of values to search for. target must be sorted.
Returns
~numpy.array
: The indices of the closest values in A to each value in target.
37class LCCalculations: 38 """Methods for performing LC calculations on mass spectra data. 39 40 Notes 41 ----- 42 This class is intended to be used as a mixin for the LCMSBase class. 43 44 Methods 45 ------- 46 * get_max_eic(eic_data). 47 Returns the maximum EIC value from the given EIC data. A static method. 48 * smooth_tic(tic). 49 Smooths the TIC data using the specified smoothing method and settings. 50 * eic_centroid_detector(rt, eic, max_eic). 51 Performs EIC centroid detection on the given EIC data. 52 * find_nearest_scan(rt). 53 Finds the nearest scan to the given retention time. 54 * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). 55 Returns an averaged mass spectrum object. 56 * find_mass_features(ms_level=1). 57 Find regions of interest for a given MS level (default is MS1). 58 * integrate_mass_features(drop_if_fail=False, ms_level=1). 59 Integrate mass features of interest and extracts EICs. 60 * find_c13_mass_features(). 61 Evaluate mass features and mark likely C13 isotopes. 62 * deconvolute_ms1_mass_features(). 63 Deconvolute mass features' ms1 mass spectra. 64 """ 65 66 @staticmethod 67 def get_max_eic(eic_data: dict): 68 """Returns the maximum EIC value from the given EIC data. 69 70 Notes 71 ----- 72 This is a static method. 73 74 Parameters 75 ---------- 76 eic_data : dict 77 A dictionary containing EIC data. 78 79 Returns 80 ------- 81 float 82 The maximum EIC value. 83 """ 84 max_eic = 0 85 for eic_data in eic_data.values(): 86 ind_max_eic = max(eic_data.get("EIC")) 87 max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic 88 89 return max_eic 90 91 def smooth_tic(self, tic): 92 """Smooths the TIC or EIC data using the specified smoothing method and settings. 93 94 Parameters 95 ---------- 96 tic : numpy.ndarray 97 The TIC (or EIC) data to be smoothed. 98 99 Returns 100 ------- 101 numpy.ndarray 102 The smoothed TIC data. 103 """ 104 implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method 105 106 pol_order = self.parameters.lc_ms.savgol_pol_order 107 108 window_len = self.parameters.lc_ms.smooth_window 109 110 window = self.parameters.lc_ms.smooth_method 111 112 return sp.smooth_signal( 113 tic, window_len, window, pol_order, implemented_smooth_method 114 ) 115 116 def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]): 117 """Performs EIC centroid detection on the given EIC data. 118 119 Parameters 120 ---------- 121 rt : numpy.ndarray 122 The retention time data. 123 eic : numpy.ndarray 124 The EIC data. 125 max_eic : float 126 The maximum EIC value. 127 apex_indexes : list, optional 128 The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function. 129 130 Returns 131 ------- 132 numpy.ndarray 133 The indexes of left, apex, and right limits as a generator. 134 """ 135 136 max_prominence = self.parameters.lc_ms.peak_max_prominence_percent 137 138 max_height = self.parameters.lc_ms.peak_height_max_percent 139 140 signal_threshold = self.parameters.lc_ms.eic_signal_threshold 141 142 min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints 143 144 peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold 145 146 include_indexes = sp.peak_picking_first_derivative( 147 domain=rt, 148 signal=eic, 149 max_height=max_height, 150 max_prominence=max_prominence, 151 max_signal=max_eic, 152 min_peak_datapoints=min_peak_datapoints, 153 peak_derivative_threshold=peak_derivative_threshold, 154 signal_threshold=signal_threshold, 155 correct_baseline=False, 156 plot_res=False, 157 apex_indexes=apex_indexes, 158 ) 159 160 return include_indexes 161 162 def find_nearest_scan(self, rt): 163 """Finds the nearest scan to the given retention time. 164 165 Parameters 166 ---------- 167 rt : float 168 The retention time (in minutes) to find the nearest scan for. 169 170 Returns 171 ------- 172 int 173 The scan number of the nearest scan. 174 """ 175 array_rt = np.array(self.retention_time) 176 177 scan_index = (np.abs(array_rt - rt)).argmin() 178 179 real_scan = self.scans_number[scan_index] 180 181 return real_scan 182 183 def add_peak_metrics(self): 184 """Add peak metrics to the mass features. 185 186 This function calculates the peak metrics for each mass feature and adds them to the mass feature objects. 187 """ 188 # Check that at least some mass features have eic data 189 if not any([mf._eic_data is not None for mf in self.mass_features.values()]): 190 raise ValueError( 191 "No mass features have EIC data. Run integrate_mass_features first." 192 ) 193 194 for mass_feature in self.mass_features.values(): 195 # Check if the mass feature has been integrated 196 if mass_feature._eic_data is not None: 197 # Calculate peak metrics 198 mass_feature.calc_half_height_width() 199 mass_feature.calc_tailing_factor() 200 mass_feature.calc_dispersity_index() 201 202 def get_average_mass_spectrum( 203 self, 204 scan_list, 205 apex_scan, 206 spectrum_mode="profile", 207 ms_level=1, 208 auto_process=True, 209 use_parser=False, 210 perform_checks=True, 211 polarity=None, 212 ms_params=None, 213 ): 214 """Returns an averaged mass spectrum object 215 216 Parameters 217 ---------- 218 scan_list : list 219 List of scan numbers to average. 220 apex_scan : int 221 Number of the apex scan 222 spectrum_mode : str, optional 223 The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging. 224 ms_level : int, optional 225 The MS level to use. Defaults to 1. 226 auto_process : bool, optional 227 If True, the averaged mass spectrum will be auto-processed. Defaults to True. 228 use_parser : bool, optional 229 If True, the mass spectra will be obtained from the parser. Defaults to False. 230 perform_checks : bool, optional 231 If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan 232 polarity : int, optional 233 The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1) 234 ms_params : MSParameters, optional 235 The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None. 236 237 Returns 238 ------- 239 MassSpectrumProfile 240 The averaged mass spectrum object. 241 242 Raises 243 ------ 244 ValueError 245 If the spectrum mode is not "profile". 246 If the MS level is not found in the unprocessed mass spectra dictionary. 247 If not all scan numbers are found in the unprocessed mass spectra dictionary. 248 """ 249 if perform_checks: 250 if spectrum_mode != "profile": 251 raise ValueError("Averaging only supported for profile mode") 252 253 if polarity is None: 254 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 255 if self.polarity == "negative": 256 polarity = -1 257 elif self.polarity == "positive": 258 polarity = 1 259 else: 260 raise ValueError( 261 "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'" 262 ) 263 264 # if not using_parser, check that scan numbers are in _ms_unprocessed 265 if not use_parser: 266 if perform_checks: 267 # Set index to scan for faster lookup 268 ms_df = ( 269 self._ms_unprocessed[ms_level] 270 .copy() 271 .set_index("scan", drop=False) 272 .sort_index() 273 ) 274 my_ms_df = ms_df.loc[scan_list] 275 # Check that all scan numbers are in the ms_df 276 if not all(np.isin(scan_list, ms_df.index)): 277 raise ValueError( 278 "Not all scan numbers found in the unprocessed mass spectra dictionary" 279 ) 280 else: 281 my_ms_df = ( 282 pd.DataFrame({"scan": scan_list}) 283 .set_index("scan") 284 .join(self._ms_unprocessed[ms_level], how="left") 285 ) 286 287 if use_parser: 288 ms_list = [ 289 self.spectra_parser.get_mass_spectrum_from_scan( 290 x, spectrum_mode=spectrum_mode, auto_process=False 291 ) 292 for x in scan_list 293 ] 294 ms_mz = [x._mz_exp for x in ms_list] 295 ms_int = [x._abundance for x in ms_list] 296 my_ms_df = [] 297 for i in np.arange(len(ms_mz)): 298 my_ms_df.append( 299 pd.DataFrame( 300 {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]} 301 ) 302 ) 303 my_ms_df = pd.concat(my_ms_df) 304 305 if not self.check_if_grid(my_ms_df): 306 my_ms_df = self.grid_data(my_ms_df) 307 308 my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index() 309 310 ms = ms_from_array_profile( 311 my_ms_ave.mz, 312 my_ms_ave.intensity, 313 self.file_location, 314 polarity=polarity, 315 auto_process=False, 316 ) 317 318 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 319 if ms is not None: 320 if ms_params is not None: 321 ms.parameters = ms_params 322 ms.scan_number = apex_scan 323 if auto_process: 324 ms.process_mass_spec() 325 return ms 326 327 def find_mass_features(self, ms_level=1, grid=True): 328 """Find mass features within an LCMSBase object 329 330 Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future. 331 332 Parameters 333 ---------- 334 ms_level : int, optional 335 The MS level to use for peak picking Default is 1. 336 grid : bool, optional 337 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 338 used for persistent homology peak picking for profile data only. Default is True. 339 340 Raises 341 ------ 342 ValueError 343 If no MS level data is found on the object. 344 If persistent homology peak picking is attempted on non-profile mode data. 345 If data is not gridded and grid is False. 346 If peak picking method is not implemented. 347 348 Returns 349 ------- 350 None, but assigns the mass_features and eics attributes to the object. 351 352 """ 353 pp_method = self.parameters.lc_ms.peak_picking_method 354 355 if pp_method == "persistent homology": 356 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 357 if all(msx_scan_df["ms_format"] == "profile"): 358 self.find_mass_features_ph(ms_level=ms_level, grid=grid) 359 else: 360 raise ValueError( 361 "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format( 362 ms_level 363 ) 364 ) 365 elif pp_method == "centroided_persistent_homology": 366 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 367 if all(msx_scan_df["ms_format"] == "centroid"): 368 self.find_mass_features_ph_centroid(ms_level=ms_level) 369 else: 370 raise ValueError( 371 "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format( 372 ms_level 373 ) 374 ) 375 else: 376 raise ValueError("Peak picking method not implemented") 377 378 def integrate_mass_features( 379 self, drop_if_fail=True, drop_duplicates=True, ms_level=1 380 ): 381 """Integrate mass features and extract EICs. 382 383 Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute. 384 385 Parameters 386 ---------- 387 drop_if_fail : bool, optional 388 Whether to drop mass features if the EIC limit calculations fail. 389 Default is True. 390 drop_duplicates : bool, optional 391 Whether to mass features that appear to be duplicates 392 (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). 393 Default is True. 394 ms_level : int, optional 395 The MS level to use. Default is 1. 396 397 Raises 398 ------ 399 ValueError 400 If no mass features are found. 401 If no MS level data is found for the given MS level (either in data or in the scan data) 402 403 Returns 404 ------- 405 None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute. 406 407 Notes 408 ----- 409 drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection). 410 """ 411 # Check if there is data 412 if ms_level in self._ms_unprocessed.keys(): 413 raw_data = self._ms_unprocessed[ms_level].copy() 414 else: 415 raise ValueError("No MS level " + str(ms_level) + " data found") 416 if self.mass_features is not None: 417 mf_df = self.mass_features_to_df().copy() 418 else: 419 raise ValueError( 420 "No mass features found, did you run find_mass_features() first?" 421 ) 422 # Check if mass_spectrum exists on each mass feature 423 if not all( 424 [mf.mass_spectrum is not None for mf in self.mass_features.values()] 425 ): 426 raise ValueError( 427 "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?" 428 ) 429 430 # Subset scan data to only include correct ms_level 431 scan_df_sub = self.scan_df[ 432 self.scan_df["ms_level"] == int(ms_level) 433 ].reset_index(drop=True) 434 if scan_df_sub.empty: 435 raise ValueError("No MS level " + ms_level + " data found in scan data") 436 scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy() 437 438 mzs_to_extract = np.unique(mf_df["mz"].values) 439 mzs_to_extract.sort() 440 441 # Pre-sort raw_data by mz for faster filtering 442 raw_data_sorted = raw_data.sort_values(["mz", "scan"]).reset_index(drop=True) 443 raw_data_mz = raw_data_sorted["mz"].values 444 445 # Get EICs for each unique mz in mass features list 446 for mz in mzs_to_extract: 447 mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 448 mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 449 450 # Use binary search for faster mz range filtering 451 left_idx = np.searchsorted(raw_data_mz, mz_min, side="left") 452 right_idx = np.searchsorted(raw_data_mz, mz_max, side="right") 453 raw_data_sub = raw_data_sorted.iloc[left_idx:right_idx].copy() 454 455 raw_data_sub = ( 456 raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index() 457 ) 458 raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left") 459 raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0) 460 myEIC = EIC_Data( 461 scans=raw_data_sub["scan"].values, 462 time=raw_data_sub["scan_time"].values, 463 eic=raw_data_sub["intensity"].values, 464 ) 465 # Smooth EIC 466 smoothed_eic = self.smooth_tic(myEIC.eic) 467 smoothed_eic[smoothed_eic < 0] = 0 468 myEIC.eic_smoothed = smoothed_eic 469 self.eics[mz] = myEIC 470 471 # Get limits of mass features using EIC centroid detector and integrate 472 mf_df["area"] = np.nan 473 for idx, mass_feature in mf_df.iterrows(): 474 mz = mass_feature.mz 475 apex_scan = mass_feature.apex_scan 476 477 # Pull EIC data and find apex scan index 478 myEIC = self.eics[mz] 479 self.mass_features[idx]._eic_data = myEIC 480 apex_index = np.where(myEIC.scans == apex_scan)[0][0] 481 482 # Find left and right limits of peak using EIC centroid detector, add to EICData 483 centroid_eics = self.eic_centroid_detector( 484 myEIC.time, 485 myEIC.eic_smoothed, 486 mass_feature.intensity * 1.1, 487 apex_indexes=[int(apex_index)], 488 ) 489 l_a_r_scan_idx = [i for i in centroid_eics] 490 if len(l_a_r_scan_idx) > 0: 491 # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans 492 # Find the number of consecutive non-zero values in the EIC segment 493 mask = myEIC.eic[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1] > 0 494 # Find the longest run of consecutive True values 495 if np.any(mask): 496 # Find indices where mask changes value 497 diff = np.diff(np.concatenate(([0], mask.astype(int), [0]))) 498 starts = np.where(diff == 1)[0] 499 ends = np.where(diff == -1)[0] 500 consecutive_scans = (ends - starts).max() 501 else: 502 consecutive_scans = 0 503 if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min: 504 self.mass_features.pop(idx) 505 continue 506 # Add start and final scan to mass_features and EICData 507 left_scan, right_scan = ( 508 myEIC.scans[l_a_r_scan_idx[0][0]], 509 myEIC.scans[l_a_r_scan_idx[0][2]], 510 ) 511 mf_scan_apex = [(left_scan, int(apex_scan), right_scan)] 512 myEIC.apexes = myEIC.apexes + mf_scan_apex 513 self.mass_features[idx].start_scan = left_scan 514 self.mass_features[idx].final_scan = right_scan 515 516 # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData 517 area = np.trapz( 518 myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 519 myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 520 ) 521 mf_df.at[idx, "area"] = area 522 myEIC.areas = myEIC.areas + [area] 523 self.eics[mz] = myEIC 524 self.mass_features[idx]._area = area 525 else: 526 if drop_if_fail is True: 527 self.mass_features.pop(idx) 528 529 if drop_duplicates: 530 # Prepare mass feature dataframe 531 mf_df = self.mass_features_to_df() 532 533 # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature 534 # Keep the first mass feature (highest persistence) 535 for idx, mass_feature in mf_df.iterrows(): 536 mz = mass_feature.mz 537 apex_scan = mass_feature.apex_scan 538 539 mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6 540 mf_df_sub = mf_df[ 541 mf_df["mz_diff_ppm"] 542 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 543 * 10**6 544 ].copy() 545 546 # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature 547 for idx2, mass_feature2 in mf_df_sub.iterrows(): 548 if idx2 != idx: 549 if ( 550 mass_feature2.start_scan >= mass_feature.start_scan 551 and mass_feature2.final_scan <= mass_feature.final_scan 552 ): 553 if idx2 in self.mass_features.keys(): 554 self.mass_features.pop(idx2) 555 556 def find_c13_mass_features(self): 557 """Mark likely C13 isotopes and connect to monoisoitopic mass features. 558 559 Returns 560 ------- 561 None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object. 562 563 Raises 564 ------ 565 ValueError 566 If no mass features are found. 567 """ 568 verbose = self.parameters.lc_ms.verbose_processing 569 if verbose: 570 print("evaluating mass features for C13 isotopes") 571 if self.mass_features is None: 572 raise ValueError("No mass features found, run find_mass_features() first") 573 574 # Data prep fo sparse distance matrix 575 dims = ["mz", "scan_time"] 576 mf_df = self.mass_features_to_df().copy() 577 # Drop mass features that have no area (these are likely to be noise) 578 mf_df = mf_df[mf_df["area"].notnull()] 579 mf_df["mf_id"] = mf_df.index.values 580 dims = ["mz", "scan_time"] 581 582 # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features 583 mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy() 584 585 mz_diff = 1.003355 # C13-C12 mass difference 586 tol = [ 587 mf_df["mz"].median() 588 * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 589 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5, 590 ] # mz, in relative; scan_time in minutes 591 592 # Compute inter-feature distances 593 distances = None 594 for i in range(len(dims)): 595 # Construct k-d tree 596 values = mf_df[dims[i]].values 597 tree = KDTree(values.reshape(-1, 1)) 598 599 max_tol = tol[i] 600 if dims[i] == "mz": 601 # Maximum absolute tolerance 602 max_tol = mz_diff + tol[i] 603 604 # Compute sparse distance matrix 605 # the larger the max_tol, the slower this operation is 606 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 607 608 # Only consider forward case, exclude diagonal 609 sdm = sparse.triu(sdm, k=1) 610 611 if dims[i] == "mz": 612 min_tol = mz_diff - tol[i] 613 # Get only the ones that are above the min tol 614 idx = sdm.data > min_tol 615 616 # Reconstruct sparse distance matrix 617 sdm = sparse.coo_matrix( 618 (sdm.data[idx], (sdm.row[idx], sdm.col[idx])), 619 shape=(len(values), len(values)), 620 ) 621 622 # Cast as binary matrix 623 sdm.data = np.ones_like(sdm.data) 624 625 # Stack distances 626 if distances is None: 627 distances = sdm 628 else: 629 distances = distances.multiply(sdm) 630 631 # Extract indices of within-tolerance points 632 distances = distances.tocoo() 633 pairs = np.stack((distances.row, distances.col), axis=1) # C12 to C13 pairs 634 635 # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df 636 pairs_mf = pairs.copy() 637 pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values 638 pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values 639 640 # Connect monoisotopic masses with isotopologes within mass_features 641 monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1])) 642 for mono in monos: 643 self.mass_features[mono].monoisotopic_mf_id = mono 644 pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"]) 645 while not pairs_iso_df.empty: 646 pairs_iso_df = pairs_iso_df.set_index("parent", drop=False) 647 m1_isos = pairs_iso_df.loc[monos, "child"].unique() 648 for iso in m1_isos: 649 # Set monoisotopic_mf_id and isotopologue_type for isotopologues 650 parent = pairs_mf[pairs_mf[:, 1] == iso, 0] 651 if len(parent) > 1: 652 # Choose the parent that is closest in time to the isotopologue 653 parent_time = [self.mass_features[p].retention_time for p in parent] 654 time_diff = [ 655 np.abs(self.mass_features[iso].retention_time - x) 656 for x in parent_time 657 ] 658 parent = parent[np.argmin(time_diff)] 659 else: 660 parent = parent[0] 661 self.mass_features[iso].monoisotopic_mf_id = self.mass_features[ 662 parent 663 ].monoisotopic_mf_id 664 if self.mass_features[iso].monoisotopic_mf_id is not None: 665 mass_diff = ( 666 self.mass_features[iso].mz 667 - self.mass_features[ 668 self.mass_features[iso].monoisotopic_mf_id 669 ].mz 670 ) 671 self.mass_features[iso].isotopologue_type = "13C" + str( 672 int(round(mass_diff, 0)) 673 ) 674 675 # Drop the mono and iso from the pairs_iso_df 676 pairs_iso_df = pairs_iso_df.drop( 677 index=monos, errors="ignore" 678 ) # Drop pairs where the parent is a child that is a child of a root 679 pairs_iso_df = pairs_iso_df.set_index("child", drop=False) 680 pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore") 681 682 if not pairs_iso_df.empty: 683 # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process 684 monos = np.setdiff1d( 685 np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child) 686 ) 687 if verbose: 688 # Report fraction of compounds annotated with isotopes 689 mf_df["c13_flag"] = np.where( 690 np.logical_or( 691 np.isin(mf_df["mf_id"], pairs_mf[:, 0]), 692 np.isin(mf_df["mf_id"], pairs_mf[:, 1]), 693 ), 694 1, 695 0, 696 ) 697 print( 698 str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3)) 699 + " of mass features have or are C13 isotopes" 700 ) 701 702 def deconvolute_ms1_mass_features(self): 703 """Deconvolute MS1 mass features 704 705 Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features 706 707 Parameters 708 ---------- 709 None 710 711 Returns 712 ------- 713 None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent, 714 and associated_mass_features_deconvoluted attributes to the mass features in the 715 mass_features attribute of the LCMSBase object. 716 717 Raises 718 ------ 719 ValueError 720 If no mass features are found, must run find_mass_features() first. 721 If no EICs are found, did you run integrate_mass_features() first? 722 723 """ 724 # Checks for set mass_features and eics 725 if self.mass_features is None: 726 raise ValueError( 727 "No mass features found, did you run find_mass_features() first?" 728 ) 729 730 if self.eics == {}: 731 raise ValueError( 732 "No EICs found, did you run integrate_mass_features() first?" 733 ) 734 735 if 1 not in self._ms_unprocessed.keys(): 736 raise ValueError("No unprocessed MS1 spectra found.") 737 738 # Prep ms1 data 739 ms1_data = self._ms_unprocessed[1].copy() 740 ms1_data = ms1_data.set_index("scan") 741 742 # Prep mass feature summary 743 mass_feature_df = self.mass_features_to_df() 744 745 # Loop through each mass feature 746 for mf_id, mass_feature in self.mass_features.items(): 747 # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df 748 if mass_feature.mz != mass_feature.ms1_peak.mz_exp: 749 continue 750 751 # Get the left and right limits of the EIC of the mass feature 752 l_scan, _, r_scan = mass_feature._eic_data.apexes[0] 753 754 # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz 755 ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy() 756 ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False) 757 758 # Get the centroided masses of the mass feature 759 mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp 760 761 # Find the closest mz in the ms1 data to the centroided masses of the mass feature 762 ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[ 763 find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values) 764 ] 765 766 # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz 767 ms1_data_sub["mz_diff_rel"] = ( 768 np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"]) 769 / ms1_data_sub["mz"] 770 ) 771 ms1_data_sub = ms1_data_sub[ 772 ms1_data_sub["mz_diff_rel"] 773 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 774 ].reset_index(drop=True) 775 776 # Group by mass_feature_mz and scan and sum intensity 777 ms1_data_sub_group = ( 778 ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"] 779 .sum() 780 .reset_index() 781 ) 782 783 # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity) 784 corr = ( 785 ms1_data_sub_group.pivot( 786 index="scan", columns="mass_feature_mz", values="intensity" 787 ) 788 .fillna(0) 789 .corr() 790 ) 791 792 # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8 793 decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min 794 795 # Try catch for KeyError in case the mass feature mz is not in the correlation matrix 796 try: 797 corr_subset = corr.loc[mass_feature.mz,] 798 except KeyError: 799 # If the mass feature mz is not in the correlation matrix, skip to the next mass feature 800 continue 801 802 corr_subset = corr_subset[corr_subset > decon_corr_min] 803 804 # Get the masses from the mass spectrum that are the result of the deconvolution 805 mzs_decon = corr_subset.index.values 806 807 # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature 808 mzs_decon_idx = [ 809 id 810 for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp) 811 if mz in mzs_decon 812 ] 813 mass_feature._ms_deconvoluted_idx = mzs_decon_idx 814 815 # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum 816 if ( 817 mass_feature.ms1_peak.abundance 818 == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max() 819 ): 820 mass_feature.mass_spectrum_deconvoluted_parent = True 821 else: 822 mass_feature.mass_spectrum_deconvoluted_parent = False 823 824 # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature 825 # Subset mass_feature_df to only include mass features that are within the clustering tolerance 826 mass_feature_df_sub = mass_feature_df[ 827 abs(mass_feature.retention_time - mass_feature_df["scan_time"]) 828 < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 829 ].copy() 830 # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum 831 mass_feature_df_sub["mz_diff_ppm"] = [ 832 np.abs(mzs_decon - mz).min() / mz * 10**6 833 for mz in mass_feature_df_sub["mz"] 834 ] 835 # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses 836 mfs_associated_decon = mass_feature_df_sub[ 837 mass_feature_df_sub["mz_diff_ppm"] 838 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6 839 ].index.values 840 841 mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon
Methods for performing LC calculations on mass spectra data.
Notes
This class is intended to be used as a mixin for the LCMSBase class.
Methods
- get_max_eic(eic_data). Returns the maximum EIC value from the given EIC data. A static method.
- smooth_tic(tic). Smooths the TIC data using the specified smoothing method and settings.
- eic_centroid_detector(rt, eic, max_eic). Performs EIC centroid detection on the given EIC data.
- find_nearest_scan(rt). Finds the nearest scan to the given retention time.
- get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). Returns an averaged mass spectrum object.
- find_mass_features(ms_level=1). Find regions of interest for a given MS level (default is MS1).
- integrate_mass_features(drop_if_fail=False, ms_level=1). Integrate mass features of interest and extracts EICs.
- find_c13_mass_features(). Evaluate mass features and mark likely C13 isotopes.
- deconvolute_ms1_mass_features(). Deconvolute mass features' ms1 mass spectra.
66 @staticmethod 67 def get_max_eic(eic_data: dict): 68 """Returns the maximum EIC value from the given EIC data. 69 70 Notes 71 ----- 72 This is a static method. 73 74 Parameters 75 ---------- 76 eic_data : dict 77 A dictionary containing EIC data. 78 79 Returns 80 ------- 81 float 82 The maximum EIC value. 83 """ 84 max_eic = 0 85 for eic_data in eic_data.values(): 86 ind_max_eic = max(eic_data.get("EIC")) 87 max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic 88 89 return max_eic
Returns the maximum EIC value from the given EIC data.
Notes
This is a static method.
Parameters
- eic_data (dict): A dictionary containing EIC data.
Returns
- float: The maximum EIC value.
91 def smooth_tic(self, tic): 92 """Smooths the TIC or EIC data using the specified smoothing method and settings. 93 94 Parameters 95 ---------- 96 tic : numpy.ndarray 97 The TIC (or EIC) data to be smoothed. 98 99 Returns 100 ------- 101 numpy.ndarray 102 The smoothed TIC data. 103 """ 104 implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method 105 106 pol_order = self.parameters.lc_ms.savgol_pol_order 107 108 window_len = self.parameters.lc_ms.smooth_window 109 110 window = self.parameters.lc_ms.smooth_method 111 112 return sp.smooth_signal( 113 tic, window_len, window, pol_order, implemented_smooth_method 114 )
Smooths the TIC or EIC data using the specified smoothing method and settings.
Parameters
- tic (numpy.ndarray): The TIC (or EIC) data to be smoothed.
Returns
- numpy.ndarray: The smoothed TIC data.
116 def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]): 117 """Performs EIC centroid detection on the given EIC data. 118 119 Parameters 120 ---------- 121 rt : numpy.ndarray 122 The retention time data. 123 eic : numpy.ndarray 124 The EIC data. 125 max_eic : float 126 The maximum EIC value. 127 apex_indexes : list, optional 128 The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function. 129 130 Returns 131 ------- 132 numpy.ndarray 133 The indexes of left, apex, and right limits as a generator. 134 """ 135 136 max_prominence = self.parameters.lc_ms.peak_max_prominence_percent 137 138 max_height = self.parameters.lc_ms.peak_height_max_percent 139 140 signal_threshold = self.parameters.lc_ms.eic_signal_threshold 141 142 min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints 143 144 peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold 145 146 include_indexes = sp.peak_picking_first_derivative( 147 domain=rt, 148 signal=eic, 149 max_height=max_height, 150 max_prominence=max_prominence, 151 max_signal=max_eic, 152 min_peak_datapoints=min_peak_datapoints, 153 peak_derivative_threshold=peak_derivative_threshold, 154 signal_threshold=signal_threshold, 155 correct_baseline=False, 156 plot_res=False, 157 apex_indexes=apex_indexes, 158 ) 159 160 return include_indexes
Performs EIC centroid detection on the given EIC data.
Parameters
- rt (numpy.ndarray): The retention time data.
- eic (numpy.ndarray): The EIC data.
- max_eic (float): The maximum EIC value.
- apex_indexes (list, optional): The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
Returns
- numpy.ndarray: The indexes of left, apex, and right limits as a generator.
162 def find_nearest_scan(self, rt): 163 """Finds the nearest scan to the given retention time. 164 165 Parameters 166 ---------- 167 rt : float 168 The retention time (in minutes) to find the nearest scan for. 169 170 Returns 171 ------- 172 int 173 The scan number of the nearest scan. 174 """ 175 array_rt = np.array(self.retention_time) 176 177 scan_index = (np.abs(array_rt - rt)).argmin() 178 179 real_scan = self.scans_number[scan_index] 180 181 return real_scan
Finds the nearest scan to the given retention time.
Parameters
- rt (float): The retention time (in minutes) to find the nearest scan for.
Returns
- int: The scan number of the nearest scan.
183 def add_peak_metrics(self): 184 """Add peak metrics to the mass features. 185 186 This function calculates the peak metrics for each mass feature and adds them to the mass feature objects. 187 """ 188 # Check that at least some mass features have eic data 189 if not any([mf._eic_data is not None for mf in self.mass_features.values()]): 190 raise ValueError( 191 "No mass features have EIC data. Run integrate_mass_features first." 192 ) 193 194 for mass_feature in self.mass_features.values(): 195 # Check if the mass feature has been integrated 196 if mass_feature._eic_data is not None: 197 # Calculate peak metrics 198 mass_feature.calc_half_height_width() 199 mass_feature.calc_tailing_factor() 200 mass_feature.calc_dispersity_index()
Add peak metrics to the mass features.
This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
202 def get_average_mass_spectrum( 203 self, 204 scan_list, 205 apex_scan, 206 spectrum_mode="profile", 207 ms_level=1, 208 auto_process=True, 209 use_parser=False, 210 perform_checks=True, 211 polarity=None, 212 ms_params=None, 213 ): 214 """Returns an averaged mass spectrum object 215 216 Parameters 217 ---------- 218 scan_list : list 219 List of scan numbers to average. 220 apex_scan : int 221 Number of the apex scan 222 spectrum_mode : str, optional 223 The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging. 224 ms_level : int, optional 225 The MS level to use. Defaults to 1. 226 auto_process : bool, optional 227 If True, the averaged mass spectrum will be auto-processed. Defaults to True. 228 use_parser : bool, optional 229 If True, the mass spectra will be obtained from the parser. Defaults to False. 230 perform_checks : bool, optional 231 If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan 232 polarity : int, optional 233 The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1) 234 ms_params : MSParameters, optional 235 The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None. 236 237 Returns 238 ------- 239 MassSpectrumProfile 240 The averaged mass spectrum object. 241 242 Raises 243 ------ 244 ValueError 245 If the spectrum mode is not "profile". 246 If the MS level is not found in the unprocessed mass spectra dictionary. 247 If not all scan numbers are found in the unprocessed mass spectra dictionary. 248 """ 249 if perform_checks: 250 if spectrum_mode != "profile": 251 raise ValueError("Averaging only supported for profile mode") 252 253 if polarity is None: 254 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 255 if self.polarity == "negative": 256 polarity = -1 257 elif self.polarity == "positive": 258 polarity = 1 259 else: 260 raise ValueError( 261 "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'" 262 ) 263 264 # if not using_parser, check that scan numbers are in _ms_unprocessed 265 if not use_parser: 266 if perform_checks: 267 # Set index to scan for faster lookup 268 ms_df = ( 269 self._ms_unprocessed[ms_level] 270 .copy() 271 .set_index("scan", drop=False) 272 .sort_index() 273 ) 274 my_ms_df = ms_df.loc[scan_list] 275 # Check that all scan numbers are in the ms_df 276 if not all(np.isin(scan_list, ms_df.index)): 277 raise ValueError( 278 "Not all scan numbers found in the unprocessed mass spectra dictionary" 279 ) 280 else: 281 my_ms_df = ( 282 pd.DataFrame({"scan": scan_list}) 283 .set_index("scan") 284 .join(self._ms_unprocessed[ms_level], how="left") 285 ) 286 287 if use_parser: 288 ms_list = [ 289 self.spectra_parser.get_mass_spectrum_from_scan( 290 x, spectrum_mode=spectrum_mode, auto_process=False 291 ) 292 for x in scan_list 293 ] 294 ms_mz = [x._mz_exp for x in ms_list] 295 ms_int = [x._abundance for x in ms_list] 296 my_ms_df = [] 297 for i in np.arange(len(ms_mz)): 298 my_ms_df.append( 299 pd.DataFrame( 300 {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]} 301 ) 302 ) 303 my_ms_df = pd.concat(my_ms_df) 304 305 if not self.check_if_grid(my_ms_df): 306 my_ms_df = self.grid_data(my_ms_df) 307 308 my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index() 309 310 ms = ms_from_array_profile( 311 my_ms_ave.mz, 312 my_ms_ave.intensity, 313 self.file_location, 314 polarity=polarity, 315 auto_process=False, 316 ) 317 318 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 319 if ms is not None: 320 if ms_params is not None: 321 ms.parameters = ms_params 322 ms.scan_number = apex_scan 323 if auto_process: 324 ms.process_mass_spec() 325 return ms
Returns an averaged mass spectrum object
Parameters
- scan_list (list): List of scan numbers to average.
- apex_scan (int): Number of the apex scan
- spectrum_mode (str, optional): The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
- ms_level (int, optional): The MS level to use. Defaults to 1.
- auto_process (bool, optional): If True, the averaged mass spectrum will be auto-processed. Defaults to True.
- use_parser (bool, optional): If True, the mass spectra will be obtained from the parser. Defaults to False.
- perform_checks (bool, optional): If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan
- polarity (int, optional): The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
- ms_params (MSParameters, optional): The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
Returns
- MassSpectrumProfile: The averaged mass spectrum object.
Raises
- ValueError: If the spectrum mode is not "profile". If the MS level is not found in the unprocessed mass spectra dictionary. If not all scan numbers are found in the unprocessed mass spectra dictionary.
327 def find_mass_features(self, ms_level=1, grid=True): 328 """Find mass features within an LCMSBase object 329 330 Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future. 331 332 Parameters 333 ---------- 334 ms_level : int, optional 335 The MS level to use for peak picking Default is 1. 336 grid : bool, optional 337 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 338 used for persistent homology peak picking for profile data only. Default is True. 339 340 Raises 341 ------ 342 ValueError 343 If no MS level data is found on the object. 344 If persistent homology peak picking is attempted on non-profile mode data. 345 If data is not gridded and grid is False. 346 If peak picking method is not implemented. 347 348 Returns 349 ------- 350 None, but assigns the mass_features and eics attributes to the object. 351 352 """ 353 pp_method = self.parameters.lc_ms.peak_picking_method 354 355 if pp_method == "persistent homology": 356 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 357 if all(msx_scan_df["ms_format"] == "profile"): 358 self.find_mass_features_ph(ms_level=ms_level, grid=grid) 359 else: 360 raise ValueError( 361 "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format( 362 ms_level 363 ) 364 ) 365 elif pp_method == "centroided_persistent_homology": 366 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 367 if all(msx_scan_df["ms_format"] == "centroid"): 368 self.find_mass_features_ph_centroid(ms_level=ms_level) 369 else: 370 raise ValueError( 371 "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format( 372 ms_level 373 ) 374 ) 375 else: 376 raise ValueError("Peak picking method not implemented")
Find mass features within an LCMSBase object
Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
Parameters
- ms_level (int, optional): The MS level to use for peak picking Default is 1.
- grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), used for persistent homology peak picking for profile data only. Default is True.
Raises
- ValueError: If no MS level data is found on the object. If persistent homology peak picking is attempted on non-profile mode data. If data is not gridded and grid is False. If peak picking method is not implemented.
Returns
- None, but assigns the mass_features and eics attributes to the object.
378 def integrate_mass_features( 379 self, drop_if_fail=True, drop_duplicates=True, ms_level=1 380 ): 381 """Integrate mass features and extract EICs. 382 383 Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute. 384 385 Parameters 386 ---------- 387 drop_if_fail : bool, optional 388 Whether to drop mass features if the EIC limit calculations fail. 389 Default is True. 390 drop_duplicates : bool, optional 391 Whether to mass features that appear to be duplicates 392 (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). 393 Default is True. 394 ms_level : int, optional 395 The MS level to use. Default is 1. 396 397 Raises 398 ------ 399 ValueError 400 If no mass features are found. 401 If no MS level data is found for the given MS level (either in data or in the scan data) 402 403 Returns 404 ------- 405 None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute. 406 407 Notes 408 ----- 409 drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection). 410 """ 411 # Check if there is data 412 if ms_level in self._ms_unprocessed.keys(): 413 raw_data = self._ms_unprocessed[ms_level].copy() 414 else: 415 raise ValueError("No MS level " + str(ms_level) + " data found") 416 if self.mass_features is not None: 417 mf_df = self.mass_features_to_df().copy() 418 else: 419 raise ValueError( 420 "No mass features found, did you run find_mass_features() first?" 421 ) 422 # Check if mass_spectrum exists on each mass feature 423 if not all( 424 [mf.mass_spectrum is not None for mf in self.mass_features.values()] 425 ): 426 raise ValueError( 427 "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?" 428 ) 429 430 # Subset scan data to only include correct ms_level 431 scan_df_sub = self.scan_df[ 432 self.scan_df["ms_level"] == int(ms_level) 433 ].reset_index(drop=True) 434 if scan_df_sub.empty: 435 raise ValueError("No MS level " + ms_level + " data found in scan data") 436 scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy() 437 438 mzs_to_extract = np.unique(mf_df["mz"].values) 439 mzs_to_extract.sort() 440 441 # Pre-sort raw_data by mz for faster filtering 442 raw_data_sorted = raw_data.sort_values(["mz", "scan"]).reset_index(drop=True) 443 raw_data_mz = raw_data_sorted["mz"].values 444 445 # Get EICs for each unique mz in mass features list 446 for mz in mzs_to_extract: 447 mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 448 mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 449 450 # Use binary search for faster mz range filtering 451 left_idx = np.searchsorted(raw_data_mz, mz_min, side="left") 452 right_idx = np.searchsorted(raw_data_mz, mz_max, side="right") 453 raw_data_sub = raw_data_sorted.iloc[left_idx:right_idx].copy() 454 455 raw_data_sub = ( 456 raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index() 457 ) 458 raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left") 459 raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0) 460 myEIC = EIC_Data( 461 scans=raw_data_sub["scan"].values, 462 time=raw_data_sub["scan_time"].values, 463 eic=raw_data_sub["intensity"].values, 464 ) 465 # Smooth EIC 466 smoothed_eic = self.smooth_tic(myEIC.eic) 467 smoothed_eic[smoothed_eic < 0] = 0 468 myEIC.eic_smoothed = smoothed_eic 469 self.eics[mz] = myEIC 470 471 # Get limits of mass features using EIC centroid detector and integrate 472 mf_df["area"] = np.nan 473 for idx, mass_feature in mf_df.iterrows(): 474 mz = mass_feature.mz 475 apex_scan = mass_feature.apex_scan 476 477 # Pull EIC data and find apex scan index 478 myEIC = self.eics[mz] 479 self.mass_features[idx]._eic_data = myEIC 480 apex_index = np.where(myEIC.scans == apex_scan)[0][0] 481 482 # Find left and right limits of peak using EIC centroid detector, add to EICData 483 centroid_eics = self.eic_centroid_detector( 484 myEIC.time, 485 myEIC.eic_smoothed, 486 mass_feature.intensity * 1.1, 487 apex_indexes=[int(apex_index)], 488 ) 489 l_a_r_scan_idx = [i for i in centroid_eics] 490 if len(l_a_r_scan_idx) > 0: 491 # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans 492 # Find the number of consecutive non-zero values in the EIC segment 493 mask = myEIC.eic[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1] > 0 494 # Find the longest run of consecutive True values 495 if np.any(mask): 496 # Find indices where mask changes value 497 diff = np.diff(np.concatenate(([0], mask.astype(int), [0]))) 498 starts = np.where(diff == 1)[0] 499 ends = np.where(diff == -1)[0] 500 consecutive_scans = (ends - starts).max() 501 else: 502 consecutive_scans = 0 503 if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min: 504 self.mass_features.pop(idx) 505 continue 506 # Add start and final scan to mass_features and EICData 507 left_scan, right_scan = ( 508 myEIC.scans[l_a_r_scan_idx[0][0]], 509 myEIC.scans[l_a_r_scan_idx[0][2]], 510 ) 511 mf_scan_apex = [(left_scan, int(apex_scan), right_scan)] 512 myEIC.apexes = myEIC.apexes + mf_scan_apex 513 self.mass_features[idx].start_scan = left_scan 514 self.mass_features[idx].final_scan = right_scan 515 516 # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData 517 area = np.trapz( 518 myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 519 myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 520 ) 521 mf_df.at[idx, "area"] = area 522 myEIC.areas = myEIC.areas + [area] 523 self.eics[mz] = myEIC 524 self.mass_features[idx]._area = area 525 else: 526 if drop_if_fail is True: 527 self.mass_features.pop(idx) 528 529 if drop_duplicates: 530 # Prepare mass feature dataframe 531 mf_df = self.mass_features_to_df() 532 533 # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature 534 # Keep the first mass feature (highest persistence) 535 for idx, mass_feature in mf_df.iterrows(): 536 mz = mass_feature.mz 537 apex_scan = mass_feature.apex_scan 538 539 mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6 540 mf_df_sub = mf_df[ 541 mf_df["mz_diff_ppm"] 542 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 543 * 10**6 544 ].copy() 545 546 # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature 547 for idx2, mass_feature2 in mf_df_sub.iterrows(): 548 if idx2 != idx: 549 if ( 550 mass_feature2.start_scan >= mass_feature.start_scan 551 and mass_feature2.final_scan <= mass_feature.final_scan 552 ): 553 if idx2 in self.mass_features.keys(): 554 self.mass_features.pop(idx2)
Integrate mass features and extract EICs.
Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
Parameters
- drop_if_fail (bool, optional): Whether to drop mass features if the EIC limit calculations fail. Default is True.
- drop_duplicates (bool, optional): Whether to mass features that appear to be duplicates (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). Default is True.
- ms_level (int, optional): The MS level to use. Default is 1.
Raises
- ValueError: If no mass features are found. If no MS level data is found for the given MS level (either in data or in the scan data)
Returns
- None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
Notes
drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
556 def find_c13_mass_features(self): 557 """Mark likely C13 isotopes and connect to monoisoitopic mass features. 558 559 Returns 560 ------- 561 None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object. 562 563 Raises 564 ------ 565 ValueError 566 If no mass features are found. 567 """ 568 verbose = self.parameters.lc_ms.verbose_processing 569 if verbose: 570 print("evaluating mass features for C13 isotopes") 571 if self.mass_features is None: 572 raise ValueError("No mass features found, run find_mass_features() first") 573 574 # Data prep fo sparse distance matrix 575 dims = ["mz", "scan_time"] 576 mf_df = self.mass_features_to_df().copy() 577 # Drop mass features that have no area (these are likely to be noise) 578 mf_df = mf_df[mf_df["area"].notnull()] 579 mf_df["mf_id"] = mf_df.index.values 580 dims = ["mz", "scan_time"] 581 582 # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features 583 mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy() 584 585 mz_diff = 1.003355 # C13-C12 mass difference 586 tol = [ 587 mf_df["mz"].median() 588 * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 589 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5, 590 ] # mz, in relative; scan_time in minutes 591 592 # Compute inter-feature distances 593 distances = None 594 for i in range(len(dims)): 595 # Construct k-d tree 596 values = mf_df[dims[i]].values 597 tree = KDTree(values.reshape(-1, 1)) 598 599 max_tol = tol[i] 600 if dims[i] == "mz": 601 # Maximum absolute tolerance 602 max_tol = mz_diff + tol[i] 603 604 # Compute sparse distance matrix 605 # the larger the max_tol, the slower this operation is 606 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 607 608 # Only consider forward case, exclude diagonal 609 sdm = sparse.triu(sdm, k=1) 610 611 if dims[i] == "mz": 612 min_tol = mz_diff - tol[i] 613 # Get only the ones that are above the min tol 614 idx = sdm.data > min_tol 615 616 # Reconstruct sparse distance matrix 617 sdm = sparse.coo_matrix( 618 (sdm.data[idx], (sdm.row[idx], sdm.col[idx])), 619 shape=(len(values), len(values)), 620 ) 621 622 # Cast as binary matrix 623 sdm.data = np.ones_like(sdm.data) 624 625 # Stack distances 626 if distances is None: 627 distances = sdm 628 else: 629 distances = distances.multiply(sdm) 630 631 # Extract indices of within-tolerance points 632 distances = distances.tocoo() 633 pairs = np.stack((distances.row, distances.col), axis=1) # C12 to C13 pairs 634 635 # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df 636 pairs_mf = pairs.copy() 637 pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values 638 pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values 639 640 # Connect monoisotopic masses with isotopologes within mass_features 641 monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1])) 642 for mono in monos: 643 self.mass_features[mono].monoisotopic_mf_id = mono 644 pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"]) 645 while not pairs_iso_df.empty: 646 pairs_iso_df = pairs_iso_df.set_index("parent", drop=False) 647 m1_isos = pairs_iso_df.loc[monos, "child"].unique() 648 for iso in m1_isos: 649 # Set monoisotopic_mf_id and isotopologue_type for isotopologues 650 parent = pairs_mf[pairs_mf[:, 1] == iso, 0] 651 if len(parent) > 1: 652 # Choose the parent that is closest in time to the isotopologue 653 parent_time = [self.mass_features[p].retention_time for p in parent] 654 time_diff = [ 655 np.abs(self.mass_features[iso].retention_time - x) 656 for x in parent_time 657 ] 658 parent = parent[np.argmin(time_diff)] 659 else: 660 parent = parent[0] 661 self.mass_features[iso].monoisotopic_mf_id = self.mass_features[ 662 parent 663 ].monoisotopic_mf_id 664 if self.mass_features[iso].monoisotopic_mf_id is not None: 665 mass_diff = ( 666 self.mass_features[iso].mz 667 - self.mass_features[ 668 self.mass_features[iso].monoisotopic_mf_id 669 ].mz 670 ) 671 self.mass_features[iso].isotopologue_type = "13C" + str( 672 int(round(mass_diff, 0)) 673 ) 674 675 # Drop the mono and iso from the pairs_iso_df 676 pairs_iso_df = pairs_iso_df.drop( 677 index=monos, errors="ignore" 678 ) # Drop pairs where the parent is a child that is a child of a root 679 pairs_iso_df = pairs_iso_df.set_index("child", drop=False) 680 pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore") 681 682 if not pairs_iso_df.empty: 683 # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process 684 monos = np.setdiff1d( 685 np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child) 686 ) 687 if verbose: 688 # Report fraction of compounds annotated with isotopes 689 mf_df["c13_flag"] = np.where( 690 np.logical_or( 691 np.isin(mf_df["mf_id"], pairs_mf[:, 0]), 692 np.isin(mf_df["mf_id"], pairs_mf[:, 1]), 693 ), 694 1, 695 0, 696 ) 697 print( 698 str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3)) 699 + " of mass features have or are C13 isotopes" 700 )
Mark likely C13 isotopes and connect to monoisoitopic mass features.
Returns
- None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
Raises
- ValueError: If no mass features are found.
702 def deconvolute_ms1_mass_features(self): 703 """Deconvolute MS1 mass features 704 705 Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features 706 707 Parameters 708 ---------- 709 None 710 711 Returns 712 ------- 713 None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent, 714 and associated_mass_features_deconvoluted attributes to the mass features in the 715 mass_features attribute of the LCMSBase object. 716 717 Raises 718 ------ 719 ValueError 720 If no mass features are found, must run find_mass_features() first. 721 If no EICs are found, did you run integrate_mass_features() first? 722 723 """ 724 # Checks for set mass_features and eics 725 if self.mass_features is None: 726 raise ValueError( 727 "No mass features found, did you run find_mass_features() first?" 728 ) 729 730 if self.eics == {}: 731 raise ValueError( 732 "No EICs found, did you run integrate_mass_features() first?" 733 ) 734 735 if 1 not in self._ms_unprocessed.keys(): 736 raise ValueError("No unprocessed MS1 spectra found.") 737 738 # Prep ms1 data 739 ms1_data = self._ms_unprocessed[1].copy() 740 ms1_data = ms1_data.set_index("scan") 741 742 # Prep mass feature summary 743 mass_feature_df = self.mass_features_to_df() 744 745 # Loop through each mass feature 746 for mf_id, mass_feature in self.mass_features.items(): 747 # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df 748 if mass_feature.mz != mass_feature.ms1_peak.mz_exp: 749 continue 750 751 # Get the left and right limits of the EIC of the mass feature 752 l_scan, _, r_scan = mass_feature._eic_data.apexes[0] 753 754 # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz 755 ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy() 756 ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False) 757 758 # Get the centroided masses of the mass feature 759 mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp 760 761 # Find the closest mz in the ms1 data to the centroided masses of the mass feature 762 ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[ 763 find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values) 764 ] 765 766 # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz 767 ms1_data_sub["mz_diff_rel"] = ( 768 np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"]) 769 / ms1_data_sub["mz"] 770 ) 771 ms1_data_sub = ms1_data_sub[ 772 ms1_data_sub["mz_diff_rel"] 773 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 774 ].reset_index(drop=True) 775 776 # Group by mass_feature_mz and scan and sum intensity 777 ms1_data_sub_group = ( 778 ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"] 779 .sum() 780 .reset_index() 781 ) 782 783 # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity) 784 corr = ( 785 ms1_data_sub_group.pivot( 786 index="scan", columns="mass_feature_mz", values="intensity" 787 ) 788 .fillna(0) 789 .corr() 790 ) 791 792 # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8 793 decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min 794 795 # Try catch for KeyError in case the mass feature mz is not in the correlation matrix 796 try: 797 corr_subset = corr.loc[mass_feature.mz,] 798 except KeyError: 799 # If the mass feature mz is not in the correlation matrix, skip to the next mass feature 800 continue 801 802 corr_subset = corr_subset[corr_subset > decon_corr_min] 803 804 # Get the masses from the mass spectrum that are the result of the deconvolution 805 mzs_decon = corr_subset.index.values 806 807 # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature 808 mzs_decon_idx = [ 809 id 810 for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp) 811 if mz in mzs_decon 812 ] 813 mass_feature._ms_deconvoluted_idx = mzs_decon_idx 814 815 # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum 816 if ( 817 mass_feature.ms1_peak.abundance 818 == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max() 819 ): 820 mass_feature.mass_spectrum_deconvoluted_parent = True 821 else: 822 mass_feature.mass_spectrum_deconvoluted_parent = False 823 824 # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature 825 # Subset mass_feature_df to only include mass features that are within the clustering tolerance 826 mass_feature_df_sub = mass_feature_df[ 827 abs(mass_feature.retention_time - mass_feature_df["scan_time"]) 828 < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 829 ].copy() 830 # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum 831 mass_feature_df_sub["mz_diff_ppm"] = [ 832 np.abs(mzs_decon - mz).min() / mz * 10**6 833 for mz in mass_feature_df_sub["mz"] 834 ] 835 # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses 836 mfs_associated_decon = mass_feature_df_sub[ 837 mass_feature_df_sub["mz_diff_ppm"] 838 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6 839 ].index.values 840 841 mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon
Deconvolute MS1 mass features
Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
Parameters
- None
Returns
- None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
- and associated_mass_features_deconvoluted attributes to the mass features in the
- mass_features attribute of the LCMSBase object.
Raises
- ValueError: If no mass features are found, must run find_mass_features() first. If no EICs are found, did you run integrate_mass_features() first?
844class PHCalculations: 845 """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data. 846 847 Notes 848 ----- 849 This class is intended to be used as a mixin for the LCMSBase class. 850 851 Methods 852 ------- 853 * sparse_mean_filter(idx, V, radius=[0, 1, 1]). 854 Sparse implementation of a mean filter. 855 * embed_unique_indices(a). 856 Creates an array of indices, sorted by unique element. 857 * sparse_upper_star(idx, V). 858 Sparse implementation of an upper star filtration. 859 * check_if_grid(data). 860 Check if the data is gridded in mz space. 861 * grid_data(data). 862 Grid the data in the mz dimension. 863 * find_mass_features_ph(ms_level=1, grid=True). 864 Find mass features within an LCMSBase object using persistent homology. 865 * cluster_mass_features(drop_children=True). 866 Cluster regions of interest. 867 """ 868 869 @staticmethod 870 def sparse_mean_filter(idx, V, radius=[0, 1, 1]): 871 """Sparse implementation of a mean filter. 872 873 Parameters 874 ---------- 875 idx : :obj:`~numpy.array` 876 Edge indices for each dimension (MxN). 877 V : :obj:`~numpy.array` 878 Array of intensity data (Mx1). 879 radius : float or list 880 Radius of the sparse filter in each dimension. Values less than 881 zero indicate no connectivity in that dimension. 882 883 Returns 884 ------- 885 :obj:`~numpy.array` 886 Filtered intensities (Mx1). 887 888 Notes 889 ----- 890 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. 891 This is a static method. 892 """ 893 894 # Copy indices 895 idx = idx.copy().astype(V.dtype) 896 897 # Scale 898 for i, r in enumerate(radius): 899 # Increase inter-index distance 900 if r < 1: 901 idx[:, i] *= 2 902 903 # Do nothing 904 elif r == 1: 905 pass 906 907 # Decrease inter-index distance 908 else: 909 idx[:, i] /= r 910 911 # Connectivity matrix 912 cmat = KDTree(idx) 913 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 914 cmat.setdiag(1) 915 916 # Pair indices 917 I, J = cmat.nonzero() 918 919 # Delete cmat 920 cmat_shape = cmat.shape 921 del cmat 922 923 # Sum over columns 924 V_sum = sparse.bsr_matrix( 925 (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype 926 ).diagonal(0) 927 928 # Count over columns 929 V_count = sparse.bsr_matrix( 930 (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype 931 ).diagonal(0) 932 933 return V_sum / V_count 934 935 @staticmethod 936 def embed_unique_indices(a): 937 """Creates an array of indices, sorted by unique element. 938 939 Parameters 940 ---------- 941 a : :obj:`~numpy.array` 942 Array of unique elements (Mx1). 943 944 Returns 945 ------- 946 :obj:`~numpy.array` 947 Array of indices (Mx1). 948 949 Notes 950 ----- 951 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 952 This is a static method. 953 """ 954 955 def count_tens(n): 956 # Count tens 957 ntens = (n - 1) // 10 958 959 while True: 960 ntens_test = (ntens + n - 1) // 10 961 962 if ntens_test == ntens: 963 return ntens 964 else: 965 ntens = ntens_test 966 967 def arange_exclude_10s(n): 968 # How many 10s will there be? 969 ntens = count_tens(n) 970 971 # Base array 972 arr = np.arange(0, n + ntens) 973 974 # Exclude 10s 975 arr = arr[(arr == 0) | (arr % 10 != 0)][:n] 976 977 return arr 978 979 # Creates an array of indices, sorted by unique element 980 idx_sort = np.argsort(a) 981 idx_unsort = np.argsort(idx_sort) 982 983 # Sorts records array so all unique elements are together 984 sorted_a = a[idx_sort] 985 986 # Returns the unique values, the index of the first occurrence, 987 # and the count for each element 988 vals, idx_start, count = np.unique( 989 sorted_a, return_index=True, return_counts=True 990 ) 991 992 # Splits the indices into separate arrays 993 splits = np.split(idx_sort, idx_start[1:]) 994 995 # Creates unique indices for each split 996 idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits]) 997 998 # Reorders according to input array 999 idx_unq = idx_unq[idx_unsort] 1000 1001 # Magnitude of each index 1002 exp = np.log10( 1003 idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64) 1004 ) 1005 idx_unq_mag = np.power(10, np.floor(exp) + 1) 1006 1007 # Result 1008 return a + idx_unq / idx_unq_mag 1009 1010 @staticmethod 1011 def roll_up_dataframe( 1012 df: pd.DataFrame, 1013 sort_by: str, 1014 tol: list, 1015 relative: list, 1016 dims: list, 1017 memory_opt_threshold: int = 10000, 1018 ): 1019 """Subset data by rolling up into apex in appropriate dimensions. 1020 1021 Parameters 1022 ---------- 1023 data : pd.DataFrame 1024 The input data containing "dims" columns and the "sort_by" column. 1025 sort_by : str 1026 The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 1027 (i.e., the mass feature with the highest value in the sort_by column). 1028 dims : list 1029 A list of dimension names (column names in the data DataFrame) to roll up the mass features by. 1030 tol : list 1031 A list of tolerances for each dimension. The length of the list must match the number of dimensions. 1032 The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). 1033 If relative is True, the tolerance will be multiplied by the maximum value in that dimension. 1034 If relative is False, the tolerance will be used as is. 1035 relative : list 1036 A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1037 memory_opt_threshold : int, optional 1038 Minimum number of rows to trigger memory-optimized processing. Default is 10000. 1039 1040 Returns 1041 ------- 1042 pd.DataFrame 1043 A DataFrame with only the rolled up mass features, with the original index and columns. 1044 1045 1046 Raises 1047 ------ 1048 ValueError 1049 If the input data is not a pandas DataFrame. 1050 If the input data does not have columns for each of the dimensions in "dims". 1051 If the length of "dims", "tol", and "relative" do not match. 1052 """ 1053 og_columns = df.columns.copy() 1054 1055 # Unindex the data, but keep the original index 1056 if df.index.name is not None: 1057 og_index = df.index.name 1058 else: 1059 og_index = "index" 1060 df = df.reset_index(drop=False) 1061 1062 # Sort data by sort_by column, and reindex 1063 df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True) 1064 1065 # Check that data is a DataFrame and has columns for each of the dims 1066 if not isinstance(df, pd.DataFrame): 1067 raise ValueError("Data must be a pandas DataFrame") 1068 for dim in dims: 1069 if dim not in df.columns: 1070 raise ValueError(f"Data must have a column for {dim}") 1071 if len(dims) != len(tol) or len(dims) != len(relative): 1072 raise ValueError( 1073 "Dimensions, tolerances, and relative flags must be the same length" 1074 ) 1075 1076 # Pre-compute all values arrays 1077 all_values = [df[dim].values for dim in dims] 1078 1079 # Choose processing method based on dataframe size 1080 if len(df) >= memory_opt_threshold: 1081 # Memory-optimized approach for large dataframes 1082 distances = PHCalculations._compute_distances_memory_optimized( 1083 all_values, tol, relative 1084 ) 1085 else: 1086 # Faster approach for smaller dataframes 1087 distances = PHCalculations._compute_distances_original( 1088 all_values, tol, relative 1089 ) 1090 1091 # Process pairs with original logic but memory optimizations 1092 distances = distances.tocoo() 1093 pairs = np.stack((distances.row, distances.col), axis=1) 1094 pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]).set_index("parent") 1095 del distances, pairs # Free memory immediately 1096 1097 to_drop = [] 1098 while not pairs_df.empty: 1099 # Find root_parents and their children (original logic preserved) 1100 root_parents = np.setdiff1d( 1101 np.unique(pairs_df.index.values), np.unique(pairs_df.child.values) 1102 ) 1103 children_of_roots = pairs_df.loc[root_parents, "child"].unique() 1104 to_drop.extend(children_of_roots) # Use extend instead of append 1105 1106 # Remove root_children as possible parents from pairs_df for next iteration 1107 pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore") 1108 pairs_df = pairs_df.reset_index().set_index("child") 1109 # Remove root_children as possible children from pairs_df for next iteration 1110 pairs_df = pairs_df.drop(index=children_of_roots) 1111 1112 # Prepare for next iteration 1113 pairs_df = pairs_df.reset_index().set_index("parent") 1114 1115 # Convert to numpy array for efficient dropping 1116 to_drop = np.array(to_drop) 1117 1118 # Drop mass features that are not cluster parents 1119 df_sub = df.drop(index=to_drop) 1120 1121 # Set index back to og_index and only keep original columns 1122 df_sub = df_sub.set_index(og_index).sort_index()[og_columns] 1123 1124 return df_sub 1125 1126 @staticmethod 1127 def _compute_distances_original(all_values, tol, relative): 1128 """Original distance computation method for smaller datasets. 1129 1130 This method computes the pairwise distances between features in the dataset 1131 using a straightforward approach. It is suitable for smaller datasets where 1132 memory usage is not a primary concern. 1133 1134 Parameters 1135 ---------- 1136 all_values : list of :obj:`~numpy.array` 1137 List of arrays containing the values for each dimension. 1138 tol : list of float 1139 List of tolerances for each dimension. 1140 relative : list of bool 1141 List of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1142 1143 Returns 1144 ------- 1145 :obj:`~scipy.sparse.coo_matrix` 1146 Sparse matrix indicating pairwise distances within tolerances. 1147 """ 1148 # Compute inter-feature distances with memory optimization 1149 distances = None 1150 for i in range(len(all_values)): 1151 values = all_values[i] 1152 # Use single precision if possible to reduce memory 1153 tree = KDTree(values.reshape(-1, 1).astype(np.float32)) 1154 1155 max_tol = tol[i] 1156 if relative[i] is True: 1157 max_tol = tol[i] * values.max() 1158 1159 # Compute sparse distance matrix with smaller chunks if memory is an issue 1160 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 1161 1162 # Only consider forward case, exclude diagonal 1163 sdm = sparse.triu(sdm, k=1) 1164 1165 # Process relative distances more efficiently 1166 if relative[i] is True: 1167 # Vectorized computation without creating intermediate arrays 1168 row_values = values[sdm.row] 1169 valid_idx = sdm.data <= tol[i] * row_values 1170 1171 # Reconstruct sparse matrix more efficiently 1172 sdm = sparse.coo_matrix( 1173 ( 1174 np.ones(valid_idx.sum(), dtype=np.uint8), 1175 (sdm.row[valid_idx], sdm.col[valid_idx]), 1176 ), 1177 shape=(len(values), len(values)), 1178 ) 1179 else: 1180 # Cast as binary matrix with smaller data type 1181 sdm.data = np.ones(len(sdm.data), dtype=np.uint8) 1182 1183 # Stack distances with memory-efficient multiplication 1184 if distances is None: 1185 distances = sdm 1186 else: 1187 # Use in-place operations where possible 1188 distances = distances.multiply(sdm) 1189 del sdm # Free memory immediately 1190 1191 return distances 1192 1193 @staticmethod 1194 def _compute_distances_memory_optimized(all_values, tol, relative): 1195 """Memory-optimized distance computation for large datasets. 1196 1197 This method computes the pairwise distances between features in the dataset 1198 using a more memory-efficient approach. It is suitable for larger datasets 1199 where memory usage is a primary concern. 1200 1201 Parameters 1202 ---------- 1203 all_values : list of :obj:`~numpy.array` 1204 List of arrays containing the values for each dimension. 1205 tol : list of float 1206 List of tolerances for each dimension. 1207 relative : list of bool 1208 List of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1209 1210 Returns 1211 ------- 1212 :obj:`~scipy.sparse.coo_matrix` 1213 Sparse matrix indicating pairwise distances within tolerances. 1214 """ 1215 # Compute distance matrix for first dimension (full matrix as before) 1216 values_0 = all_values[0].astype(np.float32) 1217 tree_0 = KDTree(values_0.reshape(-1, 1)) 1218 1219 max_tol_0 = tol[0] 1220 if relative[0] is True: 1221 max_tol_0 = tol[0] * values_0.max() 1222 1223 # Compute sparse distance matrix for first dimension 1224 distances = tree_0.sparse_distance_matrix( 1225 tree_0, max_tol_0, output_type="coo_matrix" 1226 ) 1227 distances = sparse.triu(distances, k=1) 1228 1229 # Process relative distances for first dimension 1230 if relative[0] is True: 1231 row_values = values_0[distances.row] 1232 valid_idx = distances.data <= tol[0] * row_values 1233 distances = sparse.coo_matrix( 1234 ( 1235 np.ones(valid_idx.sum(), dtype=np.uint8), 1236 (distances.row[valid_idx], distances.col[valid_idx]), 1237 ), 1238 shape=(len(values_0), len(values_0)), 1239 ) 1240 else: 1241 distances.data = np.ones(len(distances.data), dtype=np.uint8) 1242 1243 # For remaining dimensions, work only on chunks defined by first dimension pairs 1244 if len(all_values) > 1: 1245 distances_coo = distances.tocoo() 1246 valid_pairs = [] 1247 1248 # Process each pair from first dimension 1249 for idx in range(len(distances_coo.data)): 1250 i, j = distances_coo.row[idx], distances_coo.col[idx] 1251 is_valid_pair = True 1252 1253 # Check remaining dimensions for this specific pair 1254 for dim_idx in range(1, len(all_values)): 1255 values = all_values[dim_idx] 1256 val_i, val_j = values[i], values[j] 1257 1258 max_tol = tol[dim_idx] 1259 if relative[dim_idx] is True: 1260 max_tol = tol[dim_idx] * values.max() 1261 1262 distance_ij = abs(val_i - val_j) 1263 1264 # Check if this pair satisfies the tolerance for this dimension 1265 if relative[dim_idx] is True: 1266 if distance_ij > tol[dim_idx] * val_i: 1267 is_valid_pair = False 1268 break 1269 else: 1270 if distance_ij > max_tol: 1271 is_valid_pair = False 1272 break 1273 1274 if is_valid_pair: 1275 valid_pairs.append((i, j)) 1276 1277 # Rebuild distances matrix with only valid pairs 1278 if valid_pairs: 1279 valid_pairs = np.array(valid_pairs) 1280 distances = sparse.coo_matrix( 1281 ( 1282 np.ones(len(valid_pairs), dtype=np.uint8), 1283 (valid_pairs[:, 0], valid_pairs[:, 1]), 1284 ), 1285 shape=(len(values_0), len(values_0)), 1286 ) 1287 else: 1288 # No valid pairs found 1289 distances = sparse.coo_matrix( 1290 (len(values_0), len(values_0)), dtype=np.uint8 1291 ) 1292 1293 return distances 1294 1295 def sparse_upper_star(self, idx, V): 1296 """Sparse implementation of an upper star filtration. 1297 1298 Parameters 1299 ---------- 1300 idx : :obj:`~numpy.array` 1301 Edge indices for each dimension (MxN). 1302 V : :obj:`~numpy.array` 1303 Array of intensity data (Mx1). 1304 Returns 1305 ------- 1306 idx : :obj:`~numpy.array` 1307 Index of filtered points (Mx1). 1308 persistence : :obj:`~numpy.array` 1309 Persistence of each filtered point (Mx1). 1310 1311 Notes 1312 ----- 1313 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1314 """ 1315 1316 # Invert 1317 V = -1 * V.copy().astype(int) 1318 1319 # Embed indices 1320 V = self.embed_unique_indices(V) 1321 1322 # Connectivity matrix 1323 cmat = KDTree(idx) 1324 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1325 cmat.setdiag(1) 1326 cmat = sparse.triu(cmat) 1327 1328 # Pairwise minimums 1329 I, J = cmat.nonzero() 1330 d = np.maximum(V[I], V[J]) 1331 1332 # Delete connectiity matrix 1333 cmat_shape = cmat.shape 1334 del cmat 1335 1336 # Sparse distance matrix 1337 sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape) 1338 1339 # Delete pairwise mins 1340 del d, I, J 1341 1342 # Persistence homology 1343 ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0] 1344 1345 # Bound death values 1346 ph[ph[:, 1] == np.inf, 1] = np.max(V) 1347 1348 # Construct tree to query against 1349 tree = KDTree(V.reshape((-1, 1))) 1350 1351 # Get the indexes of the first nearest neighbor by birth 1352 _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1) 1353 1354 return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1) 1355 1356 def check_if_grid(self, data): 1357 """Check if the data are gridded in mz space. 1358 1359 Parameters 1360 ---------- 1361 data : DataFrame 1362 DataFrame containing the mass spectrometry data. Needs to have mz and scan columns. 1363 1364 Returns 1365 ------- 1366 bool 1367 True if the data is gridded in the mz direction, False otherwise. 1368 1369 Notes 1370 ----- 1371 This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly. 1372 """ 1373 # Calculate the difference between consecutive mz values in a single scan 1374 dat_check = data.copy().reset_index(drop=True) 1375 dat_check["mz_diff"] = np.abs(dat_check["mz"].diff()) 1376 mz_diff_min = ( 1377 dat_check.groupby("scan")["mz_diff"].min().min() 1378 ) # within each scan, what is the smallest mz difference between consecutive mz values 1379 1380 # Find the mininum mz difference between mz values in the data; regardless of scan 1381 dat_check_mz = dat_check[["mz"]].drop_duplicates().copy() 1382 dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True) 1383 dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff()) 1384 1385 # Get minimum mz_diff between mz values in the data 1386 mz_diff_min_raw = dat_check_mz["mz_diff"].min() 1387 1388 # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded 1389 if mz_diff_min_raw < mz_diff_min: 1390 return False 1391 else: 1392 return True 1393 1394 def grid_data(self, data, attempts=5): 1395 """Grid the data in the mz dimension. 1396 1397 Data must be gridded prior to persistent homology calculations and computing average mass spectrum 1398 1399 Parameters 1400 ---------- 1401 data : DataFrame 1402 The input data containing mz, scan, scan_time, and intensity columns. 1403 attempts : int, optional 1404 The number of attempts to grid the data. Default is 5. 1405 1406 Returns 1407 ------- 1408 DataFrame 1409 The gridded data with mz, scan, scan_time, and intensity columns. 1410 1411 Raises 1412 ------ 1413 ValueError 1414 If gridding fails after the specified number of attempts. 1415 """ 1416 attempt_i = 0 1417 while attempt_i < attempts: 1418 attempt_i += 1 1419 data = self._grid_data(data) 1420 1421 if self.check_if_grid(data): 1422 return data 1423 1424 if not self.check_if_grid(data): 1425 raise ValueError( 1426 "Gridding failed after " 1427 + str(attempt_i) 1428 + " attempts. Please check the data." 1429 ) 1430 else: 1431 return data 1432 1433 def _grid_data(self, data): 1434 """Internal method to grid the data in the mz dimension. 1435 1436 Notes 1437 ----- 1438 This method is called by the grid_data method and should not be called directly. 1439 It will attempt to grid the data in the mz dimension by creating a grid of mz values based on the minimum mz difference within each scan, 1440 but it does not check if the data is already gridded or if the gridding is successful. 1441 1442 Parameters 1443 ---------- 1444 data : pd.DataFrame or pl.DataFrame 1445 The input data to grid. 1446 1447 Returns 1448 ------- 1449 pd.DataFrame or pl.DataFrame 1450 The data after attempting to grid it in the mz dimension. 1451 """ 1452 # Calculate the difference between consecutive mz values in a single scan for grid spacing 1453 data_w = data.copy().reset_index(drop=True) 1454 data_w["mz_diff"] = np.abs(data_w["mz"].diff()) 1455 mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999 1456 1457 # Need high intensity mz values first so they are parents in the output pairs stack 1458 dat_mz = data_w[["mz", "intensity"]].sort_values( 1459 by=["intensity"], ascending=False 1460 ) 1461 dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy() 1462 1463 # Construct KD tree 1464 tree = KDTree(dat_mz.mz.values.reshape(-1, 1)) 1465 sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix") 1466 sdm = sparse.triu(sdm, k=1) 1467 sdm.data = np.ones_like(sdm.data) 1468 distances = sdm.tocoo() 1469 pairs = np.stack((distances.row, distances.col), axis=1) 1470 1471 # Cull pairs to just get root 1472 to_drop = [] 1473 while len(pairs) > 0: 1474 root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1])) 1475 id_root_parents = np.isin(pairs[:, 0], root_parents) 1476 children_of_roots = np.unique(pairs[id_root_parents, 1]) 1477 to_drop = np.append(to_drop, children_of_roots) 1478 1479 # Set up pairs array for next iteration by removing pairs with children or parents already dropped 1480 pairs = pairs[~np.isin(pairs[:, 1], to_drop), :] 1481 pairs = pairs[~np.isin(pairs[:, 0], to_drop), :] 1482 dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop)) 1483 mz_dat_np = ( 1484 dat_mz[["mz"]] 1485 .sort_values(by=["mz"]) 1486 .reset_index(drop=True) 1487 .values.flatten() 1488 ) 1489 1490 # Sort data by mz and recast mz to nearest value in mz_dat_np 1491 data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy() 1492 data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)] 1493 data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"]) 1494 1495 # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity 1496 new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy() 1497 new_data_w = ( 1498 new_data_w.drop(columns=["mz_diff", "mz_orig"]) 1499 .groupby(["scan", "mz"])["intensity"] 1500 .sum() 1501 .reset_index() 1502 ) 1503 new_data_w = ( 1504 new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True]) 1505 .reset_index(drop=True) 1506 .copy() 1507 ) 1508 1509 return new_data_w 1510 1511 def find_mass_features_ph(self, ms_level=1, grid=True): 1512 """Find mass features within an LCMSBase object using persistent homology. 1513 1514 Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id) 1515 1516 Parameters 1517 ---------- 1518 ms_level : int, optional 1519 The MS level to use. Default is 1. 1520 grid : bool, optional 1521 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True. 1522 1523 Raises 1524 ------ 1525 ValueError 1526 If no MS level data is found on the object. 1527 If data is not gridded and grid is False. 1528 1529 Returns 1530 ------- 1531 None, but assigns the mass_features attribute to the object. 1532 1533 Notes 1534 ----- 1535 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1536 """ 1537 # Check that ms_level is a key in self._ms_uprocessed 1538 if ms_level not in self._ms_unprocessed.keys(): 1539 raise ValueError( 1540 "No MS level " 1541 + str(ms_level) 1542 + " data found, did you instantiate with parser specific to MS level?" 1543 ) 1544 1545 # Get ms data 1546 data = self._ms_unprocessed[ms_level].copy() 1547 1548 # Drop rows with missing intensity values and reset index 1549 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1550 1551 # Threshold data 1552 dims = ["mz", "scan_time"] 1553 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1554 persistence_threshold = ( 1555 self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max() 1556 ) 1557 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1558 1559 # Check if gridded, if not, grid 1560 gridded_mz = self.check_if_grid(data_thres) 1561 if gridded_mz is False: 1562 if grid is False: 1563 raise ValueError( 1564 "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function" 1565 ) 1566 else: 1567 data_thres = self.grid_data(data_thres) 1568 1569 # Add scan_time 1570 data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1571 # Process in chunks if required 1572 if len(data_thres) > 10000: 1573 return self._find_mass_features_ph_partition( 1574 data_thres, dims, persistence_threshold 1575 ) 1576 else: 1577 # Process all at once 1578 return self._find_mass_features_ph_single( 1579 data_thres, dims, persistence_threshold 1580 ) 1581 1582 def _find_mass_features_ph_single(self, data_thres, dims, persistence_threshold): 1583 """Process all data at once (original logic).""" 1584 # Build factors 1585 factors = { 1586 dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32) 1587 for dim in dims 1588 } 1589 1590 # Build indexes 1591 index = { 1592 dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32) 1593 for dim in factors 1594 } 1595 1596 # Smooth and process 1597 mass_features_df = self._process_partition_ph( 1598 data_thres, index, dims, persistence_threshold 1599 ) 1600 1601 # Roll up within chunk to remove duplicates 1602 mass_features_df = self.roll_up_dataframe( 1603 df=mass_features_df, 1604 sort_by="persistence", 1605 dims=["mz", "scan_time"], 1606 tol=[ 1607 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1608 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1609 ], 1610 relative=[True, False], 1611 ) 1612 mass_features_df = mass_features_df.reset_index(drop=True) 1613 1614 # Populate mass_features attribute 1615 self._populate_mass_features(mass_features_df) 1616 1617 def _find_mass_features_ph_partition(self, data_thres, dims, persistence_threshold): 1618 """Partition the persistent homology mass feature detection for large datasets. 1619 1620 This method splits the input data into overlapping scan partitions, processes each partition to detect mass features 1621 using persistent homology, rolls up duplicates within and across partitions, and populates the mass_features attribute. 1622 1623 Parameters 1624 ---------- 1625 data_thres : pd.DataFrame 1626 The thresholded input data containing mass spectrometry information. 1627 dims : list 1628 List of dimension names (e.g., ["mz", "scan_time"]) used for feature detection. 1629 persistence_threshold : float 1630 Minimum persistence value required for a detected mass feature to be retained. 1631 1632 Returns 1633 ------- 1634 None 1635 Populates the mass_features attribute of the object with detected mass features. 1636 """ 1637 all_mass_features = [] 1638 1639 # Split scans into partitions 1640 unique_scans = sorted(data_thres["scan"].unique()) 1641 unique_scans_n = len(unique_scans) 1642 1643 # Calculate partition size in scans based on goal 1644 partition_size_goal = 5000 1645 scans_per_partition = max( 1646 1, partition_size_goal // (len(data_thres) // unique_scans_n) 1647 ) 1648 if scans_per_partition == 0: 1649 scans_per_partition = 1 1650 1651 # Make partitions based on scans, with overlapping in partitioned scans 1652 scan_overlap = 4 1653 partition_scans = [] 1654 for i in range(0, unique_scans_n, scans_per_partition): 1655 start_idx = max(0, i - scan_overlap) 1656 end_idx = min( 1657 unique_scans_n - 1, i + scans_per_partition - 1 + scan_overlap 1658 ) 1659 scans_group = [int(s) for s in unique_scans[start_idx : end_idx + 1]] 1660 partition_scans.append(scans_group) 1661 1662 # Set index to scan for faster filtering 1663 data_thres = data_thres.set_index("scan") 1664 for scans in partition_scans: 1665 # Determine start and end scan for partition, with 5 scans overlap 1666 partition_data = data_thres.loc[scans].reset_index(drop=False).copy() 1667 1668 if len(partition_data) == 0: 1669 continue 1670 1671 # Build factors for this partition 1672 factors = { 1673 dim: pd.factorize(partition_data[dim], sort=True)[1].astype(np.float32) 1674 for dim in dims 1675 } 1676 1677 # Build indexes 1678 index = { 1679 dim: np.searchsorted(factors[dim], partition_data[dim]).astype( 1680 np.float32 1681 ) 1682 for dim in factors 1683 } 1684 1685 # Process partition 1686 partition_features = self._process_partition_ph( 1687 partition_data, index, dims, persistence_threshold 1688 ) 1689 1690 if len(partition_features) == 0: 1691 continue 1692 1693 # Roll up within partition 1694 partition_features = self.roll_up_dataframe( 1695 df=partition_features, 1696 sort_by="persistence", 1697 dims=["mz", "scan_time"], 1698 tol=[ 1699 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1700 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1701 ], 1702 relative=[True, False], 1703 ) 1704 partition_features = partition_features.reset_index(drop=True) 1705 1706 if len(partition_features) > 0: 1707 all_mass_features.append(partition_features) 1708 1709 # Combine results from all partitions 1710 if all_mass_features: 1711 combined_features = pd.concat(all_mass_features, ignore_index=True) 1712 1713 # Sort by persistence 1714 combined_features = combined_features.sort_values( 1715 by="persistence", ascending=False 1716 ).reset_index(drop=True) 1717 1718 # Remove duplicates from overlapping regions 1719 combined_features = self.roll_up_dataframe( 1720 df=combined_features, 1721 sort_by="persistence", 1722 dims=["mz", "scan_time"], 1723 tol=[ 1724 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1725 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1726 ], 1727 relative=[True, False], 1728 ) 1729 1730 # resort by persistence and reset index 1731 combined_features = combined_features.reset_index(drop=True) 1732 1733 # Populate mass_features attribute 1734 self._populate_mass_features(combined_features) 1735 else: 1736 self.mass_features = {} 1737 1738 def _process_partition_ph(self, partition_data, index, dims, persistence_threshold): 1739 """Process a single partition with persistent homology.""" 1740 # Smooth data 1741 iterations = self.parameters.lc_ms.ph_smooth_it 1742 smooth_radius = [ 1743 self.parameters.lc_ms.ph_smooth_radius_mz, 1744 self.parameters.lc_ms.ph_smooth_radius_scan, 1745 ] 1746 1747 index_array = np.vstack([index[dim] for dim in dims]).T 1748 V = partition_data["intensity"].values 1749 resid = np.inf 1750 1751 for i in range(iterations): 1752 # Previous iteration 1753 V_prev = V.copy() 1754 resid_prev = resid 1755 V = self.sparse_mean_filter(index_array, V, radius=smooth_radius) 1756 1757 # Calculate residual with previous iteration 1758 resid = np.sqrt(np.mean(np.square(V - V_prev))) 1759 1760 # Evaluate convergence 1761 if i > 0: 1762 # Percent change in residual 1763 test = np.abs(resid - resid_prev) / resid_prev 1764 1765 # Exit criteria 1766 if test <= 0: 1767 break 1768 1769 # Overwrite values 1770 partition_data = partition_data.copy() 1771 partition_data["intensity"] = V 1772 1773 # Use persistent homology to find regions of interest 1774 pidx, pers = self.sparse_upper_star(index_array, V) 1775 pidx = pidx[pers > 1] 1776 pers = pers[pers > 1] 1777 1778 if len(pidx) == 0: 1779 return pd.DataFrame() 1780 1781 # Get peaks 1782 peaks = partition_data.iloc[pidx, :].reset_index(drop=True) 1783 1784 # Add persistence column 1785 peaks["persistence"] = pers 1786 mass_features = peaks.sort_values( 1787 by="persistence", ascending=False 1788 ).reset_index(drop=True) 1789 1790 # Filter by persistence threshold 1791 mass_features = mass_features.loc[ 1792 mass_features["persistence"] > persistence_threshold, : 1793 ].reset_index(drop=True) 1794 1795 return mass_features 1796 1797 def _populate_mass_features(self, mass_features_df): 1798 """Populate the mass_features attribute from a DataFrame. 1799 1800 Parameters 1801 ---------- 1802 mass_features_df : pd.DataFrame 1803 DataFrame containing mass feature information. 1804 Note that the order of this DataFrame will determine the order of mass features in the mass_features attribute. 1805 1806 Returns 1807 ------- 1808 None, but assigns the mass_features attribute to the object. 1809 """ 1810 # Rename scan column to apex_scan 1811 mass_features_df = mass_features_df.rename( 1812 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1813 ) 1814 1815 # Populate mass_features attribute 1816 self.mass_features = {} 1817 for row in mass_features_df.itertuples(): 1818 row_dict = mass_features_df.iloc[row.Index].to_dict() 1819 lcms_feature = LCMSMassFeature(self, **row_dict) 1820 self.mass_features[lcms_feature.id] = lcms_feature 1821 1822 if self.parameters.lc_ms.verbose_processing: 1823 print("Found " + str(len(mass_features_df)) + " initial mass features") 1824 1825 def find_mass_features_ph_centroid(self, ms_level=1): 1826 """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data. 1827 1828 Parameters 1829 ---------- 1830 ms_level : int, optional 1831 The MS level to use. Default is 1. 1832 1833 Raises 1834 ------ 1835 ValueError 1836 If no MS level data is found on the object. 1837 1838 Returns 1839 ------- 1840 None, but assigns the mass_features attribute to the object. 1841 """ 1842 # Check that ms_level is a key in self._ms_uprocessed 1843 if ms_level not in self._ms_unprocessed.keys(): 1844 raise ValueError( 1845 "No MS level " 1846 + str(ms_level) 1847 + " data found, did you instantiate with parser specific to MS level?" 1848 ) 1849 1850 # Work with reference instead of copy 1851 data = self._ms_unprocessed[ms_level] 1852 1853 # Calculate threshold first to avoid multiple operations 1854 max_intensity = data["intensity"].max() 1855 threshold = self.parameters.lc_ms.ph_inten_min_rel * max_intensity 1856 1857 # Create single filter condition and apply to required columns only 1858 valid_mask = data["intensity"].notna() & (data["intensity"] > threshold) 1859 required_cols = ["mz", "intensity", "scan"] 1860 data_thres = data.loc[valid_mask, required_cols].copy() 1861 data_thres["persistence"] = data_thres["intensity"] 1862 1863 # Merge with required scan data 1864 scan_subset = self.scan_df[["scan", "scan_time"]] 1865 mf_df = data_thres.merge(scan_subset, on="scan", how="inner") 1866 del data_thres, scan_subset 1867 1868 # Order by scan_time and then mz to ensure features near in rt are processed together 1869 # It's ok that different scans are in different partitions; we will roll up later 1870 mf_df = mf_df.sort_values( 1871 by=["scan_time", "mz"], ascending=[True, True] 1872 ).reset_index(drop=True) 1873 partition_size = 10000 1874 partitions = [ 1875 mf_df.iloc[i : i + partition_size].reset_index(drop=True) 1876 for i in range(0, len(mf_df), partition_size) 1877 ] 1878 del mf_df 1879 1880 # Run roll_up_dataframe on each partition 1881 rolled_partitions = [] 1882 for part in partitions: 1883 rolled = self.roll_up_dataframe( 1884 df=part, 1885 sort_by="persistence", 1886 dims=["mz", "scan_time"], 1887 tol=[ 1888 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1889 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1890 ], 1891 relative=[True, False], 1892 ) 1893 rolled_partitions.append(rolled) 1894 del partitions 1895 1896 # Run roll_up_dataframe on the rolled_up partitions to merge features near partition boundaries 1897 1898 # Combine results and run a final roll-up to merge features near partition boundaries 1899 mf_df_final = pd.concat(rolled_partitions, ignore_index=True) 1900 del rolled_partitions 1901 1902 # Reorder by persistence before final roll-up 1903 mf_df_final = mf_df_final.sort_values( 1904 by="persistence", ascending=False 1905 ).reset_index(drop=True) 1906 1907 mf_df_final = self.roll_up_dataframe( 1908 df=mf_df_final, 1909 sort_by="persistence", 1910 dims=["mz", "scan_time"], 1911 tol=[ 1912 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1913 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1914 ], 1915 relative=[True, False], 1916 ) 1917 # reset index 1918 mf_df_final = mf_df_final.reset_index(drop=True) 1919 1920 # Combine rename and sort operations 1921 mass_features = ( 1922 mf_df_final.rename( 1923 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1924 ) 1925 .sort_values(by="persistence", ascending=False) 1926 .reset_index(drop=True) 1927 ) 1928 del mf_df_final # Free memory 1929 1930 # Order by persistence and reset index 1931 mass_features = mass_features.sort_values( 1932 by="persistence", ascending=False 1933 ).reset_index(drop=True) 1934 1935 self.mass_features = {} 1936 for idx, row in mass_features.iterrows(): 1937 row_dict = row.to_dict() 1938 lcms_feature = LCMSMassFeature(self, **row_dict) 1939 self.mass_features[lcms_feature.id] = lcms_feature 1940 1941 if self.parameters.lc_ms.verbose_processing: 1942 print("Found " + str(len(mass_features)) + " initial mass features") 1943 1944 def cluster_mass_features(self, drop_children=True, sort_by="persistence"): 1945 """Cluster mass features 1946 1947 Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence. 1948 1949 Parameters 1950 ---------- 1951 drop_children : bool, optional 1952 Whether to drop the mass features that are not cluster parents. Default is True. 1953 sort_by : str, optional 1954 The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence". 1955 1956 Raises 1957 ------ 1958 ValueError 1959 If no mass features are found. 1960 If too many mass features are found. 1961 1962 Returns 1963 ------- 1964 None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents. 1965 """ 1966 if self.mass_features is None: 1967 raise ValueError("No mass features found, run find_mass_features() first") 1968 if len(self.mass_features) > 400000: 1969 raise ValueError( 1970 "Too many mass features of interest found, run find_mass_features() with a higher intensity threshold" 1971 ) 1972 dims = ["mz", "scan_time"] 1973 mf_df_og = self.mass_features_to_df() 1974 mf_df = mf_df_og.copy() 1975 1976 tol = [ 1977 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1978 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1979 ] # mz, in relative; scan_time in minutes 1980 relative = [True, False] 1981 1982 # Roll up mass features based on their proximity in the declared dimensions 1983 mf_df_new = self.roll_up_dataframe( 1984 df=mf_df, sort_by=sort_by, dims=dims, tol=tol, relative=relative 1985 ) 1986 1987 mf_df["cluster_parent"] = np.where( 1988 np.isin(mf_df.index, mf_df_new.index), True, False 1989 ) 1990 1991 # get mass feature ids of features that are not cluster parents 1992 cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values 1993 if drop_children is True: 1994 # Drop mass features that are not cluster parents from self 1995 self.mass_features = { 1996 k: v 1997 for k, v in self.mass_features.items() 1998 if k not in cluster_daughters 1999 } 2000 else: 2001 return cluster_daughters
Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.
Notes
This class is intended to be used as a mixin for the LCMSBase class.
Methods
- sparse_mean_filter(idx, V, radius=[0, 1, 1]). Sparse implementation of a mean filter.
- embed_unique_indices(a). Creates an array of indices, sorted by unique element.
- sparse_upper_star(idx, V). Sparse implementation of an upper star filtration.
- check_if_grid(data). Check if the data is gridded in mz space.
- grid_data(data). Grid the data in the mz dimension.
- find_mass_features_ph(ms_level=1, grid=True). Find mass features within an LCMSBase object using persistent homology.
- cluster_mass_features(drop_children=True). Cluster regions of interest.
869 @staticmethod 870 def sparse_mean_filter(idx, V, radius=[0, 1, 1]): 871 """Sparse implementation of a mean filter. 872 873 Parameters 874 ---------- 875 idx : :obj:`~numpy.array` 876 Edge indices for each dimension (MxN). 877 V : :obj:`~numpy.array` 878 Array of intensity data (Mx1). 879 radius : float or list 880 Radius of the sparse filter in each dimension. Values less than 881 zero indicate no connectivity in that dimension. 882 883 Returns 884 ------- 885 :obj:`~numpy.array` 886 Filtered intensities (Mx1). 887 888 Notes 889 ----- 890 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. 891 This is a static method. 892 """ 893 894 # Copy indices 895 idx = idx.copy().astype(V.dtype) 896 897 # Scale 898 for i, r in enumerate(radius): 899 # Increase inter-index distance 900 if r < 1: 901 idx[:, i] *= 2 902 903 # Do nothing 904 elif r == 1: 905 pass 906 907 # Decrease inter-index distance 908 else: 909 idx[:, i] /= r 910 911 # Connectivity matrix 912 cmat = KDTree(idx) 913 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 914 cmat.setdiag(1) 915 916 # Pair indices 917 I, J = cmat.nonzero() 918 919 # Delete cmat 920 cmat_shape = cmat.shape 921 del cmat 922 923 # Sum over columns 924 V_sum = sparse.bsr_matrix( 925 (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype 926 ).diagonal(0) 927 928 # Count over columns 929 V_count = sparse.bsr_matrix( 930 (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype 931 ).diagonal(0) 932 933 return V_sum / V_count
Sparse implementation of a mean filter.
Parameters
- idx (
~numpy.array
): Edge indices for each dimension (MxN). - V (
~numpy.array
): Array of intensity data (Mx1). - radius (float or list): Radius of the sparse filter in each dimension. Values less than zero indicate no connectivity in that dimension.
Returns
~numpy.array
: Filtered intensities (Mx1).
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. This is a static method.
935 @staticmethod 936 def embed_unique_indices(a): 937 """Creates an array of indices, sorted by unique element. 938 939 Parameters 940 ---------- 941 a : :obj:`~numpy.array` 942 Array of unique elements (Mx1). 943 944 Returns 945 ------- 946 :obj:`~numpy.array` 947 Array of indices (Mx1). 948 949 Notes 950 ----- 951 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 952 This is a static method. 953 """ 954 955 def count_tens(n): 956 # Count tens 957 ntens = (n - 1) // 10 958 959 while True: 960 ntens_test = (ntens + n - 1) // 10 961 962 if ntens_test == ntens: 963 return ntens 964 else: 965 ntens = ntens_test 966 967 def arange_exclude_10s(n): 968 # How many 10s will there be? 969 ntens = count_tens(n) 970 971 # Base array 972 arr = np.arange(0, n + ntens) 973 974 # Exclude 10s 975 arr = arr[(arr == 0) | (arr % 10 != 0)][:n] 976 977 return arr 978 979 # Creates an array of indices, sorted by unique element 980 idx_sort = np.argsort(a) 981 idx_unsort = np.argsort(idx_sort) 982 983 # Sorts records array so all unique elements are together 984 sorted_a = a[idx_sort] 985 986 # Returns the unique values, the index of the first occurrence, 987 # and the count for each element 988 vals, idx_start, count = np.unique( 989 sorted_a, return_index=True, return_counts=True 990 ) 991 992 # Splits the indices into separate arrays 993 splits = np.split(idx_sort, idx_start[1:]) 994 995 # Creates unique indices for each split 996 idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits]) 997 998 # Reorders according to input array 999 idx_unq = idx_unq[idx_unsort] 1000 1001 # Magnitude of each index 1002 exp = np.log10( 1003 idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64) 1004 ) 1005 idx_unq_mag = np.power(10, np.floor(exp) + 1) 1006 1007 # Result 1008 return a + idx_unq / idx_unq_mag
Creates an array of indices, sorted by unique element.
Parameters
- a (
~numpy.array
): Array of unique elements (Mx1).
Returns
~numpy.array
: Array of indices (Mx1).
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos This is a static method.
1010 @staticmethod 1011 def roll_up_dataframe( 1012 df: pd.DataFrame, 1013 sort_by: str, 1014 tol: list, 1015 relative: list, 1016 dims: list, 1017 memory_opt_threshold: int = 10000, 1018 ): 1019 """Subset data by rolling up into apex in appropriate dimensions. 1020 1021 Parameters 1022 ---------- 1023 data : pd.DataFrame 1024 The input data containing "dims" columns and the "sort_by" column. 1025 sort_by : str 1026 The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 1027 (i.e., the mass feature with the highest value in the sort_by column). 1028 dims : list 1029 A list of dimension names (column names in the data DataFrame) to roll up the mass features by. 1030 tol : list 1031 A list of tolerances for each dimension. The length of the list must match the number of dimensions. 1032 The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). 1033 If relative is True, the tolerance will be multiplied by the maximum value in that dimension. 1034 If relative is False, the tolerance will be used as is. 1035 relative : list 1036 A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1037 memory_opt_threshold : int, optional 1038 Minimum number of rows to trigger memory-optimized processing. Default is 10000. 1039 1040 Returns 1041 ------- 1042 pd.DataFrame 1043 A DataFrame with only the rolled up mass features, with the original index and columns. 1044 1045 1046 Raises 1047 ------ 1048 ValueError 1049 If the input data is not a pandas DataFrame. 1050 If the input data does not have columns for each of the dimensions in "dims". 1051 If the length of "dims", "tol", and "relative" do not match. 1052 """ 1053 og_columns = df.columns.copy() 1054 1055 # Unindex the data, but keep the original index 1056 if df.index.name is not None: 1057 og_index = df.index.name 1058 else: 1059 og_index = "index" 1060 df = df.reset_index(drop=False) 1061 1062 # Sort data by sort_by column, and reindex 1063 df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True) 1064 1065 # Check that data is a DataFrame and has columns for each of the dims 1066 if not isinstance(df, pd.DataFrame): 1067 raise ValueError("Data must be a pandas DataFrame") 1068 for dim in dims: 1069 if dim not in df.columns: 1070 raise ValueError(f"Data must have a column for {dim}") 1071 if len(dims) != len(tol) or len(dims) != len(relative): 1072 raise ValueError( 1073 "Dimensions, tolerances, and relative flags must be the same length" 1074 ) 1075 1076 # Pre-compute all values arrays 1077 all_values = [df[dim].values for dim in dims] 1078 1079 # Choose processing method based on dataframe size 1080 if len(df) >= memory_opt_threshold: 1081 # Memory-optimized approach for large dataframes 1082 distances = PHCalculations._compute_distances_memory_optimized( 1083 all_values, tol, relative 1084 ) 1085 else: 1086 # Faster approach for smaller dataframes 1087 distances = PHCalculations._compute_distances_original( 1088 all_values, tol, relative 1089 ) 1090 1091 # Process pairs with original logic but memory optimizations 1092 distances = distances.tocoo() 1093 pairs = np.stack((distances.row, distances.col), axis=1) 1094 pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]).set_index("parent") 1095 del distances, pairs # Free memory immediately 1096 1097 to_drop = [] 1098 while not pairs_df.empty: 1099 # Find root_parents and their children (original logic preserved) 1100 root_parents = np.setdiff1d( 1101 np.unique(pairs_df.index.values), np.unique(pairs_df.child.values) 1102 ) 1103 children_of_roots = pairs_df.loc[root_parents, "child"].unique() 1104 to_drop.extend(children_of_roots) # Use extend instead of append 1105 1106 # Remove root_children as possible parents from pairs_df for next iteration 1107 pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore") 1108 pairs_df = pairs_df.reset_index().set_index("child") 1109 # Remove root_children as possible children from pairs_df for next iteration 1110 pairs_df = pairs_df.drop(index=children_of_roots) 1111 1112 # Prepare for next iteration 1113 pairs_df = pairs_df.reset_index().set_index("parent") 1114 1115 # Convert to numpy array for efficient dropping 1116 to_drop = np.array(to_drop) 1117 1118 # Drop mass features that are not cluster parents 1119 df_sub = df.drop(index=to_drop) 1120 1121 # Set index back to og_index and only keep original columns 1122 df_sub = df_sub.set_index(og_index).sort_index()[og_columns] 1123 1124 return df_sub
Subset data by rolling up into apex in appropriate dimensions.
Parameters
- data (pd.DataFrame): The input data containing "dims" columns and the "sort_by" column.
- sort_by (str): The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature (i.e., the mass feature with the highest value in the sort_by column).
- dims (list): A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
- tol (list): A list of tolerances for each dimension. The length of the list must match the number of dimensions. The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). If relative is True, the tolerance will be multiplied by the maximum value in that dimension. If relative is False, the tolerance will be used as is.
- relative (list): A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
- memory_opt_threshold (int, optional): Minimum number of rows to trigger memory-optimized processing. Default is 10000.
Returns
- pd.DataFrame: A DataFrame with only the rolled up mass features, with the original index and columns.
Raises
- ValueError: If the input data is not a pandas DataFrame. If the input data does not have columns for each of the dimensions in "dims". If the length of "dims", "tol", and "relative" do not match.
1295 def sparse_upper_star(self, idx, V): 1296 """Sparse implementation of an upper star filtration. 1297 1298 Parameters 1299 ---------- 1300 idx : :obj:`~numpy.array` 1301 Edge indices for each dimension (MxN). 1302 V : :obj:`~numpy.array` 1303 Array of intensity data (Mx1). 1304 Returns 1305 ------- 1306 idx : :obj:`~numpy.array` 1307 Index of filtered points (Mx1). 1308 persistence : :obj:`~numpy.array` 1309 Persistence of each filtered point (Mx1). 1310 1311 Notes 1312 ----- 1313 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1314 """ 1315 1316 # Invert 1317 V = -1 * V.copy().astype(int) 1318 1319 # Embed indices 1320 V = self.embed_unique_indices(V) 1321 1322 # Connectivity matrix 1323 cmat = KDTree(idx) 1324 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1325 cmat.setdiag(1) 1326 cmat = sparse.triu(cmat) 1327 1328 # Pairwise minimums 1329 I, J = cmat.nonzero() 1330 d = np.maximum(V[I], V[J]) 1331 1332 # Delete connectiity matrix 1333 cmat_shape = cmat.shape 1334 del cmat 1335 1336 # Sparse distance matrix 1337 sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape) 1338 1339 # Delete pairwise mins 1340 del d, I, J 1341 1342 # Persistence homology 1343 ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0] 1344 1345 # Bound death values 1346 ph[ph[:, 1] == np.inf, 1] = np.max(V) 1347 1348 # Construct tree to query against 1349 tree = KDTree(V.reshape((-1, 1))) 1350 1351 # Get the indexes of the first nearest neighbor by birth 1352 _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1) 1353 1354 return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)
Sparse implementation of an upper star filtration.
Parameters
- idx (
~numpy.array
): Edge indices for each dimension (MxN). - V (
~numpy.array
): Array of intensity data (Mx1).
Returns
- idx (
~numpy.array
): Index of filtered points (Mx1). - persistence (
~numpy.array
): Persistence of each filtered point (Mx1).
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1356 def check_if_grid(self, data): 1357 """Check if the data are gridded in mz space. 1358 1359 Parameters 1360 ---------- 1361 data : DataFrame 1362 DataFrame containing the mass spectrometry data. Needs to have mz and scan columns. 1363 1364 Returns 1365 ------- 1366 bool 1367 True if the data is gridded in the mz direction, False otherwise. 1368 1369 Notes 1370 ----- 1371 This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly. 1372 """ 1373 # Calculate the difference between consecutive mz values in a single scan 1374 dat_check = data.copy().reset_index(drop=True) 1375 dat_check["mz_diff"] = np.abs(dat_check["mz"].diff()) 1376 mz_diff_min = ( 1377 dat_check.groupby("scan")["mz_diff"].min().min() 1378 ) # within each scan, what is the smallest mz difference between consecutive mz values 1379 1380 # Find the mininum mz difference between mz values in the data; regardless of scan 1381 dat_check_mz = dat_check[["mz"]].drop_duplicates().copy() 1382 dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True) 1383 dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff()) 1384 1385 # Get minimum mz_diff between mz values in the data 1386 mz_diff_min_raw = dat_check_mz["mz_diff"].min() 1387 1388 # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded 1389 if mz_diff_min_raw < mz_diff_min: 1390 return False 1391 else: 1392 return True
Check if the data are gridded in mz space.
Parameters
- data (DataFrame): DataFrame containing the mass spectrometry data. Needs to have mz and scan columns.
Returns
- bool: True if the data is gridded in the mz direction, False otherwise.
Notes
This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1394 def grid_data(self, data, attempts=5): 1395 """Grid the data in the mz dimension. 1396 1397 Data must be gridded prior to persistent homology calculations and computing average mass spectrum 1398 1399 Parameters 1400 ---------- 1401 data : DataFrame 1402 The input data containing mz, scan, scan_time, and intensity columns. 1403 attempts : int, optional 1404 The number of attempts to grid the data. Default is 5. 1405 1406 Returns 1407 ------- 1408 DataFrame 1409 The gridded data with mz, scan, scan_time, and intensity columns. 1410 1411 Raises 1412 ------ 1413 ValueError 1414 If gridding fails after the specified number of attempts. 1415 """ 1416 attempt_i = 0 1417 while attempt_i < attempts: 1418 attempt_i += 1 1419 data = self._grid_data(data) 1420 1421 if self.check_if_grid(data): 1422 return data 1423 1424 if not self.check_if_grid(data): 1425 raise ValueError( 1426 "Gridding failed after " 1427 + str(attempt_i) 1428 + " attempts. Please check the data." 1429 ) 1430 else: 1431 return data
Grid the data in the mz dimension.
Data must be gridded prior to persistent homology calculations and computing average mass spectrum
Parameters
- data (DataFrame): The input data containing mz, scan, scan_time, and intensity columns.
- attempts (int, optional): The number of attempts to grid the data. Default is 5.
Returns
- DataFrame: The gridded data with mz, scan, scan_time, and intensity columns.
Raises
- ValueError: If gridding fails after the specified number of attempts.
1511 def find_mass_features_ph(self, ms_level=1, grid=True): 1512 """Find mass features within an LCMSBase object using persistent homology. 1513 1514 Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id) 1515 1516 Parameters 1517 ---------- 1518 ms_level : int, optional 1519 The MS level to use. Default is 1. 1520 grid : bool, optional 1521 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True. 1522 1523 Raises 1524 ------ 1525 ValueError 1526 If no MS level data is found on the object. 1527 If data is not gridded and grid is False. 1528 1529 Returns 1530 ------- 1531 None, but assigns the mass_features attribute to the object. 1532 1533 Notes 1534 ----- 1535 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1536 """ 1537 # Check that ms_level is a key in self._ms_uprocessed 1538 if ms_level not in self._ms_unprocessed.keys(): 1539 raise ValueError( 1540 "No MS level " 1541 + str(ms_level) 1542 + " data found, did you instantiate with parser specific to MS level?" 1543 ) 1544 1545 # Get ms data 1546 data = self._ms_unprocessed[ms_level].copy() 1547 1548 # Drop rows with missing intensity values and reset index 1549 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1550 1551 # Threshold data 1552 dims = ["mz", "scan_time"] 1553 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1554 persistence_threshold = ( 1555 self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max() 1556 ) 1557 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1558 1559 # Check if gridded, if not, grid 1560 gridded_mz = self.check_if_grid(data_thres) 1561 if gridded_mz is False: 1562 if grid is False: 1563 raise ValueError( 1564 "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function" 1565 ) 1566 else: 1567 data_thres = self.grid_data(data_thres) 1568 1569 # Add scan_time 1570 data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1571 # Process in chunks if required 1572 if len(data_thres) > 10000: 1573 return self._find_mass_features_ph_partition( 1574 data_thres, dims, persistence_threshold 1575 ) 1576 else: 1577 # Process all at once 1578 return self._find_mass_features_ph_single( 1579 data_thres, dims, persistence_threshold 1580 )
Find mass features within an LCMSBase object using persistent homology.
Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
Parameters
- ms_level (int, optional): The MS level to use. Default is 1.
- grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
Raises
- ValueError: If no MS level data is found on the object. If data is not gridded and grid is False.
Returns
- None, but assigns the mass_features attribute to the object.
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1825 def find_mass_features_ph_centroid(self, ms_level=1): 1826 """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data. 1827 1828 Parameters 1829 ---------- 1830 ms_level : int, optional 1831 The MS level to use. Default is 1. 1832 1833 Raises 1834 ------ 1835 ValueError 1836 If no MS level data is found on the object. 1837 1838 Returns 1839 ------- 1840 None, but assigns the mass_features attribute to the object. 1841 """ 1842 # Check that ms_level is a key in self._ms_uprocessed 1843 if ms_level not in self._ms_unprocessed.keys(): 1844 raise ValueError( 1845 "No MS level " 1846 + str(ms_level) 1847 + " data found, did you instantiate with parser specific to MS level?" 1848 ) 1849 1850 # Work with reference instead of copy 1851 data = self._ms_unprocessed[ms_level] 1852 1853 # Calculate threshold first to avoid multiple operations 1854 max_intensity = data["intensity"].max() 1855 threshold = self.parameters.lc_ms.ph_inten_min_rel * max_intensity 1856 1857 # Create single filter condition and apply to required columns only 1858 valid_mask = data["intensity"].notna() & (data["intensity"] > threshold) 1859 required_cols = ["mz", "intensity", "scan"] 1860 data_thres = data.loc[valid_mask, required_cols].copy() 1861 data_thres["persistence"] = data_thres["intensity"] 1862 1863 # Merge with required scan data 1864 scan_subset = self.scan_df[["scan", "scan_time"]] 1865 mf_df = data_thres.merge(scan_subset, on="scan", how="inner") 1866 del data_thres, scan_subset 1867 1868 # Order by scan_time and then mz to ensure features near in rt are processed together 1869 # It's ok that different scans are in different partitions; we will roll up later 1870 mf_df = mf_df.sort_values( 1871 by=["scan_time", "mz"], ascending=[True, True] 1872 ).reset_index(drop=True) 1873 partition_size = 10000 1874 partitions = [ 1875 mf_df.iloc[i : i + partition_size].reset_index(drop=True) 1876 for i in range(0, len(mf_df), partition_size) 1877 ] 1878 del mf_df 1879 1880 # Run roll_up_dataframe on each partition 1881 rolled_partitions = [] 1882 for part in partitions: 1883 rolled = self.roll_up_dataframe( 1884 df=part, 1885 sort_by="persistence", 1886 dims=["mz", "scan_time"], 1887 tol=[ 1888 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1889 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1890 ], 1891 relative=[True, False], 1892 ) 1893 rolled_partitions.append(rolled) 1894 del partitions 1895 1896 # Run roll_up_dataframe on the rolled_up partitions to merge features near partition boundaries 1897 1898 # Combine results and run a final roll-up to merge features near partition boundaries 1899 mf_df_final = pd.concat(rolled_partitions, ignore_index=True) 1900 del rolled_partitions 1901 1902 # Reorder by persistence before final roll-up 1903 mf_df_final = mf_df_final.sort_values( 1904 by="persistence", ascending=False 1905 ).reset_index(drop=True) 1906 1907 mf_df_final = self.roll_up_dataframe( 1908 df=mf_df_final, 1909 sort_by="persistence", 1910 dims=["mz", "scan_time"], 1911 tol=[ 1912 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1913 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1914 ], 1915 relative=[True, False], 1916 ) 1917 # reset index 1918 mf_df_final = mf_df_final.reset_index(drop=True) 1919 1920 # Combine rename and sort operations 1921 mass_features = ( 1922 mf_df_final.rename( 1923 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1924 ) 1925 .sort_values(by="persistence", ascending=False) 1926 .reset_index(drop=True) 1927 ) 1928 del mf_df_final # Free memory 1929 1930 # Order by persistence and reset index 1931 mass_features = mass_features.sort_values( 1932 by="persistence", ascending=False 1933 ).reset_index(drop=True) 1934 1935 self.mass_features = {} 1936 for idx, row in mass_features.iterrows(): 1937 row_dict = row.to_dict() 1938 lcms_feature = LCMSMassFeature(self, **row_dict) 1939 self.mass_features[lcms_feature.id] = lcms_feature 1940 1941 if self.parameters.lc_ms.verbose_processing: 1942 print("Found " + str(len(mass_features)) + " initial mass features")
Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
Parameters
- ms_level (int, optional): The MS level to use. Default is 1.
Raises
- ValueError: If no MS level data is found on the object.
Returns
- None, but assigns the mass_features attribute to the object.
1944 def cluster_mass_features(self, drop_children=True, sort_by="persistence"): 1945 """Cluster mass features 1946 1947 Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence. 1948 1949 Parameters 1950 ---------- 1951 drop_children : bool, optional 1952 Whether to drop the mass features that are not cluster parents. Default is True. 1953 sort_by : str, optional 1954 The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence". 1955 1956 Raises 1957 ------ 1958 ValueError 1959 If no mass features are found. 1960 If too many mass features are found. 1961 1962 Returns 1963 ------- 1964 None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents. 1965 """ 1966 if self.mass_features is None: 1967 raise ValueError("No mass features found, run find_mass_features() first") 1968 if len(self.mass_features) > 400000: 1969 raise ValueError( 1970 "Too many mass features of interest found, run find_mass_features() with a higher intensity threshold" 1971 ) 1972 dims = ["mz", "scan_time"] 1973 mf_df_og = self.mass_features_to_df() 1974 mf_df = mf_df_og.copy() 1975 1976 tol = [ 1977 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1978 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1979 ] # mz, in relative; scan_time in minutes 1980 relative = [True, False] 1981 1982 # Roll up mass features based on their proximity in the declared dimensions 1983 mf_df_new = self.roll_up_dataframe( 1984 df=mf_df, sort_by=sort_by, dims=dims, tol=tol, relative=relative 1985 ) 1986 1987 mf_df["cluster_parent"] = np.where( 1988 np.isin(mf_df.index, mf_df_new.index), True, False 1989 ) 1990 1991 # get mass feature ids of features that are not cluster parents 1992 cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values 1993 if drop_children is True: 1994 # Drop mass features that are not cluster parents from self 1995 self.mass_features = { 1996 k: v 1997 for k, v in self.mass_features.items() 1998 if k not in cluster_daughters 1999 } 2000 else: 2001 return cluster_daughters
Cluster mass features
Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
Parameters
- drop_children (bool, optional): Whether to drop the mass features that are not cluster parents. Default is True.
- sort_by (str, optional): The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
Raises
- ValueError: If no mass features are found. If too many mass features are found.
Returns
- None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.