corems.mass_spectra.calc.lc_calc
1import numpy as np 2import pandas as pd 3from ripser import ripser 4from scipy import sparse 5from scipy.spatial import KDTree 6 7from corems.chroma_peak.factory.chroma_peak_classes import LCMSMassFeature 8from corems.mass_spectra.calc import SignalProcessing as sp 9from corems.mass_spectra.factory.chromat_data import EIC_Data 10from corems.mass_spectrum.input.numpyArray import ms_from_array_profile 11 12 13def find_closest(A, target): 14 """Find the index of closest value in A to each value in target. 15 16 Parameters 17 ---------- 18 A : :obj:`~numpy.array` 19 The array to search (blueprint). A must be sorted. 20 target : :obj:`~numpy.array` 21 The array of values to search for. target must be sorted. 22 23 Returns 24 ------- 25 :obj:`~numpy.array` 26 The indices of the closest values in A to each value in target. 27 """ 28 idx = A.searchsorted(target) 29 idx = np.clip(idx, 1, len(A) - 1) 30 left = A[idx - 1] 31 right = A[idx] 32 idx -= target - left < right - target 33 return idx 34 35 36class LCCalculations: 37 """Methods for performing LC calculations on mass spectra data. 38 39 Notes 40 ----- 41 This class is intended to be used as a mixin for the LCMSBase class. 42 43 Methods 44 ------- 45 * get_max_eic(eic_data). 46 Returns the maximum EIC value from the given EIC data. A static method. 47 * smooth_tic(tic). 48 Smooths the TIC data using the specified smoothing method and settings. 49 * eic_centroid_detector(rt, eic, max_eic). 50 Performs EIC centroid detection on the given EIC data. 51 * find_nearest_scan(rt). 52 Finds the nearest scan to the given retention time. 53 * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). 54 Returns an averaged mass spectrum object. 55 * find_mass_features(ms_level=1). 56 Find regions of interest for a given MS level (default is MS1). 57 * integrate_mass_features(drop_if_fail=False, ms_level=1). 58 Integrate mass features of interest and extracts EICs. 59 * find_c13_mass_features(). 60 Evaluate mass features and mark likely C13 isotopes. 61 * deconvolute_ms1_mass_features(). 62 Deconvolute mass features' ms1 mass spectra. 63 """ 64 65 @staticmethod 66 def get_max_eic(eic_data: dict): 67 """Returns the maximum EIC value from the given EIC data. 68 69 Notes 70 ----- 71 This is a static method. 72 73 Parameters 74 ---------- 75 eic_data : dict 76 A dictionary containing EIC data. 77 78 Returns 79 ------- 80 float 81 The maximum EIC value. 82 """ 83 max_eic = 0 84 for eic_data in eic_data.values(): 85 ind_max_eic = max(eic_data.get("EIC")) 86 max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic 87 88 return max_eic 89 90 def smooth_tic(self, tic): 91 """Smooths the TIC or EIC data using the specified smoothing method and settings. 92 93 Parameters 94 ---------- 95 tic : numpy.ndarray 96 The TIC (or EIC) data to be smoothed. 97 98 Returns 99 ------- 100 numpy.ndarray 101 The smoothed TIC data. 102 """ 103 implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method 104 105 pol_order = self.parameters.lc_ms.savgol_pol_order 106 107 window_len = self.parameters.lc_ms.smooth_window 108 109 window = self.parameters.lc_ms.smooth_method 110 111 return sp.smooth_signal( 112 tic, window_len, window, pol_order, implemented_smooth_method 113 ) 114 115 def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]): 116 """Performs EIC centroid detection on the given EIC data. 117 118 Parameters 119 ---------- 120 rt : numpy.ndarray 121 The retention time data. 122 eic : numpy.ndarray 123 The EIC data. 124 max_eic : float 125 The maximum EIC value. 126 apex_indexes : list, optional 127 The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function. 128 129 Returns 130 ------- 131 numpy.ndarray 132 The indexes of left, apex, and right limits as a generator. 133 """ 134 135 max_prominence = self.parameters.lc_ms.peak_max_prominence_percent 136 137 max_height = self.parameters.lc_ms.peak_height_max_percent 138 139 signal_threshold = self.parameters.lc_ms.eic_signal_threshold 140 141 min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints 142 143 peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold 144 145 include_indexes = sp.peak_picking_first_derivative( 146 domain=rt, 147 signal=eic, 148 max_height=max_height, 149 max_prominence=max_prominence, 150 max_signal=max_eic, 151 min_peak_datapoints=min_peak_datapoints, 152 peak_derivative_threshold=peak_derivative_threshold, 153 signal_threshold=signal_threshold, 154 correct_baseline=False, 155 plot_res=False, 156 apex_indexes=apex_indexes, 157 ) 158 #include_indexes is a generator of tuples (left_index, apex_index, right_index) 159 include_indexes = list(include_indexes) 160 # Add check to make sure that there are at least 1/2 of min_peak_datapoints on either side of the apex 161 indicies = [x for x in include_indexes] 162 for idx in indicies: 163 if (idx[1] - idx[0] < min_peak_datapoints / 2) or ( 164 idx[2] - idx[1] < min_peak_datapoints / 2 165 ): 166 include_indexes.remove(idx) 167 return include_indexes 168 169 def find_nearest_scan(self, rt): 170 """Finds the nearest scan to the given retention time. 171 172 Parameters 173 ---------- 174 rt : float 175 The retention time (in minutes) to find the nearest scan for. 176 177 Returns 178 ------- 179 int 180 The scan number of the nearest scan. 181 """ 182 array_rt = np.array(self.retention_time) 183 184 scan_index = (np.abs(array_rt - rt)).argmin() 185 186 real_scan = self.scans_number[scan_index] 187 188 return real_scan 189 190 def add_peak_metrics(self, remove_by_metrics=True): 191 """Add peak metrics to the mass features. 192 193 This function calculates the peak metrics for each mass feature and adds them to the mass feature objects. 194 195 Parameters 196 ---------- 197 remove_by_metrics : bool, optional 198 If True, remove mass features based on their peak metrics such as S/N, Gaussian similarity, 199 dispersity index, and noise score. Default is True, which checks the setting in the processing parameters. 200 If False, peak metrics are calculated but no mass features are removed, regardless of the setting in the processing parameters. 201 """ 202 # Check that at least some mass features have eic data 203 if not any([mf._eic_data is not None for mf in self.mass_features.values()]): 204 raise ValueError( 205 "No mass features have EIC data. Run integrate_mass_features first." 206 ) 207 208 for mass_feature in self.mass_features.values(): 209 # Check if the mass feature has been integrated 210 if mass_feature._eic_data is not None: 211 # Calculate peak metrics 212 mass_feature.calc_half_height_width() 213 mass_feature.calc_tailing_factor() 214 mass_feature.calc_dispersity_index() 215 mass_feature.calc_gaussian_similarity() 216 mass_feature.calc_noise_score() 217 218 # Remove mass features by peak metrics if designated in parameters 219 if self.parameters.lc_ms.remove_mass_features_by_peak_metrics and remove_by_metrics: 220 self._remove_mass_features_by_peak_metrics() 221 222 def get_average_mass_spectrum( 223 self, 224 scan_list, 225 apex_scan, 226 spectrum_mode="profile", 227 ms_level=1, 228 auto_process=True, 229 use_parser=False, 230 perform_checks=True, 231 polarity=None, 232 ms_params=None, 233 ): 234 """Returns an averaged mass spectrum object 235 236 Parameters 237 ---------- 238 scan_list : list 239 List of scan numbers to average. 240 apex_scan : int 241 Number of the apex scan 242 spectrum_mode : str, optional 243 The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging. 244 ms_level : int, optional 245 The MS level to use. Defaults to 1. 246 auto_process : bool, optional 247 If True, the averaged mass spectrum will be auto-processed. Defaults to True. 248 use_parser : bool, optional 249 If True, the mass spectra will be obtained from the parser. Defaults to False. 250 perform_checks : bool, optional 251 If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan 252 polarity : int, optional 253 The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1) 254 ms_params : MSParameters, optional 255 The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None. 256 257 Returns 258 ------- 259 MassSpectrumProfile 260 The averaged mass spectrum object. 261 262 Raises 263 ------ 264 ValueError 265 If the spectrum mode is not "profile". 266 If the MS level is not found in the unprocessed mass spectra dictionary. 267 If not all scan numbers are found in the unprocessed mass spectra dictionary. 268 """ 269 if perform_checks: 270 if spectrum_mode != "profile": 271 raise ValueError("Averaging only supported for profile mode") 272 273 if polarity is None: 274 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 275 if self.polarity == "negative": 276 polarity = -1 277 elif self.polarity == "positive": 278 polarity = 1 279 else: 280 raise ValueError( 281 "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'" 282 ) 283 284 # if not using_parser, check that scan numbers are in _ms_unprocessed 285 if not use_parser: 286 if perform_checks: 287 # Set index to scan for faster lookup 288 ms_df = ( 289 self._ms_unprocessed[ms_level] 290 .copy() 291 .set_index("scan", drop=False) 292 .sort_index() 293 ) 294 my_ms_df = ms_df.loc[scan_list] 295 # Check that all scan numbers are in the ms_df 296 if not all(np.isin(scan_list, ms_df.index)): 297 raise ValueError( 298 "Not all scan numbers found in the unprocessed mass spectra dictionary" 299 ) 300 else: 301 my_ms_df = ( 302 pd.DataFrame({"scan": scan_list}) 303 .set_index("scan") 304 .join(self._ms_unprocessed[ms_level], how="left") 305 ) 306 307 if use_parser: 308 ms_list = [ 309 self.spectra_parser.get_mass_spectrum_from_scan( 310 x, spectrum_mode=spectrum_mode, auto_process=False 311 ) 312 for x in scan_list 313 ] 314 ms_mz = [x._mz_exp for x in ms_list] 315 ms_int = [x._abundance for x in ms_list] 316 my_ms_df = [] 317 for i in np.arange(len(ms_mz)): 318 my_ms_df.append( 319 pd.DataFrame( 320 {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]} 321 ) 322 ) 323 my_ms_df = pd.concat(my_ms_df) 324 325 if not self.check_if_grid(my_ms_df): 326 my_ms_df = self.grid_data(my_ms_df) 327 328 my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index() 329 330 ms = ms_from_array_profile( 331 my_ms_ave.mz, 332 my_ms_ave.intensity, 333 self.file_location, 334 polarity=polarity, 335 auto_process=False, 336 ) 337 338 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 339 if ms is not None: 340 if ms_params is not None: 341 ms.parameters = ms_params 342 ms.scan_number = apex_scan 343 if auto_process: 344 ms.process_mass_spec() 345 return ms 346 347 def find_mass_features(self, ms_level=1, grid=True): 348 """Find mass features within an LCMSBase object 349 350 Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future. 351 352 Parameters 353 ---------- 354 ms_level : int, optional 355 The MS level to use for peak picking Default is 1. 356 grid : bool, optional 357 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 358 used for persistent homology peak picking for profile data only. Default is True. 359 360 Raises 361 ------ 362 ValueError 363 If no MS level data is found on the object. 364 If persistent homology peak picking is attempted on non-profile mode data. 365 If data is not gridded and grid is False. 366 If peak picking method is not implemented. 367 368 Returns 369 ------- 370 None, but assigns the mass_features and eics attributes to the object. 371 372 """ 373 pp_method = self.parameters.lc_ms.peak_picking_method 374 375 if pp_method == "persistent homology": 376 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 377 if all(msx_scan_df["ms_format"] == "profile"): 378 self.find_mass_features_ph(ms_level=ms_level, grid=grid) 379 else: 380 raise ValueError( 381 "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format( 382 ms_level 383 ) 384 ) 385 elif pp_method == "centroided_persistent_homology": 386 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 387 if all(msx_scan_df["ms_format"] == "centroid"): 388 self.find_mass_features_ph_centroid(ms_level=ms_level) 389 else: 390 raise ValueError( 391 "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format( 392 ms_level 393 ) 394 ) 395 else: 396 raise ValueError("Peak picking method not implemented") 397 398 # Remove noisey mass features if designated in parameters 399 if self.parameters.lc_ms.remove_redundant_mass_features: 400 self._remove_redundant_mass_features() 401 402 def integrate_mass_features( 403 self, drop_if_fail=True, drop_duplicates=True, ms_level=1 404 ): 405 """Integrate mass features and extract EICs. 406 407 Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute. 408 409 Parameters 410 ---------- 411 drop_if_fail : bool, optional 412 Whether to drop mass features if the EIC limit calculations fail. 413 Default is True. 414 drop_duplicates : bool, optional 415 Whether to mass features that appear to be duplicates 416 (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). 417 Default is True. 418 ms_level : int, optional 419 The MS level to use. Default is 1. 420 421 Raises 422 ------ 423 ValueError 424 If no mass features are found. 425 If no MS level data is found for the given MS level (either in data or in the scan data) 426 427 Returns 428 ------- 429 None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute. 430 431 Notes 432 ----- 433 drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection). 434 """ 435 # Check if there is data 436 if ms_level in self._ms_unprocessed.keys(): 437 raw_data = self._ms_unprocessed[ms_level].copy() 438 else: 439 raise ValueError("No MS level " + str(ms_level) + " data found") 440 if self.mass_features is not None: 441 mf_df = self.mass_features_to_df().copy() 442 else: 443 raise ValueError( 444 "No mass features found, did you run find_mass_features() first?" 445 ) 446 447 # Subset scan data to only include correct ms_level 448 scan_df_sub = self.scan_df[ 449 self.scan_df["ms_level"] == int(ms_level) 450 ].reset_index(drop=True) 451 if scan_df_sub.empty: 452 raise ValueError("No MS level " + ms_level + " data found in scan data") 453 scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy() 454 455 mzs_to_extract = np.unique(mf_df["mz"].values) 456 mzs_to_extract.sort() 457 458 # Pre-sort raw_data by mz for faster filtering 459 raw_data_sorted = raw_data.sort_values(["mz", "scan"]).reset_index(drop=True) 460 raw_data_mz = raw_data_sorted["mz"].values 461 462 # Get EICs for each unique mz in mass features list 463 for mz in mzs_to_extract: 464 mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 465 mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 466 467 # Use binary search for faster mz range filtering 468 left_idx = np.searchsorted(raw_data_mz, mz_min, side="left") 469 right_idx = np.searchsorted(raw_data_mz, mz_max, side="right") 470 raw_data_sub = raw_data_sorted.iloc[left_idx:right_idx].copy() 471 472 raw_data_sub = ( 473 raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index() 474 ) 475 raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left") 476 raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0) 477 myEIC = EIC_Data( 478 scans=raw_data_sub["scan"].values, 479 time=raw_data_sub["scan_time"].values, 480 eic=raw_data_sub["intensity"].values, 481 ) 482 # Smooth EIC 483 smoothed_eic = self.smooth_tic(myEIC.eic) 484 smoothed_eic[smoothed_eic < 0] = 0 485 myEIC.eic_smoothed = smoothed_eic 486 self.eics[mz] = myEIC 487 488 # Get limits of mass features using EIC centroid detector and integrate 489 mf_df["area"] = np.nan 490 for idx, mass_feature in mf_df.iterrows(): 491 mz = mass_feature.mz 492 apex_scan = mass_feature.apex_scan 493 494 # Pull EIC data and find apex scan index 495 myEIC = self.eics[mz] 496 self.mass_features[idx]._eic_data = myEIC 497 apex_index = np.where(myEIC.scans == apex_scan)[0][0] 498 499 # Find left and right limits of peak using EIC centroid detector, add to EICData 500 centroid_eics = self.eic_centroid_detector( 501 myEIC.time, 502 myEIC.eic_smoothed, 503 mass_feature.intensity * 1.1, 504 apex_indexes=[int(apex_index)], 505 ) 506 l_a_r_scan_idx = [i for i in centroid_eics] 507 if len(l_a_r_scan_idx) > 0: 508 # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans 509 # Find the number of consecutive non-zero values in the EIC segment 510 mask = myEIC.eic[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1] > 0 511 # Find the longest run of consecutive True values 512 if np.any(mask): 513 # Find indices where mask changes value 514 diff = np.diff(np.concatenate(([0], mask.astype(int), [0]))) 515 starts = np.where(diff == 1)[0] 516 ends = np.where(diff == -1)[0] 517 consecutive_scans = (ends - starts).max() 518 else: 519 consecutive_scans = 0 520 if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min: 521 self.mass_features.pop(idx) 522 continue 523 # Add start and final scan to mass_features and EICData 524 left_scan, right_scan = ( 525 myEIC.scans[l_a_r_scan_idx[0][0]], 526 myEIC.scans[l_a_r_scan_idx[0][2]], 527 ) 528 mf_scan_apex = [(left_scan, int(apex_scan), right_scan)] 529 myEIC.apexes = myEIC.apexes + mf_scan_apex 530 self.mass_features[idx].start_scan = left_scan 531 self.mass_features[idx].final_scan = right_scan 532 533 # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData 534 area = np.trapz( 535 myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 536 myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 537 ) 538 mf_df.at[idx, "area"] = area 539 myEIC.areas = myEIC.areas + [area] 540 self.eics[mz] = myEIC 541 self.mass_features[idx]._area = area 542 else: 543 if drop_if_fail is True: 544 self.mass_features.pop(idx) 545 546 if drop_duplicates: 547 # Prepare mass feature dataframe 548 mf_df = self.mass_features_to_df() 549 550 # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature 551 # Keep the first mass feature (highest persistence) 552 for idx, mass_feature in mf_df.iterrows(): 553 mz = mass_feature.mz 554 apex_scan = mass_feature.apex_scan 555 556 mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6 557 mf_df_sub = mf_df[ 558 mf_df["mz_diff_ppm"] 559 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 560 * 10**6 561 ].copy() 562 563 # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature 564 for idx2, mass_feature2 in mf_df_sub.iterrows(): 565 if idx2 != idx: 566 if ( 567 mass_feature2.start_scan >= mass_feature.start_scan 568 and mass_feature2.final_scan <= mass_feature.final_scan 569 ): 570 if idx2 in self.mass_features.keys(): 571 self.mass_features.pop(idx2) 572 573 def find_c13_mass_features(self): 574 """Mark likely C13 isotopes and connect to monoisoitopic mass features. 575 576 Returns 577 ------- 578 None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object. 579 580 Raises 581 ------ 582 ValueError 583 If no mass features are found. 584 """ 585 verbose = self.parameters.lc_ms.verbose_processing 586 if verbose: 587 print("evaluating mass features for C13 isotopes") 588 if self.mass_features is None: 589 raise ValueError("No mass features found, run find_mass_features() first") 590 591 # Data prep fo sparse distance matrix 592 dims = ["mz", "scan_time"] 593 mf_df = self.mass_features_to_df().copy() 594 # Drop mass features that have no area (these are likely to be noise) 595 mf_df = mf_df[mf_df["area"].notnull()] 596 mf_df["mf_id"] = mf_df.index.values 597 dims = ["mz", "scan_time"] 598 599 # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features 600 mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy() 601 602 mz_diff = 1.003355 # C13-C12 mass difference 603 tol = [ 604 mf_df["mz"].median() 605 * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 606 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5, 607 ] # mz, in relative; scan_time in minutes 608 609 # Compute inter-feature distances 610 distances = None 611 for i in range(len(dims)): 612 # Construct k-d tree 613 values = mf_df[dims[i]].values 614 tree = KDTree(values.reshape(-1, 1)) 615 616 max_tol = tol[i] 617 if dims[i] == "mz": 618 # Maximum absolute tolerance 619 max_tol = mz_diff + tol[i] 620 621 # Compute sparse distance matrix 622 # the larger the max_tol, the slower this operation is 623 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 624 625 # Only consider forward case, exclude diagonal 626 sdm = sparse.triu(sdm, k=1) 627 628 if dims[i] == "mz": 629 min_tol = mz_diff - tol[i] 630 # Get only the ones that are above the min tol 631 idx = sdm.data > min_tol 632 633 # Reconstruct sparse distance matrix 634 sdm = sparse.coo_matrix( 635 (sdm.data[idx], (sdm.row[idx], sdm.col[idx])), 636 shape=(len(values), len(values)), 637 ) 638 639 # Cast as binary matrix 640 sdm.data = np.ones_like(sdm.data) 641 642 # Stack distances 643 if distances is None: 644 distances = sdm 645 else: 646 distances = distances.multiply(sdm) 647 648 # Extract indices of within-tolerance points 649 distances = distances.tocoo() 650 pairs = np.stack((distances.row, distances.col), axis=1) # C12 to C13 pairs 651 652 # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df 653 pairs_mf = pairs.copy() 654 pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values 655 pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values 656 657 # Connect monoisotopic masses with isotopologes within mass_features 658 monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1])) 659 for mono in monos: 660 self.mass_features[mono].monoisotopic_mf_id = mono 661 pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"]) 662 while not pairs_iso_df.empty: 663 pairs_iso_df = pairs_iso_df.set_index("parent", drop=False) 664 m1_isos = pairs_iso_df.loc[monos, "child"].unique() 665 for iso in m1_isos: 666 # Set monoisotopic_mf_id and isotopologue_type for isotopologues 667 parent = pairs_mf[pairs_mf[:, 1] == iso, 0] 668 if len(parent) > 1: 669 # Choose the parent that is closest in time to the isotopologue 670 parent_time = [self.mass_features[p].retention_time for p in parent] 671 time_diff = [ 672 np.abs(self.mass_features[iso].retention_time - x) 673 for x in parent_time 674 ] 675 parent = parent[np.argmin(time_diff)] 676 else: 677 parent = parent[0] 678 self.mass_features[iso].monoisotopic_mf_id = self.mass_features[ 679 parent 680 ].monoisotopic_mf_id 681 if self.mass_features[iso].monoisotopic_mf_id is not None: 682 mass_diff = ( 683 self.mass_features[iso].mz 684 - self.mass_features[ 685 self.mass_features[iso].monoisotopic_mf_id 686 ].mz 687 ) 688 self.mass_features[iso].isotopologue_type = "13C" + str( 689 int(round(mass_diff, 0)) 690 ) 691 692 # Drop the mono and iso from the pairs_iso_df 693 pairs_iso_df = pairs_iso_df.drop( 694 index=monos, errors="ignore" 695 ) # Drop pairs where the parent is a child that is a child of a root 696 pairs_iso_df = pairs_iso_df.set_index("child", drop=False) 697 pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore") 698 699 if not pairs_iso_df.empty: 700 # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process 701 monos = np.setdiff1d( 702 np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child) 703 ) 704 if verbose: 705 # Report fraction of compounds annotated with isotopes 706 mf_df["c13_flag"] = np.where( 707 np.logical_or( 708 np.isin(mf_df["mf_id"], pairs_mf[:, 0]), 709 np.isin(mf_df["mf_id"], pairs_mf[:, 1]), 710 ), 711 1, 712 0, 713 ) 714 print( 715 str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3)) 716 + " of mass features have or are C13 isotopes" 717 ) 718 719 def deconvolute_ms1_mass_features(self): 720 """Deconvolute MS1 mass features 721 722 Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features 723 724 Parameters 725 ---------- 726 None 727 728 Returns 729 ------- 730 None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent, 731 and associated_mass_features_deconvoluted attributes to the mass features in the 732 mass_features attribute of the LCMSBase object. 733 734 Raises 735 ------ 736 ValueError 737 If no mass features are found, must run find_mass_features() first. 738 If no EICs are found, did you run integrate_mass_features() first? 739 740 """ 741 # Checks for set mass_features and eics 742 if self.mass_features is None: 743 raise ValueError( 744 "No mass features found, did you run find_mass_features() first?" 745 ) 746 747 if self.eics == {}: 748 raise ValueError( 749 "No EICs found, did you run integrate_mass_features() first?" 750 ) 751 752 if 1 not in self._ms_unprocessed.keys(): 753 raise ValueError("No unprocessed MS1 spectra found.") 754 755 # Prep ms1 data 756 ms1_data = self._ms_unprocessed[1].copy() 757 ms1_data = ms1_data.set_index("scan") 758 759 # Prep mass feature summary 760 mass_feature_df = self.mass_features_to_df() 761 762 # Loop through each mass feature 763 for mf_id, mass_feature in self.mass_features.items(): 764 # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df 765 if mass_feature.mz != mass_feature.ms1_peak.mz_exp: 766 continue 767 768 # Get the left and right limits of the EIC of the mass feature 769 l_scan, _, r_scan = mass_feature._eic_data.apexes[0] 770 771 # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz 772 ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy() 773 ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False) 774 775 # Get the centroided masses of the mass feature 776 mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp 777 778 # Find the closest mz in the ms1 data to the centroided masses of the mass feature 779 ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[ 780 find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values) 781 ] 782 783 # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz 784 ms1_data_sub["mz_diff_rel"] = ( 785 np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"]) 786 / ms1_data_sub["mz"] 787 ) 788 ms1_data_sub = ms1_data_sub[ 789 ms1_data_sub["mz_diff_rel"] 790 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 791 ].reset_index(drop=True) 792 793 # Group by mass_feature_mz and scan and sum intensity 794 ms1_data_sub_group = ( 795 ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"] 796 .sum() 797 .reset_index() 798 ) 799 800 # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity) 801 corr = ( 802 ms1_data_sub_group.pivot( 803 index="scan", columns="mass_feature_mz", values="intensity" 804 ) 805 .fillna(0) 806 .corr() 807 ) 808 809 # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8 810 decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min 811 812 # Try catch for KeyError in case the mass feature mz is not in the correlation matrix 813 try: 814 corr_subset = corr.loc[mass_feature.mz,] 815 except KeyError: 816 # If the mass feature mz is not in the correlation matrix, skip to the next mass feature 817 continue 818 819 corr_subset = corr_subset[corr_subset > decon_corr_min] 820 821 # Get the masses from the mass spectrum that are the result of the deconvolution 822 mzs_decon = corr_subset.index.values 823 824 # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature 825 mzs_decon_idx = [ 826 id 827 for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp) 828 if mz in mzs_decon 829 ] 830 mass_feature._ms_deconvoluted_idx = mzs_decon_idx 831 832 # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum 833 if ( 834 mass_feature.ms1_peak.abundance 835 == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max() 836 ): 837 mass_feature.mass_spectrum_deconvoluted_parent = True 838 else: 839 mass_feature.mass_spectrum_deconvoluted_parent = False 840 841 # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature 842 # Subset mass_feature_df to only include mass features that are within the clustering tolerance 843 mass_feature_df_sub = mass_feature_df[ 844 abs(mass_feature.retention_time - mass_feature_df["scan_time"]) 845 < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 846 ].copy() 847 # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum 848 mass_feature_df_sub["mz_diff_ppm"] = [ 849 np.abs(mzs_decon - mz).min() / mz * 10**6 850 for mz in mass_feature_df_sub["mz"] 851 ] 852 # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses 853 mfs_associated_decon = mass_feature_df_sub[ 854 mass_feature_df_sub["mz_diff_ppm"] 855 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6 856 ].index.values 857 858 mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon 859 860 def _remove_redundant_mass_features( 861 self, 862 ) -> None: 863 """ 864 Identify and remove redundant mass features that are likely contaminants based on their m/z values and scan frequency. 865 Especially useful for HILIC data where signals do not return to baseline between peaks or for data with significant background noise. 866 867 Contaminants are characterized by: 868 1. Similar m/z values (within ppm_tolerance) 869 2. High frequency across scan numbers (ubiquitous presence) 870 871 Notes 872 ----- 873 Depends on self.mass_features being populated, uses the parameters in self.parameters.lc_ms for tolerances (mass_feature_cluster_mz_tolerance_rel) 874 """ 875 ppm_tolerance = self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel*1e6 876 min_scan_frequency = self.parameters.lc_ms.redundant_scan_frequency_min 877 n_retain = self.parameters.lc_ms.redundant_feature_retain_n 878 879 df = self.mass_features_to_df() 880 881 if df.empty: 882 return pd.DataFrame() 883 # df index should be mf_id 884 if 'mf_id' not in df.columns: 885 if 'mf_id' in df.index.names: 886 df = df.reset_index() 887 else: 888 raise ValueError("DataFrame must contain 'mf_id' column or index.") 889 890 # Sort by m/z for efficient grouping 891 df_sorted = df.sort_values('mz').reset_index(drop=True) 892 893 # Calculate total number of unique scans for frequency calculation 894 # Calculate total possible scans (check the cluster rt tolerance and the min rt and max rt of the data) 895 total_time = self.scan_df['scan_time'].max() - self.scan_df['scan_time'].min() 896 cluster_rt_tolerance = self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 897 # If the feature was detected in every possible scan (and then rolled up), it would be in this many scans 898 total_scans = int(total_time / cluster_rt_tolerance) + 1 899 900 # Group similar m/z values using ppm tolerance 901 mz_groups = [] 902 current_group = [] 903 904 for i, row in df_sorted.iterrows(): 905 current_mz = row['mz'] 906 907 if not current_group: 908 # Start first group 909 current_group = [i] 910 else: 911 # Check if current m/z is within tolerance of group representative 912 group_representative_mz = df_sorted.iloc[current_group[0]]['mz'] 913 ppm_diff = abs(current_mz - group_representative_mz) / group_representative_mz * 1e6 914 915 if ppm_diff <= ppm_tolerance: 916 # Add to current group 917 current_group.append(i) 918 else: 919 # Start new group, but first process current group 920 if len(current_group) > 0: 921 mz_groups.append(current_group) 922 current_group = [i] 923 924 # Don't forget the last group 925 if current_group: 926 mz_groups.append(current_group) 927 928 # Analyze each m/z group for contaminant characteristics 929 930 for group_indices in mz_groups: 931 group_data = df_sorted.iloc[group_indices] 932 933 # Calculate group statistics 934 unique_scans = group_data['apex_scan'].nunique() 935 scan_frequency = unique_scans / total_scans 936 937 # Check if this group meets contaminant criteria 938 if scan_frequency >= min_scan_frequency: 939 group_data = group_data.sort_values('intensity', ascending=False) 940 non_representative_mf_id = group_data.iloc[n_retain:]['mf_id'].tolist() # These will be removed 941 942 self.mass_features = { 943 k: v for k, v in self.mass_features.items() if k not in non_representative_mf_id 944 } 945 946 def _remove_mass_features_by_peak_metrics(self) -> None: 947 """Remove mass features based on peak metrics defined in mass_feature_attribute_filter_dict. 948 949 This method filters mass features based on various peak shape metrics and quality indicators 950 such as noise scores, Gaussian similarity, tailing factors, dispersity index, etc. 951 952 The filtering criteria are defined in the mass_feature_attribute_filter_dict parameter, 953 which should contain attribute names as keys and filter specifications as values. 954 955 Filter specification format: 956 {attribute_name: {'value': threshold, 'operator': comparison}} 957 958 Available operators: 959 - '>' or 'greater': Keep features where attribute > threshold 960 - '<' or 'less': Keep features where attribute < threshold 961 - '>=' or 'greater_equal': Keep features where attribute >= threshold 962 - '<=' or 'less_equal': Keep features where attribute <= threshold 963 964 Examples: 965 - {'noise_score_max': {'value': 0.5, 'operator': '>='}} - Keep features with noise_score_max >= 0.5 966 - {'dispersity_index': {'value': 0.1, 'operator': '<'}} - Keep features with dispersity_index < 0.1 967 - {'gaussian_similarity': {'value': 0.7, 'operator': '>='}} - Keep features with gaussian_similarity >= 0.7 968 969 Returns 970 ------- 971 None 972 Modifies self.mass_features in place by removing filtered features. 973 974 Raises 975 ------ 976 ValueError 977 If no mass features are found, if an invalid attribute is specified, or if filter specification is malformed. 978 """ 979 if self.mass_features is None or len(self.mass_features) == 0: 980 raise ValueError("No mass features found, run find_mass_features() first") 981 982 filter_dict = self.parameters.lc_ms.mass_feature_attribute_filter_dict 983 984 if not filter_dict: 985 # No filtering criteria specified, return early 986 return 987 988 verbose = self.parameters.lc_ms.verbose_processing 989 initial_count = len(self.mass_features) 990 991 if verbose: 992 print(f"Filtering mass features using peak metrics. Initial count: {initial_count}") 993 994 # List to collect IDs of mass features to remove 995 features_to_remove = [] 996 997 for mf_id, mass_feature in self.mass_features.items(): 998 should_remove = False 999 1000 for attribute_name, filter_spec in filter_dict.items(): 1001 # Validate filter specification structure 1002 if not isinstance(filter_spec, dict): 1003 raise ValueError(f"Filter specification for '{attribute_name}' must be a dictionary with 'value' and 'operator' keys") 1004 1005 if 'value' not in filter_spec or 'operator' not in filter_spec: 1006 raise ValueError(f"Filter specification for '{attribute_name}' must contain both 'value' and 'operator' keys") 1007 1008 threshold_value = filter_spec['value'] 1009 operator = filter_spec['operator'].lower().strip() 1010 1011 # Validate operator 1012 valid_operators = {'>', '<', '>=', '<=', 'greater', 'less', 'greater_equal', 'less_equal'} 1013 if operator not in valid_operators: 1014 raise ValueError(f"Invalid operator '{operator}' for attribute '{attribute_name}'. Valid operators: {valid_operators}") 1015 1016 # Normalize operator names 1017 operator_map = { 1018 'greater': '>', 1019 'less': '<', 1020 'greater_equal': '>=', 1021 'less_equal': '<=' 1022 } 1023 operator = operator_map.get(operator, operator) 1024 1025 # Get the attribute value from the mass feature 1026 try: 1027 if hasattr(mass_feature, attribute_name): 1028 attribute_value = getattr(mass_feature, attribute_name) 1029 else: 1030 raise ValueError(f"Mass feature does not have attribute '{attribute_name}'") 1031 1032 # Handle None values or attributes that haven't been calculated 1033 if attribute_value is None: 1034 if verbose: 1035 print(f"Warning: Mass feature {mf_id} has None value for '{attribute_name}'. Removing feature.") 1036 should_remove = True 1037 break 1038 1039 # Handle numpy arrays (like half_height_width which returns mean) 1040 if hasattr(attribute_value, '__len__') and not isinstance(attribute_value, str): 1041 # For arrays, we use the mean or appropriate summary statistic 1042 if attribute_name == 'half_height_width': 1043 # half_height_width property already returns the mean 1044 pass 1045 else: 1046 attribute_value = float(np.mean(attribute_value)) 1047 1048 # Handle NaN values 1049 if np.isnan(float(attribute_value)): 1050 if verbose: 1051 print(f"Warning: Mass feature {mf_id} has NaN value for '{attribute_name}'. Removing feature.") 1052 should_remove = True 1053 break 1054 1055 # Apply the threshold comparison based on operator 1056 attribute_value = float(attribute_value) 1057 threshold_value = float(threshold_value) 1058 1059 if operator == '>' and not (attribute_value > threshold_value): 1060 should_remove = True 1061 break 1062 elif operator == '<' and not (attribute_value < threshold_value): 1063 should_remove = True 1064 break 1065 elif operator == '>=' and not (attribute_value >= threshold_value): 1066 should_remove = True 1067 break 1068 elif operator == '<=' and not (attribute_value <= threshold_value): 1069 should_remove = True 1070 break 1071 1072 except (AttributeError, ValueError, TypeError) as e: 1073 if verbose: 1074 print(f"Error evaluating filter '{attribute_name}' for mass feature {mf_id}: {e}") 1075 should_remove = True 1076 break 1077 1078 if should_remove: 1079 features_to_remove.append(mf_id) 1080 1081 # Remove filtered mass features 1082 for mf_id in features_to_remove: 1083 del self.mass_features[mf_id] 1084 1085 # Clean up unassociated EICs and ms1 data 1086 self._remove_unassociated_eics() 1087 self._remove_unassociated_ms1_spectra() 1088 1089 def _remove_unassociated_eics(self) -> None: 1090 """Remove EICs that are not associated with any mass features. 1091 1092 This method cleans up the eics attribute by removing any EICs that do not correspond to 1093 any mass features currently stored in the mass_features attribute. This is useful for 1094 freeing up memory and ensuring that only relevant EICs are retained. 1095 1096 Returns 1097 ------- 1098 None 1099 Modifies self.eics in place by removing unassociated EICs. 1100 """ 1101 if self.mass_features is None or len(self.mass_features) == 0: 1102 self.eics = {} 1103 return 1104 1105 # Get the set of m/z values associated with current mass features 1106 associated_mzs = {mf.mz for mf in self.mass_features.values()} 1107 1108 # Remove EICs that are not associated with any mass features 1109 self.eics = {mz: eic for mz, eic in self.eics.items() if mz in associated_mzs} 1110 1111 def _remove_unassociated_ms1_spectra(self) -> None: 1112 """Remove MS1 spectra that are not associated with any mass features. 1113 This method cleans up the _ms_unprocessed attribute by removing any MS1 spectra that do not correspond to 1114 any mass features currently stored in the mass_features attribute. This is useful for freeing up memory 1115 and ensuring that only relevant MS1 spectra are retained. 1116 1117 Returns 1118 ------- 1119 None 1120 """ 1121 if self.mass_features is None or len(self.mass_features) == 0: 1122 self._ms_unprocessed = {} 1123 return 1124 1125 # Get the set of m/z values associated with current mass features 1126 associated_ms1_scans = {mf.apex_scan for mf in self.mass_features.values()} 1127 associated_ms1_scans = [int(scan) for scan in associated_ms1_scans] 1128 1129 # Get keys within the _ms attribute (these are individual MassSpectrum objects) 1130 current_stored_spectra = list(set(self._ms.keys())) 1131 if len(current_stored_spectra) == 0: 1132 return 1133 current_stored_spectra = [int(scan) for scan in current_stored_spectra] 1134 1135 # Filter the current_stored_spectra to only ms1 scans 1136 current_stored_spectra_ms1 = [ scan for scan in current_stored_spectra if scan in self.ms1_scans ] 1137 1138 # Remove MS1 spectra that are not associated with any mass features 1139 scans_to_drop = [scan for scan in current_stored_spectra_ms1 if scan not in associated_ms1_scans] 1140 for scan in scans_to_drop: 1141 if scan in self._ms: 1142 del self._ms[scan] 1143 1144class PHCalculations: 1145 """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data. 1146 1147 Notes 1148 ----- 1149 This class is intended to be used as a mixin for the LCMSBase class. 1150 1151 Methods 1152 ------- 1153 * sparse_mean_filter(idx, V, radius=[0, 1, 1]). 1154 Sparse implementation of a mean filter. 1155 * embed_unique_indices(a). 1156 Creates an array of indices, sorted by unique element. 1157 * sparse_upper_star(idx, V). 1158 Sparse implementation of an upper star filtration. 1159 * check_if_grid(data). 1160 Check if the data is gridded in mz space. 1161 * grid_data(data). 1162 Grid the data in the mz dimension. 1163 * find_mass_features_ph(ms_level=1, grid=True). 1164 Find mass features within an LCMSBase object using persistent homology. 1165 * cluster_mass_features(drop_children=True). 1166 Cluster regions of interest. 1167 """ 1168 1169 @staticmethod 1170 def sparse_mean_filter(idx, V, radius=[0, 1, 1]): 1171 """Sparse implementation of a mean filter. 1172 1173 Parameters 1174 ---------- 1175 idx : :obj:`~numpy.array` 1176 Edge indices for each dimension (MxN). 1177 V : :obj:`~numpy.array` 1178 Array of intensity data (Mx1). 1179 radius : float or list 1180 Radius of the sparse filter in each dimension. Values less than 1181 zero indicate no connectivity in that dimension. 1182 1183 Returns 1184 ------- 1185 :obj:`~numpy.array` 1186 Filtered intensities (Mx1). 1187 1188 Notes 1189 ----- 1190 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. 1191 This is a static method. 1192 """ 1193 1194 # Copy indices 1195 idx = idx.copy().astype(V.dtype) 1196 1197 # Scale 1198 for i, r in enumerate(radius): 1199 # Increase inter-index distance 1200 if r < 1: 1201 idx[:, i] *= 2 1202 1203 # Do nothing 1204 elif r == 1: 1205 pass 1206 1207 # Decrease inter-index distance 1208 else: 1209 idx[:, i] /= r 1210 1211 # Connectivity matrix 1212 cmat = KDTree(idx) 1213 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1214 cmat.setdiag(1) 1215 1216 # Pair indices 1217 I, J = cmat.nonzero() 1218 1219 # Delete cmat 1220 cmat_shape = cmat.shape 1221 del cmat 1222 1223 # Sum over columns 1224 V_sum = sparse.bsr_matrix( 1225 (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype 1226 ).diagonal(0) 1227 1228 # Count over columns 1229 V_count = sparse.bsr_matrix( 1230 (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype 1231 ).diagonal(0) 1232 1233 return V_sum / V_count 1234 1235 @staticmethod 1236 def embed_unique_indices(a): 1237 """Creates an array of indices, sorted by unique element. 1238 1239 Parameters 1240 ---------- 1241 a : :obj:`~numpy.array` 1242 Array of unique elements (Mx1). 1243 1244 Returns 1245 ------- 1246 :obj:`~numpy.array` 1247 Array of indices (Mx1). 1248 1249 Notes 1250 ----- 1251 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1252 This is a static method. 1253 """ 1254 1255 def count_tens(n): 1256 # Count tens 1257 ntens = (n - 1) // 10 1258 1259 while True: 1260 ntens_test = (ntens + n - 1) // 10 1261 1262 if ntens_test == ntens: 1263 return ntens 1264 else: 1265 ntens = ntens_test 1266 1267 def arange_exclude_10s(n): 1268 # How many 10s will there be? 1269 ntens = count_tens(n) 1270 1271 # Base array 1272 arr = np.arange(0, n + ntens) 1273 1274 # Exclude 10s 1275 arr = arr[(arr == 0) | (arr % 10 != 0)][:n] 1276 1277 return arr 1278 1279 # Creates an array of indices, sorted by unique element 1280 idx_sort = np.argsort(a) 1281 idx_unsort = np.argsort(idx_sort) 1282 1283 # Sorts records array so all unique elements are together 1284 sorted_a = a[idx_sort] 1285 1286 # Returns the unique values, the index of the first occurrence, 1287 # and the count for each element 1288 vals, idx_start, count = np.unique( 1289 sorted_a, return_index=True, return_counts=True 1290 ) 1291 1292 # Splits the indices into separate arrays 1293 splits = np.split(idx_sort, idx_start[1:]) 1294 1295 # Creates unique indices for each split 1296 idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits]) 1297 1298 # Reorders according to input array 1299 idx_unq = idx_unq[idx_unsort] 1300 1301 # Magnitude of each index 1302 exp = np.log10( 1303 idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64) 1304 ) 1305 idx_unq_mag = np.power(10, np.floor(exp) + 1) 1306 1307 # Result 1308 return a + idx_unq / idx_unq_mag 1309 1310 @staticmethod 1311 def roll_up_dataframe( 1312 df: pd.DataFrame, 1313 sort_by: str, 1314 tol: list, 1315 relative: list, 1316 dims: list, 1317 memory_opt_threshold: int = 10000, 1318 ): 1319 """Subset data by rolling up into apex in appropriate dimensions. 1320 1321 Parameters 1322 ---------- 1323 data : pd.DataFrame 1324 The input data containing "dims" columns and the "sort_by" column. 1325 sort_by : str 1326 The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 1327 (i.e., the mass feature with the highest value in the sort_by column). 1328 dims : list 1329 A list of dimension names (column names in the data DataFrame) to roll up the mass features by. 1330 tol : list 1331 A list of tolerances for each dimension. The length of the list must match the number of dimensions. 1332 The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). 1333 If relative is True, the tolerance will be multiplied by the maximum value in that dimension. 1334 If relative is False, the tolerance will be used as is. 1335 relative : list 1336 A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1337 memory_opt_threshold : int, optional 1338 Minimum number of rows to trigger memory-optimized processing. Default is 10000. 1339 1340 Returns 1341 ------- 1342 pd.DataFrame 1343 A DataFrame with only the rolled up mass features, with the original index and columns. 1344 1345 1346 Raises 1347 ------ 1348 ValueError 1349 If the input data is not a pandas DataFrame. 1350 If the input data does not have columns for each of the dimensions in "dims". 1351 If the length of "dims", "tol", and "relative" do not match. 1352 """ 1353 og_columns = df.columns.copy() 1354 1355 # Unindex the data, but keep the original index 1356 if df.index.name is not None: 1357 og_index = df.index.name 1358 else: 1359 og_index = "index" 1360 df = df.reset_index(drop=False) 1361 1362 # Sort data by sort_by column, and reindex 1363 df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True) 1364 1365 # Check that data is a DataFrame and has columns for each of the dims 1366 if not isinstance(df, pd.DataFrame): 1367 raise ValueError("Data must be a pandas DataFrame") 1368 for dim in dims: 1369 if dim not in df.columns: 1370 raise ValueError(f"Data must have a column for {dim}") 1371 if len(dims) != len(tol) or len(dims) != len(relative): 1372 raise ValueError( 1373 "Dimensions, tolerances, and relative flags must be the same length" 1374 ) 1375 1376 # Pre-compute all values arrays 1377 all_values = [df[dim].values for dim in dims] 1378 1379 # Choose processing method based on dataframe size 1380 if len(df) >= memory_opt_threshold: 1381 # Memory-optimized approach for large dataframes 1382 distances = PHCalculations._compute_distances_memory_optimized( 1383 all_values, tol, relative 1384 ) 1385 else: 1386 # Faster approach for smaller dataframes 1387 distances = PHCalculations._compute_distances_original( 1388 all_values, tol, relative 1389 ) 1390 1391 # Process pairs with original logic but memory optimizations 1392 distances = distances.tocoo() 1393 pairs = np.stack((distances.row, distances.col), axis=1) 1394 pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]).set_index("parent") 1395 del distances, pairs # Free memory immediately 1396 1397 to_drop = [] 1398 while not pairs_df.empty: 1399 # Find root_parents and their children (original logic preserved) 1400 root_parents = np.setdiff1d( 1401 np.unique(pairs_df.index.values), np.unique(pairs_df.child.values) 1402 ) 1403 children_of_roots = pairs_df.loc[root_parents, "child"].unique() 1404 to_drop.extend(children_of_roots) # Use extend instead of append 1405 1406 # Remove root_children as possible parents from pairs_df for next iteration 1407 pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore") 1408 pairs_df = pairs_df.reset_index().set_index("child") 1409 # Remove root_children as possible children from pairs_df for next iteration 1410 pairs_df = pairs_df.drop(index=children_of_roots) 1411 1412 # Prepare for next iteration 1413 pairs_df = pairs_df.reset_index().set_index("parent") 1414 1415 # Convert to numpy array for efficient dropping 1416 to_drop = np.array(to_drop) 1417 1418 # Drop mass features that are not cluster parents 1419 df_sub = df.drop(index=to_drop) 1420 1421 # Set index back to og_index and only keep original columns 1422 df_sub = df_sub.set_index(og_index).sort_index()[og_columns] 1423 1424 return df_sub 1425 1426 @staticmethod 1427 def _compute_distances_original(all_values, tol, relative): 1428 """Original distance computation method for smaller datasets. 1429 1430 This method computes the pairwise distances between features in the dataset 1431 using a straightforward approach. It is suitable for smaller datasets where 1432 memory usage is not a primary concern. 1433 1434 Parameters 1435 ---------- 1436 all_values : list of :obj:`~numpy.array` 1437 List of arrays containing the values for each dimension. 1438 tol : list of float 1439 List of tolerances for each dimension. 1440 relative : list of bool 1441 List of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1442 1443 Returns 1444 ------- 1445 :obj:`~scipy.sparse.coo_matrix` 1446 Sparse matrix indicating pairwise distances within tolerances. 1447 """ 1448 # Compute inter-feature distances with memory optimization 1449 distances = None 1450 for i in range(len(all_values)): 1451 values = all_values[i] 1452 # Use single precision if possible to reduce memory 1453 tree = KDTree(values.reshape(-1, 1).astype(np.float32)) 1454 1455 max_tol = tol[i] 1456 if relative[i] is True: 1457 max_tol = tol[i] * values.max() 1458 1459 # Compute sparse distance matrix with smaller chunks if memory is an issue 1460 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 1461 1462 # Only consider forward case, exclude diagonal 1463 sdm = sparse.triu(sdm, k=1) 1464 1465 # Process relative distances more efficiently 1466 if relative[i] is True: 1467 # Vectorized computation without creating intermediate arrays 1468 row_values = values[sdm.row] 1469 valid_idx = sdm.data <= tol[i] * row_values 1470 1471 # Reconstruct sparse matrix more efficiently 1472 sdm = sparse.coo_matrix( 1473 ( 1474 np.ones(valid_idx.sum(), dtype=np.uint8), 1475 (sdm.row[valid_idx], sdm.col[valid_idx]), 1476 ), 1477 shape=(len(values), len(values)), 1478 ) 1479 else: 1480 # Cast as binary matrix with smaller data type 1481 sdm.data = np.ones(len(sdm.data), dtype=np.uint8) 1482 1483 # Stack distances with memory-efficient multiplication 1484 if distances is None: 1485 distances = sdm 1486 else: 1487 # Use in-place operations where possible 1488 distances = distances.multiply(sdm) 1489 del sdm # Free memory immediately 1490 1491 return distances 1492 1493 @staticmethod 1494 def _compute_distances_memory_optimized(all_values, tol, relative): 1495 """Memory-optimized distance computation for large datasets. 1496 1497 This method computes the pairwise distances between features in the dataset 1498 using a more memory-efficient approach. It is suitable for larger datasets 1499 where memory usage is a primary concern. 1500 1501 Parameters 1502 ---------- 1503 all_values : list of :obj:`~numpy.array` 1504 List of arrays containing the values for each dimension. 1505 tol : list of float 1506 List of tolerances for each dimension. 1507 relative : list of bool 1508 List of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1509 1510 Returns 1511 ------- 1512 :obj:`~scipy.sparse.coo_matrix` 1513 Sparse matrix indicating pairwise distances within tolerances. 1514 """ 1515 # Compute distance matrix for first dimension (full matrix as before) 1516 values_0 = all_values[0].astype(np.float32) 1517 tree_0 = KDTree(values_0.reshape(-1, 1)) 1518 1519 max_tol_0 = tol[0] 1520 if relative[0] is True: 1521 max_tol_0 = tol[0] * values_0.max() 1522 1523 # Compute sparse distance matrix for first dimension 1524 distances = tree_0.sparse_distance_matrix( 1525 tree_0, max_tol_0, output_type="coo_matrix" 1526 ) 1527 distances = sparse.triu(distances, k=1) 1528 1529 # Process relative distances for first dimension 1530 if relative[0] is True: 1531 row_values = values_0[distances.row] 1532 valid_idx = distances.data <= tol[0] * row_values 1533 distances = sparse.coo_matrix( 1534 ( 1535 np.ones(valid_idx.sum(), dtype=np.uint8), 1536 (distances.row[valid_idx], distances.col[valid_idx]), 1537 ), 1538 shape=(len(values_0), len(values_0)), 1539 ) 1540 else: 1541 distances.data = np.ones(len(distances.data), dtype=np.uint8) 1542 1543 # For remaining dimensions, work only on chunks defined by first dimension pairs 1544 if len(all_values) > 1: 1545 distances_coo = distances.tocoo() 1546 valid_pairs = [] 1547 1548 # Process each pair from first dimension 1549 for idx in range(len(distances_coo.data)): 1550 i, j = distances_coo.row[idx], distances_coo.col[idx] 1551 is_valid_pair = True 1552 1553 # Check remaining dimensions for this specific pair 1554 for dim_idx in range(1, len(all_values)): 1555 values = all_values[dim_idx] 1556 val_i, val_j = values[i], values[j] 1557 1558 max_tol = tol[dim_idx] 1559 if relative[dim_idx] is True: 1560 max_tol = tol[dim_idx] * values.max() 1561 1562 distance_ij = abs(val_i - val_j) 1563 1564 # Check if this pair satisfies the tolerance for this dimension 1565 if relative[dim_idx] is True: 1566 if distance_ij > tol[dim_idx] * val_i: 1567 is_valid_pair = False 1568 break 1569 else: 1570 if distance_ij > max_tol: 1571 is_valid_pair = False 1572 break 1573 1574 if is_valid_pair: 1575 valid_pairs.append((i, j)) 1576 1577 # Rebuild distances matrix with only valid pairs 1578 if valid_pairs: 1579 valid_pairs = np.array(valid_pairs) 1580 distances = sparse.coo_matrix( 1581 ( 1582 np.ones(len(valid_pairs), dtype=np.uint8), 1583 (valid_pairs[:, 0], valid_pairs[:, 1]), 1584 ), 1585 shape=(len(values_0), len(values_0)), 1586 ) 1587 else: 1588 # No valid pairs found 1589 distances = sparse.coo_matrix( 1590 (len(values_0), len(values_0)), dtype=np.uint8 1591 ) 1592 1593 return distances 1594 1595 def sparse_upper_star(self, idx, V): 1596 """Sparse implementation of an upper star filtration. 1597 1598 Parameters 1599 ---------- 1600 idx : :obj:`~numpy.array` 1601 Edge indices for each dimension (MxN). 1602 V : :obj:`~numpy.array` 1603 Array of intensity data (Mx1). 1604 Returns 1605 ------- 1606 idx : :obj:`~numpy.array` 1607 Index of filtered points (Mx1). 1608 persistence : :obj:`~numpy.array` 1609 Persistence of each filtered point (Mx1). 1610 1611 Notes 1612 ----- 1613 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1614 """ 1615 1616 # Invert 1617 V = -1 * V.copy().astype(int) 1618 1619 # Embed indices 1620 V = self.embed_unique_indices(V) 1621 1622 # Connectivity matrix 1623 cmat = KDTree(idx) 1624 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1625 cmat.setdiag(1) 1626 cmat = sparse.triu(cmat) 1627 1628 # Pairwise minimums 1629 I, J = cmat.nonzero() 1630 d = np.maximum(V[I], V[J]) 1631 1632 # Delete connectiity matrix 1633 cmat_shape = cmat.shape 1634 del cmat 1635 1636 # Sparse distance matrix 1637 sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape) 1638 1639 # Delete pairwise mins 1640 del d, I, J 1641 1642 # Persistence homology 1643 ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0] 1644 1645 # Bound death values 1646 ph[ph[:, 1] == np.inf, 1] = np.max(V) 1647 1648 # Construct tree to query against 1649 tree = KDTree(V.reshape((-1, 1))) 1650 1651 # Get the indexes of the first nearest neighbor by birth 1652 _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1) 1653 1654 return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1) 1655 1656 def check_if_grid(self, data): 1657 """Check if the data are gridded in mz space. 1658 1659 Parameters 1660 ---------- 1661 data : DataFrame 1662 DataFrame containing the mass spectrometry data. Needs to have mz and scan columns. 1663 1664 Returns 1665 ------- 1666 bool 1667 True if the data is gridded in the mz direction, False otherwise. 1668 1669 Notes 1670 ----- 1671 This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly. 1672 """ 1673 # Calculate the difference between consecutive mz values in a single scan 1674 dat_check = data.copy().reset_index(drop=True) 1675 dat_check["mz_diff"] = np.abs(dat_check["mz"].diff()) 1676 mz_diff_min = ( 1677 dat_check.groupby("scan")["mz_diff"].min().min() 1678 ) # within each scan, what is the smallest mz difference between consecutive mz values 1679 1680 # Find the mininum mz difference between mz values in the data; regardless of scan 1681 dat_check_mz = dat_check[["mz"]].drop_duplicates().copy() 1682 dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True) 1683 dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff()) 1684 1685 # Get minimum mz_diff between mz values in the data 1686 mz_diff_min_raw = dat_check_mz["mz_diff"].min() 1687 1688 # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded 1689 if mz_diff_min_raw < mz_diff_min: 1690 return False 1691 else: 1692 return True 1693 1694 def grid_data(self, data, attempts=5): 1695 """Grid the data in the mz dimension. 1696 1697 Data must be gridded prior to persistent homology calculations and computing average mass spectrum 1698 1699 Parameters 1700 ---------- 1701 data : DataFrame 1702 The input data containing mz, scan, scan_time, and intensity columns. 1703 attempts : int, optional 1704 The number of attempts to grid the data. Default is 5. 1705 1706 Returns 1707 ------- 1708 DataFrame 1709 The gridded data with mz, scan, scan_time, and intensity columns. 1710 1711 Raises 1712 ------ 1713 ValueError 1714 If gridding fails after the specified number of attempts. 1715 """ 1716 attempt_i = 0 1717 while attempt_i < attempts: 1718 attempt_i += 1 1719 data = self._grid_data(data) 1720 1721 if self.check_if_grid(data): 1722 return data 1723 1724 if not self.check_if_grid(data): 1725 raise ValueError( 1726 "Gridding failed after " 1727 + str(attempt_i) 1728 + " attempts. Please check the data." 1729 ) 1730 else: 1731 return data 1732 1733 def _grid_data(self, data): 1734 """Internal method to grid the data in the mz dimension. 1735 1736 Notes 1737 ----- 1738 This method is called by the grid_data method and should not be called directly. 1739 It will attempt to grid the data in the mz dimension by creating a grid of mz values based on the minimum mz difference within each scan, 1740 but it does not check if the data is already gridded or if the gridding is successful. 1741 1742 Parameters 1743 ---------- 1744 data : pd.DataFrame or pl.DataFrame 1745 The input data to grid. 1746 1747 Returns 1748 ------- 1749 pd.DataFrame or pl.DataFrame 1750 The data after attempting to grid it in the mz dimension. 1751 """ 1752 # Calculate the difference between consecutive mz values in a single scan for grid spacing 1753 data_w = data.copy().reset_index(drop=True) 1754 data_w["mz_diff"] = np.abs(data_w["mz"].diff()) 1755 mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999 1756 1757 # Need high intensity mz values first so they are parents in the output pairs stack 1758 dat_mz = data_w[["mz", "intensity"]].sort_values( 1759 by=["intensity"], ascending=False 1760 ) 1761 dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy() 1762 1763 # Construct KD tree 1764 tree = KDTree(dat_mz.mz.values.reshape(-1, 1)) 1765 sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix") 1766 sdm = sparse.triu(sdm, k=1) 1767 sdm.data = np.ones_like(sdm.data) 1768 distances = sdm.tocoo() 1769 pairs = np.stack((distances.row, distances.col), axis=1) 1770 1771 # Cull pairs to just get root 1772 to_drop = [] 1773 while len(pairs) > 0: 1774 root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1])) 1775 id_root_parents = np.isin(pairs[:, 0], root_parents) 1776 children_of_roots = np.unique(pairs[id_root_parents, 1]) 1777 to_drop = np.append(to_drop, children_of_roots) 1778 1779 # Set up pairs array for next iteration by removing pairs with children or parents already dropped 1780 pairs = pairs[~np.isin(pairs[:, 1], to_drop), :] 1781 pairs = pairs[~np.isin(pairs[:, 0], to_drop), :] 1782 dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop)) 1783 mz_dat_np = ( 1784 dat_mz[["mz"]] 1785 .sort_values(by=["mz"]) 1786 .reset_index(drop=True) 1787 .values.flatten() 1788 ) 1789 1790 # Sort data by mz and recast mz to nearest value in mz_dat_np 1791 data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy() 1792 data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)] 1793 data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"]) 1794 1795 # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity 1796 new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy() 1797 new_data_w = ( 1798 new_data_w.drop(columns=["mz_diff", "mz_orig"]) 1799 .groupby(["scan", "mz"])["intensity"] 1800 .sum() 1801 .reset_index() 1802 ) 1803 new_data_w = ( 1804 new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True]) 1805 .reset_index(drop=True) 1806 .copy() 1807 ) 1808 1809 return new_data_w 1810 1811 def find_mass_features_ph(self, ms_level=1, grid=True): 1812 """Find mass features within an LCMSBase object using persistent homology. 1813 1814 Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id) 1815 1816 Parameters 1817 ---------- 1818 ms_level : int, optional 1819 The MS level to use. Default is 1. 1820 grid : bool, optional 1821 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True. 1822 1823 Raises 1824 ------ 1825 ValueError 1826 If no MS level data is found on the object. 1827 If data is not gridded and grid is False. 1828 1829 Returns 1830 ------- 1831 None, but assigns the mass_features attribute to the object. 1832 1833 Notes 1834 ----- 1835 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1836 """ 1837 # Check that ms_level is a key in self._ms_uprocessed 1838 if ms_level not in self._ms_unprocessed.keys(): 1839 raise ValueError( 1840 "No MS level " 1841 + str(ms_level) 1842 + " data found, did you instantiate with parser specific to MS level?" 1843 ) 1844 1845 # Get ms data 1846 data = self._ms_unprocessed[ms_level].copy() 1847 1848 # Drop rows with missing intensity values and reset index 1849 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1850 1851 # Threshold data 1852 dims = ["mz", "scan_time"] 1853 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1854 persistence_threshold = ( 1855 self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max() 1856 ) 1857 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1858 1859 # Check if gridded, if not, grid 1860 gridded_mz = self.check_if_grid(data_thres) 1861 if gridded_mz is False: 1862 if grid is False: 1863 raise ValueError( 1864 "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function" 1865 ) 1866 else: 1867 data_thres = self.grid_data(data_thres) 1868 1869 # Add scan_time 1870 data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1871 # Process in chunks if required 1872 if len(data_thres) > 10000: 1873 return self._find_mass_features_ph_partition( 1874 data_thres, dims, persistence_threshold 1875 ) 1876 else: 1877 # Process all at once 1878 return self._find_mass_features_ph_single( 1879 data_thres, dims, persistence_threshold 1880 ) 1881 1882 def _find_mass_features_ph_single(self, data_thres, dims, persistence_threshold): 1883 """Process all data at once (original logic).""" 1884 # Build factors 1885 factors = { 1886 dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32) 1887 for dim in dims 1888 } 1889 1890 # Build indexes 1891 index = { 1892 dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32) 1893 for dim in factors 1894 } 1895 1896 # Smooth and process 1897 mass_features_df = self._process_partition_ph( 1898 data_thres, index, dims, persistence_threshold 1899 ) 1900 1901 # Roll up within chunk to remove duplicates 1902 mass_features_df = self.roll_up_dataframe( 1903 df=mass_features_df, 1904 sort_by="persistence", 1905 dims=["mz", "scan_time"], 1906 tol=[ 1907 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1908 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1909 ], 1910 relative=[True, False], 1911 ) 1912 mass_features_df = mass_features_df.reset_index(drop=True) 1913 1914 # Populate mass_features attribute 1915 self._populate_mass_features(mass_features_df) 1916 1917 def _find_mass_features_ph_partition(self, data_thres, dims, persistence_threshold): 1918 """Partition the persistent homology mass feature detection for large datasets. 1919 1920 This method splits the input data into overlapping scan partitions, processes each partition to detect mass features 1921 using persistent homology, rolls up duplicates within and across partitions, and populates the mass_features attribute. 1922 1923 Parameters 1924 ---------- 1925 data_thres : pd.DataFrame 1926 The thresholded input data containing mass spectrometry information. 1927 dims : list 1928 List of dimension names (e.g., ["mz", "scan_time"]) used for feature detection. 1929 persistence_threshold : float 1930 Minimum persistence value required for a detected mass feature to be retained. 1931 1932 Returns 1933 ------- 1934 None 1935 Populates the mass_features attribute of the object with detected mass features. 1936 """ 1937 all_mass_features = [] 1938 1939 # Split scans into partitions 1940 unique_scans = sorted(data_thres["scan"].unique()) 1941 unique_scans_n = len(unique_scans) 1942 1943 # Calculate partition size in scans based on goal 1944 partition_size_goal = 5000 1945 scans_per_partition = max( 1946 1, partition_size_goal // (len(data_thres) // unique_scans_n) 1947 ) 1948 if scans_per_partition == 0: 1949 scans_per_partition = 1 1950 1951 # Make partitions based on scans, with overlapping in partitioned scans 1952 scan_overlap = 4 1953 partition_scans = [] 1954 for i in range(0, unique_scans_n, scans_per_partition): 1955 start_idx = max(0, i - scan_overlap) 1956 end_idx = min( 1957 unique_scans_n - 1, i + scans_per_partition - 1 + scan_overlap 1958 ) 1959 scans_group = [int(s) for s in unique_scans[start_idx : end_idx + 1]] 1960 partition_scans.append(scans_group) 1961 1962 # Set index to scan for faster filtering 1963 data_thres = data_thres.set_index("scan") 1964 for scans in partition_scans: 1965 # Determine start and end scan for partition, with 5 scans overlap 1966 partition_data = data_thres.loc[scans].reset_index(drop=False).copy() 1967 1968 if len(partition_data) == 0: 1969 continue 1970 1971 # Build factors for this partition 1972 factors = { 1973 dim: pd.factorize(partition_data[dim], sort=True)[1].astype(np.float32) 1974 for dim in dims 1975 } 1976 1977 # Build indexes 1978 index = { 1979 dim: np.searchsorted(factors[dim], partition_data[dim]).astype( 1980 np.float32 1981 ) 1982 for dim in factors 1983 } 1984 1985 # Process partition 1986 partition_features = self._process_partition_ph( 1987 partition_data, index, dims, persistence_threshold 1988 ) 1989 1990 if len(partition_features) == 0: 1991 continue 1992 1993 # Roll up within partition 1994 partition_features = self.roll_up_dataframe( 1995 df=partition_features, 1996 sort_by="persistence", 1997 dims=["mz", "scan_time"], 1998 tol=[ 1999 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 2000 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 2001 ], 2002 relative=[True, False], 2003 ) 2004 partition_features = partition_features.reset_index(drop=True) 2005 2006 if len(partition_features) > 0: 2007 all_mass_features.append(partition_features) 2008 2009 # Combine results from all partitions 2010 if all_mass_features: 2011 combined_features = pd.concat(all_mass_features, ignore_index=True) 2012 2013 # Sort by persistence 2014 combined_features = combined_features.sort_values( 2015 by="persistence", ascending=False 2016 ).reset_index(drop=True) 2017 2018 # Remove duplicates from overlapping regions 2019 combined_features = self.roll_up_dataframe( 2020 df=combined_features, 2021 sort_by="persistence", 2022 dims=["mz", "scan_time"], 2023 tol=[ 2024 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 2025 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 2026 ], 2027 relative=[True, False], 2028 ) 2029 2030 # resort by persistence and reset index 2031 combined_features = combined_features.reset_index(drop=True) 2032 2033 # Populate mass_features attribute 2034 self._populate_mass_features(combined_features) 2035 else: 2036 self.mass_features = {} 2037 2038 def _process_partition_ph(self, partition_data, index, dims, persistence_threshold): 2039 """Process a single partition with persistent homology.""" 2040 # Smooth data 2041 iterations = self.parameters.lc_ms.ph_smooth_it 2042 smooth_radius = [ 2043 self.parameters.lc_ms.ph_smooth_radius_mz, 2044 self.parameters.lc_ms.ph_smooth_radius_scan, 2045 ] 2046 2047 index_array = np.vstack([index[dim] for dim in dims]).T 2048 V = partition_data["intensity"].values 2049 resid = np.inf 2050 2051 for i in range(iterations): 2052 # Previous iteration 2053 V_prev = V.copy() 2054 resid_prev = resid 2055 V = self.sparse_mean_filter(index_array, V, radius=smooth_radius) 2056 2057 # Calculate residual with previous iteration 2058 resid = np.sqrt(np.mean(np.square(V - V_prev))) 2059 2060 # Evaluate convergence 2061 if i > 0: 2062 # Percent change in residual 2063 test = np.abs(resid - resid_prev) / resid_prev 2064 2065 # Exit criteria 2066 if test <= 0: 2067 break 2068 2069 # Overwrite values 2070 partition_data = partition_data.copy() 2071 partition_data["intensity"] = V 2072 2073 # Use persistent homology to find regions of interest 2074 pidx, pers = self.sparse_upper_star(index_array, V) 2075 pidx = pidx[pers > 1] 2076 pers = pers[pers > 1] 2077 2078 if len(pidx) == 0: 2079 return pd.DataFrame() 2080 2081 # Get peaks 2082 peaks = partition_data.iloc[pidx, :].reset_index(drop=True) 2083 2084 # Add persistence column 2085 peaks["persistence"] = pers 2086 mass_features = peaks.sort_values( 2087 by="persistence", ascending=False 2088 ).reset_index(drop=True) 2089 2090 # Filter by persistence threshold 2091 mass_features = mass_features.loc[ 2092 mass_features["persistence"] > persistence_threshold, : 2093 ].reset_index(drop=True) 2094 2095 return mass_features 2096 2097 def _populate_mass_features(self, mass_features_df): 2098 """Populate the mass_features attribute from a DataFrame. 2099 2100 Parameters 2101 ---------- 2102 mass_features_df : pd.DataFrame 2103 DataFrame containing mass feature information. 2104 Note that the order of this DataFrame will determine the order of mass features in the mass_features attribute. 2105 2106 Returns 2107 ------- 2108 None, but assigns the mass_features attribute to the object. 2109 """ 2110 # Rename scan column to apex_scan 2111 mass_features_df = mass_features_df.rename( 2112 columns={"scan": "apex_scan", "scan_time": "retention_time"} 2113 ) 2114 2115 # Populate mass_features attribute 2116 self.mass_features = {} 2117 for row in mass_features_df.itertuples(): 2118 row_dict = mass_features_df.iloc[row.Index].to_dict() 2119 lcms_feature = LCMSMassFeature(self, **row_dict) 2120 self.mass_features[lcms_feature.id] = lcms_feature 2121 2122 if self.parameters.lc_ms.verbose_processing: 2123 print("Found " + str(len(mass_features_df)) + " initial mass features") 2124 2125 def find_mass_features_ph_centroid(self, ms_level=1): 2126 """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data. 2127 2128 Parameters 2129 ---------- 2130 ms_level : int, optional 2131 The MS level to use. Default is 1. 2132 2133 Raises 2134 ------ 2135 ValueError 2136 If no MS level data is found on the object. 2137 2138 Returns 2139 ------- 2140 None, but assigns the mass_features attribute to the object. 2141 """ 2142 # Check that ms_level is a key in self._ms_uprocessed 2143 if ms_level not in self._ms_unprocessed.keys(): 2144 raise ValueError( 2145 "No MS level " 2146 + str(ms_level) 2147 + " data found, did you instantiate with parser specific to MS level?" 2148 ) 2149 2150 # Work with reference instead of copy 2151 data = self._ms_unprocessed[ms_level] 2152 2153 # Calculate threshold first to avoid multiple operations 2154 max_intensity = data["intensity"].max() 2155 threshold = self.parameters.lc_ms.ph_inten_min_rel * max_intensity 2156 2157 # Create single filter condition and apply to required columns only 2158 valid_mask = data["intensity"].notna() & (data["intensity"] > threshold) 2159 required_cols = ["mz", "intensity", "scan"] 2160 data_thres = data.loc[valid_mask, required_cols].copy() 2161 data_thres["persistence"] = data_thres["intensity"] 2162 2163 # Merge with required scan data 2164 scan_subset = self.scan_df[["scan", "scan_time"]] 2165 mf_df = data_thres.merge(scan_subset, on="scan", how="inner") 2166 del data_thres, scan_subset 2167 2168 # Order by scan_time and then mz to ensure features near in rt are processed together 2169 # It's ok that different scans are in different partitions; we will roll up later 2170 mf_df = mf_df.sort_values( 2171 by=["scan_time", "mz"], ascending=[True, True] 2172 ).reset_index(drop=True) 2173 partition_size = 10000 2174 partitions = [ 2175 mf_df.iloc[i : i + partition_size].reset_index(drop=True) 2176 for i in range(0, len(mf_df), partition_size) 2177 ] 2178 del mf_df 2179 2180 # Run roll_up_dataframe on each partition 2181 rolled_partitions = [] 2182 for part in partitions: 2183 rolled = self.roll_up_dataframe( 2184 df=part, 2185 sort_by="persistence", 2186 dims=["mz", "scan_time"], 2187 tol=[ 2188 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 2189 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 2190 ], 2191 relative=[True, False], 2192 ) 2193 rolled_partitions.append(rolled) 2194 del partitions 2195 2196 # Run roll_up_dataframe on the rolled_up partitions to merge features near partition boundaries 2197 2198 # Combine results and run a final roll-up to merge features near partition boundaries 2199 mf_df_final = pd.concat(rolled_partitions, ignore_index=True) 2200 del rolled_partitions 2201 2202 # Reorder by persistence before final roll-up 2203 mf_df_final = mf_df_final.sort_values( 2204 by="persistence", ascending=False 2205 ).reset_index(drop=True) 2206 2207 mf_df_final = self.roll_up_dataframe( 2208 df=mf_df_final, 2209 sort_by="persistence", 2210 dims=["mz", "scan_time"], 2211 tol=[ 2212 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 2213 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 2214 ], 2215 relative=[True, False], 2216 ) 2217 # reset index 2218 mf_df_final = mf_df_final.reset_index(drop=True) 2219 2220 # Combine rename and sort operations 2221 mass_features = ( 2222 mf_df_final.rename( 2223 columns={"scan": "apex_scan", "scan_time": "retention_time"} 2224 ) 2225 .sort_values(by="persistence", ascending=False) 2226 .reset_index(drop=True) 2227 ) 2228 del mf_df_final # Free memory 2229 2230 # Order by persistence and reset index 2231 mass_features = mass_features.sort_values( 2232 by="persistence", ascending=False 2233 ).reset_index(drop=True) 2234 2235 self.mass_features = {} 2236 for idx, row in mass_features.iterrows(): 2237 row_dict = row.to_dict() 2238 lcms_feature = LCMSMassFeature(self, **row_dict) 2239 self.mass_features[lcms_feature.id] = lcms_feature 2240 2241 if self.parameters.lc_ms.verbose_processing: 2242 print("Found " + str(len(mass_features)) + " initial mass features") 2243 2244 def cluster_mass_features(self, drop_children=True, sort_by="persistence"): 2245 """Cluster mass features 2246 2247 Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence. 2248 2249 Parameters 2250 ---------- 2251 drop_children : bool, optional 2252 Whether to drop the mass features that are not cluster parents. Default is True. 2253 sort_by : str, optional 2254 The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence". 2255 2256 Raises 2257 ------ 2258 ValueError 2259 If no mass features are found. 2260 If too many mass features are found. 2261 2262 Returns 2263 ------- 2264 None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents. 2265 """ 2266 if self.mass_features is None: 2267 raise ValueError("No mass features found, run find_mass_features() first") 2268 if len(self.mass_features) > 400000: 2269 raise ValueError( 2270 "Too many mass features of interest found, run find_mass_features() with a higher intensity threshold" 2271 ) 2272 dims = ["mz", "scan_time"] 2273 mf_df_og = self.mass_features_to_df() 2274 mf_df = mf_df_og.copy() 2275 2276 tol = [ 2277 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 2278 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 2279 ] # mz, in relative; scan_time in minutes 2280 relative = [True, False] 2281 2282 # Roll up mass features based on their proximity in the declared dimensions 2283 mf_df_new = self.roll_up_dataframe( 2284 df=mf_df, sort_by=sort_by, dims=dims, tol=tol, relative=relative 2285 ) 2286 2287 mf_df["cluster_parent"] = np.where( 2288 np.isin(mf_df.index, mf_df_new.index), True, False 2289 ) 2290 2291 # get mass feature ids of features that are not cluster parents 2292 cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values 2293 if drop_children is True: 2294 # Drop mass features that are not cluster parents from self 2295 self.mass_features = { 2296 k: v 2297 for k, v in self.mass_features.items() 2298 if k not in cluster_daughters 2299 } 2300 else: 2301 return cluster_daughters
14def find_closest(A, target): 15 """Find the index of closest value in A to each value in target. 16 17 Parameters 18 ---------- 19 A : :obj:`~numpy.array` 20 The array to search (blueprint). A must be sorted. 21 target : :obj:`~numpy.array` 22 The array of values to search for. target must be sorted. 23 24 Returns 25 ------- 26 :obj:`~numpy.array` 27 The indices of the closest values in A to each value in target. 28 """ 29 idx = A.searchsorted(target) 30 idx = np.clip(idx, 1, len(A) - 1) 31 left = A[idx - 1] 32 right = A[idx] 33 idx -= target - left < right - target 34 return idx
Find the index of closest value in A to each value in target.
Parameters
- A (
~numpy.array): The array to search (blueprint). A must be sorted. - target (
~numpy.array): The array of values to search for. target must be sorted.
Returns
~numpy.array: The indices of the closest values in A to each value in target.
37class LCCalculations: 38 """Methods for performing LC calculations on mass spectra data. 39 40 Notes 41 ----- 42 This class is intended to be used as a mixin for the LCMSBase class. 43 44 Methods 45 ------- 46 * get_max_eic(eic_data). 47 Returns the maximum EIC value from the given EIC data. A static method. 48 * smooth_tic(tic). 49 Smooths the TIC data using the specified smoothing method and settings. 50 * eic_centroid_detector(rt, eic, max_eic). 51 Performs EIC centroid detection on the given EIC data. 52 * find_nearest_scan(rt). 53 Finds the nearest scan to the given retention time. 54 * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). 55 Returns an averaged mass spectrum object. 56 * find_mass_features(ms_level=1). 57 Find regions of interest for a given MS level (default is MS1). 58 * integrate_mass_features(drop_if_fail=False, ms_level=1). 59 Integrate mass features of interest and extracts EICs. 60 * find_c13_mass_features(). 61 Evaluate mass features and mark likely C13 isotopes. 62 * deconvolute_ms1_mass_features(). 63 Deconvolute mass features' ms1 mass spectra. 64 """ 65 66 @staticmethod 67 def get_max_eic(eic_data: dict): 68 """Returns the maximum EIC value from the given EIC data. 69 70 Notes 71 ----- 72 This is a static method. 73 74 Parameters 75 ---------- 76 eic_data : dict 77 A dictionary containing EIC data. 78 79 Returns 80 ------- 81 float 82 The maximum EIC value. 83 """ 84 max_eic = 0 85 for eic_data in eic_data.values(): 86 ind_max_eic = max(eic_data.get("EIC")) 87 max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic 88 89 return max_eic 90 91 def smooth_tic(self, tic): 92 """Smooths the TIC or EIC data using the specified smoothing method and settings. 93 94 Parameters 95 ---------- 96 tic : numpy.ndarray 97 The TIC (or EIC) data to be smoothed. 98 99 Returns 100 ------- 101 numpy.ndarray 102 The smoothed TIC data. 103 """ 104 implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method 105 106 pol_order = self.parameters.lc_ms.savgol_pol_order 107 108 window_len = self.parameters.lc_ms.smooth_window 109 110 window = self.parameters.lc_ms.smooth_method 111 112 return sp.smooth_signal( 113 tic, window_len, window, pol_order, implemented_smooth_method 114 ) 115 116 def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]): 117 """Performs EIC centroid detection on the given EIC data. 118 119 Parameters 120 ---------- 121 rt : numpy.ndarray 122 The retention time data. 123 eic : numpy.ndarray 124 The EIC data. 125 max_eic : float 126 The maximum EIC value. 127 apex_indexes : list, optional 128 The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function. 129 130 Returns 131 ------- 132 numpy.ndarray 133 The indexes of left, apex, and right limits as a generator. 134 """ 135 136 max_prominence = self.parameters.lc_ms.peak_max_prominence_percent 137 138 max_height = self.parameters.lc_ms.peak_height_max_percent 139 140 signal_threshold = self.parameters.lc_ms.eic_signal_threshold 141 142 min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints 143 144 peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold 145 146 include_indexes = sp.peak_picking_first_derivative( 147 domain=rt, 148 signal=eic, 149 max_height=max_height, 150 max_prominence=max_prominence, 151 max_signal=max_eic, 152 min_peak_datapoints=min_peak_datapoints, 153 peak_derivative_threshold=peak_derivative_threshold, 154 signal_threshold=signal_threshold, 155 correct_baseline=False, 156 plot_res=False, 157 apex_indexes=apex_indexes, 158 ) 159 #include_indexes is a generator of tuples (left_index, apex_index, right_index) 160 include_indexes = list(include_indexes) 161 # Add check to make sure that there are at least 1/2 of min_peak_datapoints on either side of the apex 162 indicies = [x for x in include_indexes] 163 for idx in indicies: 164 if (idx[1] - idx[0] < min_peak_datapoints / 2) or ( 165 idx[2] - idx[1] < min_peak_datapoints / 2 166 ): 167 include_indexes.remove(idx) 168 return include_indexes 169 170 def find_nearest_scan(self, rt): 171 """Finds the nearest scan to the given retention time. 172 173 Parameters 174 ---------- 175 rt : float 176 The retention time (in minutes) to find the nearest scan for. 177 178 Returns 179 ------- 180 int 181 The scan number of the nearest scan. 182 """ 183 array_rt = np.array(self.retention_time) 184 185 scan_index = (np.abs(array_rt - rt)).argmin() 186 187 real_scan = self.scans_number[scan_index] 188 189 return real_scan 190 191 def add_peak_metrics(self, remove_by_metrics=True): 192 """Add peak metrics to the mass features. 193 194 This function calculates the peak metrics for each mass feature and adds them to the mass feature objects. 195 196 Parameters 197 ---------- 198 remove_by_metrics : bool, optional 199 If True, remove mass features based on their peak metrics such as S/N, Gaussian similarity, 200 dispersity index, and noise score. Default is True, which checks the setting in the processing parameters. 201 If False, peak metrics are calculated but no mass features are removed, regardless of the setting in the processing parameters. 202 """ 203 # Check that at least some mass features have eic data 204 if not any([mf._eic_data is not None for mf in self.mass_features.values()]): 205 raise ValueError( 206 "No mass features have EIC data. Run integrate_mass_features first." 207 ) 208 209 for mass_feature in self.mass_features.values(): 210 # Check if the mass feature has been integrated 211 if mass_feature._eic_data is not None: 212 # Calculate peak metrics 213 mass_feature.calc_half_height_width() 214 mass_feature.calc_tailing_factor() 215 mass_feature.calc_dispersity_index() 216 mass_feature.calc_gaussian_similarity() 217 mass_feature.calc_noise_score() 218 219 # Remove mass features by peak metrics if designated in parameters 220 if self.parameters.lc_ms.remove_mass_features_by_peak_metrics and remove_by_metrics: 221 self._remove_mass_features_by_peak_metrics() 222 223 def get_average_mass_spectrum( 224 self, 225 scan_list, 226 apex_scan, 227 spectrum_mode="profile", 228 ms_level=1, 229 auto_process=True, 230 use_parser=False, 231 perform_checks=True, 232 polarity=None, 233 ms_params=None, 234 ): 235 """Returns an averaged mass spectrum object 236 237 Parameters 238 ---------- 239 scan_list : list 240 List of scan numbers to average. 241 apex_scan : int 242 Number of the apex scan 243 spectrum_mode : str, optional 244 The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging. 245 ms_level : int, optional 246 The MS level to use. Defaults to 1. 247 auto_process : bool, optional 248 If True, the averaged mass spectrum will be auto-processed. Defaults to True. 249 use_parser : bool, optional 250 If True, the mass spectra will be obtained from the parser. Defaults to False. 251 perform_checks : bool, optional 252 If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan 253 polarity : int, optional 254 The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1) 255 ms_params : MSParameters, optional 256 The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None. 257 258 Returns 259 ------- 260 MassSpectrumProfile 261 The averaged mass spectrum object. 262 263 Raises 264 ------ 265 ValueError 266 If the spectrum mode is not "profile". 267 If the MS level is not found in the unprocessed mass spectra dictionary. 268 If not all scan numbers are found in the unprocessed mass spectra dictionary. 269 """ 270 if perform_checks: 271 if spectrum_mode != "profile": 272 raise ValueError("Averaging only supported for profile mode") 273 274 if polarity is None: 275 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 276 if self.polarity == "negative": 277 polarity = -1 278 elif self.polarity == "positive": 279 polarity = 1 280 else: 281 raise ValueError( 282 "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'" 283 ) 284 285 # if not using_parser, check that scan numbers are in _ms_unprocessed 286 if not use_parser: 287 if perform_checks: 288 # Set index to scan for faster lookup 289 ms_df = ( 290 self._ms_unprocessed[ms_level] 291 .copy() 292 .set_index("scan", drop=False) 293 .sort_index() 294 ) 295 my_ms_df = ms_df.loc[scan_list] 296 # Check that all scan numbers are in the ms_df 297 if not all(np.isin(scan_list, ms_df.index)): 298 raise ValueError( 299 "Not all scan numbers found in the unprocessed mass spectra dictionary" 300 ) 301 else: 302 my_ms_df = ( 303 pd.DataFrame({"scan": scan_list}) 304 .set_index("scan") 305 .join(self._ms_unprocessed[ms_level], how="left") 306 ) 307 308 if use_parser: 309 ms_list = [ 310 self.spectra_parser.get_mass_spectrum_from_scan( 311 x, spectrum_mode=spectrum_mode, auto_process=False 312 ) 313 for x in scan_list 314 ] 315 ms_mz = [x._mz_exp for x in ms_list] 316 ms_int = [x._abundance for x in ms_list] 317 my_ms_df = [] 318 for i in np.arange(len(ms_mz)): 319 my_ms_df.append( 320 pd.DataFrame( 321 {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]} 322 ) 323 ) 324 my_ms_df = pd.concat(my_ms_df) 325 326 if not self.check_if_grid(my_ms_df): 327 my_ms_df = self.grid_data(my_ms_df) 328 329 my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index() 330 331 ms = ms_from_array_profile( 332 my_ms_ave.mz, 333 my_ms_ave.intensity, 334 self.file_location, 335 polarity=polarity, 336 auto_process=False, 337 ) 338 339 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 340 if ms is not None: 341 if ms_params is not None: 342 ms.parameters = ms_params 343 ms.scan_number = apex_scan 344 if auto_process: 345 ms.process_mass_spec() 346 return ms 347 348 def find_mass_features(self, ms_level=1, grid=True): 349 """Find mass features within an LCMSBase object 350 351 Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future. 352 353 Parameters 354 ---------- 355 ms_level : int, optional 356 The MS level to use for peak picking Default is 1. 357 grid : bool, optional 358 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 359 used for persistent homology peak picking for profile data only. Default is True. 360 361 Raises 362 ------ 363 ValueError 364 If no MS level data is found on the object. 365 If persistent homology peak picking is attempted on non-profile mode data. 366 If data is not gridded and grid is False. 367 If peak picking method is not implemented. 368 369 Returns 370 ------- 371 None, but assigns the mass_features and eics attributes to the object. 372 373 """ 374 pp_method = self.parameters.lc_ms.peak_picking_method 375 376 if pp_method == "persistent homology": 377 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 378 if all(msx_scan_df["ms_format"] == "profile"): 379 self.find_mass_features_ph(ms_level=ms_level, grid=grid) 380 else: 381 raise ValueError( 382 "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format( 383 ms_level 384 ) 385 ) 386 elif pp_method == "centroided_persistent_homology": 387 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 388 if all(msx_scan_df["ms_format"] == "centroid"): 389 self.find_mass_features_ph_centroid(ms_level=ms_level) 390 else: 391 raise ValueError( 392 "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format( 393 ms_level 394 ) 395 ) 396 else: 397 raise ValueError("Peak picking method not implemented") 398 399 # Remove noisey mass features if designated in parameters 400 if self.parameters.lc_ms.remove_redundant_mass_features: 401 self._remove_redundant_mass_features() 402 403 def integrate_mass_features( 404 self, drop_if_fail=True, drop_duplicates=True, ms_level=1 405 ): 406 """Integrate mass features and extract EICs. 407 408 Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute. 409 410 Parameters 411 ---------- 412 drop_if_fail : bool, optional 413 Whether to drop mass features if the EIC limit calculations fail. 414 Default is True. 415 drop_duplicates : bool, optional 416 Whether to mass features that appear to be duplicates 417 (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). 418 Default is True. 419 ms_level : int, optional 420 The MS level to use. Default is 1. 421 422 Raises 423 ------ 424 ValueError 425 If no mass features are found. 426 If no MS level data is found for the given MS level (either in data or in the scan data) 427 428 Returns 429 ------- 430 None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute. 431 432 Notes 433 ----- 434 drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection). 435 """ 436 # Check if there is data 437 if ms_level in self._ms_unprocessed.keys(): 438 raw_data = self._ms_unprocessed[ms_level].copy() 439 else: 440 raise ValueError("No MS level " + str(ms_level) + " data found") 441 if self.mass_features is not None: 442 mf_df = self.mass_features_to_df().copy() 443 else: 444 raise ValueError( 445 "No mass features found, did you run find_mass_features() first?" 446 ) 447 448 # Subset scan data to only include correct ms_level 449 scan_df_sub = self.scan_df[ 450 self.scan_df["ms_level"] == int(ms_level) 451 ].reset_index(drop=True) 452 if scan_df_sub.empty: 453 raise ValueError("No MS level " + ms_level + " data found in scan data") 454 scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy() 455 456 mzs_to_extract = np.unique(mf_df["mz"].values) 457 mzs_to_extract.sort() 458 459 # Pre-sort raw_data by mz for faster filtering 460 raw_data_sorted = raw_data.sort_values(["mz", "scan"]).reset_index(drop=True) 461 raw_data_mz = raw_data_sorted["mz"].values 462 463 # Get EICs for each unique mz in mass features list 464 for mz in mzs_to_extract: 465 mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 466 mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 467 468 # Use binary search for faster mz range filtering 469 left_idx = np.searchsorted(raw_data_mz, mz_min, side="left") 470 right_idx = np.searchsorted(raw_data_mz, mz_max, side="right") 471 raw_data_sub = raw_data_sorted.iloc[left_idx:right_idx].copy() 472 473 raw_data_sub = ( 474 raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index() 475 ) 476 raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left") 477 raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0) 478 myEIC = EIC_Data( 479 scans=raw_data_sub["scan"].values, 480 time=raw_data_sub["scan_time"].values, 481 eic=raw_data_sub["intensity"].values, 482 ) 483 # Smooth EIC 484 smoothed_eic = self.smooth_tic(myEIC.eic) 485 smoothed_eic[smoothed_eic < 0] = 0 486 myEIC.eic_smoothed = smoothed_eic 487 self.eics[mz] = myEIC 488 489 # Get limits of mass features using EIC centroid detector and integrate 490 mf_df["area"] = np.nan 491 for idx, mass_feature in mf_df.iterrows(): 492 mz = mass_feature.mz 493 apex_scan = mass_feature.apex_scan 494 495 # Pull EIC data and find apex scan index 496 myEIC = self.eics[mz] 497 self.mass_features[idx]._eic_data = myEIC 498 apex_index = np.where(myEIC.scans == apex_scan)[0][0] 499 500 # Find left and right limits of peak using EIC centroid detector, add to EICData 501 centroid_eics = self.eic_centroid_detector( 502 myEIC.time, 503 myEIC.eic_smoothed, 504 mass_feature.intensity * 1.1, 505 apex_indexes=[int(apex_index)], 506 ) 507 l_a_r_scan_idx = [i for i in centroid_eics] 508 if len(l_a_r_scan_idx) > 0: 509 # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans 510 # Find the number of consecutive non-zero values in the EIC segment 511 mask = myEIC.eic[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1] > 0 512 # Find the longest run of consecutive True values 513 if np.any(mask): 514 # Find indices where mask changes value 515 diff = np.diff(np.concatenate(([0], mask.astype(int), [0]))) 516 starts = np.where(diff == 1)[0] 517 ends = np.where(diff == -1)[0] 518 consecutive_scans = (ends - starts).max() 519 else: 520 consecutive_scans = 0 521 if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min: 522 self.mass_features.pop(idx) 523 continue 524 # Add start and final scan to mass_features and EICData 525 left_scan, right_scan = ( 526 myEIC.scans[l_a_r_scan_idx[0][0]], 527 myEIC.scans[l_a_r_scan_idx[0][2]], 528 ) 529 mf_scan_apex = [(left_scan, int(apex_scan), right_scan)] 530 myEIC.apexes = myEIC.apexes + mf_scan_apex 531 self.mass_features[idx].start_scan = left_scan 532 self.mass_features[idx].final_scan = right_scan 533 534 # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData 535 area = np.trapz( 536 myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 537 myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 538 ) 539 mf_df.at[idx, "area"] = area 540 myEIC.areas = myEIC.areas + [area] 541 self.eics[mz] = myEIC 542 self.mass_features[idx]._area = area 543 else: 544 if drop_if_fail is True: 545 self.mass_features.pop(idx) 546 547 if drop_duplicates: 548 # Prepare mass feature dataframe 549 mf_df = self.mass_features_to_df() 550 551 # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature 552 # Keep the first mass feature (highest persistence) 553 for idx, mass_feature in mf_df.iterrows(): 554 mz = mass_feature.mz 555 apex_scan = mass_feature.apex_scan 556 557 mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6 558 mf_df_sub = mf_df[ 559 mf_df["mz_diff_ppm"] 560 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 561 * 10**6 562 ].copy() 563 564 # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature 565 for idx2, mass_feature2 in mf_df_sub.iterrows(): 566 if idx2 != idx: 567 if ( 568 mass_feature2.start_scan >= mass_feature.start_scan 569 and mass_feature2.final_scan <= mass_feature.final_scan 570 ): 571 if idx2 in self.mass_features.keys(): 572 self.mass_features.pop(idx2) 573 574 def find_c13_mass_features(self): 575 """Mark likely C13 isotopes and connect to monoisoitopic mass features. 576 577 Returns 578 ------- 579 None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object. 580 581 Raises 582 ------ 583 ValueError 584 If no mass features are found. 585 """ 586 verbose = self.parameters.lc_ms.verbose_processing 587 if verbose: 588 print("evaluating mass features for C13 isotopes") 589 if self.mass_features is None: 590 raise ValueError("No mass features found, run find_mass_features() first") 591 592 # Data prep fo sparse distance matrix 593 dims = ["mz", "scan_time"] 594 mf_df = self.mass_features_to_df().copy() 595 # Drop mass features that have no area (these are likely to be noise) 596 mf_df = mf_df[mf_df["area"].notnull()] 597 mf_df["mf_id"] = mf_df.index.values 598 dims = ["mz", "scan_time"] 599 600 # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features 601 mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy() 602 603 mz_diff = 1.003355 # C13-C12 mass difference 604 tol = [ 605 mf_df["mz"].median() 606 * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 607 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5, 608 ] # mz, in relative; scan_time in minutes 609 610 # Compute inter-feature distances 611 distances = None 612 for i in range(len(dims)): 613 # Construct k-d tree 614 values = mf_df[dims[i]].values 615 tree = KDTree(values.reshape(-1, 1)) 616 617 max_tol = tol[i] 618 if dims[i] == "mz": 619 # Maximum absolute tolerance 620 max_tol = mz_diff + tol[i] 621 622 # Compute sparse distance matrix 623 # the larger the max_tol, the slower this operation is 624 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 625 626 # Only consider forward case, exclude diagonal 627 sdm = sparse.triu(sdm, k=1) 628 629 if dims[i] == "mz": 630 min_tol = mz_diff - tol[i] 631 # Get only the ones that are above the min tol 632 idx = sdm.data > min_tol 633 634 # Reconstruct sparse distance matrix 635 sdm = sparse.coo_matrix( 636 (sdm.data[idx], (sdm.row[idx], sdm.col[idx])), 637 shape=(len(values), len(values)), 638 ) 639 640 # Cast as binary matrix 641 sdm.data = np.ones_like(sdm.data) 642 643 # Stack distances 644 if distances is None: 645 distances = sdm 646 else: 647 distances = distances.multiply(sdm) 648 649 # Extract indices of within-tolerance points 650 distances = distances.tocoo() 651 pairs = np.stack((distances.row, distances.col), axis=1) # C12 to C13 pairs 652 653 # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df 654 pairs_mf = pairs.copy() 655 pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values 656 pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values 657 658 # Connect monoisotopic masses with isotopologes within mass_features 659 monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1])) 660 for mono in monos: 661 self.mass_features[mono].monoisotopic_mf_id = mono 662 pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"]) 663 while not pairs_iso_df.empty: 664 pairs_iso_df = pairs_iso_df.set_index("parent", drop=False) 665 m1_isos = pairs_iso_df.loc[monos, "child"].unique() 666 for iso in m1_isos: 667 # Set monoisotopic_mf_id and isotopologue_type for isotopologues 668 parent = pairs_mf[pairs_mf[:, 1] == iso, 0] 669 if len(parent) > 1: 670 # Choose the parent that is closest in time to the isotopologue 671 parent_time = [self.mass_features[p].retention_time for p in parent] 672 time_diff = [ 673 np.abs(self.mass_features[iso].retention_time - x) 674 for x in parent_time 675 ] 676 parent = parent[np.argmin(time_diff)] 677 else: 678 parent = parent[0] 679 self.mass_features[iso].monoisotopic_mf_id = self.mass_features[ 680 parent 681 ].monoisotopic_mf_id 682 if self.mass_features[iso].monoisotopic_mf_id is not None: 683 mass_diff = ( 684 self.mass_features[iso].mz 685 - self.mass_features[ 686 self.mass_features[iso].monoisotopic_mf_id 687 ].mz 688 ) 689 self.mass_features[iso].isotopologue_type = "13C" + str( 690 int(round(mass_diff, 0)) 691 ) 692 693 # Drop the mono and iso from the pairs_iso_df 694 pairs_iso_df = pairs_iso_df.drop( 695 index=monos, errors="ignore" 696 ) # Drop pairs where the parent is a child that is a child of a root 697 pairs_iso_df = pairs_iso_df.set_index("child", drop=False) 698 pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore") 699 700 if not pairs_iso_df.empty: 701 # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process 702 monos = np.setdiff1d( 703 np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child) 704 ) 705 if verbose: 706 # Report fraction of compounds annotated with isotopes 707 mf_df["c13_flag"] = np.where( 708 np.logical_or( 709 np.isin(mf_df["mf_id"], pairs_mf[:, 0]), 710 np.isin(mf_df["mf_id"], pairs_mf[:, 1]), 711 ), 712 1, 713 0, 714 ) 715 print( 716 str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3)) 717 + " of mass features have or are C13 isotopes" 718 ) 719 720 def deconvolute_ms1_mass_features(self): 721 """Deconvolute MS1 mass features 722 723 Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features 724 725 Parameters 726 ---------- 727 None 728 729 Returns 730 ------- 731 None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent, 732 and associated_mass_features_deconvoluted attributes to the mass features in the 733 mass_features attribute of the LCMSBase object. 734 735 Raises 736 ------ 737 ValueError 738 If no mass features are found, must run find_mass_features() first. 739 If no EICs are found, did you run integrate_mass_features() first? 740 741 """ 742 # Checks for set mass_features and eics 743 if self.mass_features is None: 744 raise ValueError( 745 "No mass features found, did you run find_mass_features() first?" 746 ) 747 748 if self.eics == {}: 749 raise ValueError( 750 "No EICs found, did you run integrate_mass_features() first?" 751 ) 752 753 if 1 not in self._ms_unprocessed.keys(): 754 raise ValueError("No unprocessed MS1 spectra found.") 755 756 # Prep ms1 data 757 ms1_data = self._ms_unprocessed[1].copy() 758 ms1_data = ms1_data.set_index("scan") 759 760 # Prep mass feature summary 761 mass_feature_df = self.mass_features_to_df() 762 763 # Loop through each mass feature 764 for mf_id, mass_feature in self.mass_features.items(): 765 # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df 766 if mass_feature.mz != mass_feature.ms1_peak.mz_exp: 767 continue 768 769 # Get the left and right limits of the EIC of the mass feature 770 l_scan, _, r_scan = mass_feature._eic_data.apexes[0] 771 772 # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz 773 ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy() 774 ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False) 775 776 # Get the centroided masses of the mass feature 777 mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp 778 779 # Find the closest mz in the ms1 data to the centroided masses of the mass feature 780 ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[ 781 find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values) 782 ] 783 784 # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz 785 ms1_data_sub["mz_diff_rel"] = ( 786 np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"]) 787 / ms1_data_sub["mz"] 788 ) 789 ms1_data_sub = ms1_data_sub[ 790 ms1_data_sub["mz_diff_rel"] 791 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 792 ].reset_index(drop=True) 793 794 # Group by mass_feature_mz and scan and sum intensity 795 ms1_data_sub_group = ( 796 ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"] 797 .sum() 798 .reset_index() 799 ) 800 801 # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity) 802 corr = ( 803 ms1_data_sub_group.pivot( 804 index="scan", columns="mass_feature_mz", values="intensity" 805 ) 806 .fillna(0) 807 .corr() 808 ) 809 810 # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8 811 decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min 812 813 # Try catch for KeyError in case the mass feature mz is not in the correlation matrix 814 try: 815 corr_subset = corr.loc[mass_feature.mz,] 816 except KeyError: 817 # If the mass feature mz is not in the correlation matrix, skip to the next mass feature 818 continue 819 820 corr_subset = corr_subset[corr_subset > decon_corr_min] 821 822 # Get the masses from the mass spectrum that are the result of the deconvolution 823 mzs_decon = corr_subset.index.values 824 825 # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature 826 mzs_decon_idx = [ 827 id 828 for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp) 829 if mz in mzs_decon 830 ] 831 mass_feature._ms_deconvoluted_idx = mzs_decon_idx 832 833 # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum 834 if ( 835 mass_feature.ms1_peak.abundance 836 == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max() 837 ): 838 mass_feature.mass_spectrum_deconvoluted_parent = True 839 else: 840 mass_feature.mass_spectrum_deconvoluted_parent = False 841 842 # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature 843 # Subset mass_feature_df to only include mass features that are within the clustering tolerance 844 mass_feature_df_sub = mass_feature_df[ 845 abs(mass_feature.retention_time - mass_feature_df["scan_time"]) 846 < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 847 ].copy() 848 # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum 849 mass_feature_df_sub["mz_diff_ppm"] = [ 850 np.abs(mzs_decon - mz).min() / mz * 10**6 851 for mz in mass_feature_df_sub["mz"] 852 ] 853 # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses 854 mfs_associated_decon = mass_feature_df_sub[ 855 mass_feature_df_sub["mz_diff_ppm"] 856 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6 857 ].index.values 858 859 mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon 860 861 def _remove_redundant_mass_features( 862 self, 863 ) -> None: 864 """ 865 Identify and remove redundant mass features that are likely contaminants based on their m/z values and scan frequency. 866 Especially useful for HILIC data where signals do not return to baseline between peaks or for data with significant background noise. 867 868 Contaminants are characterized by: 869 1. Similar m/z values (within ppm_tolerance) 870 2. High frequency across scan numbers (ubiquitous presence) 871 872 Notes 873 ----- 874 Depends on self.mass_features being populated, uses the parameters in self.parameters.lc_ms for tolerances (mass_feature_cluster_mz_tolerance_rel) 875 """ 876 ppm_tolerance = self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel*1e6 877 min_scan_frequency = self.parameters.lc_ms.redundant_scan_frequency_min 878 n_retain = self.parameters.lc_ms.redundant_feature_retain_n 879 880 df = self.mass_features_to_df() 881 882 if df.empty: 883 return pd.DataFrame() 884 # df index should be mf_id 885 if 'mf_id' not in df.columns: 886 if 'mf_id' in df.index.names: 887 df = df.reset_index() 888 else: 889 raise ValueError("DataFrame must contain 'mf_id' column or index.") 890 891 # Sort by m/z for efficient grouping 892 df_sorted = df.sort_values('mz').reset_index(drop=True) 893 894 # Calculate total number of unique scans for frequency calculation 895 # Calculate total possible scans (check the cluster rt tolerance and the min rt and max rt of the data) 896 total_time = self.scan_df['scan_time'].max() - self.scan_df['scan_time'].min() 897 cluster_rt_tolerance = self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 898 # If the feature was detected in every possible scan (and then rolled up), it would be in this many scans 899 total_scans = int(total_time / cluster_rt_tolerance) + 1 900 901 # Group similar m/z values using ppm tolerance 902 mz_groups = [] 903 current_group = [] 904 905 for i, row in df_sorted.iterrows(): 906 current_mz = row['mz'] 907 908 if not current_group: 909 # Start first group 910 current_group = [i] 911 else: 912 # Check if current m/z is within tolerance of group representative 913 group_representative_mz = df_sorted.iloc[current_group[0]]['mz'] 914 ppm_diff = abs(current_mz - group_representative_mz) / group_representative_mz * 1e6 915 916 if ppm_diff <= ppm_tolerance: 917 # Add to current group 918 current_group.append(i) 919 else: 920 # Start new group, but first process current group 921 if len(current_group) > 0: 922 mz_groups.append(current_group) 923 current_group = [i] 924 925 # Don't forget the last group 926 if current_group: 927 mz_groups.append(current_group) 928 929 # Analyze each m/z group for contaminant characteristics 930 931 for group_indices in mz_groups: 932 group_data = df_sorted.iloc[group_indices] 933 934 # Calculate group statistics 935 unique_scans = group_data['apex_scan'].nunique() 936 scan_frequency = unique_scans / total_scans 937 938 # Check if this group meets contaminant criteria 939 if scan_frequency >= min_scan_frequency: 940 group_data = group_data.sort_values('intensity', ascending=False) 941 non_representative_mf_id = group_data.iloc[n_retain:]['mf_id'].tolist() # These will be removed 942 943 self.mass_features = { 944 k: v for k, v in self.mass_features.items() if k not in non_representative_mf_id 945 } 946 947 def _remove_mass_features_by_peak_metrics(self) -> None: 948 """Remove mass features based on peak metrics defined in mass_feature_attribute_filter_dict. 949 950 This method filters mass features based on various peak shape metrics and quality indicators 951 such as noise scores, Gaussian similarity, tailing factors, dispersity index, etc. 952 953 The filtering criteria are defined in the mass_feature_attribute_filter_dict parameter, 954 which should contain attribute names as keys and filter specifications as values. 955 956 Filter specification format: 957 {attribute_name: {'value': threshold, 'operator': comparison}} 958 959 Available operators: 960 - '>' or 'greater': Keep features where attribute > threshold 961 - '<' or 'less': Keep features where attribute < threshold 962 - '>=' or 'greater_equal': Keep features where attribute >= threshold 963 - '<=' or 'less_equal': Keep features where attribute <= threshold 964 965 Examples: 966 - {'noise_score_max': {'value': 0.5, 'operator': '>='}} - Keep features with noise_score_max >= 0.5 967 - {'dispersity_index': {'value': 0.1, 'operator': '<'}} - Keep features with dispersity_index < 0.1 968 - {'gaussian_similarity': {'value': 0.7, 'operator': '>='}} - Keep features with gaussian_similarity >= 0.7 969 970 Returns 971 ------- 972 None 973 Modifies self.mass_features in place by removing filtered features. 974 975 Raises 976 ------ 977 ValueError 978 If no mass features are found, if an invalid attribute is specified, or if filter specification is malformed. 979 """ 980 if self.mass_features is None or len(self.mass_features) == 0: 981 raise ValueError("No mass features found, run find_mass_features() first") 982 983 filter_dict = self.parameters.lc_ms.mass_feature_attribute_filter_dict 984 985 if not filter_dict: 986 # No filtering criteria specified, return early 987 return 988 989 verbose = self.parameters.lc_ms.verbose_processing 990 initial_count = len(self.mass_features) 991 992 if verbose: 993 print(f"Filtering mass features using peak metrics. Initial count: {initial_count}") 994 995 # List to collect IDs of mass features to remove 996 features_to_remove = [] 997 998 for mf_id, mass_feature in self.mass_features.items(): 999 should_remove = False 1000 1001 for attribute_name, filter_spec in filter_dict.items(): 1002 # Validate filter specification structure 1003 if not isinstance(filter_spec, dict): 1004 raise ValueError(f"Filter specification for '{attribute_name}' must be a dictionary with 'value' and 'operator' keys") 1005 1006 if 'value' not in filter_spec or 'operator' not in filter_spec: 1007 raise ValueError(f"Filter specification for '{attribute_name}' must contain both 'value' and 'operator' keys") 1008 1009 threshold_value = filter_spec['value'] 1010 operator = filter_spec['operator'].lower().strip() 1011 1012 # Validate operator 1013 valid_operators = {'>', '<', '>=', '<=', 'greater', 'less', 'greater_equal', 'less_equal'} 1014 if operator not in valid_operators: 1015 raise ValueError(f"Invalid operator '{operator}' for attribute '{attribute_name}'. Valid operators: {valid_operators}") 1016 1017 # Normalize operator names 1018 operator_map = { 1019 'greater': '>', 1020 'less': '<', 1021 'greater_equal': '>=', 1022 'less_equal': '<=' 1023 } 1024 operator = operator_map.get(operator, operator) 1025 1026 # Get the attribute value from the mass feature 1027 try: 1028 if hasattr(mass_feature, attribute_name): 1029 attribute_value = getattr(mass_feature, attribute_name) 1030 else: 1031 raise ValueError(f"Mass feature does not have attribute '{attribute_name}'") 1032 1033 # Handle None values or attributes that haven't been calculated 1034 if attribute_value is None: 1035 if verbose: 1036 print(f"Warning: Mass feature {mf_id} has None value for '{attribute_name}'. Removing feature.") 1037 should_remove = True 1038 break 1039 1040 # Handle numpy arrays (like half_height_width which returns mean) 1041 if hasattr(attribute_value, '__len__') and not isinstance(attribute_value, str): 1042 # For arrays, we use the mean or appropriate summary statistic 1043 if attribute_name == 'half_height_width': 1044 # half_height_width property already returns the mean 1045 pass 1046 else: 1047 attribute_value = float(np.mean(attribute_value)) 1048 1049 # Handle NaN values 1050 if np.isnan(float(attribute_value)): 1051 if verbose: 1052 print(f"Warning: Mass feature {mf_id} has NaN value for '{attribute_name}'. Removing feature.") 1053 should_remove = True 1054 break 1055 1056 # Apply the threshold comparison based on operator 1057 attribute_value = float(attribute_value) 1058 threshold_value = float(threshold_value) 1059 1060 if operator == '>' and not (attribute_value > threshold_value): 1061 should_remove = True 1062 break 1063 elif operator == '<' and not (attribute_value < threshold_value): 1064 should_remove = True 1065 break 1066 elif operator == '>=' and not (attribute_value >= threshold_value): 1067 should_remove = True 1068 break 1069 elif operator == '<=' and not (attribute_value <= threshold_value): 1070 should_remove = True 1071 break 1072 1073 except (AttributeError, ValueError, TypeError) as e: 1074 if verbose: 1075 print(f"Error evaluating filter '{attribute_name}' for mass feature {mf_id}: {e}") 1076 should_remove = True 1077 break 1078 1079 if should_remove: 1080 features_to_remove.append(mf_id) 1081 1082 # Remove filtered mass features 1083 for mf_id in features_to_remove: 1084 del self.mass_features[mf_id] 1085 1086 # Clean up unassociated EICs and ms1 data 1087 self._remove_unassociated_eics() 1088 self._remove_unassociated_ms1_spectra() 1089 1090 def _remove_unassociated_eics(self) -> None: 1091 """Remove EICs that are not associated with any mass features. 1092 1093 This method cleans up the eics attribute by removing any EICs that do not correspond to 1094 any mass features currently stored in the mass_features attribute. This is useful for 1095 freeing up memory and ensuring that only relevant EICs are retained. 1096 1097 Returns 1098 ------- 1099 None 1100 Modifies self.eics in place by removing unassociated EICs. 1101 """ 1102 if self.mass_features is None or len(self.mass_features) == 0: 1103 self.eics = {} 1104 return 1105 1106 # Get the set of m/z values associated with current mass features 1107 associated_mzs = {mf.mz for mf in self.mass_features.values()} 1108 1109 # Remove EICs that are not associated with any mass features 1110 self.eics = {mz: eic for mz, eic in self.eics.items() if mz in associated_mzs} 1111 1112 def _remove_unassociated_ms1_spectra(self) -> None: 1113 """Remove MS1 spectra that are not associated with any mass features. 1114 This method cleans up the _ms_unprocessed attribute by removing any MS1 spectra that do not correspond to 1115 any mass features currently stored in the mass_features attribute. This is useful for freeing up memory 1116 and ensuring that only relevant MS1 spectra are retained. 1117 1118 Returns 1119 ------- 1120 None 1121 """ 1122 if self.mass_features is None or len(self.mass_features) == 0: 1123 self._ms_unprocessed = {} 1124 return 1125 1126 # Get the set of m/z values associated with current mass features 1127 associated_ms1_scans = {mf.apex_scan for mf in self.mass_features.values()} 1128 associated_ms1_scans = [int(scan) for scan in associated_ms1_scans] 1129 1130 # Get keys within the _ms attribute (these are individual MassSpectrum objects) 1131 current_stored_spectra = list(set(self._ms.keys())) 1132 if len(current_stored_spectra) == 0: 1133 return 1134 current_stored_spectra = [int(scan) for scan in current_stored_spectra] 1135 1136 # Filter the current_stored_spectra to only ms1 scans 1137 current_stored_spectra_ms1 = [ scan for scan in current_stored_spectra if scan in self.ms1_scans ] 1138 1139 # Remove MS1 spectra that are not associated with any mass features 1140 scans_to_drop = [scan for scan in current_stored_spectra_ms1 if scan not in associated_ms1_scans] 1141 for scan in scans_to_drop: 1142 if scan in self._ms: 1143 del self._ms[scan]
Methods for performing LC calculations on mass spectra data.
Notes
This class is intended to be used as a mixin for the LCMSBase class.
Methods
- get_max_eic(eic_data). Returns the maximum EIC value from the given EIC data. A static method.
- smooth_tic(tic). Smooths the TIC data using the specified smoothing method and settings.
- eic_centroid_detector(rt, eic, max_eic). Performs EIC centroid detection on the given EIC data.
- find_nearest_scan(rt). Finds the nearest scan to the given retention time.
- get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). Returns an averaged mass spectrum object.
- find_mass_features(ms_level=1). Find regions of interest for a given MS level (default is MS1).
- integrate_mass_features(drop_if_fail=False, ms_level=1). Integrate mass features of interest and extracts EICs.
- find_c13_mass_features(). Evaluate mass features and mark likely C13 isotopes.
- deconvolute_ms1_mass_features(). Deconvolute mass features' ms1 mass spectra.
66 @staticmethod 67 def get_max_eic(eic_data: dict): 68 """Returns the maximum EIC value from the given EIC data. 69 70 Notes 71 ----- 72 This is a static method. 73 74 Parameters 75 ---------- 76 eic_data : dict 77 A dictionary containing EIC data. 78 79 Returns 80 ------- 81 float 82 The maximum EIC value. 83 """ 84 max_eic = 0 85 for eic_data in eic_data.values(): 86 ind_max_eic = max(eic_data.get("EIC")) 87 max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic 88 89 return max_eic
Returns the maximum EIC value from the given EIC data.
Notes
This is a static method.
Parameters
- eic_data (dict): A dictionary containing EIC data.
Returns
- float: The maximum EIC value.
91 def smooth_tic(self, tic): 92 """Smooths the TIC or EIC data using the specified smoothing method and settings. 93 94 Parameters 95 ---------- 96 tic : numpy.ndarray 97 The TIC (or EIC) data to be smoothed. 98 99 Returns 100 ------- 101 numpy.ndarray 102 The smoothed TIC data. 103 """ 104 implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method 105 106 pol_order = self.parameters.lc_ms.savgol_pol_order 107 108 window_len = self.parameters.lc_ms.smooth_window 109 110 window = self.parameters.lc_ms.smooth_method 111 112 return sp.smooth_signal( 113 tic, window_len, window, pol_order, implemented_smooth_method 114 )
Smooths the TIC or EIC data using the specified smoothing method and settings.
Parameters
- tic (numpy.ndarray): The TIC (or EIC) data to be smoothed.
Returns
- numpy.ndarray: The smoothed TIC data.
116 def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]): 117 """Performs EIC centroid detection on the given EIC data. 118 119 Parameters 120 ---------- 121 rt : numpy.ndarray 122 The retention time data. 123 eic : numpy.ndarray 124 The EIC data. 125 max_eic : float 126 The maximum EIC value. 127 apex_indexes : list, optional 128 The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function. 129 130 Returns 131 ------- 132 numpy.ndarray 133 The indexes of left, apex, and right limits as a generator. 134 """ 135 136 max_prominence = self.parameters.lc_ms.peak_max_prominence_percent 137 138 max_height = self.parameters.lc_ms.peak_height_max_percent 139 140 signal_threshold = self.parameters.lc_ms.eic_signal_threshold 141 142 min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints 143 144 peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold 145 146 include_indexes = sp.peak_picking_first_derivative( 147 domain=rt, 148 signal=eic, 149 max_height=max_height, 150 max_prominence=max_prominence, 151 max_signal=max_eic, 152 min_peak_datapoints=min_peak_datapoints, 153 peak_derivative_threshold=peak_derivative_threshold, 154 signal_threshold=signal_threshold, 155 correct_baseline=False, 156 plot_res=False, 157 apex_indexes=apex_indexes, 158 ) 159 #include_indexes is a generator of tuples (left_index, apex_index, right_index) 160 include_indexes = list(include_indexes) 161 # Add check to make sure that there are at least 1/2 of min_peak_datapoints on either side of the apex 162 indicies = [x for x in include_indexes] 163 for idx in indicies: 164 if (idx[1] - idx[0] < min_peak_datapoints / 2) or ( 165 idx[2] - idx[1] < min_peak_datapoints / 2 166 ): 167 include_indexes.remove(idx) 168 return include_indexes
Performs EIC centroid detection on the given EIC data.
Parameters
- rt (numpy.ndarray): The retention time data.
- eic (numpy.ndarray): The EIC data.
- max_eic (float): The maximum EIC value.
- apex_indexes (list, optional): The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
Returns
- numpy.ndarray: The indexes of left, apex, and right limits as a generator.
170 def find_nearest_scan(self, rt): 171 """Finds the nearest scan to the given retention time. 172 173 Parameters 174 ---------- 175 rt : float 176 The retention time (in minutes) to find the nearest scan for. 177 178 Returns 179 ------- 180 int 181 The scan number of the nearest scan. 182 """ 183 array_rt = np.array(self.retention_time) 184 185 scan_index = (np.abs(array_rt - rt)).argmin() 186 187 real_scan = self.scans_number[scan_index] 188 189 return real_scan
Finds the nearest scan to the given retention time.
Parameters
- rt (float): The retention time (in minutes) to find the nearest scan for.
Returns
- int: The scan number of the nearest scan.
191 def add_peak_metrics(self, remove_by_metrics=True): 192 """Add peak metrics to the mass features. 193 194 This function calculates the peak metrics for each mass feature and adds them to the mass feature objects. 195 196 Parameters 197 ---------- 198 remove_by_metrics : bool, optional 199 If True, remove mass features based on their peak metrics such as S/N, Gaussian similarity, 200 dispersity index, and noise score. Default is True, which checks the setting in the processing parameters. 201 If False, peak metrics are calculated but no mass features are removed, regardless of the setting in the processing parameters. 202 """ 203 # Check that at least some mass features have eic data 204 if not any([mf._eic_data is not None for mf in self.mass_features.values()]): 205 raise ValueError( 206 "No mass features have EIC data. Run integrate_mass_features first." 207 ) 208 209 for mass_feature in self.mass_features.values(): 210 # Check if the mass feature has been integrated 211 if mass_feature._eic_data is not None: 212 # Calculate peak metrics 213 mass_feature.calc_half_height_width() 214 mass_feature.calc_tailing_factor() 215 mass_feature.calc_dispersity_index() 216 mass_feature.calc_gaussian_similarity() 217 mass_feature.calc_noise_score() 218 219 # Remove mass features by peak metrics if designated in parameters 220 if self.parameters.lc_ms.remove_mass_features_by_peak_metrics and remove_by_metrics: 221 self._remove_mass_features_by_peak_metrics()
Add peak metrics to the mass features.
This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
Parameters
- remove_by_metrics (bool, optional): If True, remove mass features based on their peak metrics such as S/N, Gaussian similarity, dispersity index, and noise score. Default is True, which checks the setting in the processing parameters. If False, peak metrics are calculated but no mass features are removed, regardless of the setting in the processing parameters.
223 def get_average_mass_spectrum( 224 self, 225 scan_list, 226 apex_scan, 227 spectrum_mode="profile", 228 ms_level=1, 229 auto_process=True, 230 use_parser=False, 231 perform_checks=True, 232 polarity=None, 233 ms_params=None, 234 ): 235 """Returns an averaged mass spectrum object 236 237 Parameters 238 ---------- 239 scan_list : list 240 List of scan numbers to average. 241 apex_scan : int 242 Number of the apex scan 243 spectrum_mode : str, optional 244 The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging. 245 ms_level : int, optional 246 The MS level to use. Defaults to 1. 247 auto_process : bool, optional 248 If True, the averaged mass spectrum will be auto-processed. Defaults to True. 249 use_parser : bool, optional 250 If True, the mass spectra will be obtained from the parser. Defaults to False. 251 perform_checks : bool, optional 252 If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan 253 polarity : int, optional 254 The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1) 255 ms_params : MSParameters, optional 256 The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None. 257 258 Returns 259 ------- 260 MassSpectrumProfile 261 The averaged mass spectrum object. 262 263 Raises 264 ------ 265 ValueError 266 If the spectrum mode is not "profile". 267 If the MS level is not found in the unprocessed mass spectra dictionary. 268 If not all scan numbers are found in the unprocessed mass spectra dictionary. 269 """ 270 if perform_checks: 271 if spectrum_mode != "profile": 272 raise ValueError("Averaging only supported for profile mode") 273 274 if polarity is None: 275 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 276 if self.polarity == "negative": 277 polarity = -1 278 elif self.polarity == "positive": 279 polarity = 1 280 else: 281 raise ValueError( 282 "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'" 283 ) 284 285 # if not using_parser, check that scan numbers are in _ms_unprocessed 286 if not use_parser: 287 if perform_checks: 288 # Set index to scan for faster lookup 289 ms_df = ( 290 self._ms_unprocessed[ms_level] 291 .copy() 292 .set_index("scan", drop=False) 293 .sort_index() 294 ) 295 my_ms_df = ms_df.loc[scan_list] 296 # Check that all scan numbers are in the ms_df 297 if not all(np.isin(scan_list, ms_df.index)): 298 raise ValueError( 299 "Not all scan numbers found in the unprocessed mass spectra dictionary" 300 ) 301 else: 302 my_ms_df = ( 303 pd.DataFrame({"scan": scan_list}) 304 .set_index("scan") 305 .join(self._ms_unprocessed[ms_level], how="left") 306 ) 307 308 if use_parser: 309 ms_list = [ 310 self.spectra_parser.get_mass_spectrum_from_scan( 311 x, spectrum_mode=spectrum_mode, auto_process=False 312 ) 313 for x in scan_list 314 ] 315 ms_mz = [x._mz_exp for x in ms_list] 316 ms_int = [x._abundance for x in ms_list] 317 my_ms_df = [] 318 for i in np.arange(len(ms_mz)): 319 my_ms_df.append( 320 pd.DataFrame( 321 {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]} 322 ) 323 ) 324 my_ms_df = pd.concat(my_ms_df) 325 326 if not self.check_if_grid(my_ms_df): 327 my_ms_df = self.grid_data(my_ms_df) 328 329 my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index() 330 331 ms = ms_from_array_profile( 332 my_ms_ave.mz, 333 my_ms_ave.intensity, 334 self.file_location, 335 polarity=polarity, 336 auto_process=False, 337 ) 338 339 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 340 if ms is not None: 341 if ms_params is not None: 342 ms.parameters = ms_params 343 ms.scan_number = apex_scan 344 if auto_process: 345 ms.process_mass_spec() 346 return ms
Returns an averaged mass spectrum object
Parameters
- scan_list (list): List of scan numbers to average.
- apex_scan (int): Number of the apex scan
- spectrum_mode (str, optional): The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
- ms_level (int, optional): The MS level to use. Defaults to 1.
- auto_process (bool, optional): If True, the averaged mass spectrum will be auto-processed. Defaults to True.
- use_parser (bool, optional): If True, the mass spectra will be obtained from the parser. Defaults to False.
- perform_checks (bool, optional): If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan
- polarity (int, optional): The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
- ms_params (MSParameters, optional): The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
Returns
- MassSpectrumProfile: The averaged mass spectrum object.
Raises
- ValueError: If the spectrum mode is not "profile". If the MS level is not found in the unprocessed mass spectra dictionary. If not all scan numbers are found in the unprocessed mass spectra dictionary.
348 def find_mass_features(self, ms_level=1, grid=True): 349 """Find mass features within an LCMSBase object 350 351 Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future. 352 353 Parameters 354 ---------- 355 ms_level : int, optional 356 The MS level to use for peak picking Default is 1. 357 grid : bool, optional 358 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 359 used for persistent homology peak picking for profile data only. Default is True. 360 361 Raises 362 ------ 363 ValueError 364 If no MS level data is found on the object. 365 If persistent homology peak picking is attempted on non-profile mode data. 366 If data is not gridded and grid is False. 367 If peak picking method is not implemented. 368 369 Returns 370 ------- 371 None, but assigns the mass_features and eics attributes to the object. 372 373 """ 374 pp_method = self.parameters.lc_ms.peak_picking_method 375 376 if pp_method == "persistent homology": 377 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 378 if all(msx_scan_df["ms_format"] == "profile"): 379 self.find_mass_features_ph(ms_level=ms_level, grid=grid) 380 else: 381 raise ValueError( 382 "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format( 383 ms_level 384 ) 385 ) 386 elif pp_method == "centroided_persistent_homology": 387 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 388 if all(msx_scan_df["ms_format"] == "centroid"): 389 self.find_mass_features_ph_centroid(ms_level=ms_level) 390 else: 391 raise ValueError( 392 "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format( 393 ms_level 394 ) 395 ) 396 else: 397 raise ValueError("Peak picking method not implemented") 398 399 # Remove noisey mass features if designated in parameters 400 if self.parameters.lc_ms.remove_redundant_mass_features: 401 self._remove_redundant_mass_features()
Find mass features within an LCMSBase object
Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
Parameters
- ms_level (int, optional): The MS level to use for peak picking Default is 1.
- grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), used for persistent homology peak picking for profile data only. Default is True.
Raises
- ValueError: If no MS level data is found on the object. If persistent homology peak picking is attempted on non-profile mode data. If data is not gridded and grid is False. If peak picking method is not implemented.
Returns
- None, but assigns the mass_features and eics attributes to the object.
403 def integrate_mass_features( 404 self, drop_if_fail=True, drop_duplicates=True, ms_level=1 405 ): 406 """Integrate mass features and extract EICs. 407 408 Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute. 409 410 Parameters 411 ---------- 412 drop_if_fail : bool, optional 413 Whether to drop mass features if the EIC limit calculations fail. 414 Default is True. 415 drop_duplicates : bool, optional 416 Whether to mass features that appear to be duplicates 417 (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). 418 Default is True. 419 ms_level : int, optional 420 The MS level to use. Default is 1. 421 422 Raises 423 ------ 424 ValueError 425 If no mass features are found. 426 If no MS level data is found for the given MS level (either in data or in the scan data) 427 428 Returns 429 ------- 430 None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute. 431 432 Notes 433 ----- 434 drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection). 435 """ 436 # Check if there is data 437 if ms_level in self._ms_unprocessed.keys(): 438 raw_data = self._ms_unprocessed[ms_level].copy() 439 else: 440 raise ValueError("No MS level " + str(ms_level) + " data found") 441 if self.mass_features is not None: 442 mf_df = self.mass_features_to_df().copy() 443 else: 444 raise ValueError( 445 "No mass features found, did you run find_mass_features() first?" 446 ) 447 448 # Subset scan data to only include correct ms_level 449 scan_df_sub = self.scan_df[ 450 self.scan_df["ms_level"] == int(ms_level) 451 ].reset_index(drop=True) 452 if scan_df_sub.empty: 453 raise ValueError("No MS level " + ms_level + " data found in scan data") 454 scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy() 455 456 mzs_to_extract = np.unique(mf_df["mz"].values) 457 mzs_to_extract.sort() 458 459 # Pre-sort raw_data by mz for faster filtering 460 raw_data_sorted = raw_data.sort_values(["mz", "scan"]).reset_index(drop=True) 461 raw_data_mz = raw_data_sorted["mz"].values 462 463 # Get EICs for each unique mz in mass features list 464 for mz in mzs_to_extract: 465 mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 466 mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 467 468 # Use binary search for faster mz range filtering 469 left_idx = np.searchsorted(raw_data_mz, mz_min, side="left") 470 right_idx = np.searchsorted(raw_data_mz, mz_max, side="right") 471 raw_data_sub = raw_data_sorted.iloc[left_idx:right_idx].copy() 472 473 raw_data_sub = ( 474 raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index() 475 ) 476 raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left") 477 raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0) 478 myEIC = EIC_Data( 479 scans=raw_data_sub["scan"].values, 480 time=raw_data_sub["scan_time"].values, 481 eic=raw_data_sub["intensity"].values, 482 ) 483 # Smooth EIC 484 smoothed_eic = self.smooth_tic(myEIC.eic) 485 smoothed_eic[smoothed_eic < 0] = 0 486 myEIC.eic_smoothed = smoothed_eic 487 self.eics[mz] = myEIC 488 489 # Get limits of mass features using EIC centroid detector and integrate 490 mf_df["area"] = np.nan 491 for idx, mass_feature in mf_df.iterrows(): 492 mz = mass_feature.mz 493 apex_scan = mass_feature.apex_scan 494 495 # Pull EIC data and find apex scan index 496 myEIC = self.eics[mz] 497 self.mass_features[idx]._eic_data = myEIC 498 apex_index = np.where(myEIC.scans == apex_scan)[0][0] 499 500 # Find left and right limits of peak using EIC centroid detector, add to EICData 501 centroid_eics = self.eic_centroid_detector( 502 myEIC.time, 503 myEIC.eic_smoothed, 504 mass_feature.intensity * 1.1, 505 apex_indexes=[int(apex_index)], 506 ) 507 l_a_r_scan_idx = [i for i in centroid_eics] 508 if len(l_a_r_scan_idx) > 0: 509 # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans 510 # Find the number of consecutive non-zero values in the EIC segment 511 mask = myEIC.eic[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1] > 0 512 # Find the longest run of consecutive True values 513 if np.any(mask): 514 # Find indices where mask changes value 515 diff = np.diff(np.concatenate(([0], mask.astype(int), [0]))) 516 starts = np.where(diff == 1)[0] 517 ends = np.where(diff == -1)[0] 518 consecutive_scans = (ends - starts).max() 519 else: 520 consecutive_scans = 0 521 if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min: 522 self.mass_features.pop(idx) 523 continue 524 # Add start and final scan to mass_features and EICData 525 left_scan, right_scan = ( 526 myEIC.scans[l_a_r_scan_idx[0][0]], 527 myEIC.scans[l_a_r_scan_idx[0][2]], 528 ) 529 mf_scan_apex = [(left_scan, int(apex_scan), right_scan)] 530 myEIC.apexes = myEIC.apexes + mf_scan_apex 531 self.mass_features[idx].start_scan = left_scan 532 self.mass_features[idx].final_scan = right_scan 533 534 # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData 535 area = np.trapz( 536 myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 537 myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 538 ) 539 mf_df.at[idx, "area"] = area 540 myEIC.areas = myEIC.areas + [area] 541 self.eics[mz] = myEIC 542 self.mass_features[idx]._area = area 543 else: 544 if drop_if_fail is True: 545 self.mass_features.pop(idx) 546 547 if drop_duplicates: 548 # Prepare mass feature dataframe 549 mf_df = self.mass_features_to_df() 550 551 # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature 552 # Keep the first mass feature (highest persistence) 553 for idx, mass_feature in mf_df.iterrows(): 554 mz = mass_feature.mz 555 apex_scan = mass_feature.apex_scan 556 557 mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6 558 mf_df_sub = mf_df[ 559 mf_df["mz_diff_ppm"] 560 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 561 * 10**6 562 ].copy() 563 564 # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature 565 for idx2, mass_feature2 in mf_df_sub.iterrows(): 566 if idx2 != idx: 567 if ( 568 mass_feature2.start_scan >= mass_feature.start_scan 569 and mass_feature2.final_scan <= mass_feature.final_scan 570 ): 571 if idx2 in self.mass_features.keys(): 572 self.mass_features.pop(idx2)
Integrate mass features and extract EICs.
Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
Parameters
- drop_if_fail (bool, optional): Whether to drop mass features if the EIC limit calculations fail. Default is True.
- drop_duplicates (bool, optional): Whether to mass features that appear to be duplicates (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). Default is True.
- ms_level (int, optional): The MS level to use. Default is 1.
Raises
- ValueError: If no mass features are found. If no MS level data is found for the given MS level (either in data or in the scan data)
Returns
- None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
Notes
drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
574 def find_c13_mass_features(self): 575 """Mark likely C13 isotopes and connect to monoisoitopic mass features. 576 577 Returns 578 ------- 579 None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object. 580 581 Raises 582 ------ 583 ValueError 584 If no mass features are found. 585 """ 586 verbose = self.parameters.lc_ms.verbose_processing 587 if verbose: 588 print("evaluating mass features for C13 isotopes") 589 if self.mass_features is None: 590 raise ValueError("No mass features found, run find_mass_features() first") 591 592 # Data prep fo sparse distance matrix 593 dims = ["mz", "scan_time"] 594 mf_df = self.mass_features_to_df().copy() 595 # Drop mass features that have no area (these are likely to be noise) 596 mf_df = mf_df[mf_df["area"].notnull()] 597 mf_df["mf_id"] = mf_df.index.values 598 dims = ["mz", "scan_time"] 599 600 # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features 601 mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy() 602 603 mz_diff = 1.003355 # C13-C12 mass difference 604 tol = [ 605 mf_df["mz"].median() 606 * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 607 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5, 608 ] # mz, in relative; scan_time in minutes 609 610 # Compute inter-feature distances 611 distances = None 612 for i in range(len(dims)): 613 # Construct k-d tree 614 values = mf_df[dims[i]].values 615 tree = KDTree(values.reshape(-1, 1)) 616 617 max_tol = tol[i] 618 if dims[i] == "mz": 619 # Maximum absolute tolerance 620 max_tol = mz_diff + tol[i] 621 622 # Compute sparse distance matrix 623 # the larger the max_tol, the slower this operation is 624 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 625 626 # Only consider forward case, exclude diagonal 627 sdm = sparse.triu(sdm, k=1) 628 629 if dims[i] == "mz": 630 min_tol = mz_diff - tol[i] 631 # Get only the ones that are above the min tol 632 idx = sdm.data > min_tol 633 634 # Reconstruct sparse distance matrix 635 sdm = sparse.coo_matrix( 636 (sdm.data[idx], (sdm.row[idx], sdm.col[idx])), 637 shape=(len(values), len(values)), 638 ) 639 640 # Cast as binary matrix 641 sdm.data = np.ones_like(sdm.data) 642 643 # Stack distances 644 if distances is None: 645 distances = sdm 646 else: 647 distances = distances.multiply(sdm) 648 649 # Extract indices of within-tolerance points 650 distances = distances.tocoo() 651 pairs = np.stack((distances.row, distances.col), axis=1) # C12 to C13 pairs 652 653 # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df 654 pairs_mf = pairs.copy() 655 pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values 656 pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values 657 658 # Connect monoisotopic masses with isotopologes within mass_features 659 monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1])) 660 for mono in monos: 661 self.mass_features[mono].monoisotopic_mf_id = mono 662 pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"]) 663 while not pairs_iso_df.empty: 664 pairs_iso_df = pairs_iso_df.set_index("parent", drop=False) 665 m1_isos = pairs_iso_df.loc[monos, "child"].unique() 666 for iso in m1_isos: 667 # Set monoisotopic_mf_id and isotopologue_type for isotopologues 668 parent = pairs_mf[pairs_mf[:, 1] == iso, 0] 669 if len(parent) > 1: 670 # Choose the parent that is closest in time to the isotopologue 671 parent_time = [self.mass_features[p].retention_time for p in parent] 672 time_diff = [ 673 np.abs(self.mass_features[iso].retention_time - x) 674 for x in parent_time 675 ] 676 parent = parent[np.argmin(time_diff)] 677 else: 678 parent = parent[0] 679 self.mass_features[iso].monoisotopic_mf_id = self.mass_features[ 680 parent 681 ].monoisotopic_mf_id 682 if self.mass_features[iso].monoisotopic_mf_id is not None: 683 mass_diff = ( 684 self.mass_features[iso].mz 685 - self.mass_features[ 686 self.mass_features[iso].monoisotopic_mf_id 687 ].mz 688 ) 689 self.mass_features[iso].isotopologue_type = "13C" + str( 690 int(round(mass_diff, 0)) 691 ) 692 693 # Drop the mono and iso from the pairs_iso_df 694 pairs_iso_df = pairs_iso_df.drop( 695 index=monos, errors="ignore" 696 ) # Drop pairs where the parent is a child that is a child of a root 697 pairs_iso_df = pairs_iso_df.set_index("child", drop=False) 698 pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore") 699 700 if not pairs_iso_df.empty: 701 # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process 702 monos = np.setdiff1d( 703 np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child) 704 ) 705 if verbose: 706 # Report fraction of compounds annotated with isotopes 707 mf_df["c13_flag"] = np.where( 708 np.logical_or( 709 np.isin(mf_df["mf_id"], pairs_mf[:, 0]), 710 np.isin(mf_df["mf_id"], pairs_mf[:, 1]), 711 ), 712 1, 713 0, 714 ) 715 print( 716 str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3)) 717 + " of mass features have or are C13 isotopes" 718 )
Mark likely C13 isotopes and connect to monoisoitopic mass features.
Returns
- None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
Raises
- ValueError: If no mass features are found.
720 def deconvolute_ms1_mass_features(self): 721 """Deconvolute MS1 mass features 722 723 Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features 724 725 Parameters 726 ---------- 727 None 728 729 Returns 730 ------- 731 None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent, 732 and associated_mass_features_deconvoluted attributes to the mass features in the 733 mass_features attribute of the LCMSBase object. 734 735 Raises 736 ------ 737 ValueError 738 If no mass features are found, must run find_mass_features() first. 739 If no EICs are found, did you run integrate_mass_features() first? 740 741 """ 742 # Checks for set mass_features and eics 743 if self.mass_features is None: 744 raise ValueError( 745 "No mass features found, did you run find_mass_features() first?" 746 ) 747 748 if self.eics == {}: 749 raise ValueError( 750 "No EICs found, did you run integrate_mass_features() first?" 751 ) 752 753 if 1 not in self._ms_unprocessed.keys(): 754 raise ValueError("No unprocessed MS1 spectra found.") 755 756 # Prep ms1 data 757 ms1_data = self._ms_unprocessed[1].copy() 758 ms1_data = ms1_data.set_index("scan") 759 760 # Prep mass feature summary 761 mass_feature_df = self.mass_features_to_df() 762 763 # Loop through each mass feature 764 for mf_id, mass_feature in self.mass_features.items(): 765 # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df 766 if mass_feature.mz != mass_feature.ms1_peak.mz_exp: 767 continue 768 769 # Get the left and right limits of the EIC of the mass feature 770 l_scan, _, r_scan = mass_feature._eic_data.apexes[0] 771 772 # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz 773 ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy() 774 ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False) 775 776 # Get the centroided masses of the mass feature 777 mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp 778 779 # Find the closest mz in the ms1 data to the centroided masses of the mass feature 780 ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[ 781 find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values) 782 ] 783 784 # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz 785 ms1_data_sub["mz_diff_rel"] = ( 786 np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"]) 787 / ms1_data_sub["mz"] 788 ) 789 ms1_data_sub = ms1_data_sub[ 790 ms1_data_sub["mz_diff_rel"] 791 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 792 ].reset_index(drop=True) 793 794 # Group by mass_feature_mz and scan and sum intensity 795 ms1_data_sub_group = ( 796 ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"] 797 .sum() 798 .reset_index() 799 ) 800 801 # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity) 802 corr = ( 803 ms1_data_sub_group.pivot( 804 index="scan", columns="mass_feature_mz", values="intensity" 805 ) 806 .fillna(0) 807 .corr() 808 ) 809 810 # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8 811 decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min 812 813 # Try catch for KeyError in case the mass feature mz is not in the correlation matrix 814 try: 815 corr_subset = corr.loc[mass_feature.mz,] 816 except KeyError: 817 # If the mass feature mz is not in the correlation matrix, skip to the next mass feature 818 continue 819 820 corr_subset = corr_subset[corr_subset > decon_corr_min] 821 822 # Get the masses from the mass spectrum that are the result of the deconvolution 823 mzs_decon = corr_subset.index.values 824 825 # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature 826 mzs_decon_idx = [ 827 id 828 for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp) 829 if mz in mzs_decon 830 ] 831 mass_feature._ms_deconvoluted_idx = mzs_decon_idx 832 833 # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum 834 if ( 835 mass_feature.ms1_peak.abundance 836 == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max() 837 ): 838 mass_feature.mass_spectrum_deconvoluted_parent = True 839 else: 840 mass_feature.mass_spectrum_deconvoluted_parent = False 841 842 # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature 843 # Subset mass_feature_df to only include mass features that are within the clustering tolerance 844 mass_feature_df_sub = mass_feature_df[ 845 abs(mass_feature.retention_time - mass_feature_df["scan_time"]) 846 < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 847 ].copy() 848 # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum 849 mass_feature_df_sub["mz_diff_ppm"] = [ 850 np.abs(mzs_decon - mz).min() / mz * 10**6 851 for mz in mass_feature_df_sub["mz"] 852 ] 853 # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses 854 mfs_associated_decon = mass_feature_df_sub[ 855 mass_feature_df_sub["mz_diff_ppm"] 856 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6 857 ].index.values 858 859 mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon
Deconvolute MS1 mass features
Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
Parameters
- None
Returns
- None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
- and associated_mass_features_deconvoluted attributes to the mass features in the
- mass_features attribute of the LCMSBase object.
Raises
- ValueError: If no mass features are found, must run find_mass_features() first. If no EICs are found, did you run integrate_mass_features() first?
1145class PHCalculations: 1146 """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data. 1147 1148 Notes 1149 ----- 1150 This class is intended to be used as a mixin for the LCMSBase class. 1151 1152 Methods 1153 ------- 1154 * sparse_mean_filter(idx, V, radius=[0, 1, 1]). 1155 Sparse implementation of a mean filter. 1156 * embed_unique_indices(a). 1157 Creates an array of indices, sorted by unique element. 1158 * sparse_upper_star(idx, V). 1159 Sparse implementation of an upper star filtration. 1160 * check_if_grid(data). 1161 Check if the data is gridded in mz space. 1162 * grid_data(data). 1163 Grid the data in the mz dimension. 1164 * find_mass_features_ph(ms_level=1, grid=True). 1165 Find mass features within an LCMSBase object using persistent homology. 1166 * cluster_mass_features(drop_children=True). 1167 Cluster regions of interest. 1168 """ 1169 1170 @staticmethod 1171 def sparse_mean_filter(idx, V, radius=[0, 1, 1]): 1172 """Sparse implementation of a mean filter. 1173 1174 Parameters 1175 ---------- 1176 idx : :obj:`~numpy.array` 1177 Edge indices for each dimension (MxN). 1178 V : :obj:`~numpy.array` 1179 Array of intensity data (Mx1). 1180 radius : float or list 1181 Radius of the sparse filter in each dimension. Values less than 1182 zero indicate no connectivity in that dimension. 1183 1184 Returns 1185 ------- 1186 :obj:`~numpy.array` 1187 Filtered intensities (Mx1). 1188 1189 Notes 1190 ----- 1191 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. 1192 This is a static method. 1193 """ 1194 1195 # Copy indices 1196 idx = idx.copy().astype(V.dtype) 1197 1198 # Scale 1199 for i, r in enumerate(radius): 1200 # Increase inter-index distance 1201 if r < 1: 1202 idx[:, i] *= 2 1203 1204 # Do nothing 1205 elif r == 1: 1206 pass 1207 1208 # Decrease inter-index distance 1209 else: 1210 idx[:, i] /= r 1211 1212 # Connectivity matrix 1213 cmat = KDTree(idx) 1214 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1215 cmat.setdiag(1) 1216 1217 # Pair indices 1218 I, J = cmat.nonzero() 1219 1220 # Delete cmat 1221 cmat_shape = cmat.shape 1222 del cmat 1223 1224 # Sum over columns 1225 V_sum = sparse.bsr_matrix( 1226 (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype 1227 ).diagonal(0) 1228 1229 # Count over columns 1230 V_count = sparse.bsr_matrix( 1231 (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype 1232 ).diagonal(0) 1233 1234 return V_sum / V_count 1235 1236 @staticmethod 1237 def embed_unique_indices(a): 1238 """Creates an array of indices, sorted by unique element. 1239 1240 Parameters 1241 ---------- 1242 a : :obj:`~numpy.array` 1243 Array of unique elements (Mx1). 1244 1245 Returns 1246 ------- 1247 :obj:`~numpy.array` 1248 Array of indices (Mx1). 1249 1250 Notes 1251 ----- 1252 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1253 This is a static method. 1254 """ 1255 1256 def count_tens(n): 1257 # Count tens 1258 ntens = (n - 1) // 10 1259 1260 while True: 1261 ntens_test = (ntens + n - 1) // 10 1262 1263 if ntens_test == ntens: 1264 return ntens 1265 else: 1266 ntens = ntens_test 1267 1268 def arange_exclude_10s(n): 1269 # How many 10s will there be? 1270 ntens = count_tens(n) 1271 1272 # Base array 1273 arr = np.arange(0, n + ntens) 1274 1275 # Exclude 10s 1276 arr = arr[(arr == 0) | (arr % 10 != 0)][:n] 1277 1278 return arr 1279 1280 # Creates an array of indices, sorted by unique element 1281 idx_sort = np.argsort(a) 1282 idx_unsort = np.argsort(idx_sort) 1283 1284 # Sorts records array so all unique elements are together 1285 sorted_a = a[idx_sort] 1286 1287 # Returns the unique values, the index of the first occurrence, 1288 # and the count for each element 1289 vals, idx_start, count = np.unique( 1290 sorted_a, return_index=True, return_counts=True 1291 ) 1292 1293 # Splits the indices into separate arrays 1294 splits = np.split(idx_sort, idx_start[1:]) 1295 1296 # Creates unique indices for each split 1297 idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits]) 1298 1299 # Reorders according to input array 1300 idx_unq = idx_unq[idx_unsort] 1301 1302 # Magnitude of each index 1303 exp = np.log10( 1304 idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64) 1305 ) 1306 idx_unq_mag = np.power(10, np.floor(exp) + 1) 1307 1308 # Result 1309 return a + idx_unq / idx_unq_mag 1310 1311 @staticmethod 1312 def roll_up_dataframe( 1313 df: pd.DataFrame, 1314 sort_by: str, 1315 tol: list, 1316 relative: list, 1317 dims: list, 1318 memory_opt_threshold: int = 10000, 1319 ): 1320 """Subset data by rolling up into apex in appropriate dimensions. 1321 1322 Parameters 1323 ---------- 1324 data : pd.DataFrame 1325 The input data containing "dims" columns and the "sort_by" column. 1326 sort_by : str 1327 The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 1328 (i.e., the mass feature with the highest value in the sort_by column). 1329 dims : list 1330 A list of dimension names (column names in the data DataFrame) to roll up the mass features by. 1331 tol : list 1332 A list of tolerances for each dimension. The length of the list must match the number of dimensions. 1333 The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). 1334 If relative is True, the tolerance will be multiplied by the maximum value in that dimension. 1335 If relative is False, the tolerance will be used as is. 1336 relative : list 1337 A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1338 memory_opt_threshold : int, optional 1339 Minimum number of rows to trigger memory-optimized processing. Default is 10000. 1340 1341 Returns 1342 ------- 1343 pd.DataFrame 1344 A DataFrame with only the rolled up mass features, with the original index and columns. 1345 1346 1347 Raises 1348 ------ 1349 ValueError 1350 If the input data is not a pandas DataFrame. 1351 If the input data does not have columns for each of the dimensions in "dims". 1352 If the length of "dims", "tol", and "relative" do not match. 1353 """ 1354 og_columns = df.columns.copy() 1355 1356 # Unindex the data, but keep the original index 1357 if df.index.name is not None: 1358 og_index = df.index.name 1359 else: 1360 og_index = "index" 1361 df = df.reset_index(drop=False) 1362 1363 # Sort data by sort_by column, and reindex 1364 df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True) 1365 1366 # Check that data is a DataFrame and has columns for each of the dims 1367 if not isinstance(df, pd.DataFrame): 1368 raise ValueError("Data must be a pandas DataFrame") 1369 for dim in dims: 1370 if dim not in df.columns: 1371 raise ValueError(f"Data must have a column for {dim}") 1372 if len(dims) != len(tol) or len(dims) != len(relative): 1373 raise ValueError( 1374 "Dimensions, tolerances, and relative flags must be the same length" 1375 ) 1376 1377 # Pre-compute all values arrays 1378 all_values = [df[dim].values for dim in dims] 1379 1380 # Choose processing method based on dataframe size 1381 if len(df) >= memory_opt_threshold: 1382 # Memory-optimized approach for large dataframes 1383 distances = PHCalculations._compute_distances_memory_optimized( 1384 all_values, tol, relative 1385 ) 1386 else: 1387 # Faster approach for smaller dataframes 1388 distances = PHCalculations._compute_distances_original( 1389 all_values, tol, relative 1390 ) 1391 1392 # Process pairs with original logic but memory optimizations 1393 distances = distances.tocoo() 1394 pairs = np.stack((distances.row, distances.col), axis=1) 1395 pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]).set_index("parent") 1396 del distances, pairs # Free memory immediately 1397 1398 to_drop = [] 1399 while not pairs_df.empty: 1400 # Find root_parents and their children (original logic preserved) 1401 root_parents = np.setdiff1d( 1402 np.unique(pairs_df.index.values), np.unique(pairs_df.child.values) 1403 ) 1404 children_of_roots = pairs_df.loc[root_parents, "child"].unique() 1405 to_drop.extend(children_of_roots) # Use extend instead of append 1406 1407 # Remove root_children as possible parents from pairs_df for next iteration 1408 pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore") 1409 pairs_df = pairs_df.reset_index().set_index("child") 1410 # Remove root_children as possible children from pairs_df for next iteration 1411 pairs_df = pairs_df.drop(index=children_of_roots) 1412 1413 # Prepare for next iteration 1414 pairs_df = pairs_df.reset_index().set_index("parent") 1415 1416 # Convert to numpy array for efficient dropping 1417 to_drop = np.array(to_drop) 1418 1419 # Drop mass features that are not cluster parents 1420 df_sub = df.drop(index=to_drop) 1421 1422 # Set index back to og_index and only keep original columns 1423 df_sub = df_sub.set_index(og_index).sort_index()[og_columns] 1424 1425 return df_sub 1426 1427 @staticmethod 1428 def _compute_distances_original(all_values, tol, relative): 1429 """Original distance computation method for smaller datasets. 1430 1431 This method computes the pairwise distances between features in the dataset 1432 using a straightforward approach. It is suitable for smaller datasets where 1433 memory usage is not a primary concern. 1434 1435 Parameters 1436 ---------- 1437 all_values : list of :obj:`~numpy.array` 1438 List of arrays containing the values for each dimension. 1439 tol : list of float 1440 List of tolerances for each dimension. 1441 relative : list of bool 1442 List of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1443 1444 Returns 1445 ------- 1446 :obj:`~scipy.sparse.coo_matrix` 1447 Sparse matrix indicating pairwise distances within tolerances. 1448 """ 1449 # Compute inter-feature distances with memory optimization 1450 distances = None 1451 for i in range(len(all_values)): 1452 values = all_values[i] 1453 # Use single precision if possible to reduce memory 1454 tree = KDTree(values.reshape(-1, 1).astype(np.float32)) 1455 1456 max_tol = tol[i] 1457 if relative[i] is True: 1458 max_tol = tol[i] * values.max() 1459 1460 # Compute sparse distance matrix with smaller chunks if memory is an issue 1461 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 1462 1463 # Only consider forward case, exclude diagonal 1464 sdm = sparse.triu(sdm, k=1) 1465 1466 # Process relative distances more efficiently 1467 if relative[i] is True: 1468 # Vectorized computation without creating intermediate arrays 1469 row_values = values[sdm.row] 1470 valid_idx = sdm.data <= tol[i] * row_values 1471 1472 # Reconstruct sparse matrix more efficiently 1473 sdm = sparse.coo_matrix( 1474 ( 1475 np.ones(valid_idx.sum(), dtype=np.uint8), 1476 (sdm.row[valid_idx], sdm.col[valid_idx]), 1477 ), 1478 shape=(len(values), len(values)), 1479 ) 1480 else: 1481 # Cast as binary matrix with smaller data type 1482 sdm.data = np.ones(len(sdm.data), dtype=np.uint8) 1483 1484 # Stack distances with memory-efficient multiplication 1485 if distances is None: 1486 distances = sdm 1487 else: 1488 # Use in-place operations where possible 1489 distances = distances.multiply(sdm) 1490 del sdm # Free memory immediately 1491 1492 return distances 1493 1494 @staticmethod 1495 def _compute_distances_memory_optimized(all_values, tol, relative): 1496 """Memory-optimized distance computation for large datasets. 1497 1498 This method computes the pairwise distances between features in the dataset 1499 using a more memory-efficient approach. It is suitable for larger datasets 1500 where memory usage is a primary concern. 1501 1502 Parameters 1503 ---------- 1504 all_values : list of :obj:`~numpy.array` 1505 List of arrays containing the values for each dimension. 1506 tol : list of float 1507 List of tolerances for each dimension. 1508 relative : list of bool 1509 List of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1510 1511 Returns 1512 ------- 1513 :obj:`~scipy.sparse.coo_matrix` 1514 Sparse matrix indicating pairwise distances within tolerances. 1515 """ 1516 # Compute distance matrix for first dimension (full matrix as before) 1517 values_0 = all_values[0].astype(np.float32) 1518 tree_0 = KDTree(values_0.reshape(-1, 1)) 1519 1520 max_tol_0 = tol[0] 1521 if relative[0] is True: 1522 max_tol_0 = tol[0] * values_0.max() 1523 1524 # Compute sparse distance matrix for first dimension 1525 distances = tree_0.sparse_distance_matrix( 1526 tree_0, max_tol_0, output_type="coo_matrix" 1527 ) 1528 distances = sparse.triu(distances, k=1) 1529 1530 # Process relative distances for first dimension 1531 if relative[0] is True: 1532 row_values = values_0[distances.row] 1533 valid_idx = distances.data <= tol[0] * row_values 1534 distances = sparse.coo_matrix( 1535 ( 1536 np.ones(valid_idx.sum(), dtype=np.uint8), 1537 (distances.row[valid_idx], distances.col[valid_idx]), 1538 ), 1539 shape=(len(values_0), len(values_0)), 1540 ) 1541 else: 1542 distances.data = np.ones(len(distances.data), dtype=np.uint8) 1543 1544 # For remaining dimensions, work only on chunks defined by first dimension pairs 1545 if len(all_values) > 1: 1546 distances_coo = distances.tocoo() 1547 valid_pairs = [] 1548 1549 # Process each pair from first dimension 1550 for idx in range(len(distances_coo.data)): 1551 i, j = distances_coo.row[idx], distances_coo.col[idx] 1552 is_valid_pair = True 1553 1554 # Check remaining dimensions for this specific pair 1555 for dim_idx in range(1, len(all_values)): 1556 values = all_values[dim_idx] 1557 val_i, val_j = values[i], values[j] 1558 1559 max_tol = tol[dim_idx] 1560 if relative[dim_idx] is True: 1561 max_tol = tol[dim_idx] * values.max() 1562 1563 distance_ij = abs(val_i - val_j) 1564 1565 # Check if this pair satisfies the tolerance for this dimension 1566 if relative[dim_idx] is True: 1567 if distance_ij > tol[dim_idx] * val_i: 1568 is_valid_pair = False 1569 break 1570 else: 1571 if distance_ij > max_tol: 1572 is_valid_pair = False 1573 break 1574 1575 if is_valid_pair: 1576 valid_pairs.append((i, j)) 1577 1578 # Rebuild distances matrix with only valid pairs 1579 if valid_pairs: 1580 valid_pairs = np.array(valid_pairs) 1581 distances = sparse.coo_matrix( 1582 ( 1583 np.ones(len(valid_pairs), dtype=np.uint8), 1584 (valid_pairs[:, 0], valid_pairs[:, 1]), 1585 ), 1586 shape=(len(values_0), len(values_0)), 1587 ) 1588 else: 1589 # No valid pairs found 1590 distances = sparse.coo_matrix( 1591 (len(values_0), len(values_0)), dtype=np.uint8 1592 ) 1593 1594 return distances 1595 1596 def sparse_upper_star(self, idx, V): 1597 """Sparse implementation of an upper star filtration. 1598 1599 Parameters 1600 ---------- 1601 idx : :obj:`~numpy.array` 1602 Edge indices for each dimension (MxN). 1603 V : :obj:`~numpy.array` 1604 Array of intensity data (Mx1). 1605 Returns 1606 ------- 1607 idx : :obj:`~numpy.array` 1608 Index of filtered points (Mx1). 1609 persistence : :obj:`~numpy.array` 1610 Persistence of each filtered point (Mx1). 1611 1612 Notes 1613 ----- 1614 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1615 """ 1616 1617 # Invert 1618 V = -1 * V.copy().astype(int) 1619 1620 # Embed indices 1621 V = self.embed_unique_indices(V) 1622 1623 # Connectivity matrix 1624 cmat = KDTree(idx) 1625 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1626 cmat.setdiag(1) 1627 cmat = sparse.triu(cmat) 1628 1629 # Pairwise minimums 1630 I, J = cmat.nonzero() 1631 d = np.maximum(V[I], V[J]) 1632 1633 # Delete connectiity matrix 1634 cmat_shape = cmat.shape 1635 del cmat 1636 1637 # Sparse distance matrix 1638 sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape) 1639 1640 # Delete pairwise mins 1641 del d, I, J 1642 1643 # Persistence homology 1644 ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0] 1645 1646 # Bound death values 1647 ph[ph[:, 1] == np.inf, 1] = np.max(V) 1648 1649 # Construct tree to query against 1650 tree = KDTree(V.reshape((-1, 1))) 1651 1652 # Get the indexes of the first nearest neighbor by birth 1653 _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1) 1654 1655 return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1) 1656 1657 def check_if_grid(self, data): 1658 """Check if the data are gridded in mz space. 1659 1660 Parameters 1661 ---------- 1662 data : DataFrame 1663 DataFrame containing the mass spectrometry data. Needs to have mz and scan columns. 1664 1665 Returns 1666 ------- 1667 bool 1668 True if the data is gridded in the mz direction, False otherwise. 1669 1670 Notes 1671 ----- 1672 This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly. 1673 """ 1674 # Calculate the difference between consecutive mz values in a single scan 1675 dat_check = data.copy().reset_index(drop=True) 1676 dat_check["mz_diff"] = np.abs(dat_check["mz"].diff()) 1677 mz_diff_min = ( 1678 dat_check.groupby("scan")["mz_diff"].min().min() 1679 ) # within each scan, what is the smallest mz difference between consecutive mz values 1680 1681 # Find the mininum mz difference between mz values in the data; regardless of scan 1682 dat_check_mz = dat_check[["mz"]].drop_duplicates().copy() 1683 dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True) 1684 dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff()) 1685 1686 # Get minimum mz_diff between mz values in the data 1687 mz_diff_min_raw = dat_check_mz["mz_diff"].min() 1688 1689 # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded 1690 if mz_diff_min_raw < mz_diff_min: 1691 return False 1692 else: 1693 return True 1694 1695 def grid_data(self, data, attempts=5): 1696 """Grid the data in the mz dimension. 1697 1698 Data must be gridded prior to persistent homology calculations and computing average mass spectrum 1699 1700 Parameters 1701 ---------- 1702 data : DataFrame 1703 The input data containing mz, scan, scan_time, and intensity columns. 1704 attempts : int, optional 1705 The number of attempts to grid the data. Default is 5. 1706 1707 Returns 1708 ------- 1709 DataFrame 1710 The gridded data with mz, scan, scan_time, and intensity columns. 1711 1712 Raises 1713 ------ 1714 ValueError 1715 If gridding fails after the specified number of attempts. 1716 """ 1717 attempt_i = 0 1718 while attempt_i < attempts: 1719 attempt_i += 1 1720 data = self._grid_data(data) 1721 1722 if self.check_if_grid(data): 1723 return data 1724 1725 if not self.check_if_grid(data): 1726 raise ValueError( 1727 "Gridding failed after " 1728 + str(attempt_i) 1729 + " attempts. Please check the data." 1730 ) 1731 else: 1732 return data 1733 1734 def _grid_data(self, data): 1735 """Internal method to grid the data in the mz dimension. 1736 1737 Notes 1738 ----- 1739 This method is called by the grid_data method and should not be called directly. 1740 It will attempt to grid the data in the mz dimension by creating a grid of mz values based on the minimum mz difference within each scan, 1741 but it does not check if the data is already gridded or if the gridding is successful. 1742 1743 Parameters 1744 ---------- 1745 data : pd.DataFrame or pl.DataFrame 1746 The input data to grid. 1747 1748 Returns 1749 ------- 1750 pd.DataFrame or pl.DataFrame 1751 The data after attempting to grid it in the mz dimension. 1752 """ 1753 # Calculate the difference between consecutive mz values in a single scan for grid spacing 1754 data_w = data.copy().reset_index(drop=True) 1755 data_w["mz_diff"] = np.abs(data_w["mz"].diff()) 1756 mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999 1757 1758 # Need high intensity mz values first so they are parents in the output pairs stack 1759 dat_mz = data_w[["mz", "intensity"]].sort_values( 1760 by=["intensity"], ascending=False 1761 ) 1762 dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy() 1763 1764 # Construct KD tree 1765 tree = KDTree(dat_mz.mz.values.reshape(-1, 1)) 1766 sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix") 1767 sdm = sparse.triu(sdm, k=1) 1768 sdm.data = np.ones_like(sdm.data) 1769 distances = sdm.tocoo() 1770 pairs = np.stack((distances.row, distances.col), axis=1) 1771 1772 # Cull pairs to just get root 1773 to_drop = [] 1774 while len(pairs) > 0: 1775 root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1])) 1776 id_root_parents = np.isin(pairs[:, 0], root_parents) 1777 children_of_roots = np.unique(pairs[id_root_parents, 1]) 1778 to_drop = np.append(to_drop, children_of_roots) 1779 1780 # Set up pairs array for next iteration by removing pairs with children or parents already dropped 1781 pairs = pairs[~np.isin(pairs[:, 1], to_drop), :] 1782 pairs = pairs[~np.isin(pairs[:, 0], to_drop), :] 1783 dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop)) 1784 mz_dat_np = ( 1785 dat_mz[["mz"]] 1786 .sort_values(by=["mz"]) 1787 .reset_index(drop=True) 1788 .values.flatten() 1789 ) 1790 1791 # Sort data by mz and recast mz to nearest value in mz_dat_np 1792 data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy() 1793 data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)] 1794 data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"]) 1795 1796 # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity 1797 new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy() 1798 new_data_w = ( 1799 new_data_w.drop(columns=["mz_diff", "mz_orig"]) 1800 .groupby(["scan", "mz"])["intensity"] 1801 .sum() 1802 .reset_index() 1803 ) 1804 new_data_w = ( 1805 new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True]) 1806 .reset_index(drop=True) 1807 .copy() 1808 ) 1809 1810 return new_data_w 1811 1812 def find_mass_features_ph(self, ms_level=1, grid=True): 1813 """Find mass features within an LCMSBase object using persistent homology. 1814 1815 Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id) 1816 1817 Parameters 1818 ---------- 1819 ms_level : int, optional 1820 The MS level to use. Default is 1. 1821 grid : bool, optional 1822 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True. 1823 1824 Raises 1825 ------ 1826 ValueError 1827 If no MS level data is found on the object. 1828 If data is not gridded and grid is False. 1829 1830 Returns 1831 ------- 1832 None, but assigns the mass_features attribute to the object. 1833 1834 Notes 1835 ----- 1836 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1837 """ 1838 # Check that ms_level is a key in self._ms_uprocessed 1839 if ms_level not in self._ms_unprocessed.keys(): 1840 raise ValueError( 1841 "No MS level " 1842 + str(ms_level) 1843 + " data found, did you instantiate with parser specific to MS level?" 1844 ) 1845 1846 # Get ms data 1847 data = self._ms_unprocessed[ms_level].copy() 1848 1849 # Drop rows with missing intensity values and reset index 1850 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1851 1852 # Threshold data 1853 dims = ["mz", "scan_time"] 1854 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1855 persistence_threshold = ( 1856 self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max() 1857 ) 1858 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1859 1860 # Check if gridded, if not, grid 1861 gridded_mz = self.check_if_grid(data_thres) 1862 if gridded_mz is False: 1863 if grid is False: 1864 raise ValueError( 1865 "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function" 1866 ) 1867 else: 1868 data_thres = self.grid_data(data_thres) 1869 1870 # Add scan_time 1871 data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1872 # Process in chunks if required 1873 if len(data_thres) > 10000: 1874 return self._find_mass_features_ph_partition( 1875 data_thres, dims, persistence_threshold 1876 ) 1877 else: 1878 # Process all at once 1879 return self._find_mass_features_ph_single( 1880 data_thres, dims, persistence_threshold 1881 ) 1882 1883 def _find_mass_features_ph_single(self, data_thres, dims, persistence_threshold): 1884 """Process all data at once (original logic).""" 1885 # Build factors 1886 factors = { 1887 dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32) 1888 for dim in dims 1889 } 1890 1891 # Build indexes 1892 index = { 1893 dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32) 1894 for dim in factors 1895 } 1896 1897 # Smooth and process 1898 mass_features_df = self._process_partition_ph( 1899 data_thres, index, dims, persistence_threshold 1900 ) 1901 1902 # Roll up within chunk to remove duplicates 1903 mass_features_df = self.roll_up_dataframe( 1904 df=mass_features_df, 1905 sort_by="persistence", 1906 dims=["mz", "scan_time"], 1907 tol=[ 1908 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1909 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1910 ], 1911 relative=[True, False], 1912 ) 1913 mass_features_df = mass_features_df.reset_index(drop=True) 1914 1915 # Populate mass_features attribute 1916 self._populate_mass_features(mass_features_df) 1917 1918 def _find_mass_features_ph_partition(self, data_thres, dims, persistence_threshold): 1919 """Partition the persistent homology mass feature detection for large datasets. 1920 1921 This method splits the input data into overlapping scan partitions, processes each partition to detect mass features 1922 using persistent homology, rolls up duplicates within and across partitions, and populates the mass_features attribute. 1923 1924 Parameters 1925 ---------- 1926 data_thres : pd.DataFrame 1927 The thresholded input data containing mass spectrometry information. 1928 dims : list 1929 List of dimension names (e.g., ["mz", "scan_time"]) used for feature detection. 1930 persistence_threshold : float 1931 Minimum persistence value required for a detected mass feature to be retained. 1932 1933 Returns 1934 ------- 1935 None 1936 Populates the mass_features attribute of the object with detected mass features. 1937 """ 1938 all_mass_features = [] 1939 1940 # Split scans into partitions 1941 unique_scans = sorted(data_thres["scan"].unique()) 1942 unique_scans_n = len(unique_scans) 1943 1944 # Calculate partition size in scans based on goal 1945 partition_size_goal = 5000 1946 scans_per_partition = max( 1947 1, partition_size_goal // (len(data_thres) // unique_scans_n) 1948 ) 1949 if scans_per_partition == 0: 1950 scans_per_partition = 1 1951 1952 # Make partitions based on scans, with overlapping in partitioned scans 1953 scan_overlap = 4 1954 partition_scans = [] 1955 for i in range(0, unique_scans_n, scans_per_partition): 1956 start_idx = max(0, i - scan_overlap) 1957 end_idx = min( 1958 unique_scans_n - 1, i + scans_per_partition - 1 + scan_overlap 1959 ) 1960 scans_group = [int(s) for s in unique_scans[start_idx : end_idx + 1]] 1961 partition_scans.append(scans_group) 1962 1963 # Set index to scan for faster filtering 1964 data_thres = data_thres.set_index("scan") 1965 for scans in partition_scans: 1966 # Determine start and end scan for partition, with 5 scans overlap 1967 partition_data = data_thres.loc[scans].reset_index(drop=False).copy() 1968 1969 if len(partition_data) == 0: 1970 continue 1971 1972 # Build factors for this partition 1973 factors = { 1974 dim: pd.factorize(partition_data[dim], sort=True)[1].astype(np.float32) 1975 for dim in dims 1976 } 1977 1978 # Build indexes 1979 index = { 1980 dim: np.searchsorted(factors[dim], partition_data[dim]).astype( 1981 np.float32 1982 ) 1983 for dim in factors 1984 } 1985 1986 # Process partition 1987 partition_features = self._process_partition_ph( 1988 partition_data, index, dims, persistence_threshold 1989 ) 1990 1991 if len(partition_features) == 0: 1992 continue 1993 1994 # Roll up within partition 1995 partition_features = self.roll_up_dataframe( 1996 df=partition_features, 1997 sort_by="persistence", 1998 dims=["mz", "scan_time"], 1999 tol=[ 2000 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 2001 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 2002 ], 2003 relative=[True, False], 2004 ) 2005 partition_features = partition_features.reset_index(drop=True) 2006 2007 if len(partition_features) > 0: 2008 all_mass_features.append(partition_features) 2009 2010 # Combine results from all partitions 2011 if all_mass_features: 2012 combined_features = pd.concat(all_mass_features, ignore_index=True) 2013 2014 # Sort by persistence 2015 combined_features = combined_features.sort_values( 2016 by="persistence", ascending=False 2017 ).reset_index(drop=True) 2018 2019 # Remove duplicates from overlapping regions 2020 combined_features = self.roll_up_dataframe( 2021 df=combined_features, 2022 sort_by="persistence", 2023 dims=["mz", "scan_time"], 2024 tol=[ 2025 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 2026 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 2027 ], 2028 relative=[True, False], 2029 ) 2030 2031 # resort by persistence and reset index 2032 combined_features = combined_features.reset_index(drop=True) 2033 2034 # Populate mass_features attribute 2035 self._populate_mass_features(combined_features) 2036 else: 2037 self.mass_features = {} 2038 2039 def _process_partition_ph(self, partition_data, index, dims, persistence_threshold): 2040 """Process a single partition with persistent homology.""" 2041 # Smooth data 2042 iterations = self.parameters.lc_ms.ph_smooth_it 2043 smooth_radius = [ 2044 self.parameters.lc_ms.ph_smooth_radius_mz, 2045 self.parameters.lc_ms.ph_smooth_radius_scan, 2046 ] 2047 2048 index_array = np.vstack([index[dim] for dim in dims]).T 2049 V = partition_data["intensity"].values 2050 resid = np.inf 2051 2052 for i in range(iterations): 2053 # Previous iteration 2054 V_prev = V.copy() 2055 resid_prev = resid 2056 V = self.sparse_mean_filter(index_array, V, radius=smooth_radius) 2057 2058 # Calculate residual with previous iteration 2059 resid = np.sqrt(np.mean(np.square(V - V_prev))) 2060 2061 # Evaluate convergence 2062 if i > 0: 2063 # Percent change in residual 2064 test = np.abs(resid - resid_prev) / resid_prev 2065 2066 # Exit criteria 2067 if test <= 0: 2068 break 2069 2070 # Overwrite values 2071 partition_data = partition_data.copy() 2072 partition_data["intensity"] = V 2073 2074 # Use persistent homology to find regions of interest 2075 pidx, pers = self.sparse_upper_star(index_array, V) 2076 pidx = pidx[pers > 1] 2077 pers = pers[pers > 1] 2078 2079 if len(pidx) == 0: 2080 return pd.DataFrame() 2081 2082 # Get peaks 2083 peaks = partition_data.iloc[pidx, :].reset_index(drop=True) 2084 2085 # Add persistence column 2086 peaks["persistence"] = pers 2087 mass_features = peaks.sort_values( 2088 by="persistence", ascending=False 2089 ).reset_index(drop=True) 2090 2091 # Filter by persistence threshold 2092 mass_features = mass_features.loc[ 2093 mass_features["persistence"] > persistence_threshold, : 2094 ].reset_index(drop=True) 2095 2096 return mass_features 2097 2098 def _populate_mass_features(self, mass_features_df): 2099 """Populate the mass_features attribute from a DataFrame. 2100 2101 Parameters 2102 ---------- 2103 mass_features_df : pd.DataFrame 2104 DataFrame containing mass feature information. 2105 Note that the order of this DataFrame will determine the order of mass features in the mass_features attribute. 2106 2107 Returns 2108 ------- 2109 None, but assigns the mass_features attribute to the object. 2110 """ 2111 # Rename scan column to apex_scan 2112 mass_features_df = mass_features_df.rename( 2113 columns={"scan": "apex_scan", "scan_time": "retention_time"} 2114 ) 2115 2116 # Populate mass_features attribute 2117 self.mass_features = {} 2118 for row in mass_features_df.itertuples(): 2119 row_dict = mass_features_df.iloc[row.Index].to_dict() 2120 lcms_feature = LCMSMassFeature(self, **row_dict) 2121 self.mass_features[lcms_feature.id] = lcms_feature 2122 2123 if self.parameters.lc_ms.verbose_processing: 2124 print("Found " + str(len(mass_features_df)) + " initial mass features") 2125 2126 def find_mass_features_ph_centroid(self, ms_level=1): 2127 """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data. 2128 2129 Parameters 2130 ---------- 2131 ms_level : int, optional 2132 The MS level to use. Default is 1. 2133 2134 Raises 2135 ------ 2136 ValueError 2137 If no MS level data is found on the object. 2138 2139 Returns 2140 ------- 2141 None, but assigns the mass_features attribute to the object. 2142 """ 2143 # Check that ms_level is a key in self._ms_uprocessed 2144 if ms_level not in self._ms_unprocessed.keys(): 2145 raise ValueError( 2146 "No MS level " 2147 + str(ms_level) 2148 + " data found, did you instantiate with parser specific to MS level?" 2149 ) 2150 2151 # Work with reference instead of copy 2152 data = self._ms_unprocessed[ms_level] 2153 2154 # Calculate threshold first to avoid multiple operations 2155 max_intensity = data["intensity"].max() 2156 threshold = self.parameters.lc_ms.ph_inten_min_rel * max_intensity 2157 2158 # Create single filter condition and apply to required columns only 2159 valid_mask = data["intensity"].notna() & (data["intensity"] > threshold) 2160 required_cols = ["mz", "intensity", "scan"] 2161 data_thres = data.loc[valid_mask, required_cols].copy() 2162 data_thres["persistence"] = data_thres["intensity"] 2163 2164 # Merge with required scan data 2165 scan_subset = self.scan_df[["scan", "scan_time"]] 2166 mf_df = data_thres.merge(scan_subset, on="scan", how="inner") 2167 del data_thres, scan_subset 2168 2169 # Order by scan_time and then mz to ensure features near in rt are processed together 2170 # It's ok that different scans are in different partitions; we will roll up later 2171 mf_df = mf_df.sort_values( 2172 by=["scan_time", "mz"], ascending=[True, True] 2173 ).reset_index(drop=True) 2174 partition_size = 10000 2175 partitions = [ 2176 mf_df.iloc[i : i + partition_size].reset_index(drop=True) 2177 for i in range(0, len(mf_df), partition_size) 2178 ] 2179 del mf_df 2180 2181 # Run roll_up_dataframe on each partition 2182 rolled_partitions = [] 2183 for part in partitions: 2184 rolled = self.roll_up_dataframe( 2185 df=part, 2186 sort_by="persistence", 2187 dims=["mz", "scan_time"], 2188 tol=[ 2189 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 2190 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 2191 ], 2192 relative=[True, False], 2193 ) 2194 rolled_partitions.append(rolled) 2195 del partitions 2196 2197 # Run roll_up_dataframe on the rolled_up partitions to merge features near partition boundaries 2198 2199 # Combine results and run a final roll-up to merge features near partition boundaries 2200 mf_df_final = pd.concat(rolled_partitions, ignore_index=True) 2201 del rolled_partitions 2202 2203 # Reorder by persistence before final roll-up 2204 mf_df_final = mf_df_final.sort_values( 2205 by="persistence", ascending=False 2206 ).reset_index(drop=True) 2207 2208 mf_df_final = self.roll_up_dataframe( 2209 df=mf_df_final, 2210 sort_by="persistence", 2211 dims=["mz", "scan_time"], 2212 tol=[ 2213 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 2214 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 2215 ], 2216 relative=[True, False], 2217 ) 2218 # reset index 2219 mf_df_final = mf_df_final.reset_index(drop=True) 2220 2221 # Combine rename and sort operations 2222 mass_features = ( 2223 mf_df_final.rename( 2224 columns={"scan": "apex_scan", "scan_time": "retention_time"} 2225 ) 2226 .sort_values(by="persistence", ascending=False) 2227 .reset_index(drop=True) 2228 ) 2229 del mf_df_final # Free memory 2230 2231 # Order by persistence and reset index 2232 mass_features = mass_features.sort_values( 2233 by="persistence", ascending=False 2234 ).reset_index(drop=True) 2235 2236 self.mass_features = {} 2237 for idx, row in mass_features.iterrows(): 2238 row_dict = row.to_dict() 2239 lcms_feature = LCMSMassFeature(self, **row_dict) 2240 self.mass_features[lcms_feature.id] = lcms_feature 2241 2242 if self.parameters.lc_ms.verbose_processing: 2243 print("Found " + str(len(mass_features)) + " initial mass features") 2244 2245 def cluster_mass_features(self, drop_children=True, sort_by="persistence"): 2246 """Cluster mass features 2247 2248 Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence. 2249 2250 Parameters 2251 ---------- 2252 drop_children : bool, optional 2253 Whether to drop the mass features that are not cluster parents. Default is True. 2254 sort_by : str, optional 2255 The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence". 2256 2257 Raises 2258 ------ 2259 ValueError 2260 If no mass features are found. 2261 If too many mass features are found. 2262 2263 Returns 2264 ------- 2265 None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents. 2266 """ 2267 if self.mass_features is None: 2268 raise ValueError("No mass features found, run find_mass_features() first") 2269 if len(self.mass_features) > 400000: 2270 raise ValueError( 2271 "Too many mass features of interest found, run find_mass_features() with a higher intensity threshold" 2272 ) 2273 dims = ["mz", "scan_time"] 2274 mf_df_og = self.mass_features_to_df() 2275 mf_df = mf_df_og.copy() 2276 2277 tol = [ 2278 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 2279 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 2280 ] # mz, in relative; scan_time in minutes 2281 relative = [True, False] 2282 2283 # Roll up mass features based on their proximity in the declared dimensions 2284 mf_df_new = self.roll_up_dataframe( 2285 df=mf_df, sort_by=sort_by, dims=dims, tol=tol, relative=relative 2286 ) 2287 2288 mf_df["cluster_parent"] = np.where( 2289 np.isin(mf_df.index, mf_df_new.index), True, False 2290 ) 2291 2292 # get mass feature ids of features that are not cluster parents 2293 cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values 2294 if drop_children is True: 2295 # Drop mass features that are not cluster parents from self 2296 self.mass_features = { 2297 k: v 2298 for k, v in self.mass_features.items() 2299 if k not in cluster_daughters 2300 } 2301 else: 2302 return cluster_daughters
Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.
Notes
This class is intended to be used as a mixin for the LCMSBase class.
Methods
- sparse_mean_filter(idx, V, radius=[0, 1, 1]). Sparse implementation of a mean filter.
- embed_unique_indices(a). Creates an array of indices, sorted by unique element.
- sparse_upper_star(idx, V). Sparse implementation of an upper star filtration.
- check_if_grid(data). Check if the data is gridded in mz space.
- grid_data(data). Grid the data in the mz dimension.
- find_mass_features_ph(ms_level=1, grid=True). Find mass features within an LCMSBase object using persistent homology.
- cluster_mass_features(drop_children=True). Cluster regions of interest.
1170 @staticmethod 1171 def sparse_mean_filter(idx, V, radius=[0, 1, 1]): 1172 """Sparse implementation of a mean filter. 1173 1174 Parameters 1175 ---------- 1176 idx : :obj:`~numpy.array` 1177 Edge indices for each dimension (MxN). 1178 V : :obj:`~numpy.array` 1179 Array of intensity data (Mx1). 1180 radius : float or list 1181 Radius of the sparse filter in each dimension. Values less than 1182 zero indicate no connectivity in that dimension. 1183 1184 Returns 1185 ------- 1186 :obj:`~numpy.array` 1187 Filtered intensities (Mx1). 1188 1189 Notes 1190 ----- 1191 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. 1192 This is a static method. 1193 """ 1194 1195 # Copy indices 1196 idx = idx.copy().astype(V.dtype) 1197 1198 # Scale 1199 for i, r in enumerate(radius): 1200 # Increase inter-index distance 1201 if r < 1: 1202 idx[:, i] *= 2 1203 1204 # Do nothing 1205 elif r == 1: 1206 pass 1207 1208 # Decrease inter-index distance 1209 else: 1210 idx[:, i] /= r 1211 1212 # Connectivity matrix 1213 cmat = KDTree(idx) 1214 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1215 cmat.setdiag(1) 1216 1217 # Pair indices 1218 I, J = cmat.nonzero() 1219 1220 # Delete cmat 1221 cmat_shape = cmat.shape 1222 del cmat 1223 1224 # Sum over columns 1225 V_sum = sparse.bsr_matrix( 1226 (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype 1227 ).diagonal(0) 1228 1229 # Count over columns 1230 V_count = sparse.bsr_matrix( 1231 (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype 1232 ).diagonal(0) 1233 1234 return V_sum / V_count
Sparse implementation of a mean filter.
Parameters
- idx (
~numpy.array): Edge indices for each dimension (MxN). - V (
~numpy.array): Array of intensity data (Mx1). - radius (float or list): Radius of the sparse filter in each dimension. Values less than zero indicate no connectivity in that dimension.
Returns
~numpy.array: Filtered intensities (Mx1).
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. This is a static method.
1236 @staticmethod 1237 def embed_unique_indices(a): 1238 """Creates an array of indices, sorted by unique element. 1239 1240 Parameters 1241 ---------- 1242 a : :obj:`~numpy.array` 1243 Array of unique elements (Mx1). 1244 1245 Returns 1246 ------- 1247 :obj:`~numpy.array` 1248 Array of indices (Mx1). 1249 1250 Notes 1251 ----- 1252 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1253 This is a static method. 1254 """ 1255 1256 def count_tens(n): 1257 # Count tens 1258 ntens = (n - 1) // 10 1259 1260 while True: 1261 ntens_test = (ntens + n - 1) // 10 1262 1263 if ntens_test == ntens: 1264 return ntens 1265 else: 1266 ntens = ntens_test 1267 1268 def arange_exclude_10s(n): 1269 # How many 10s will there be? 1270 ntens = count_tens(n) 1271 1272 # Base array 1273 arr = np.arange(0, n + ntens) 1274 1275 # Exclude 10s 1276 arr = arr[(arr == 0) | (arr % 10 != 0)][:n] 1277 1278 return arr 1279 1280 # Creates an array of indices, sorted by unique element 1281 idx_sort = np.argsort(a) 1282 idx_unsort = np.argsort(idx_sort) 1283 1284 # Sorts records array so all unique elements are together 1285 sorted_a = a[idx_sort] 1286 1287 # Returns the unique values, the index of the first occurrence, 1288 # and the count for each element 1289 vals, idx_start, count = np.unique( 1290 sorted_a, return_index=True, return_counts=True 1291 ) 1292 1293 # Splits the indices into separate arrays 1294 splits = np.split(idx_sort, idx_start[1:]) 1295 1296 # Creates unique indices for each split 1297 idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits]) 1298 1299 # Reorders according to input array 1300 idx_unq = idx_unq[idx_unsort] 1301 1302 # Magnitude of each index 1303 exp = np.log10( 1304 idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64) 1305 ) 1306 idx_unq_mag = np.power(10, np.floor(exp) + 1) 1307 1308 # Result 1309 return a + idx_unq / idx_unq_mag
Creates an array of indices, sorted by unique element.
Parameters
- a (
~numpy.array): Array of unique elements (Mx1).
Returns
~numpy.array: Array of indices (Mx1).
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos This is a static method.
1311 @staticmethod 1312 def roll_up_dataframe( 1313 df: pd.DataFrame, 1314 sort_by: str, 1315 tol: list, 1316 relative: list, 1317 dims: list, 1318 memory_opt_threshold: int = 10000, 1319 ): 1320 """Subset data by rolling up into apex in appropriate dimensions. 1321 1322 Parameters 1323 ---------- 1324 data : pd.DataFrame 1325 The input data containing "dims" columns and the "sort_by" column. 1326 sort_by : str 1327 The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 1328 (i.e., the mass feature with the highest value in the sort_by column). 1329 dims : list 1330 A list of dimension names (column names in the data DataFrame) to roll up the mass features by. 1331 tol : list 1332 A list of tolerances for each dimension. The length of the list must match the number of dimensions. 1333 The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). 1334 If relative is True, the tolerance will be multiplied by the maximum value in that dimension. 1335 If relative is False, the tolerance will be used as is. 1336 relative : list 1337 A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1338 memory_opt_threshold : int, optional 1339 Minimum number of rows to trigger memory-optimized processing. Default is 10000. 1340 1341 Returns 1342 ------- 1343 pd.DataFrame 1344 A DataFrame with only the rolled up mass features, with the original index and columns. 1345 1346 1347 Raises 1348 ------ 1349 ValueError 1350 If the input data is not a pandas DataFrame. 1351 If the input data does not have columns for each of the dimensions in "dims". 1352 If the length of "dims", "tol", and "relative" do not match. 1353 """ 1354 og_columns = df.columns.copy() 1355 1356 # Unindex the data, but keep the original index 1357 if df.index.name is not None: 1358 og_index = df.index.name 1359 else: 1360 og_index = "index" 1361 df = df.reset_index(drop=False) 1362 1363 # Sort data by sort_by column, and reindex 1364 df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True) 1365 1366 # Check that data is a DataFrame and has columns for each of the dims 1367 if not isinstance(df, pd.DataFrame): 1368 raise ValueError("Data must be a pandas DataFrame") 1369 for dim in dims: 1370 if dim not in df.columns: 1371 raise ValueError(f"Data must have a column for {dim}") 1372 if len(dims) != len(tol) or len(dims) != len(relative): 1373 raise ValueError( 1374 "Dimensions, tolerances, and relative flags must be the same length" 1375 ) 1376 1377 # Pre-compute all values arrays 1378 all_values = [df[dim].values for dim in dims] 1379 1380 # Choose processing method based on dataframe size 1381 if len(df) >= memory_opt_threshold: 1382 # Memory-optimized approach for large dataframes 1383 distances = PHCalculations._compute_distances_memory_optimized( 1384 all_values, tol, relative 1385 ) 1386 else: 1387 # Faster approach for smaller dataframes 1388 distances = PHCalculations._compute_distances_original( 1389 all_values, tol, relative 1390 ) 1391 1392 # Process pairs with original logic but memory optimizations 1393 distances = distances.tocoo() 1394 pairs = np.stack((distances.row, distances.col), axis=1) 1395 pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]).set_index("parent") 1396 del distances, pairs # Free memory immediately 1397 1398 to_drop = [] 1399 while not pairs_df.empty: 1400 # Find root_parents and their children (original logic preserved) 1401 root_parents = np.setdiff1d( 1402 np.unique(pairs_df.index.values), np.unique(pairs_df.child.values) 1403 ) 1404 children_of_roots = pairs_df.loc[root_parents, "child"].unique() 1405 to_drop.extend(children_of_roots) # Use extend instead of append 1406 1407 # Remove root_children as possible parents from pairs_df for next iteration 1408 pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore") 1409 pairs_df = pairs_df.reset_index().set_index("child") 1410 # Remove root_children as possible children from pairs_df for next iteration 1411 pairs_df = pairs_df.drop(index=children_of_roots) 1412 1413 # Prepare for next iteration 1414 pairs_df = pairs_df.reset_index().set_index("parent") 1415 1416 # Convert to numpy array for efficient dropping 1417 to_drop = np.array(to_drop) 1418 1419 # Drop mass features that are not cluster parents 1420 df_sub = df.drop(index=to_drop) 1421 1422 # Set index back to og_index and only keep original columns 1423 df_sub = df_sub.set_index(og_index).sort_index()[og_columns] 1424 1425 return df_sub
Subset data by rolling up into apex in appropriate dimensions.
Parameters
- data (pd.DataFrame): The input data containing "dims" columns and the "sort_by" column.
- sort_by (str): The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature (i.e., the mass feature with the highest value in the sort_by column).
- dims (list): A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
- tol (list): A list of tolerances for each dimension. The length of the list must match the number of dimensions. The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). If relative is True, the tolerance will be multiplied by the maximum value in that dimension. If relative is False, the tolerance will be used as is.
- relative (list): A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
- memory_opt_threshold (int, optional): Minimum number of rows to trigger memory-optimized processing. Default is 10000.
Returns
- pd.DataFrame: A DataFrame with only the rolled up mass features, with the original index and columns.
Raises
- ValueError: If the input data is not a pandas DataFrame. If the input data does not have columns for each of the dimensions in "dims". If the length of "dims", "tol", and "relative" do not match.
1596 def sparse_upper_star(self, idx, V): 1597 """Sparse implementation of an upper star filtration. 1598 1599 Parameters 1600 ---------- 1601 idx : :obj:`~numpy.array` 1602 Edge indices for each dimension (MxN). 1603 V : :obj:`~numpy.array` 1604 Array of intensity data (Mx1). 1605 Returns 1606 ------- 1607 idx : :obj:`~numpy.array` 1608 Index of filtered points (Mx1). 1609 persistence : :obj:`~numpy.array` 1610 Persistence of each filtered point (Mx1). 1611 1612 Notes 1613 ----- 1614 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1615 """ 1616 1617 # Invert 1618 V = -1 * V.copy().astype(int) 1619 1620 # Embed indices 1621 V = self.embed_unique_indices(V) 1622 1623 # Connectivity matrix 1624 cmat = KDTree(idx) 1625 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1626 cmat.setdiag(1) 1627 cmat = sparse.triu(cmat) 1628 1629 # Pairwise minimums 1630 I, J = cmat.nonzero() 1631 d = np.maximum(V[I], V[J]) 1632 1633 # Delete connectiity matrix 1634 cmat_shape = cmat.shape 1635 del cmat 1636 1637 # Sparse distance matrix 1638 sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape) 1639 1640 # Delete pairwise mins 1641 del d, I, J 1642 1643 # Persistence homology 1644 ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0] 1645 1646 # Bound death values 1647 ph[ph[:, 1] == np.inf, 1] = np.max(V) 1648 1649 # Construct tree to query against 1650 tree = KDTree(V.reshape((-1, 1))) 1651 1652 # Get the indexes of the first nearest neighbor by birth 1653 _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1) 1654 1655 return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)
Sparse implementation of an upper star filtration.
Parameters
- idx (
~numpy.array): Edge indices for each dimension (MxN). - V (
~numpy.array): Array of intensity data (Mx1).
Returns
- idx (
~numpy.array): Index of filtered points (Mx1). - persistence (
~numpy.array): Persistence of each filtered point (Mx1).
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1657 def check_if_grid(self, data): 1658 """Check if the data are gridded in mz space. 1659 1660 Parameters 1661 ---------- 1662 data : DataFrame 1663 DataFrame containing the mass spectrometry data. Needs to have mz and scan columns. 1664 1665 Returns 1666 ------- 1667 bool 1668 True if the data is gridded in the mz direction, False otherwise. 1669 1670 Notes 1671 ----- 1672 This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly. 1673 """ 1674 # Calculate the difference between consecutive mz values in a single scan 1675 dat_check = data.copy().reset_index(drop=True) 1676 dat_check["mz_diff"] = np.abs(dat_check["mz"].diff()) 1677 mz_diff_min = ( 1678 dat_check.groupby("scan")["mz_diff"].min().min() 1679 ) # within each scan, what is the smallest mz difference between consecutive mz values 1680 1681 # Find the mininum mz difference between mz values in the data; regardless of scan 1682 dat_check_mz = dat_check[["mz"]].drop_duplicates().copy() 1683 dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True) 1684 dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff()) 1685 1686 # Get minimum mz_diff between mz values in the data 1687 mz_diff_min_raw = dat_check_mz["mz_diff"].min() 1688 1689 # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded 1690 if mz_diff_min_raw < mz_diff_min: 1691 return False 1692 else: 1693 return True
Check if the data are gridded in mz space.
Parameters
- data (DataFrame): DataFrame containing the mass spectrometry data. Needs to have mz and scan columns.
Returns
- bool: True if the data is gridded in the mz direction, False otherwise.
Notes
This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1695 def grid_data(self, data, attempts=5): 1696 """Grid the data in the mz dimension. 1697 1698 Data must be gridded prior to persistent homology calculations and computing average mass spectrum 1699 1700 Parameters 1701 ---------- 1702 data : DataFrame 1703 The input data containing mz, scan, scan_time, and intensity columns. 1704 attempts : int, optional 1705 The number of attempts to grid the data. Default is 5. 1706 1707 Returns 1708 ------- 1709 DataFrame 1710 The gridded data with mz, scan, scan_time, and intensity columns. 1711 1712 Raises 1713 ------ 1714 ValueError 1715 If gridding fails after the specified number of attempts. 1716 """ 1717 attempt_i = 0 1718 while attempt_i < attempts: 1719 attempt_i += 1 1720 data = self._grid_data(data) 1721 1722 if self.check_if_grid(data): 1723 return data 1724 1725 if not self.check_if_grid(data): 1726 raise ValueError( 1727 "Gridding failed after " 1728 + str(attempt_i) 1729 + " attempts. Please check the data." 1730 ) 1731 else: 1732 return data
Grid the data in the mz dimension.
Data must be gridded prior to persistent homology calculations and computing average mass spectrum
Parameters
- data (DataFrame): The input data containing mz, scan, scan_time, and intensity columns.
- attempts (int, optional): The number of attempts to grid the data. Default is 5.
Returns
- DataFrame: The gridded data with mz, scan, scan_time, and intensity columns.
Raises
- ValueError: If gridding fails after the specified number of attempts.
1812 def find_mass_features_ph(self, ms_level=1, grid=True): 1813 """Find mass features within an LCMSBase object using persistent homology. 1814 1815 Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id) 1816 1817 Parameters 1818 ---------- 1819 ms_level : int, optional 1820 The MS level to use. Default is 1. 1821 grid : bool, optional 1822 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True. 1823 1824 Raises 1825 ------ 1826 ValueError 1827 If no MS level data is found on the object. 1828 If data is not gridded and grid is False. 1829 1830 Returns 1831 ------- 1832 None, but assigns the mass_features attribute to the object. 1833 1834 Notes 1835 ----- 1836 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1837 """ 1838 # Check that ms_level is a key in self._ms_uprocessed 1839 if ms_level not in self._ms_unprocessed.keys(): 1840 raise ValueError( 1841 "No MS level " 1842 + str(ms_level) 1843 + " data found, did you instantiate with parser specific to MS level?" 1844 ) 1845 1846 # Get ms data 1847 data = self._ms_unprocessed[ms_level].copy() 1848 1849 # Drop rows with missing intensity values and reset index 1850 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1851 1852 # Threshold data 1853 dims = ["mz", "scan_time"] 1854 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1855 persistence_threshold = ( 1856 self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max() 1857 ) 1858 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1859 1860 # Check if gridded, if not, grid 1861 gridded_mz = self.check_if_grid(data_thres) 1862 if gridded_mz is False: 1863 if grid is False: 1864 raise ValueError( 1865 "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function" 1866 ) 1867 else: 1868 data_thres = self.grid_data(data_thres) 1869 1870 # Add scan_time 1871 data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1872 # Process in chunks if required 1873 if len(data_thres) > 10000: 1874 return self._find_mass_features_ph_partition( 1875 data_thres, dims, persistence_threshold 1876 ) 1877 else: 1878 # Process all at once 1879 return self._find_mass_features_ph_single( 1880 data_thres, dims, persistence_threshold 1881 )
Find mass features within an LCMSBase object using persistent homology.
Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
Parameters
- ms_level (int, optional): The MS level to use. Default is 1.
- grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
Raises
- ValueError: If no MS level data is found on the object. If data is not gridded and grid is False.
Returns
- None, but assigns the mass_features attribute to the object.
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
2126 def find_mass_features_ph_centroid(self, ms_level=1): 2127 """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data. 2128 2129 Parameters 2130 ---------- 2131 ms_level : int, optional 2132 The MS level to use. Default is 1. 2133 2134 Raises 2135 ------ 2136 ValueError 2137 If no MS level data is found on the object. 2138 2139 Returns 2140 ------- 2141 None, but assigns the mass_features attribute to the object. 2142 """ 2143 # Check that ms_level is a key in self._ms_uprocessed 2144 if ms_level not in self._ms_unprocessed.keys(): 2145 raise ValueError( 2146 "No MS level " 2147 + str(ms_level) 2148 + " data found, did you instantiate with parser specific to MS level?" 2149 ) 2150 2151 # Work with reference instead of copy 2152 data = self._ms_unprocessed[ms_level] 2153 2154 # Calculate threshold first to avoid multiple operations 2155 max_intensity = data["intensity"].max() 2156 threshold = self.parameters.lc_ms.ph_inten_min_rel * max_intensity 2157 2158 # Create single filter condition and apply to required columns only 2159 valid_mask = data["intensity"].notna() & (data["intensity"] > threshold) 2160 required_cols = ["mz", "intensity", "scan"] 2161 data_thres = data.loc[valid_mask, required_cols].copy() 2162 data_thres["persistence"] = data_thres["intensity"] 2163 2164 # Merge with required scan data 2165 scan_subset = self.scan_df[["scan", "scan_time"]] 2166 mf_df = data_thres.merge(scan_subset, on="scan", how="inner") 2167 del data_thres, scan_subset 2168 2169 # Order by scan_time and then mz to ensure features near in rt are processed together 2170 # It's ok that different scans are in different partitions; we will roll up later 2171 mf_df = mf_df.sort_values( 2172 by=["scan_time", "mz"], ascending=[True, True] 2173 ).reset_index(drop=True) 2174 partition_size = 10000 2175 partitions = [ 2176 mf_df.iloc[i : i + partition_size].reset_index(drop=True) 2177 for i in range(0, len(mf_df), partition_size) 2178 ] 2179 del mf_df 2180 2181 # Run roll_up_dataframe on each partition 2182 rolled_partitions = [] 2183 for part in partitions: 2184 rolled = self.roll_up_dataframe( 2185 df=part, 2186 sort_by="persistence", 2187 dims=["mz", "scan_time"], 2188 tol=[ 2189 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 2190 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 2191 ], 2192 relative=[True, False], 2193 ) 2194 rolled_partitions.append(rolled) 2195 del partitions 2196 2197 # Run roll_up_dataframe on the rolled_up partitions to merge features near partition boundaries 2198 2199 # Combine results and run a final roll-up to merge features near partition boundaries 2200 mf_df_final = pd.concat(rolled_partitions, ignore_index=True) 2201 del rolled_partitions 2202 2203 # Reorder by persistence before final roll-up 2204 mf_df_final = mf_df_final.sort_values( 2205 by="persistence", ascending=False 2206 ).reset_index(drop=True) 2207 2208 mf_df_final = self.roll_up_dataframe( 2209 df=mf_df_final, 2210 sort_by="persistence", 2211 dims=["mz", "scan_time"], 2212 tol=[ 2213 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 2214 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 2215 ], 2216 relative=[True, False], 2217 ) 2218 # reset index 2219 mf_df_final = mf_df_final.reset_index(drop=True) 2220 2221 # Combine rename and sort operations 2222 mass_features = ( 2223 mf_df_final.rename( 2224 columns={"scan": "apex_scan", "scan_time": "retention_time"} 2225 ) 2226 .sort_values(by="persistence", ascending=False) 2227 .reset_index(drop=True) 2228 ) 2229 del mf_df_final # Free memory 2230 2231 # Order by persistence and reset index 2232 mass_features = mass_features.sort_values( 2233 by="persistence", ascending=False 2234 ).reset_index(drop=True) 2235 2236 self.mass_features = {} 2237 for idx, row in mass_features.iterrows(): 2238 row_dict = row.to_dict() 2239 lcms_feature = LCMSMassFeature(self, **row_dict) 2240 self.mass_features[lcms_feature.id] = lcms_feature 2241 2242 if self.parameters.lc_ms.verbose_processing: 2243 print("Found " + str(len(mass_features)) + " initial mass features")
Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
Parameters
- ms_level (int, optional): The MS level to use. Default is 1.
Raises
- ValueError: If no MS level data is found on the object.
Returns
- None, but assigns the mass_features attribute to the object.
2245 def cluster_mass_features(self, drop_children=True, sort_by="persistence"): 2246 """Cluster mass features 2247 2248 Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence. 2249 2250 Parameters 2251 ---------- 2252 drop_children : bool, optional 2253 Whether to drop the mass features that are not cluster parents. Default is True. 2254 sort_by : str, optional 2255 The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence". 2256 2257 Raises 2258 ------ 2259 ValueError 2260 If no mass features are found. 2261 If too many mass features are found. 2262 2263 Returns 2264 ------- 2265 None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents. 2266 """ 2267 if self.mass_features is None: 2268 raise ValueError("No mass features found, run find_mass_features() first") 2269 if len(self.mass_features) > 400000: 2270 raise ValueError( 2271 "Too many mass features of interest found, run find_mass_features() with a higher intensity threshold" 2272 ) 2273 dims = ["mz", "scan_time"] 2274 mf_df_og = self.mass_features_to_df() 2275 mf_df = mf_df_og.copy() 2276 2277 tol = [ 2278 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 2279 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 2280 ] # mz, in relative; scan_time in minutes 2281 relative = [True, False] 2282 2283 # Roll up mass features based on their proximity in the declared dimensions 2284 mf_df_new = self.roll_up_dataframe( 2285 df=mf_df, sort_by=sort_by, dims=dims, tol=tol, relative=relative 2286 ) 2287 2288 mf_df["cluster_parent"] = np.where( 2289 np.isin(mf_df.index, mf_df_new.index), True, False 2290 ) 2291 2292 # get mass feature ids of features that are not cluster parents 2293 cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values 2294 if drop_children is True: 2295 # Drop mass features that are not cluster parents from self 2296 self.mass_features = { 2297 k: v 2298 for k, v in self.mass_features.items() 2299 if k not in cluster_daughters 2300 } 2301 else: 2302 return cluster_daughters
Cluster mass features
Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
Parameters
- drop_children (bool, optional): Whether to drop the mass features that are not cluster parents. Default is True.
- sort_by (str, optional): The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
Raises
- ValueError: If no mass features are found. If too many mass features are found.
Returns
- None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.