corems.mass_spectra.calc.lc_calc
1import numpy as np 2import pandas as pd 3from ripser import ripser 4from scipy import sparse 5from scipy.spatial import KDTree 6 7from corems.chroma_peak.factory.chroma_peak_classes import LCMSMassFeature 8from corems.mass_spectra.calc import SignalProcessing as sp 9from corems.mass_spectra.factory.chromat_data import EIC_Data 10from corems.mass_spectrum.input.numpyArray import ms_from_array_profile 11 12 13def find_closest(A, target): 14 """Find the index of closest value in A to each value in target. 15 16 Parameters 17 ---------- 18 A : :obj:`~numpy.array` 19 The array to search (blueprint). A must be sorted. 20 target : :obj:`~numpy.array` 21 The array of values to search for. target must be sorted. 22 23 Returns 24 ------- 25 :obj:`~numpy.array` 26 The indices of the closest values in A to each value in target. 27 """ 28 idx = A.searchsorted(target) 29 idx = np.clip(idx, 1, len(A) - 1) 30 left = A[idx - 1] 31 right = A[idx] 32 idx -= target - left < right - target 33 return idx 34 35 36class LCCalculations: 37 """Methods for performing LC calculations on mass spectra data. 38 39 Notes 40 ----- 41 This class is intended to be used as a mixin for the LCMSBase class. 42 43 Methods 44 ------- 45 * get_max_eic(eic_data). 46 Returns the maximum EIC value from the given EIC data. A static method. 47 * smooth_tic(tic). 48 Smooths the TIC data using the specified smoothing method and settings. 49 * eic_centroid_detector(rt, eic, max_eic). 50 Performs EIC centroid detection on the given EIC data. 51 * find_nearest_scan(rt). 52 Finds the nearest scan to the given retention time. 53 * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). 54 Returns an averaged mass spectrum object. 55 * find_mass_features(ms_level=1). 56 Find regions of interest for a given MS level (default is MS1). 57 * integrate_mass_features(drop_if_fail=False, ms_level=1). 58 Integrate mass features of interest and extracts EICs. 59 * find_c13_mass_features(). 60 Evaluate mass features and mark likely C13 isotopes. 61 * deconvolute_ms1_mass_features(). 62 Deconvolute mass features' ms1 mass spectra. 63 """ 64 65 @staticmethod 66 def get_max_eic(eic_data: dict): 67 """Returns the maximum EIC value from the given EIC data. 68 69 Notes 70 ----- 71 This is a static method. 72 73 Parameters 74 ---------- 75 eic_data : dict 76 A dictionary containing EIC data. 77 78 Returns 79 ------- 80 float 81 The maximum EIC value. 82 """ 83 max_eic = 0 84 for eic_data in eic_data.values(): 85 ind_max_eic = max(eic_data.get("EIC")) 86 max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic 87 88 return max_eic 89 90 def smooth_tic(self, tic): 91 """Smooths the TIC or EIC data using the specified smoothing method and settings. 92 93 Parameters 94 ---------- 95 tic : numpy.ndarray 96 The TIC (or EIC) data to be smoothed. 97 98 Returns 99 ------- 100 numpy.ndarray 101 The smoothed TIC data. 102 """ 103 implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method 104 105 pol_order = self.parameters.lc_ms.savgol_pol_order 106 107 window_len = self.parameters.lc_ms.smooth_window 108 109 window = self.parameters.lc_ms.smooth_method 110 111 return sp.smooth_signal( 112 tic, window_len, window, pol_order, implemented_smooth_method 113 ) 114 115 def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]): 116 """Performs EIC centroid detection on the given EIC data. 117 118 Parameters 119 ---------- 120 rt : numpy.ndarray 121 The retention time data. 122 eic : numpy.ndarray 123 The EIC data. 124 max_eic : float 125 The maximum EIC value. 126 apex_indexes : list, optional 127 The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function. 128 129 Returns 130 ------- 131 numpy.ndarray 132 The indexes of left, apex, and right limits as a generator. 133 """ 134 135 max_prominence = self.parameters.lc_ms.peak_max_prominence_percent 136 137 max_height = self.parameters.lc_ms.peak_height_max_percent 138 139 signal_threshold = self.parameters.lc_ms.eic_signal_threshold 140 141 min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints 142 143 peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold 144 145 include_indexes = sp.peak_picking_first_derivative( 146 domain=rt, 147 signal=eic, 148 max_height=max_height, 149 max_prominence=max_prominence, 150 max_signal=max_eic, 151 min_peak_datapoints=min_peak_datapoints, 152 peak_derivative_threshold=peak_derivative_threshold, 153 signal_threshold=signal_threshold, 154 correct_baseline=False, 155 plot_res=False, 156 apex_indexes=apex_indexes, 157 ) 158 159 return include_indexes 160 161 def find_nearest_scan(self, rt): 162 """Finds the nearest scan to the given retention time. 163 164 Parameters 165 ---------- 166 rt : float 167 The retention time (in minutes) to find the nearest scan for. 168 169 Returns 170 ------- 171 int 172 The scan number of the nearest scan. 173 """ 174 array_rt = np.array(self.retention_time) 175 176 scan_index = (np.abs(array_rt - rt)).argmin() 177 178 real_scan = self.scans_number[scan_index] 179 180 return real_scan 181 182 def add_peak_metrics(self): 183 """Add peak metrics to the mass features. 184 185 This function calculates the peak metrics for each mass feature and adds them to the mass feature objects. 186 """ 187 # Check that at least some mass features have eic data 188 if not any([mf._eic_data is not None for mf in self.mass_features.values()]): 189 raise ValueError( 190 "No mass features have EIC data. Run integrate_mass_features first." 191 ) 192 193 for mass_feature in self.mass_features.values(): 194 # Check if the mass feature has been integrated 195 if mass_feature._eic_data is not None: 196 # Calculate peak metrics 197 mass_feature.calc_half_height_width() 198 mass_feature.calc_tailing_factor() 199 mass_feature.calc_dispersity_index() 200 201 def get_average_mass_spectrum( 202 self, 203 scan_list, 204 apex_scan, 205 spectrum_mode="profile", 206 ms_level=1, 207 auto_process=True, 208 use_parser=False, 209 perform_checks=True, 210 polarity=None, 211 ms_params=None, 212 ): 213 """Returns an averaged mass spectrum object 214 215 Parameters 216 ---------- 217 scan_list : list 218 List of scan numbers to average. 219 apex_scan : int 220 Number of the apex scan 221 spectrum_mode : str, optional 222 The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging. 223 ms_level : int, optional 224 The MS level to use. Defaults to 1. 225 auto_process : bool, optional 226 If True, the averaged mass spectrum will be auto-processed. Defaults to True. 227 use_parser : bool, optional 228 If True, the mass spectra will be obtained from the parser. Defaults to False. 229 perform_checks : bool, optional 230 If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan 231 polarity : int, optional 232 The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1) 233 ms_params : MSParameters, optional 234 The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None. 235 236 Returns 237 ------- 238 MassSpectrumProfile 239 The averaged mass spectrum object. 240 241 Raises 242 ------ 243 ValueError 244 If the spectrum mode is not "profile". 245 If the MS level is not found in the unprocessed mass spectra dictionary. 246 If not all scan numbers are found in the unprocessed mass spectra dictionary. 247 """ 248 if perform_checks: 249 if spectrum_mode != "profile": 250 raise ValueError("Averaging only supported for profile mode") 251 252 if polarity is None: 253 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 254 if self.polarity == "negative": 255 polarity = -1 256 elif self.polarity == "positive": 257 polarity = 1 258 else: 259 raise ValueError( 260 "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'" 261 ) 262 263 # if not using_parser, check that scan numbers are in _ms_unprocessed 264 if not use_parser: 265 if perform_checks: 266 # Set index to scan for faster lookup 267 ms_df = ( 268 self._ms_unprocessed[ms_level] 269 .copy() 270 .set_index("scan", drop=False) 271 .sort_index() 272 ) 273 my_ms_df = ms_df.loc[scan_list] 274 # Check that all scan numbers are in the ms_df 275 if not all(np.isin(scan_list, ms_df.index)): 276 raise ValueError( 277 "Not all scan numbers found in the unprocessed mass spectra dictionary" 278 ) 279 else: 280 my_ms_df = ( 281 pd.DataFrame({"scan": scan_list}) 282 .set_index("scan") 283 .join(self._ms_unprocessed[ms_level], how="left") 284 ) 285 286 if use_parser: 287 ms_list = [ 288 self.spectra_parser.get_mass_spectrum_from_scan( 289 x, spectrum_mode=spectrum_mode, auto_process=False 290 ) 291 for x in scan_list 292 ] 293 ms_mz = [x._mz_exp for x in ms_list] 294 ms_int = [x._abundance for x in ms_list] 295 my_ms_df = [] 296 for i in np.arange(len(ms_mz)): 297 my_ms_df.append( 298 pd.DataFrame( 299 {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]} 300 ) 301 ) 302 my_ms_df = pd.concat(my_ms_df) 303 304 if not self.check_if_grid(my_ms_df): 305 my_ms_df = self.grid_data(my_ms_df) 306 307 my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index() 308 309 ms = ms_from_array_profile( 310 my_ms_ave.mz, 311 my_ms_ave.intensity, 312 self.file_location, 313 polarity=polarity, 314 auto_process=False, 315 ) 316 317 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 318 if ms is not None: 319 if ms_params is not None: 320 ms.parameters = ms_params 321 ms.scan_number = apex_scan 322 if auto_process: 323 ms.process_mass_spec() 324 return ms 325 326 def find_mass_features(self, ms_level=1, grid=True): 327 """Find mass features within an LCMSBase object 328 329 Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future. 330 331 Parameters 332 ---------- 333 ms_level : int, optional 334 The MS level to use for peak picking Default is 1. 335 grid : bool, optional 336 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 337 used for persistent homology peak picking for profile data only. Default is True. 338 339 Raises 340 ------ 341 ValueError 342 If no MS level data is found on the object. 343 If persistent homology peak picking is attempted on non-profile mode data. 344 If data is not gridded and grid is False. 345 If peak picking method is not implemented. 346 347 Returns 348 ------- 349 None, but assigns the mass_features and eics attributes to the object. 350 351 """ 352 pp_method = self.parameters.lc_ms.peak_picking_method 353 354 if pp_method == "persistent homology": 355 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 356 if all(msx_scan_df["ms_format"] == "profile"): 357 self.find_mass_features_ph(ms_level=ms_level, grid=grid) 358 self.cluster_mass_features(drop_children=True, sort_by="persistence") 359 else: 360 raise ValueError( 361 "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format( 362 ms_level 363 ) 364 ) 365 elif pp_method == "centroided_persistent_homology": 366 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 367 if all(msx_scan_df["ms_format"] == "centroid"): 368 self.find_mass_features_ph_centroid(ms_level=ms_level) 369 else: 370 raise ValueError( 371 "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format( 372 ms_level 373 ) 374 ) 375 else: 376 raise ValueError("Peak picking method not implemented") 377 378 def integrate_mass_features( 379 self, drop_if_fail=True, drop_duplicates=True, ms_level=1 380 ): 381 """Integrate mass features and extract EICs. 382 383 Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute. 384 385 Parameters 386 ---------- 387 drop_if_fail : bool, optional 388 Whether to drop mass features if the EIC limit calculations fail. 389 Default is True. 390 drop_duplicates : bool, optional 391 Whether to mass features that appear to be duplicates 392 (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). 393 Default is True. 394 ms_level : int, optional 395 The MS level to use. Default is 1. 396 397 Raises 398 ------ 399 ValueError 400 If no mass features are found. 401 If no MS level data is found for the given MS level (either in data or in the scan data) 402 403 Returns 404 ------- 405 None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute. 406 407 Notes 408 ----- 409 drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection). 410 """ 411 # Check if there is data 412 if ms_level in self._ms_unprocessed.keys(): 413 raw_data = self._ms_unprocessed[ms_level].copy() 414 else: 415 raise ValueError("No MS level " + str(ms_level) + " data found") 416 if self.mass_features is not None: 417 mf_df = self.mass_features_to_df().copy() 418 else: 419 raise ValueError( 420 "No mass features found, did you run find_mass_features() first?" 421 ) 422 # Check if mass_spectrum exists on each mass feature 423 if not all( 424 [mf.mass_spectrum is not None for mf in self.mass_features.values()] 425 ): 426 raise ValueError( 427 "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?" 428 ) 429 430 # Subset scan data to only include correct ms_level 431 scan_df_sub = self.scan_df[ 432 self.scan_df["ms_level"] == int(ms_level) 433 ].reset_index(drop=True) 434 if scan_df_sub.empty: 435 raise ValueError("No MS level " + ms_level + " data found in scan data") 436 scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy() 437 438 mzs_to_extract = np.unique(mf_df["mz"].values) 439 mzs_to_extract.sort() 440 441 # Get EICs for each unique mz in mass features list 442 for mz in mzs_to_extract: 443 mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 444 mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 445 raw_data_sub = raw_data[ 446 (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max) 447 ].reset_index(drop=True) 448 raw_data_sub = ( 449 raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index() 450 ) 451 raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left") 452 raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0) 453 myEIC = EIC_Data( 454 scans=raw_data_sub["scan"].values, 455 time=raw_data_sub["scan_time"].values, 456 eic=raw_data_sub["intensity"].values, 457 ) 458 # Smooth EIC 459 smoothed_eic = self.smooth_tic(myEIC.eic) 460 smoothed_eic[smoothed_eic < 0] = 0 461 myEIC.eic_smoothed = smoothed_eic 462 self.eics[mz] = myEIC 463 464 # Get limits of mass features using EIC centroid detector and integrate 465 mf_df["area"] = np.nan 466 for idx, mass_feature in mf_df.iterrows(): 467 mz = mass_feature.mz 468 apex_scan = mass_feature.apex_scan 469 470 # Pull EIC data and find apex scan index 471 myEIC = self.eics[mz] 472 self.mass_features[idx]._eic_data = myEIC 473 apex_index = np.where(myEIC.scans == apex_scan)[0][0] 474 475 # Find left and right limits of peak using EIC centroid detector, add to EICData 476 centroid_eics = self.eic_centroid_detector( 477 myEIC.time, 478 myEIC.eic_smoothed, 479 mass_feature.intensity * 1.1, 480 apex_indexes=[int(apex_index)], 481 ) 482 l_a_r_scan_idx = [i for i in centroid_eics] 483 if len(l_a_r_scan_idx) > 0: 484 # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans 485 # Find the number of consecutive non-zero values in the EIC segment 486 mask = myEIC.eic[l_a_r_scan_idx[0][0]:l_a_r_scan_idx[0][2] + 1] > 0 487 # Find the longest run of consecutive True values 488 if np.any(mask): 489 # Find indices where mask changes value 490 diff = np.diff(np.concatenate(([0], mask.astype(int), [0]))) 491 starts = np.where(diff == 1)[0] 492 ends = np.where(diff == -1)[0] 493 consecutive_scans = (ends - starts).max() 494 else: 495 consecutive_scans = 0 496 if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min: 497 self.mass_features.pop(idx) 498 continue 499 # Add start and final scan to mass_features and EICData 500 left_scan, right_scan = ( 501 myEIC.scans[l_a_r_scan_idx[0][0]], 502 myEIC.scans[l_a_r_scan_idx[0][2]], 503 ) 504 mf_scan_apex = [(left_scan, int(apex_scan), right_scan)] 505 myEIC.apexes = myEIC.apexes + mf_scan_apex 506 self.mass_features[idx].start_scan = left_scan 507 self.mass_features[idx].final_scan = right_scan 508 509 # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData 510 area = np.trapz( 511 myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 512 myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 513 ) 514 mf_df.at[idx, "area"] = area 515 myEIC.areas = myEIC.areas + [area] 516 self.eics[mz] = myEIC 517 self.mass_features[idx]._area = area 518 else: 519 if drop_if_fail is True: 520 self.mass_features.pop(idx) 521 522 if drop_duplicates: 523 # Prepare mass feature dataframe 524 mf_df = self.mass_features_to_df().copy() 525 526 # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature 527 # Kepp the first mass fea 528 for idx, mass_feature in mf_df.iterrows(): 529 mz = mass_feature.mz 530 apex_scan = mass_feature.apex_scan 531 532 mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6 533 mf_df_sub = mf_df[ 534 mf_df["mz_diff_ppm"] 535 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 536 * 10**6 537 ].copy() 538 539 # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature 540 for idx2, mass_feature2 in mf_df_sub.iterrows(): 541 if idx2 != idx: 542 if ( 543 mass_feature2.start_scan >= mass_feature.start_scan 544 and mass_feature2.final_scan <= mass_feature.final_scan 545 ): 546 if idx2 in self.mass_features.keys(): 547 self.mass_features.pop(idx2) 548 549 def find_c13_mass_features(self): 550 """Mark likely C13 isotopes and connect to monoisoitopic mass features. 551 552 Returns 553 ------- 554 None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object. 555 556 Raises 557 ------ 558 ValueError 559 If no mass features are found. 560 """ 561 verbose = self.parameters.lc_ms.verbose_processing 562 if verbose: 563 print("evaluating mass features for C13 isotopes") 564 if self.mass_features is None: 565 raise ValueError("No mass features found, run find_mass_features() first") 566 567 # Data prep fo sparse distance matrix 568 dims = ["mz", "scan_time"] 569 mf_df = self.mass_features_to_df().copy() 570 # Drop mass features that have no area (these are likely to be noise) 571 mf_df = mf_df[mf_df["area"].notnull()] 572 mf_df["mf_id"] = mf_df.index.values 573 dims = ["mz", "scan_time"] 574 575 # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features 576 mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy() 577 578 mz_diff = 1.003355 # C13-C12 mass difference 579 tol = [ 580 mf_df["mz"].median() 581 * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 582 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5, 583 ] # mz, in relative; scan_time in minutes 584 585 # Compute inter-feature distances 586 distances = None 587 for i in range(len(dims)): 588 # Construct k-d tree 589 values = mf_df[dims[i]].values 590 tree = KDTree(values.reshape(-1, 1)) 591 592 max_tol = tol[i] 593 if dims[i] == "mz": 594 # Maximum absolute tolerance 595 max_tol = mz_diff + tol[i] 596 597 # Compute sparse distance matrix 598 # the larger the max_tol, the slower this operation is 599 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 600 601 # Only consider forward case, exclude diagonal 602 sdm = sparse.triu(sdm, k=1) 603 604 if dims[i] == "mz": 605 min_tol = mz_diff - tol[i] 606 # Get only the ones that are above the min tol 607 idx = sdm.data > min_tol 608 609 # Reconstruct sparse distance matrix 610 sdm = sparse.coo_matrix( 611 (sdm.data[idx], (sdm.row[idx], sdm.col[idx])), 612 shape=(len(values), len(values)), 613 ) 614 615 # Cast as binary matrix 616 sdm.data = np.ones_like(sdm.data) 617 618 # Stack distances 619 if distances is None: 620 distances = sdm 621 else: 622 distances = distances.multiply(sdm) 623 624 # Extract indices of within-tolerance points 625 distances = distances.tocoo() 626 pairs = np.stack((distances.row, distances.col), axis=1) # C12 to C13 pairs 627 628 # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df 629 pairs_mf = pairs.copy() 630 pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values 631 pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values 632 633 # Connect monoisotopic masses with isotopologes within mass_features 634 monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1])) 635 for mono in monos: 636 self.mass_features[mono].monoisotopic_mf_id = mono 637 pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"]) 638 while not pairs_iso_df.empty: 639 pairs_iso_df = pairs_iso_df.set_index("parent", drop=False) 640 m1_isos = pairs_iso_df.loc[monos, "child"].unique() 641 for iso in m1_isos: 642 # Set monoisotopic_mf_id and isotopologue_type for isotopologues 643 parent = pairs_mf[pairs_mf[:, 1] == iso, 0] 644 if len(parent) > 1: 645 # Choose the parent that is closest in time to the isotopologue 646 parent_time = [self.mass_features[p].retention_time for p in parent] 647 time_diff = [ 648 np.abs(self.mass_features[iso].retention_time - x) 649 for x in parent_time 650 ] 651 parent = parent[np.argmin(time_diff)] 652 else: 653 parent = parent[0] 654 self.mass_features[iso].monoisotopic_mf_id = self.mass_features[ 655 parent 656 ].monoisotopic_mf_id 657 if self.mass_features[iso].monoisotopic_mf_id is not None: 658 mass_diff = ( 659 self.mass_features[iso].mz 660 - self.mass_features[ 661 self.mass_features[iso].monoisotopic_mf_id 662 ].mz 663 ) 664 self.mass_features[iso].isotopologue_type = "13C" + str( 665 int(round(mass_diff, 0)) 666 ) 667 668 # Drop the mono and iso from the pairs_iso_df 669 pairs_iso_df = pairs_iso_df.drop( 670 index=monos, errors="ignore" 671 ) # Drop pairs where the parent is a child that is a child of a root 672 pairs_iso_df = pairs_iso_df.set_index("child", drop=False) 673 pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore") 674 675 if not pairs_iso_df.empty: 676 # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process 677 monos = np.setdiff1d( 678 np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child) 679 ) 680 if verbose: 681 # Report fraction of compounds annotated with isotopes 682 mf_df["c13_flag"] = np.where( 683 np.logical_or( 684 np.isin(mf_df["mf_id"], pairs_mf[:, 0]), 685 np.isin(mf_df["mf_id"], pairs_mf[:, 1]), 686 ), 687 1, 688 0, 689 ) 690 print( 691 str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3)) 692 + " of mass features have or are C13 isotopes" 693 ) 694 695 def deconvolute_ms1_mass_features(self): 696 """Deconvolute MS1 mass features 697 698 Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features 699 700 Parameters 701 ---------- 702 None 703 704 Returns 705 ------- 706 None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent, 707 and associated_mass_features_deconvoluted attributes to the mass features in the 708 mass_features attribute of the LCMSBase object. 709 710 Raises 711 ------ 712 ValueError 713 If no mass features are found, must run find_mass_features() first. 714 If no EICs are found, did you run integrate_mass_features() first? 715 716 """ 717 # Checks for set mass_features and eics 718 if self.mass_features is None: 719 raise ValueError( 720 "No mass features found, did you run find_mass_features() first?" 721 ) 722 723 if self.eics == {}: 724 raise ValueError( 725 "No EICs found, did you run integrate_mass_features() first?" 726 ) 727 728 if 1 not in self._ms_unprocessed.keys(): 729 raise ValueError("No unprocessed MS1 spectra found.") 730 731 # Prep ms1 data 732 ms1_data = self._ms_unprocessed[1].copy() 733 ms1_data = ms1_data.set_index("scan") 734 735 # Prep mass feature summary 736 mass_feature_df = self.mass_features_to_df() 737 738 # Loop through each mass feature 739 for mf_id, mass_feature in self.mass_features.items(): 740 # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df 741 if mass_feature.mz != mass_feature.ms1_peak.mz_exp: 742 continue 743 744 # Get the left and right limits of the EIC of the mass feature 745 l_scan, _, r_scan = mass_feature._eic_data.apexes[0] 746 747 # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz 748 ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy() 749 ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False) 750 751 # Get the centroided masses of the mass feature 752 mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp 753 754 # Find the closest mz in the ms1 data to the centroided masses of the mass feature 755 ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[ 756 find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values) 757 ] 758 759 # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz 760 ms1_data_sub["mz_diff_rel"] = ( 761 np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"]) 762 / ms1_data_sub["mz"] 763 ) 764 ms1_data_sub = ms1_data_sub[ 765 ms1_data_sub["mz_diff_rel"] 766 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 767 ].reset_index(drop=True) 768 769 # Group by mass_feature_mz and scan and sum intensity 770 ms1_data_sub_group = ( 771 ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"] 772 .sum() 773 .reset_index() 774 ) 775 776 # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity) 777 corr = ( 778 ms1_data_sub_group.pivot( 779 index="scan", columns="mass_feature_mz", values="intensity" 780 ) 781 .fillna(0) 782 .corr() 783 ) 784 785 # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8 786 decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min 787 788 # Try catch for KeyError in case the mass feature mz is not in the correlation matrix 789 try: 790 corr_subset = corr.loc[mass_feature.mz,] 791 except KeyError: 792 # If the mass feature mz is not in the correlation matrix, skip to the next mass feature 793 continue 794 795 corr_subset = corr_subset[corr_subset > decon_corr_min] 796 797 # Get the masses from the mass spectrum that are the result of the deconvolution 798 mzs_decon = corr_subset.index.values 799 800 # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature 801 mzs_decon_idx = [ 802 id 803 for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp) 804 if mz in mzs_decon 805 ] 806 mass_feature._ms_deconvoluted_idx = mzs_decon_idx 807 808 # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum 809 if ( 810 mass_feature.ms1_peak.abundance 811 == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max() 812 ): 813 mass_feature.mass_spectrum_deconvoluted_parent = True 814 else: 815 mass_feature.mass_spectrum_deconvoluted_parent = False 816 817 # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature 818 # Subset mass_feature_df to only include mass features that are within the clustering tolerance 819 mass_feature_df_sub = mass_feature_df[ 820 abs(mass_feature.retention_time - mass_feature_df["scan_time"]) 821 < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 822 ].copy() 823 # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum 824 mass_feature_df_sub["mz_diff_ppm"] = [ 825 np.abs(mzs_decon - mz).min() / mz * 10**6 826 for mz in mass_feature_df_sub["mz"] 827 ] 828 # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses 829 mfs_associated_decon = mass_feature_df_sub[ 830 mass_feature_df_sub["mz_diff_ppm"] 831 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6 832 ].index.values 833 834 mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon 835 836 837class PHCalculations: 838 """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data. 839 840 Notes 841 ----- 842 This class is intended to be used as a mixin for the LCMSBase class. 843 844 Methods 845 ------- 846 * sparse_mean_filter(idx, V, radius=[0, 1, 1]). 847 Sparse implementation of a mean filter. 848 * embed_unique_indices(a). 849 Creates an array of indices, sorted by unique element. 850 * sparse_upper_star(idx, V). 851 Sparse implementation of an upper star filtration. 852 * check_if_grid(data). 853 Check if the data is gridded in mz space. 854 * grid_data(data). 855 Grid the data in the mz dimension. 856 * find_mass_features_ph(ms_level=1, grid=True). 857 Find mass features within an LCMSBase object using persistent homology. 858 * cluster_mass_features(drop_children=True). 859 Cluster regions of interest. 860 """ 861 862 @staticmethod 863 def sparse_mean_filter(idx, V, radius=[0, 1, 1]): 864 """Sparse implementation of a mean filter. 865 866 Parameters 867 ---------- 868 idx : :obj:`~numpy.array` 869 Edge indices for each dimension (MxN). 870 V : :obj:`~numpy.array` 871 Array of intensity data (Mx1). 872 radius : float or list 873 Radius of the sparse filter in each dimension. Values less than 874 zero indicate no connectivity in that dimension. 875 876 Returns 877 ------- 878 :obj:`~numpy.array` 879 Filtered intensities (Mx1). 880 881 Notes 882 ----- 883 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. 884 This is a static method. 885 """ 886 887 # Copy indices 888 idx = idx.copy().astype(V.dtype) 889 890 # Scale 891 for i, r in enumerate(radius): 892 # Increase inter-index distance 893 if r < 1: 894 idx[:, i] *= 2 895 896 # Do nothing 897 elif r == 1: 898 pass 899 900 # Decrease inter-index distance 901 else: 902 idx[:, i] /= r 903 904 # Connectivity matrix 905 cmat = KDTree(idx) 906 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 907 cmat.setdiag(1) 908 909 # Pair indices 910 I, J = cmat.nonzero() 911 912 # Delete cmat 913 cmat_shape = cmat.shape 914 del cmat 915 916 # Sum over columns 917 V_sum = sparse.bsr_matrix( 918 (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype 919 ).diagonal(0) 920 921 # Count over columns 922 V_count = sparse.bsr_matrix( 923 (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype 924 ).diagonal(0) 925 926 return V_sum / V_count 927 928 @staticmethod 929 def embed_unique_indices(a): 930 """Creates an array of indices, sorted by unique element. 931 932 Parameters 933 ---------- 934 a : :obj:`~numpy.array` 935 Array of unique elements (Mx1). 936 937 Returns 938 ------- 939 :obj:`~numpy.array` 940 Array of indices (Mx1). 941 942 Notes 943 ----- 944 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 945 This is a static method. 946 """ 947 948 def count_tens(n): 949 # Count tens 950 ntens = (n - 1) // 10 951 952 while True: 953 ntens_test = (ntens + n - 1) // 10 954 955 if ntens_test == ntens: 956 return ntens 957 else: 958 ntens = ntens_test 959 960 def arange_exclude_10s(n): 961 # How many 10s will there be? 962 ntens = count_tens(n) 963 964 # Base array 965 arr = np.arange(0, n + ntens) 966 967 # Exclude 10s 968 arr = arr[(arr == 0) | (arr % 10 != 0)][:n] 969 970 return arr 971 972 # Creates an array of indices, sorted by unique element 973 idx_sort = np.argsort(a) 974 idx_unsort = np.argsort(idx_sort) 975 976 # Sorts records array so all unique elements are together 977 sorted_a = a[idx_sort] 978 979 # Returns the unique values, the index of the first occurrence, 980 # and the count for each element 981 vals, idx_start, count = np.unique( 982 sorted_a, return_index=True, return_counts=True 983 ) 984 985 # Splits the indices into separate arrays 986 splits = np.split(idx_sort, idx_start[1:]) 987 988 # Creates unique indices for each split 989 idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits]) 990 991 # Reorders according to input array 992 idx_unq = idx_unq[idx_unsort] 993 994 # Magnitude of each index 995 exp = np.log10( 996 idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64) 997 ) 998 idx_unq_mag = np.power(10, np.floor(exp) + 1) 999 1000 # Result 1001 return a + idx_unq / idx_unq_mag 1002 1003 @staticmethod 1004 def roll_up_dataframe( 1005 df : pd.DataFrame, 1006 sort_by : str, 1007 tol : list, 1008 relative : list, 1009 dims : list 1010 ): 1011 """Subset data by rolling up into apex in appropriate dimensions. 1012 1013 Parameters 1014 ---------- 1015 data : pd.DataFrame 1016 The input data containing "dims" columns and the "sort_by" column. 1017 sort_by : str 1018 The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 1019 (i.e., the mass feature with the highest value in the sort_by column). 1020 dims : list 1021 A list of dimension names (column names in the data DataFrame) to roll up the mass features by. 1022 tol : list 1023 A list of tolerances for each dimension. The length of the list must match the number of dimensions. 1024 The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). 1025 If relative is True, the tolerance will be multiplied by the maximum value in that dimension. 1026 If relative is False, the tolerance will be used as is. 1027 relative : list 1028 A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1029 1030 Returns 1031 ------- 1032 pd.DataFrame 1033 A DataFrame with only the rolled up mass features, with the original index and columns. 1034 1035 1036 Raises 1037 ------ 1038 ValueError 1039 If the input data is not a pandas DataFrame. 1040 If the input data does not have columns for each of the dimensions in "dims". 1041 If the length of "dims", "tol", and "relative" do not match. 1042 """ 1043 og_columns = df.columns.copy() 1044 1045 # Unindex the data, but keep the original index 1046 if df.index.name is not None: 1047 og_index = df.index.name 1048 else: 1049 og_index = "index" 1050 df = df.reset_index(drop=False) 1051 1052 # Sort data by sort_by column, and reindex 1053 df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True) 1054 1055 # Check that data is a DataFrame and has columns for each of the dims 1056 if not isinstance(df, pd.DataFrame): 1057 raise ValueError("Data must be a pandas DataFrame") 1058 for dim in dims: 1059 if dim not in df.columns: 1060 raise ValueError(f"Data must have a column for {dim}") 1061 if len(dims) != len(tol) or len(dims) != len(relative): 1062 raise ValueError( 1063 "Dimensions, tolerances, and relative flags must be the same length" 1064 ) 1065 1066 # Compute inter-feature distances 1067 distances = None 1068 for i in range(len(dims)): 1069 # Construct k-d tree 1070 values = df[dims[i]].values 1071 tree = KDTree(values.reshape(-1, 1)) 1072 1073 max_tol = tol[i] 1074 if relative[i] is True: 1075 # Maximum absolute tolerance 1076 max_tol = tol[i] * values.max() 1077 1078 # Compute sparse distance matrix 1079 # the larger the max_tol, the slower this operation is 1080 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 1081 1082 # Only consider forward case, exclude diagonal 1083 sdm = sparse.triu(sdm, k=1) 1084 1085 # Filter relative distances 1086 if relative[i] is True: 1087 # Compute relative distances 1088 rel_dists = sdm.data / values[sdm.row] # or col? 1089 1090 # Indices of relative distances less than tolerance 1091 idx = rel_dists <= tol[i] 1092 1093 # Reconstruct sparse distance matrix 1094 sdm = sparse.coo_matrix( 1095 (rel_dists[idx], (sdm.row[idx], sdm.col[idx])), 1096 shape=(len(values), len(values)), 1097 ) 1098 1099 # Cast as binary matrix 1100 sdm.data = np.ones_like(sdm.data) 1101 1102 # Stack distances 1103 if distances is None: 1104 distances = sdm 1105 else: 1106 distances = distances.multiply(sdm) 1107 1108 # Extract indices of within-tolerance points 1109 distances = distances.tocoo() 1110 pairs = np.stack((distances.row, distances.col), axis=1) 1111 pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]) 1112 pairs_df = pairs_df.set_index("parent") 1113 1114 to_drop = [] 1115 while not pairs_df.empty: 1116 # Find root_parents and their children 1117 root_parents = np.setdiff1d( 1118 np.unique(pairs_df.index.values), np.unique(pairs_df.child.values) 1119 ) 1120 children_of_roots = pairs_df.loc[root_parents, "child"].unique() 1121 to_drop = np.append(to_drop, children_of_roots) 1122 1123 # Remove root_children as possible parents from pairs_df for next iteration 1124 pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore") 1125 pairs_df = pairs_df.reset_index().set_index("child") 1126 # Remove root_children as possible children from pairs_df for next iteration 1127 pairs_df = pairs_df.drop(index=children_of_roots) 1128 1129 # Prepare for next iteration 1130 pairs_df = pairs_df.reset_index().set_index("parent") 1131 1132 # Drop mass features that are not cluster parents 1133 df_sub = df.drop(index=np.array(to_drop)) 1134 1135 # Set index back to og_index and only keep the columns that are in the original dataframe 1136 df_sub = df_sub.set_index(og_index) 1137 1138 # sort the dataframe by the original index 1139 df_sub = df_sub.sort_index() 1140 df_sub = df_sub[og_columns] 1141 1142 return df_sub 1143 1144 def sparse_upper_star(self, idx, V): 1145 """Sparse implementation of an upper star filtration. 1146 1147 Parameters 1148 ---------- 1149 idx : :obj:`~numpy.array` 1150 Edge indices for each dimension (MxN). 1151 V : :obj:`~numpy.array` 1152 Array of intensity data (Mx1). 1153 Returns 1154 ------- 1155 idx : :obj:`~numpy.array` 1156 Index of filtered points (Mx1). 1157 persistence : :obj:`~numpy.array` 1158 Persistence of each filtered point (Mx1). 1159 1160 Notes 1161 ----- 1162 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1163 """ 1164 1165 # Invert 1166 V = -1 * V.copy().astype(int) 1167 1168 # Embed indices 1169 V = self.embed_unique_indices(V) 1170 1171 # Connectivity matrix 1172 cmat = KDTree(idx) 1173 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1174 cmat.setdiag(1) 1175 cmat = sparse.triu(cmat) 1176 1177 # Pairwise minimums 1178 I, J = cmat.nonzero() 1179 d = np.maximum(V[I], V[J]) 1180 1181 # Delete connectiity matrix 1182 cmat_shape = cmat.shape 1183 del cmat 1184 1185 # Sparse distance matrix 1186 sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape) 1187 1188 # Delete pairwise mins 1189 del d, I, J 1190 1191 # Persistence homology 1192 ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0] 1193 1194 # Bound death values 1195 ph[ph[:, 1] == np.inf, 1] = np.max(V) 1196 1197 # Construct tree to query against 1198 tree = KDTree(V.reshape((-1, 1))) 1199 1200 # Get the indexes of the first nearest neighbor by birth 1201 _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1) 1202 1203 return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1) 1204 1205 def check_if_grid(self, data): 1206 """Check if the data are gridded in mz space. 1207 1208 Parameters 1209 ---------- 1210 data : DataFrame 1211 DataFrame containing the mass spectrometry data. Needs to have mz and scan columns. 1212 1213 Returns 1214 ------- 1215 bool 1216 True if the data is gridded in the mz direction, False otherwise. 1217 1218 Notes 1219 ----- 1220 This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly. 1221 """ 1222 # Calculate the difference between consecutive mz values in a single scan 1223 dat_check = data.copy().reset_index(drop=True) 1224 dat_check["mz_diff"] = np.abs(dat_check["mz"].diff()) 1225 mz_diff_min = ( 1226 dat_check.groupby("scan")["mz_diff"].min().min() 1227 ) # within each scan, what is the smallest mz difference between consecutive mz values 1228 1229 # Find the mininum mz difference between mz values in the data; regardless of scan 1230 dat_check_mz = dat_check[["mz"]].drop_duplicates().copy() 1231 dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True) 1232 dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff()) 1233 1234 # Get minimum mz_diff between mz values in the data 1235 mz_diff_min_raw = dat_check_mz["mz_diff"].min() 1236 1237 # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded 1238 if mz_diff_min_raw < mz_diff_min: 1239 return False 1240 else: 1241 return True 1242 1243 def grid_data(self, data, attempts=5): 1244 """Grid the data in the mz dimension. 1245 1246 Data must be gridded prior to persistent homology calculations and computing average mass spectrum 1247 1248 Parameters 1249 ---------- 1250 data : DataFrame 1251 The input data containing mz, scan, scan_time, and intensity columns. 1252 attempts : int, optional 1253 The number of attempts to grid the data. Default is 5. 1254 1255 Returns 1256 ------- 1257 DataFrame 1258 The gridded data with mz, scan, scan_time, and intensity columns. 1259 1260 Raises 1261 ------ 1262 ValueError 1263 If gridding fails after the specified number of attempts. 1264 """ 1265 attempt_i = 0 1266 while attempt_i < attempts: 1267 attempt_i += 1 1268 data = self._grid_data(data) 1269 1270 if self.check_if_grid(data): 1271 return data 1272 1273 if not self.check_if_grid(data): 1274 raise ValueError( 1275 "Gridding failed after " 1276 + str(attempt_i) 1277 + " attempts. Please check the data." 1278 ) 1279 else: 1280 return data 1281 1282 def _grid_data(self, data): 1283 """Internal method to grid the data in the mz dimension. 1284 1285 Notes 1286 ----- 1287 This method is called by the grid_data method and should not be called directly. 1288 It will attempt to grid the data in the mz dimension by creating a grid of mz values based on the minimum mz difference within each scan, 1289 but it does not check if the data is already gridded or if the gridding is successful. 1290 1291 Parameters 1292 ---------- 1293 data : pd.DataFrame or pl.DataFrame 1294 The input data to grid. 1295 1296 Returns 1297 ------- 1298 pd.DataFrame or pl.DataFrame 1299 The data after attempting to grid it in the mz dimension. 1300 """ 1301 # Calculate the difference between consecutive mz values in a single scan for grid spacing 1302 data_w = data.copy().reset_index(drop=True) 1303 data_w["mz_diff"] = np.abs(data_w["mz"].diff()) 1304 mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999 1305 1306 # Need high intensity mz values first so they are parents in the output pairs stack 1307 dat_mz = data_w[["mz", "intensity"]].sort_values( 1308 by=["intensity"], ascending=False 1309 ) 1310 dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy() 1311 1312 # Construct KD tree 1313 tree = KDTree(dat_mz.mz.values.reshape(-1, 1)) 1314 sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix") 1315 sdm = sparse.triu(sdm, k=1) 1316 sdm.data = np.ones_like(sdm.data) 1317 distances = sdm.tocoo() 1318 pairs = np.stack((distances.row, distances.col), axis=1) 1319 1320 # Cull pairs to just get root 1321 to_drop = [] 1322 while len(pairs) > 0: 1323 root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1])) 1324 id_root_parents = np.isin(pairs[:, 0], root_parents) 1325 children_of_roots = np.unique(pairs[id_root_parents, 1]) 1326 to_drop = np.append(to_drop, children_of_roots) 1327 1328 # Set up pairs array for next iteration by removing pairs with children or parents already dropped 1329 pairs = pairs[~np.isin(pairs[:, 1], to_drop), :] 1330 pairs = pairs[~np.isin(pairs[:, 0], to_drop), :] 1331 dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop)) 1332 mz_dat_np = ( 1333 dat_mz[["mz"]] 1334 .sort_values(by=["mz"]) 1335 .reset_index(drop=True) 1336 .values.flatten() 1337 ) 1338 1339 # Sort data by mz and recast mz to nearest value in mz_dat_np 1340 data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy() 1341 data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)] 1342 data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"]) 1343 1344 # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity 1345 new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy() 1346 new_data_w = ( 1347 new_data_w.drop(columns=["mz_diff", "mz_orig"]) 1348 .groupby(["scan", "mz"])["intensity"] 1349 .sum() 1350 .reset_index() 1351 ) 1352 new_data_w = ( 1353 new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True]) 1354 .reset_index(drop=True) 1355 .copy() 1356 ) 1357 1358 return new_data_w 1359 1360 def find_mass_features_ph(self, ms_level=1, grid=True): 1361 """Find mass features within an LCMSBase object using persistent homology. 1362 1363 Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id) 1364 1365 Parameters 1366 ---------- 1367 ms_level : int, optional 1368 The MS level to use. Default is 1. 1369 grid : bool, optional 1370 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True. 1371 1372 Raises 1373 ------ 1374 ValueError 1375 If no MS level data is found on the object. 1376 If data is not gridded and grid is False. 1377 1378 Returns 1379 ------- 1380 None, but assigns the mass_features attribute to the object. 1381 1382 Notes 1383 ----- 1384 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1385 """ 1386 # Check that ms_level is a key in self._ms_uprocessed 1387 if ms_level not in self._ms_unprocessed.keys(): 1388 raise ValueError( 1389 "No MS level " 1390 + str(ms_level) 1391 + " data found, did you instantiate with parser specific to MS level?" 1392 ) 1393 1394 # Get ms data 1395 data = self._ms_unprocessed[ms_level].copy() 1396 1397 # Drop rows with missing intensity values and reset index 1398 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1399 1400 # Threshold data 1401 dims = ["mz", "scan_time"] 1402 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1403 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1404 1405 # Check if gridded, if not, grid 1406 gridded_mz = self.check_if_grid(data_thres) 1407 if gridded_mz is False: 1408 if grid is False: 1409 raise ValueError( 1410 "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function" 1411 ) 1412 else: 1413 data_thres = self.grid_data(data_thres) 1414 1415 # Add build factors and add scan_time 1416 data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1417 factors = { 1418 dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32) 1419 for dim in dims 1420 } # this is return a float64 index 1421 1422 # Build indexes 1423 index = { 1424 dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32) 1425 for dim in factors 1426 } 1427 1428 # Smooth data 1429 iterations = self.parameters.lc_ms.ph_smooth_it 1430 smooth_radius = [ 1431 self.parameters.lc_ms.ph_smooth_radius_mz, 1432 self.parameters.lc_ms.ph_smooth_radius_scan, 1433 ] # mz, scan_time smoothing radius (in steps) 1434 1435 index = np.vstack([index[dim] for dim in dims]).T 1436 V = data_thres["intensity"].values 1437 resid = np.inf 1438 for i in range(iterations): 1439 # Previous iteration 1440 V_prev = V.copy() 1441 resid_prev = resid 1442 V = self.sparse_mean_filter(index, V, radius=smooth_radius) 1443 1444 # Calculate residual with previous iteration 1445 resid = np.sqrt(np.mean(np.square(V - V_prev))) 1446 1447 # Evaluate convergence 1448 if i > 0: 1449 # Percent change in residual 1450 test = np.abs(resid - resid_prev) / resid_prev 1451 1452 # Exit criteria 1453 if test <= 0: 1454 break 1455 1456 # Overwrite values 1457 data_thres["intensity"] = V 1458 1459 # Use persistent homology to find regions of interest 1460 pidx, pers = self.sparse_upper_star(index, V) 1461 pidx = pidx[pers > 1] 1462 pers = pers[pers > 1] 1463 1464 # Get peaks 1465 peaks = data_thres.iloc[pidx, :].reset_index(drop=True) 1466 1467 # Add persistence column 1468 peaks["persistence"] = pers 1469 mass_features = peaks.sort_values( 1470 by="persistence", ascending=False 1471 ).reset_index(drop=True) 1472 1473 # Filter by persistence threshold 1474 persistence_threshold = ( 1475 self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max() 1476 ) 1477 mass_features = mass_features.loc[ 1478 mass_features["persistence"] > persistence_threshold, : 1479 ].reset_index(drop=True) 1480 1481 # Rename scan column to apex_scan 1482 mass_features = mass_features.rename( 1483 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1484 ) 1485 1486 # Populate mass_features attribute 1487 self.mass_features = {} 1488 for row in mass_features.itertuples(): 1489 row_dict = mass_features.iloc[row.Index].to_dict() 1490 lcms_feature = LCMSMassFeature(self, **row_dict) 1491 self.mass_features[lcms_feature.id] = lcms_feature 1492 1493 if self.parameters.lc_ms.verbose_processing: 1494 print("Found " + str(len(mass_features)) + " initial mass features") 1495 1496 def find_mass_features_ph_centroid(self, ms_level=1): 1497 """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data. 1498 1499 Parameters 1500 ---------- 1501 ms_level : int, optional 1502 The MS level to use. Default is 1. 1503 1504 Raises 1505 ------ 1506 ValueError 1507 If no MS level data is found on the object. 1508 1509 Returns 1510 ------- 1511 None, but assigns the mass_features attribute to the object. 1512 """ 1513 # Check that ms_level is a key in self._ms_uprocessed 1514 if ms_level not in self._ms_unprocessed.keys(): 1515 raise ValueError( 1516 "No MS level " 1517 + str(ms_level) 1518 + " data found, did you instantiate with parser specific to MS level?" 1519 ) 1520 1521 # Get ms data 1522 data = self._ms_unprocessed[ms_level].copy() 1523 1524 # Drop rows with missing intensity values and reset index 1525 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1526 1527 # Threshold data 1528 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1529 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1530 data_thres['persistence'] = data_thres['intensity'] 1531 1532 # Use this as the starting point for the mass features, adding scan_time 1533 mf_df = data_thres 1534 mf_df = mf_df.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1535 1536 # Define tolerances and dimensions for rolling up 1537 tol = [ 1538 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1539 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1540 ] 1541 relative = [True, False] 1542 dims = ["mz", "scan_time"] 1543 print("here") 1544 mf_df = self.roll_up_dataframe( 1545 df=mf_df, 1546 sort_by="persistence", 1547 dims=dims, 1548 tol=tol, 1549 relative=relative 1550 ) 1551 1552 # Rename scan column to apex_scan 1553 mass_features = mf_df.rename( 1554 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1555 ) 1556 # Sort my persistence and reset index 1557 mass_features = mass_features.sort_values( 1558 by="persistence", ascending=False 1559 ).reset_index(drop=True) 1560 1561 # Populate mass_features attribute 1562 self.mass_features = {} 1563 for row in mass_features.itertuples(): 1564 row_dict = mass_features.iloc[row.Index].to_dict() 1565 lcms_feature = LCMSMassFeature(self, **row_dict) 1566 self.mass_features[lcms_feature.id] = lcms_feature 1567 1568 if self.parameters.lc_ms.verbose_processing: 1569 print("Found " + str(len(mass_features)) + " initial mass features") 1570 1571 def cluster_mass_features(self, drop_children=True, sort_by="persistence"): 1572 """Cluster mass features 1573 1574 Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence. 1575 1576 Parameters 1577 ---------- 1578 drop_children : bool, optional 1579 Whether to drop the mass features that are not cluster parents. Default is True. 1580 sort_by : str, optional 1581 The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence". 1582 1583 Raises 1584 ------ 1585 ValueError 1586 If no mass features are found. 1587 If too many mass features are found. 1588 1589 Returns 1590 ------- 1591 None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents. 1592 """ 1593 verbose = self.parameters.lc_ms.verbose_processing 1594 1595 if self.mass_features is None: 1596 raise ValueError("No mass features found, run find_mass_features() first") 1597 if len(self.mass_features) > 400000: 1598 raise ValueError( 1599 "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold" 1600 ) 1601 dims = ["mz", "scan_time"] 1602 mf_df_og = self.mass_features_to_df() 1603 mf_df = mf_df_og.copy() 1604 1605 tol = [ 1606 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1607 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1608 ] # mz, in relative; scan_time in minutes 1609 relative = [True, False] 1610 1611 # Roll up mass features based on their proximity in the declared dimensions 1612 mf_df_new = self.roll_up_dataframe( 1613 df=mf_df, 1614 sort_by=sort_by, 1615 dims=dims, 1616 tol=tol, 1617 relative=relative 1618 ) 1619 1620 mf_df["cluster_parent"] = np.where( 1621 np.isin(mf_df.index, mf_df_new.index), True, False 1622 ) 1623 1624 # get mass feature ids of features that are not cluster parents 1625 cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values 1626 if drop_children is True: 1627 # Drop mass features that are not cluster parents from self 1628 self.mass_features = { 1629 k: v 1630 for k, v in self.mass_features.items() 1631 if k not in cluster_daughters 1632 } 1633 else: 1634 return cluster_daughters
14def find_closest(A, target): 15 """Find the index of closest value in A to each value in target. 16 17 Parameters 18 ---------- 19 A : :obj:`~numpy.array` 20 The array to search (blueprint). A must be sorted. 21 target : :obj:`~numpy.array` 22 The array of values to search for. target must be sorted. 23 24 Returns 25 ------- 26 :obj:`~numpy.array` 27 The indices of the closest values in A to each value in target. 28 """ 29 idx = A.searchsorted(target) 30 idx = np.clip(idx, 1, len(A) - 1) 31 left = A[idx - 1] 32 right = A[idx] 33 idx -= target - left < right - target 34 return idx
Find the index of closest value in A to each value in target.
Parameters
- A (
~numpy.array
): The array to search (blueprint). A must be sorted. - target (
~numpy.array
): The array of values to search for. target must be sorted.
Returns
~numpy.array
: The indices of the closest values in A to each value in target.
37class LCCalculations: 38 """Methods for performing LC calculations on mass spectra data. 39 40 Notes 41 ----- 42 This class is intended to be used as a mixin for the LCMSBase class. 43 44 Methods 45 ------- 46 * get_max_eic(eic_data). 47 Returns the maximum EIC value from the given EIC data. A static method. 48 * smooth_tic(tic). 49 Smooths the TIC data using the specified smoothing method and settings. 50 * eic_centroid_detector(rt, eic, max_eic). 51 Performs EIC centroid detection on the given EIC data. 52 * find_nearest_scan(rt). 53 Finds the nearest scan to the given retention time. 54 * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). 55 Returns an averaged mass spectrum object. 56 * find_mass_features(ms_level=1). 57 Find regions of interest for a given MS level (default is MS1). 58 * integrate_mass_features(drop_if_fail=False, ms_level=1). 59 Integrate mass features of interest and extracts EICs. 60 * find_c13_mass_features(). 61 Evaluate mass features and mark likely C13 isotopes. 62 * deconvolute_ms1_mass_features(). 63 Deconvolute mass features' ms1 mass spectra. 64 """ 65 66 @staticmethod 67 def get_max_eic(eic_data: dict): 68 """Returns the maximum EIC value from the given EIC data. 69 70 Notes 71 ----- 72 This is a static method. 73 74 Parameters 75 ---------- 76 eic_data : dict 77 A dictionary containing EIC data. 78 79 Returns 80 ------- 81 float 82 The maximum EIC value. 83 """ 84 max_eic = 0 85 for eic_data in eic_data.values(): 86 ind_max_eic = max(eic_data.get("EIC")) 87 max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic 88 89 return max_eic 90 91 def smooth_tic(self, tic): 92 """Smooths the TIC or EIC data using the specified smoothing method and settings. 93 94 Parameters 95 ---------- 96 tic : numpy.ndarray 97 The TIC (or EIC) data to be smoothed. 98 99 Returns 100 ------- 101 numpy.ndarray 102 The smoothed TIC data. 103 """ 104 implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method 105 106 pol_order = self.parameters.lc_ms.savgol_pol_order 107 108 window_len = self.parameters.lc_ms.smooth_window 109 110 window = self.parameters.lc_ms.smooth_method 111 112 return sp.smooth_signal( 113 tic, window_len, window, pol_order, implemented_smooth_method 114 ) 115 116 def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]): 117 """Performs EIC centroid detection on the given EIC data. 118 119 Parameters 120 ---------- 121 rt : numpy.ndarray 122 The retention time data. 123 eic : numpy.ndarray 124 The EIC data. 125 max_eic : float 126 The maximum EIC value. 127 apex_indexes : list, optional 128 The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function. 129 130 Returns 131 ------- 132 numpy.ndarray 133 The indexes of left, apex, and right limits as a generator. 134 """ 135 136 max_prominence = self.parameters.lc_ms.peak_max_prominence_percent 137 138 max_height = self.parameters.lc_ms.peak_height_max_percent 139 140 signal_threshold = self.parameters.lc_ms.eic_signal_threshold 141 142 min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints 143 144 peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold 145 146 include_indexes = sp.peak_picking_first_derivative( 147 domain=rt, 148 signal=eic, 149 max_height=max_height, 150 max_prominence=max_prominence, 151 max_signal=max_eic, 152 min_peak_datapoints=min_peak_datapoints, 153 peak_derivative_threshold=peak_derivative_threshold, 154 signal_threshold=signal_threshold, 155 correct_baseline=False, 156 plot_res=False, 157 apex_indexes=apex_indexes, 158 ) 159 160 return include_indexes 161 162 def find_nearest_scan(self, rt): 163 """Finds the nearest scan to the given retention time. 164 165 Parameters 166 ---------- 167 rt : float 168 The retention time (in minutes) to find the nearest scan for. 169 170 Returns 171 ------- 172 int 173 The scan number of the nearest scan. 174 """ 175 array_rt = np.array(self.retention_time) 176 177 scan_index = (np.abs(array_rt - rt)).argmin() 178 179 real_scan = self.scans_number[scan_index] 180 181 return real_scan 182 183 def add_peak_metrics(self): 184 """Add peak metrics to the mass features. 185 186 This function calculates the peak metrics for each mass feature and adds them to the mass feature objects. 187 """ 188 # Check that at least some mass features have eic data 189 if not any([mf._eic_data is not None for mf in self.mass_features.values()]): 190 raise ValueError( 191 "No mass features have EIC data. Run integrate_mass_features first." 192 ) 193 194 for mass_feature in self.mass_features.values(): 195 # Check if the mass feature has been integrated 196 if mass_feature._eic_data is not None: 197 # Calculate peak metrics 198 mass_feature.calc_half_height_width() 199 mass_feature.calc_tailing_factor() 200 mass_feature.calc_dispersity_index() 201 202 def get_average_mass_spectrum( 203 self, 204 scan_list, 205 apex_scan, 206 spectrum_mode="profile", 207 ms_level=1, 208 auto_process=True, 209 use_parser=False, 210 perform_checks=True, 211 polarity=None, 212 ms_params=None, 213 ): 214 """Returns an averaged mass spectrum object 215 216 Parameters 217 ---------- 218 scan_list : list 219 List of scan numbers to average. 220 apex_scan : int 221 Number of the apex scan 222 spectrum_mode : str, optional 223 The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging. 224 ms_level : int, optional 225 The MS level to use. Defaults to 1. 226 auto_process : bool, optional 227 If True, the averaged mass spectrum will be auto-processed. Defaults to True. 228 use_parser : bool, optional 229 If True, the mass spectra will be obtained from the parser. Defaults to False. 230 perform_checks : bool, optional 231 If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan 232 polarity : int, optional 233 The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1) 234 ms_params : MSParameters, optional 235 The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None. 236 237 Returns 238 ------- 239 MassSpectrumProfile 240 The averaged mass spectrum object. 241 242 Raises 243 ------ 244 ValueError 245 If the spectrum mode is not "profile". 246 If the MS level is not found in the unprocessed mass spectra dictionary. 247 If not all scan numbers are found in the unprocessed mass spectra dictionary. 248 """ 249 if perform_checks: 250 if spectrum_mode != "profile": 251 raise ValueError("Averaging only supported for profile mode") 252 253 if polarity is None: 254 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 255 if self.polarity == "negative": 256 polarity = -1 257 elif self.polarity == "positive": 258 polarity = 1 259 else: 260 raise ValueError( 261 "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'" 262 ) 263 264 # if not using_parser, check that scan numbers are in _ms_unprocessed 265 if not use_parser: 266 if perform_checks: 267 # Set index to scan for faster lookup 268 ms_df = ( 269 self._ms_unprocessed[ms_level] 270 .copy() 271 .set_index("scan", drop=False) 272 .sort_index() 273 ) 274 my_ms_df = ms_df.loc[scan_list] 275 # Check that all scan numbers are in the ms_df 276 if not all(np.isin(scan_list, ms_df.index)): 277 raise ValueError( 278 "Not all scan numbers found in the unprocessed mass spectra dictionary" 279 ) 280 else: 281 my_ms_df = ( 282 pd.DataFrame({"scan": scan_list}) 283 .set_index("scan") 284 .join(self._ms_unprocessed[ms_level], how="left") 285 ) 286 287 if use_parser: 288 ms_list = [ 289 self.spectra_parser.get_mass_spectrum_from_scan( 290 x, spectrum_mode=spectrum_mode, auto_process=False 291 ) 292 for x in scan_list 293 ] 294 ms_mz = [x._mz_exp for x in ms_list] 295 ms_int = [x._abundance for x in ms_list] 296 my_ms_df = [] 297 for i in np.arange(len(ms_mz)): 298 my_ms_df.append( 299 pd.DataFrame( 300 {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]} 301 ) 302 ) 303 my_ms_df = pd.concat(my_ms_df) 304 305 if not self.check_if_grid(my_ms_df): 306 my_ms_df = self.grid_data(my_ms_df) 307 308 my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index() 309 310 ms = ms_from_array_profile( 311 my_ms_ave.mz, 312 my_ms_ave.intensity, 313 self.file_location, 314 polarity=polarity, 315 auto_process=False, 316 ) 317 318 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 319 if ms is not None: 320 if ms_params is not None: 321 ms.parameters = ms_params 322 ms.scan_number = apex_scan 323 if auto_process: 324 ms.process_mass_spec() 325 return ms 326 327 def find_mass_features(self, ms_level=1, grid=True): 328 """Find mass features within an LCMSBase object 329 330 Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future. 331 332 Parameters 333 ---------- 334 ms_level : int, optional 335 The MS level to use for peak picking Default is 1. 336 grid : bool, optional 337 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 338 used for persistent homology peak picking for profile data only. Default is True. 339 340 Raises 341 ------ 342 ValueError 343 If no MS level data is found on the object. 344 If persistent homology peak picking is attempted on non-profile mode data. 345 If data is not gridded and grid is False. 346 If peak picking method is not implemented. 347 348 Returns 349 ------- 350 None, but assigns the mass_features and eics attributes to the object. 351 352 """ 353 pp_method = self.parameters.lc_ms.peak_picking_method 354 355 if pp_method == "persistent homology": 356 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 357 if all(msx_scan_df["ms_format"] == "profile"): 358 self.find_mass_features_ph(ms_level=ms_level, grid=grid) 359 self.cluster_mass_features(drop_children=True, sort_by="persistence") 360 else: 361 raise ValueError( 362 "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format( 363 ms_level 364 ) 365 ) 366 elif pp_method == "centroided_persistent_homology": 367 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 368 if all(msx_scan_df["ms_format"] == "centroid"): 369 self.find_mass_features_ph_centroid(ms_level=ms_level) 370 else: 371 raise ValueError( 372 "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format( 373 ms_level 374 ) 375 ) 376 else: 377 raise ValueError("Peak picking method not implemented") 378 379 def integrate_mass_features( 380 self, drop_if_fail=True, drop_duplicates=True, ms_level=1 381 ): 382 """Integrate mass features and extract EICs. 383 384 Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute. 385 386 Parameters 387 ---------- 388 drop_if_fail : bool, optional 389 Whether to drop mass features if the EIC limit calculations fail. 390 Default is True. 391 drop_duplicates : bool, optional 392 Whether to mass features that appear to be duplicates 393 (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). 394 Default is True. 395 ms_level : int, optional 396 The MS level to use. Default is 1. 397 398 Raises 399 ------ 400 ValueError 401 If no mass features are found. 402 If no MS level data is found for the given MS level (either in data or in the scan data) 403 404 Returns 405 ------- 406 None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute. 407 408 Notes 409 ----- 410 drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection). 411 """ 412 # Check if there is data 413 if ms_level in self._ms_unprocessed.keys(): 414 raw_data = self._ms_unprocessed[ms_level].copy() 415 else: 416 raise ValueError("No MS level " + str(ms_level) + " data found") 417 if self.mass_features is not None: 418 mf_df = self.mass_features_to_df().copy() 419 else: 420 raise ValueError( 421 "No mass features found, did you run find_mass_features() first?" 422 ) 423 # Check if mass_spectrum exists on each mass feature 424 if not all( 425 [mf.mass_spectrum is not None for mf in self.mass_features.values()] 426 ): 427 raise ValueError( 428 "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?" 429 ) 430 431 # Subset scan data to only include correct ms_level 432 scan_df_sub = self.scan_df[ 433 self.scan_df["ms_level"] == int(ms_level) 434 ].reset_index(drop=True) 435 if scan_df_sub.empty: 436 raise ValueError("No MS level " + ms_level + " data found in scan data") 437 scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy() 438 439 mzs_to_extract = np.unique(mf_df["mz"].values) 440 mzs_to_extract.sort() 441 442 # Get EICs for each unique mz in mass features list 443 for mz in mzs_to_extract: 444 mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 445 mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 446 raw_data_sub = raw_data[ 447 (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max) 448 ].reset_index(drop=True) 449 raw_data_sub = ( 450 raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index() 451 ) 452 raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left") 453 raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0) 454 myEIC = EIC_Data( 455 scans=raw_data_sub["scan"].values, 456 time=raw_data_sub["scan_time"].values, 457 eic=raw_data_sub["intensity"].values, 458 ) 459 # Smooth EIC 460 smoothed_eic = self.smooth_tic(myEIC.eic) 461 smoothed_eic[smoothed_eic < 0] = 0 462 myEIC.eic_smoothed = smoothed_eic 463 self.eics[mz] = myEIC 464 465 # Get limits of mass features using EIC centroid detector and integrate 466 mf_df["area"] = np.nan 467 for idx, mass_feature in mf_df.iterrows(): 468 mz = mass_feature.mz 469 apex_scan = mass_feature.apex_scan 470 471 # Pull EIC data and find apex scan index 472 myEIC = self.eics[mz] 473 self.mass_features[idx]._eic_data = myEIC 474 apex_index = np.where(myEIC.scans == apex_scan)[0][0] 475 476 # Find left and right limits of peak using EIC centroid detector, add to EICData 477 centroid_eics = self.eic_centroid_detector( 478 myEIC.time, 479 myEIC.eic_smoothed, 480 mass_feature.intensity * 1.1, 481 apex_indexes=[int(apex_index)], 482 ) 483 l_a_r_scan_idx = [i for i in centroid_eics] 484 if len(l_a_r_scan_idx) > 0: 485 # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans 486 # Find the number of consecutive non-zero values in the EIC segment 487 mask = myEIC.eic[l_a_r_scan_idx[0][0]:l_a_r_scan_idx[0][2] + 1] > 0 488 # Find the longest run of consecutive True values 489 if np.any(mask): 490 # Find indices where mask changes value 491 diff = np.diff(np.concatenate(([0], mask.astype(int), [0]))) 492 starts = np.where(diff == 1)[0] 493 ends = np.where(diff == -1)[0] 494 consecutive_scans = (ends - starts).max() 495 else: 496 consecutive_scans = 0 497 if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min: 498 self.mass_features.pop(idx) 499 continue 500 # Add start and final scan to mass_features and EICData 501 left_scan, right_scan = ( 502 myEIC.scans[l_a_r_scan_idx[0][0]], 503 myEIC.scans[l_a_r_scan_idx[0][2]], 504 ) 505 mf_scan_apex = [(left_scan, int(apex_scan), right_scan)] 506 myEIC.apexes = myEIC.apexes + mf_scan_apex 507 self.mass_features[idx].start_scan = left_scan 508 self.mass_features[idx].final_scan = right_scan 509 510 # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData 511 area = np.trapz( 512 myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 513 myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 514 ) 515 mf_df.at[idx, "area"] = area 516 myEIC.areas = myEIC.areas + [area] 517 self.eics[mz] = myEIC 518 self.mass_features[idx]._area = area 519 else: 520 if drop_if_fail is True: 521 self.mass_features.pop(idx) 522 523 if drop_duplicates: 524 # Prepare mass feature dataframe 525 mf_df = self.mass_features_to_df().copy() 526 527 # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature 528 # Kepp the first mass fea 529 for idx, mass_feature in mf_df.iterrows(): 530 mz = mass_feature.mz 531 apex_scan = mass_feature.apex_scan 532 533 mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6 534 mf_df_sub = mf_df[ 535 mf_df["mz_diff_ppm"] 536 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 537 * 10**6 538 ].copy() 539 540 # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature 541 for idx2, mass_feature2 in mf_df_sub.iterrows(): 542 if idx2 != idx: 543 if ( 544 mass_feature2.start_scan >= mass_feature.start_scan 545 and mass_feature2.final_scan <= mass_feature.final_scan 546 ): 547 if idx2 in self.mass_features.keys(): 548 self.mass_features.pop(idx2) 549 550 def find_c13_mass_features(self): 551 """Mark likely C13 isotopes and connect to monoisoitopic mass features. 552 553 Returns 554 ------- 555 None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object. 556 557 Raises 558 ------ 559 ValueError 560 If no mass features are found. 561 """ 562 verbose = self.parameters.lc_ms.verbose_processing 563 if verbose: 564 print("evaluating mass features for C13 isotopes") 565 if self.mass_features is None: 566 raise ValueError("No mass features found, run find_mass_features() first") 567 568 # Data prep fo sparse distance matrix 569 dims = ["mz", "scan_time"] 570 mf_df = self.mass_features_to_df().copy() 571 # Drop mass features that have no area (these are likely to be noise) 572 mf_df = mf_df[mf_df["area"].notnull()] 573 mf_df["mf_id"] = mf_df.index.values 574 dims = ["mz", "scan_time"] 575 576 # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features 577 mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy() 578 579 mz_diff = 1.003355 # C13-C12 mass difference 580 tol = [ 581 mf_df["mz"].median() 582 * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 583 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5, 584 ] # mz, in relative; scan_time in minutes 585 586 # Compute inter-feature distances 587 distances = None 588 for i in range(len(dims)): 589 # Construct k-d tree 590 values = mf_df[dims[i]].values 591 tree = KDTree(values.reshape(-1, 1)) 592 593 max_tol = tol[i] 594 if dims[i] == "mz": 595 # Maximum absolute tolerance 596 max_tol = mz_diff + tol[i] 597 598 # Compute sparse distance matrix 599 # the larger the max_tol, the slower this operation is 600 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 601 602 # Only consider forward case, exclude diagonal 603 sdm = sparse.triu(sdm, k=1) 604 605 if dims[i] == "mz": 606 min_tol = mz_diff - tol[i] 607 # Get only the ones that are above the min tol 608 idx = sdm.data > min_tol 609 610 # Reconstruct sparse distance matrix 611 sdm = sparse.coo_matrix( 612 (sdm.data[idx], (sdm.row[idx], sdm.col[idx])), 613 shape=(len(values), len(values)), 614 ) 615 616 # Cast as binary matrix 617 sdm.data = np.ones_like(sdm.data) 618 619 # Stack distances 620 if distances is None: 621 distances = sdm 622 else: 623 distances = distances.multiply(sdm) 624 625 # Extract indices of within-tolerance points 626 distances = distances.tocoo() 627 pairs = np.stack((distances.row, distances.col), axis=1) # C12 to C13 pairs 628 629 # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df 630 pairs_mf = pairs.copy() 631 pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values 632 pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values 633 634 # Connect monoisotopic masses with isotopologes within mass_features 635 monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1])) 636 for mono in monos: 637 self.mass_features[mono].monoisotopic_mf_id = mono 638 pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"]) 639 while not pairs_iso_df.empty: 640 pairs_iso_df = pairs_iso_df.set_index("parent", drop=False) 641 m1_isos = pairs_iso_df.loc[monos, "child"].unique() 642 for iso in m1_isos: 643 # Set monoisotopic_mf_id and isotopologue_type for isotopologues 644 parent = pairs_mf[pairs_mf[:, 1] == iso, 0] 645 if len(parent) > 1: 646 # Choose the parent that is closest in time to the isotopologue 647 parent_time = [self.mass_features[p].retention_time for p in parent] 648 time_diff = [ 649 np.abs(self.mass_features[iso].retention_time - x) 650 for x in parent_time 651 ] 652 parent = parent[np.argmin(time_diff)] 653 else: 654 parent = parent[0] 655 self.mass_features[iso].monoisotopic_mf_id = self.mass_features[ 656 parent 657 ].monoisotopic_mf_id 658 if self.mass_features[iso].monoisotopic_mf_id is not None: 659 mass_diff = ( 660 self.mass_features[iso].mz 661 - self.mass_features[ 662 self.mass_features[iso].monoisotopic_mf_id 663 ].mz 664 ) 665 self.mass_features[iso].isotopologue_type = "13C" + str( 666 int(round(mass_diff, 0)) 667 ) 668 669 # Drop the mono and iso from the pairs_iso_df 670 pairs_iso_df = pairs_iso_df.drop( 671 index=monos, errors="ignore" 672 ) # Drop pairs where the parent is a child that is a child of a root 673 pairs_iso_df = pairs_iso_df.set_index("child", drop=False) 674 pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore") 675 676 if not pairs_iso_df.empty: 677 # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process 678 monos = np.setdiff1d( 679 np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child) 680 ) 681 if verbose: 682 # Report fraction of compounds annotated with isotopes 683 mf_df["c13_flag"] = np.where( 684 np.logical_or( 685 np.isin(mf_df["mf_id"], pairs_mf[:, 0]), 686 np.isin(mf_df["mf_id"], pairs_mf[:, 1]), 687 ), 688 1, 689 0, 690 ) 691 print( 692 str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3)) 693 + " of mass features have or are C13 isotopes" 694 ) 695 696 def deconvolute_ms1_mass_features(self): 697 """Deconvolute MS1 mass features 698 699 Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features 700 701 Parameters 702 ---------- 703 None 704 705 Returns 706 ------- 707 None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent, 708 and associated_mass_features_deconvoluted attributes to the mass features in the 709 mass_features attribute of the LCMSBase object. 710 711 Raises 712 ------ 713 ValueError 714 If no mass features are found, must run find_mass_features() first. 715 If no EICs are found, did you run integrate_mass_features() first? 716 717 """ 718 # Checks for set mass_features and eics 719 if self.mass_features is None: 720 raise ValueError( 721 "No mass features found, did you run find_mass_features() first?" 722 ) 723 724 if self.eics == {}: 725 raise ValueError( 726 "No EICs found, did you run integrate_mass_features() first?" 727 ) 728 729 if 1 not in self._ms_unprocessed.keys(): 730 raise ValueError("No unprocessed MS1 spectra found.") 731 732 # Prep ms1 data 733 ms1_data = self._ms_unprocessed[1].copy() 734 ms1_data = ms1_data.set_index("scan") 735 736 # Prep mass feature summary 737 mass_feature_df = self.mass_features_to_df() 738 739 # Loop through each mass feature 740 for mf_id, mass_feature in self.mass_features.items(): 741 # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df 742 if mass_feature.mz != mass_feature.ms1_peak.mz_exp: 743 continue 744 745 # Get the left and right limits of the EIC of the mass feature 746 l_scan, _, r_scan = mass_feature._eic_data.apexes[0] 747 748 # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz 749 ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy() 750 ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False) 751 752 # Get the centroided masses of the mass feature 753 mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp 754 755 # Find the closest mz in the ms1 data to the centroided masses of the mass feature 756 ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[ 757 find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values) 758 ] 759 760 # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz 761 ms1_data_sub["mz_diff_rel"] = ( 762 np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"]) 763 / ms1_data_sub["mz"] 764 ) 765 ms1_data_sub = ms1_data_sub[ 766 ms1_data_sub["mz_diff_rel"] 767 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 768 ].reset_index(drop=True) 769 770 # Group by mass_feature_mz and scan and sum intensity 771 ms1_data_sub_group = ( 772 ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"] 773 .sum() 774 .reset_index() 775 ) 776 777 # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity) 778 corr = ( 779 ms1_data_sub_group.pivot( 780 index="scan", columns="mass_feature_mz", values="intensity" 781 ) 782 .fillna(0) 783 .corr() 784 ) 785 786 # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8 787 decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min 788 789 # Try catch for KeyError in case the mass feature mz is not in the correlation matrix 790 try: 791 corr_subset = corr.loc[mass_feature.mz,] 792 except KeyError: 793 # If the mass feature mz is not in the correlation matrix, skip to the next mass feature 794 continue 795 796 corr_subset = corr_subset[corr_subset > decon_corr_min] 797 798 # Get the masses from the mass spectrum that are the result of the deconvolution 799 mzs_decon = corr_subset.index.values 800 801 # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature 802 mzs_decon_idx = [ 803 id 804 for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp) 805 if mz in mzs_decon 806 ] 807 mass_feature._ms_deconvoluted_idx = mzs_decon_idx 808 809 # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum 810 if ( 811 mass_feature.ms1_peak.abundance 812 == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max() 813 ): 814 mass_feature.mass_spectrum_deconvoluted_parent = True 815 else: 816 mass_feature.mass_spectrum_deconvoluted_parent = False 817 818 # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature 819 # Subset mass_feature_df to only include mass features that are within the clustering tolerance 820 mass_feature_df_sub = mass_feature_df[ 821 abs(mass_feature.retention_time - mass_feature_df["scan_time"]) 822 < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 823 ].copy() 824 # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum 825 mass_feature_df_sub["mz_diff_ppm"] = [ 826 np.abs(mzs_decon - mz).min() / mz * 10**6 827 for mz in mass_feature_df_sub["mz"] 828 ] 829 # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses 830 mfs_associated_decon = mass_feature_df_sub[ 831 mass_feature_df_sub["mz_diff_ppm"] 832 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6 833 ].index.values 834 835 mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon
Methods for performing LC calculations on mass spectra data.
Notes
This class is intended to be used as a mixin for the LCMSBase class.
Methods
- get_max_eic(eic_data). Returns the maximum EIC value from the given EIC data. A static method.
- smooth_tic(tic). Smooths the TIC data using the specified smoothing method and settings.
- eic_centroid_detector(rt, eic, max_eic). Performs EIC centroid detection on the given EIC data.
- find_nearest_scan(rt). Finds the nearest scan to the given retention time.
- get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). Returns an averaged mass spectrum object.
- find_mass_features(ms_level=1). Find regions of interest for a given MS level (default is MS1).
- integrate_mass_features(drop_if_fail=False, ms_level=1). Integrate mass features of interest and extracts EICs.
- find_c13_mass_features(). Evaluate mass features and mark likely C13 isotopes.
- deconvolute_ms1_mass_features(). Deconvolute mass features' ms1 mass spectra.
66 @staticmethod 67 def get_max_eic(eic_data: dict): 68 """Returns the maximum EIC value from the given EIC data. 69 70 Notes 71 ----- 72 This is a static method. 73 74 Parameters 75 ---------- 76 eic_data : dict 77 A dictionary containing EIC data. 78 79 Returns 80 ------- 81 float 82 The maximum EIC value. 83 """ 84 max_eic = 0 85 for eic_data in eic_data.values(): 86 ind_max_eic = max(eic_data.get("EIC")) 87 max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic 88 89 return max_eic
Returns the maximum EIC value from the given EIC data.
Notes
This is a static method.
Parameters
- eic_data (dict): A dictionary containing EIC data.
Returns
- float: The maximum EIC value.
91 def smooth_tic(self, tic): 92 """Smooths the TIC or EIC data using the specified smoothing method and settings. 93 94 Parameters 95 ---------- 96 tic : numpy.ndarray 97 The TIC (or EIC) data to be smoothed. 98 99 Returns 100 ------- 101 numpy.ndarray 102 The smoothed TIC data. 103 """ 104 implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method 105 106 pol_order = self.parameters.lc_ms.savgol_pol_order 107 108 window_len = self.parameters.lc_ms.smooth_window 109 110 window = self.parameters.lc_ms.smooth_method 111 112 return sp.smooth_signal( 113 tic, window_len, window, pol_order, implemented_smooth_method 114 )
Smooths the TIC or EIC data using the specified smoothing method and settings.
Parameters
- tic (numpy.ndarray): The TIC (or EIC) data to be smoothed.
Returns
- numpy.ndarray: The smoothed TIC data.
116 def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]): 117 """Performs EIC centroid detection on the given EIC data. 118 119 Parameters 120 ---------- 121 rt : numpy.ndarray 122 The retention time data. 123 eic : numpy.ndarray 124 The EIC data. 125 max_eic : float 126 The maximum EIC value. 127 apex_indexes : list, optional 128 The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function. 129 130 Returns 131 ------- 132 numpy.ndarray 133 The indexes of left, apex, and right limits as a generator. 134 """ 135 136 max_prominence = self.parameters.lc_ms.peak_max_prominence_percent 137 138 max_height = self.parameters.lc_ms.peak_height_max_percent 139 140 signal_threshold = self.parameters.lc_ms.eic_signal_threshold 141 142 min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints 143 144 peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold 145 146 include_indexes = sp.peak_picking_first_derivative( 147 domain=rt, 148 signal=eic, 149 max_height=max_height, 150 max_prominence=max_prominence, 151 max_signal=max_eic, 152 min_peak_datapoints=min_peak_datapoints, 153 peak_derivative_threshold=peak_derivative_threshold, 154 signal_threshold=signal_threshold, 155 correct_baseline=False, 156 plot_res=False, 157 apex_indexes=apex_indexes, 158 ) 159 160 return include_indexes
Performs EIC centroid detection on the given EIC data.
Parameters
- rt (numpy.ndarray): The retention time data.
- eic (numpy.ndarray): The EIC data.
- max_eic (float): The maximum EIC value.
- apex_indexes (list, optional): The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
Returns
- numpy.ndarray: The indexes of left, apex, and right limits as a generator.
162 def find_nearest_scan(self, rt): 163 """Finds the nearest scan to the given retention time. 164 165 Parameters 166 ---------- 167 rt : float 168 The retention time (in minutes) to find the nearest scan for. 169 170 Returns 171 ------- 172 int 173 The scan number of the nearest scan. 174 """ 175 array_rt = np.array(self.retention_time) 176 177 scan_index = (np.abs(array_rt - rt)).argmin() 178 179 real_scan = self.scans_number[scan_index] 180 181 return real_scan
Finds the nearest scan to the given retention time.
Parameters
- rt (float): The retention time (in minutes) to find the nearest scan for.
Returns
- int: The scan number of the nearest scan.
183 def add_peak_metrics(self): 184 """Add peak metrics to the mass features. 185 186 This function calculates the peak metrics for each mass feature and adds them to the mass feature objects. 187 """ 188 # Check that at least some mass features have eic data 189 if not any([mf._eic_data is not None for mf in self.mass_features.values()]): 190 raise ValueError( 191 "No mass features have EIC data. Run integrate_mass_features first." 192 ) 193 194 for mass_feature in self.mass_features.values(): 195 # Check if the mass feature has been integrated 196 if mass_feature._eic_data is not None: 197 # Calculate peak metrics 198 mass_feature.calc_half_height_width() 199 mass_feature.calc_tailing_factor() 200 mass_feature.calc_dispersity_index()
Add peak metrics to the mass features.
This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
202 def get_average_mass_spectrum( 203 self, 204 scan_list, 205 apex_scan, 206 spectrum_mode="profile", 207 ms_level=1, 208 auto_process=True, 209 use_parser=False, 210 perform_checks=True, 211 polarity=None, 212 ms_params=None, 213 ): 214 """Returns an averaged mass spectrum object 215 216 Parameters 217 ---------- 218 scan_list : list 219 List of scan numbers to average. 220 apex_scan : int 221 Number of the apex scan 222 spectrum_mode : str, optional 223 The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging. 224 ms_level : int, optional 225 The MS level to use. Defaults to 1. 226 auto_process : bool, optional 227 If True, the averaged mass spectrum will be auto-processed. Defaults to True. 228 use_parser : bool, optional 229 If True, the mass spectra will be obtained from the parser. Defaults to False. 230 perform_checks : bool, optional 231 If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan 232 polarity : int, optional 233 The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1) 234 ms_params : MSParameters, optional 235 The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None. 236 237 Returns 238 ------- 239 MassSpectrumProfile 240 The averaged mass spectrum object. 241 242 Raises 243 ------ 244 ValueError 245 If the spectrum mode is not "profile". 246 If the MS level is not found in the unprocessed mass spectra dictionary. 247 If not all scan numbers are found in the unprocessed mass spectra dictionary. 248 """ 249 if perform_checks: 250 if spectrum_mode != "profile": 251 raise ValueError("Averaging only supported for profile mode") 252 253 if polarity is None: 254 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 255 if self.polarity == "negative": 256 polarity = -1 257 elif self.polarity == "positive": 258 polarity = 1 259 else: 260 raise ValueError( 261 "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'" 262 ) 263 264 # if not using_parser, check that scan numbers are in _ms_unprocessed 265 if not use_parser: 266 if perform_checks: 267 # Set index to scan for faster lookup 268 ms_df = ( 269 self._ms_unprocessed[ms_level] 270 .copy() 271 .set_index("scan", drop=False) 272 .sort_index() 273 ) 274 my_ms_df = ms_df.loc[scan_list] 275 # Check that all scan numbers are in the ms_df 276 if not all(np.isin(scan_list, ms_df.index)): 277 raise ValueError( 278 "Not all scan numbers found in the unprocessed mass spectra dictionary" 279 ) 280 else: 281 my_ms_df = ( 282 pd.DataFrame({"scan": scan_list}) 283 .set_index("scan") 284 .join(self._ms_unprocessed[ms_level], how="left") 285 ) 286 287 if use_parser: 288 ms_list = [ 289 self.spectra_parser.get_mass_spectrum_from_scan( 290 x, spectrum_mode=spectrum_mode, auto_process=False 291 ) 292 for x in scan_list 293 ] 294 ms_mz = [x._mz_exp for x in ms_list] 295 ms_int = [x._abundance for x in ms_list] 296 my_ms_df = [] 297 for i in np.arange(len(ms_mz)): 298 my_ms_df.append( 299 pd.DataFrame( 300 {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]} 301 ) 302 ) 303 my_ms_df = pd.concat(my_ms_df) 304 305 if not self.check_if_grid(my_ms_df): 306 my_ms_df = self.grid_data(my_ms_df) 307 308 my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index() 309 310 ms = ms_from_array_profile( 311 my_ms_ave.mz, 312 my_ms_ave.intensity, 313 self.file_location, 314 polarity=polarity, 315 auto_process=False, 316 ) 317 318 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 319 if ms is not None: 320 if ms_params is not None: 321 ms.parameters = ms_params 322 ms.scan_number = apex_scan 323 if auto_process: 324 ms.process_mass_spec() 325 return ms
Returns an averaged mass spectrum object
Parameters
- scan_list (list): List of scan numbers to average.
- apex_scan (int): Number of the apex scan
- spectrum_mode (str, optional): The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
- ms_level (int, optional): The MS level to use. Defaults to 1.
- auto_process (bool, optional): If True, the averaged mass spectrum will be auto-processed. Defaults to True.
- use_parser (bool, optional): If True, the mass spectra will be obtained from the parser. Defaults to False.
- perform_checks (bool, optional): If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan
- polarity (int, optional): The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
- ms_params (MSParameters, optional): The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
Returns
- MassSpectrumProfile: The averaged mass spectrum object.
Raises
- ValueError: If the spectrum mode is not "profile". If the MS level is not found in the unprocessed mass spectra dictionary. If not all scan numbers are found in the unprocessed mass spectra dictionary.
327 def find_mass_features(self, ms_level=1, grid=True): 328 """Find mass features within an LCMSBase object 329 330 Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future. 331 332 Parameters 333 ---------- 334 ms_level : int, optional 335 The MS level to use for peak picking Default is 1. 336 grid : bool, optional 337 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 338 used for persistent homology peak picking for profile data only. Default is True. 339 340 Raises 341 ------ 342 ValueError 343 If no MS level data is found on the object. 344 If persistent homology peak picking is attempted on non-profile mode data. 345 If data is not gridded and grid is False. 346 If peak picking method is not implemented. 347 348 Returns 349 ------- 350 None, but assigns the mass_features and eics attributes to the object. 351 352 """ 353 pp_method = self.parameters.lc_ms.peak_picking_method 354 355 if pp_method == "persistent homology": 356 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 357 if all(msx_scan_df["ms_format"] == "profile"): 358 self.find_mass_features_ph(ms_level=ms_level, grid=grid) 359 self.cluster_mass_features(drop_children=True, sort_by="persistence") 360 else: 361 raise ValueError( 362 "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format( 363 ms_level 364 ) 365 ) 366 elif pp_method == "centroided_persistent_homology": 367 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 368 if all(msx_scan_df["ms_format"] == "centroid"): 369 self.find_mass_features_ph_centroid(ms_level=ms_level) 370 else: 371 raise ValueError( 372 "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format( 373 ms_level 374 ) 375 ) 376 else: 377 raise ValueError("Peak picking method not implemented")
Find mass features within an LCMSBase object
Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
Parameters
- ms_level (int, optional): The MS level to use for peak picking Default is 1.
- grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), used for persistent homology peak picking for profile data only. Default is True.
Raises
- ValueError: If no MS level data is found on the object. If persistent homology peak picking is attempted on non-profile mode data. If data is not gridded and grid is False. If peak picking method is not implemented.
Returns
- None, but assigns the mass_features and eics attributes to the object.
379 def integrate_mass_features( 380 self, drop_if_fail=True, drop_duplicates=True, ms_level=1 381 ): 382 """Integrate mass features and extract EICs. 383 384 Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute. 385 386 Parameters 387 ---------- 388 drop_if_fail : bool, optional 389 Whether to drop mass features if the EIC limit calculations fail. 390 Default is True. 391 drop_duplicates : bool, optional 392 Whether to mass features that appear to be duplicates 393 (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). 394 Default is True. 395 ms_level : int, optional 396 The MS level to use. Default is 1. 397 398 Raises 399 ------ 400 ValueError 401 If no mass features are found. 402 If no MS level data is found for the given MS level (either in data or in the scan data) 403 404 Returns 405 ------- 406 None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute. 407 408 Notes 409 ----- 410 drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection). 411 """ 412 # Check if there is data 413 if ms_level in self._ms_unprocessed.keys(): 414 raw_data = self._ms_unprocessed[ms_level].copy() 415 else: 416 raise ValueError("No MS level " + str(ms_level) + " data found") 417 if self.mass_features is not None: 418 mf_df = self.mass_features_to_df().copy() 419 else: 420 raise ValueError( 421 "No mass features found, did you run find_mass_features() first?" 422 ) 423 # Check if mass_spectrum exists on each mass feature 424 if not all( 425 [mf.mass_spectrum is not None for mf in self.mass_features.values()] 426 ): 427 raise ValueError( 428 "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?" 429 ) 430 431 # Subset scan data to only include correct ms_level 432 scan_df_sub = self.scan_df[ 433 self.scan_df["ms_level"] == int(ms_level) 434 ].reset_index(drop=True) 435 if scan_df_sub.empty: 436 raise ValueError("No MS level " + ms_level + " data found in scan data") 437 scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy() 438 439 mzs_to_extract = np.unique(mf_df["mz"].values) 440 mzs_to_extract.sort() 441 442 # Get EICs for each unique mz in mass features list 443 for mz in mzs_to_extract: 444 mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 445 mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 446 raw_data_sub = raw_data[ 447 (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max) 448 ].reset_index(drop=True) 449 raw_data_sub = ( 450 raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index() 451 ) 452 raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left") 453 raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0) 454 myEIC = EIC_Data( 455 scans=raw_data_sub["scan"].values, 456 time=raw_data_sub["scan_time"].values, 457 eic=raw_data_sub["intensity"].values, 458 ) 459 # Smooth EIC 460 smoothed_eic = self.smooth_tic(myEIC.eic) 461 smoothed_eic[smoothed_eic < 0] = 0 462 myEIC.eic_smoothed = smoothed_eic 463 self.eics[mz] = myEIC 464 465 # Get limits of mass features using EIC centroid detector and integrate 466 mf_df["area"] = np.nan 467 for idx, mass_feature in mf_df.iterrows(): 468 mz = mass_feature.mz 469 apex_scan = mass_feature.apex_scan 470 471 # Pull EIC data and find apex scan index 472 myEIC = self.eics[mz] 473 self.mass_features[idx]._eic_data = myEIC 474 apex_index = np.where(myEIC.scans == apex_scan)[0][0] 475 476 # Find left and right limits of peak using EIC centroid detector, add to EICData 477 centroid_eics = self.eic_centroid_detector( 478 myEIC.time, 479 myEIC.eic_smoothed, 480 mass_feature.intensity * 1.1, 481 apex_indexes=[int(apex_index)], 482 ) 483 l_a_r_scan_idx = [i for i in centroid_eics] 484 if len(l_a_r_scan_idx) > 0: 485 # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans 486 # Find the number of consecutive non-zero values in the EIC segment 487 mask = myEIC.eic[l_a_r_scan_idx[0][0]:l_a_r_scan_idx[0][2] + 1] > 0 488 # Find the longest run of consecutive True values 489 if np.any(mask): 490 # Find indices where mask changes value 491 diff = np.diff(np.concatenate(([0], mask.astype(int), [0]))) 492 starts = np.where(diff == 1)[0] 493 ends = np.where(diff == -1)[0] 494 consecutive_scans = (ends - starts).max() 495 else: 496 consecutive_scans = 0 497 if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min: 498 self.mass_features.pop(idx) 499 continue 500 # Add start and final scan to mass_features and EICData 501 left_scan, right_scan = ( 502 myEIC.scans[l_a_r_scan_idx[0][0]], 503 myEIC.scans[l_a_r_scan_idx[0][2]], 504 ) 505 mf_scan_apex = [(left_scan, int(apex_scan), right_scan)] 506 myEIC.apexes = myEIC.apexes + mf_scan_apex 507 self.mass_features[idx].start_scan = left_scan 508 self.mass_features[idx].final_scan = right_scan 509 510 # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData 511 area = np.trapz( 512 myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 513 myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 514 ) 515 mf_df.at[idx, "area"] = area 516 myEIC.areas = myEIC.areas + [area] 517 self.eics[mz] = myEIC 518 self.mass_features[idx]._area = area 519 else: 520 if drop_if_fail is True: 521 self.mass_features.pop(idx) 522 523 if drop_duplicates: 524 # Prepare mass feature dataframe 525 mf_df = self.mass_features_to_df().copy() 526 527 # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature 528 # Kepp the first mass fea 529 for idx, mass_feature in mf_df.iterrows(): 530 mz = mass_feature.mz 531 apex_scan = mass_feature.apex_scan 532 533 mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6 534 mf_df_sub = mf_df[ 535 mf_df["mz_diff_ppm"] 536 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 537 * 10**6 538 ].copy() 539 540 # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature 541 for idx2, mass_feature2 in mf_df_sub.iterrows(): 542 if idx2 != idx: 543 if ( 544 mass_feature2.start_scan >= mass_feature.start_scan 545 and mass_feature2.final_scan <= mass_feature.final_scan 546 ): 547 if idx2 in self.mass_features.keys(): 548 self.mass_features.pop(idx2)
Integrate mass features and extract EICs.
Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
Parameters
- drop_if_fail (bool, optional): Whether to drop mass features if the EIC limit calculations fail. Default is True.
- drop_duplicates (bool, optional): Whether to mass features that appear to be duplicates (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). Default is True.
- ms_level (int, optional): The MS level to use. Default is 1.
Raises
- ValueError: If no mass features are found. If no MS level data is found for the given MS level (either in data or in the scan data)
Returns
- None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
Notes
drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
550 def find_c13_mass_features(self): 551 """Mark likely C13 isotopes and connect to monoisoitopic mass features. 552 553 Returns 554 ------- 555 None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object. 556 557 Raises 558 ------ 559 ValueError 560 If no mass features are found. 561 """ 562 verbose = self.parameters.lc_ms.verbose_processing 563 if verbose: 564 print("evaluating mass features for C13 isotopes") 565 if self.mass_features is None: 566 raise ValueError("No mass features found, run find_mass_features() first") 567 568 # Data prep fo sparse distance matrix 569 dims = ["mz", "scan_time"] 570 mf_df = self.mass_features_to_df().copy() 571 # Drop mass features that have no area (these are likely to be noise) 572 mf_df = mf_df[mf_df["area"].notnull()] 573 mf_df["mf_id"] = mf_df.index.values 574 dims = ["mz", "scan_time"] 575 576 # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features 577 mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy() 578 579 mz_diff = 1.003355 # C13-C12 mass difference 580 tol = [ 581 mf_df["mz"].median() 582 * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 583 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5, 584 ] # mz, in relative; scan_time in minutes 585 586 # Compute inter-feature distances 587 distances = None 588 for i in range(len(dims)): 589 # Construct k-d tree 590 values = mf_df[dims[i]].values 591 tree = KDTree(values.reshape(-1, 1)) 592 593 max_tol = tol[i] 594 if dims[i] == "mz": 595 # Maximum absolute tolerance 596 max_tol = mz_diff + tol[i] 597 598 # Compute sparse distance matrix 599 # the larger the max_tol, the slower this operation is 600 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 601 602 # Only consider forward case, exclude diagonal 603 sdm = sparse.triu(sdm, k=1) 604 605 if dims[i] == "mz": 606 min_tol = mz_diff - tol[i] 607 # Get only the ones that are above the min tol 608 idx = sdm.data > min_tol 609 610 # Reconstruct sparse distance matrix 611 sdm = sparse.coo_matrix( 612 (sdm.data[idx], (sdm.row[idx], sdm.col[idx])), 613 shape=(len(values), len(values)), 614 ) 615 616 # Cast as binary matrix 617 sdm.data = np.ones_like(sdm.data) 618 619 # Stack distances 620 if distances is None: 621 distances = sdm 622 else: 623 distances = distances.multiply(sdm) 624 625 # Extract indices of within-tolerance points 626 distances = distances.tocoo() 627 pairs = np.stack((distances.row, distances.col), axis=1) # C12 to C13 pairs 628 629 # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df 630 pairs_mf = pairs.copy() 631 pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values 632 pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values 633 634 # Connect monoisotopic masses with isotopologes within mass_features 635 monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1])) 636 for mono in monos: 637 self.mass_features[mono].monoisotopic_mf_id = mono 638 pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"]) 639 while not pairs_iso_df.empty: 640 pairs_iso_df = pairs_iso_df.set_index("parent", drop=False) 641 m1_isos = pairs_iso_df.loc[monos, "child"].unique() 642 for iso in m1_isos: 643 # Set monoisotopic_mf_id and isotopologue_type for isotopologues 644 parent = pairs_mf[pairs_mf[:, 1] == iso, 0] 645 if len(parent) > 1: 646 # Choose the parent that is closest in time to the isotopologue 647 parent_time = [self.mass_features[p].retention_time for p in parent] 648 time_diff = [ 649 np.abs(self.mass_features[iso].retention_time - x) 650 for x in parent_time 651 ] 652 parent = parent[np.argmin(time_diff)] 653 else: 654 parent = parent[0] 655 self.mass_features[iso].monoisotopic_mf_id = self.mass_features[ 656 parent 657 ].monoisotopic_mf_id 658 if self.mass_features[iso].monoisotopic_mf_id is not None: 659 mass_diff = ( 660 self.mass_features[iso].mz 661 - self.mass_features[ 662 self.mass_features[iso].monoisotopic_mf_id 663 ].mz 664 ) 665 self.mass_features[iso].isotopologue_type = "13C" + str( 666 int(round(mass_diff, 0)) 667 ) 668 669 # Drop the mono and iso from the pairs_iso_df 670 pairs_iso_df = pairs_iso_df.drop( 671 index=monos, errors="ignore" 672 ) # Drop pairs where the parent is a child that is a child of a root 673 pairs_iso_df = pairs_iso_df.set_index("child", drop=False) 674 pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore") 675 676 if not pairs_iso_df.empty: 677 # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process 678 monos = np.setdiff1d( 679 np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child) 680 ) 681 if verbose: 682 # Report fraction of compounds annotated with isotopes 683 mf_df["c13_flag"] = np.where( 684 np.logical_or( 685 np.isin(mf_df["mf_id"], pairs_mf[:, 0]), 686 np.isin(mf_df["mf_id"], pairs_mf[:, 1]), 687 ), 688 1, 689 0, 690 ) 691 print( 692 str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3)) 693 + " of mass features have or are C13 isotopes" 694 )
Mark likely C13 isotopes and connect to monoisoitopic mass features.
Returns
- None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
Raises
- ValueError: If no mass features are found.
696 def deconvolute_ms1_mass_features(self): 697 """Deconvolute MS1 mass features 698 699 Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features 700 701 Parameters 702 ---------- 703 None 704 705 Returns 706 ------- 707 None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent, 708 and associated_mass_features_deconvoluted attributes to the mass features in the 709 mass_features attribute of the LCMSBase object. 710 711 Raises 712 ------ 713 ValueError 714 If no mass features are found, must run find_mass_features() first. 715 If no EICs are found, did you run integrate_mass_features() first? 716 717 """ 718 # Checks for set mass_features and eics 719 if self.mass_features is None: 720 raise ValueError( 721 "No mass features found, did you run find_mass_features() first?" 722 ) 723 724 if self.eics == {}: 725 raise ValueError( 726 "No EICs found, did you run integrate_mass_features() first?" 727 ) 728 729 if 1 not in self._ms_unprocessed.keys(): 730 raise ValueError("No unprocessed MS1 spectra found.") 731 732 # Prep ms1 data 733 ms1_data = self._ms_unprocessed[1].copy() 734 ms1_data = ms1_data.set_index("scan") 735 736 # Prep mass feature summary 737 mass_feature_df = self.mass_features_to_df() 738 739 # Loop through each mass feature 740 for mf_id, mass_feature in self.mass_features.items(): 741 # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df 742 if mass_feature.mz != mass_feature.ms1_peak.mz_exp: 743 continue 744 745 # Get the left and right limits of the EIC of the mass feature 746 l_scan, _, r_scan = mass_feature._eic_data.apexes[0] 747 748 # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz 749 ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy() 750 ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False) 751 752 # Get the centroided masses of the mass feature 753 mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp 754 755 # Find the closest mz in the ms1 data to the centroided masses of the mass feature 756 ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[ 757 find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values) 758 ] 759 760 # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz 761 ms1_data_sub["mz_diff_rel"] = ( 762 np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"]) 763 / ms1_data_sub["mz"] 764 ) 765 ms1_data_sub = ms1_data_sub[ 766 ms1_data_sub["mz_diff_rel"] 767 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 768 ].reset_index(drop=True) 769 770 # Group by mass_feature_mz and scan and sum intensity 771 ms1_data_sub_group = ( 772 ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"] 773 .sum() 774 .reset_index() 775 ) 776 777 # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity) 778 corr = ( 779 ms1_data_sub_group.pivot( 780 index="scan", columns="mass_feature_mz", values="intensity" 781 ) 782 .fillna(0) 783 .corr() 784 ) 785 786 # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8 787 decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min 788 789 # Try catch for KeyError in case the mass feature mz is not in the correlation matrix 790 try: 791 corr_subset = corr.loc[mass_feature.mz,] 792 except KeyError: 793 # If the mass feature mz is not in the correlation matrix, skip to the next mass feature 794 continue 795 796 corr_subset = corr_subset[corr_subset > decon_corr_min] 797 798 # Get the masses from the mass spectrum that are the result of the deconvolution 799 mzs_decon = corr_subset.index.values 800 801 # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature 802 mzs_decon_idx = [ 803 id 804 for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp) 805 if mz in mzs_decon 806 ] 807 mass_feature._ms_deconvoluted_idx = mzs_decon_idx 808 809 # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum 810 if ( 811 mass_feature.ms1_peak.abundance 812 == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max() 813 ): 814 mass_feature.mass_spectrum_deconvoluted_parent = True 815 else: 816 mass_feature.mass_spectrum_deconvoluted_parent = False 817 818 # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature 819 # Subset mass_feature_df to only include mass features that are within the clustering tolerance 820 mass_feature_df_sub = mass_feature_df[ 821 abs(mass_feature.retention_time - mass_feature_df["scan_time"]) 822 < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 823 ].copy() 824 # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum 825 mass_feature_df_sub["mz_diff_ppm"] = [ 826 np.abs(mzs_decon - mz).min() / mz * 10**6 827 for mz in mass_feature_df_sub["mz"] 828 ] 829 # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses 830 mfs_associated_decon = mass_feature_df_sub[ 831 mass_feature_df_sub["mz_diff_ppm"] 832 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6 833 ].index.values 834 835 mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon
Deconvolute MS1 mass features
Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
Parameters
- None
Returns
- None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
- and associated_mass_features_deconvoluted attributes to the mass features in the
- mass_features attribute of the LCMSBase object.
Raises
- ValueError: If no mass features are found, must run find_mass_features() first. If no EICs are found, did you run integrate_mass_features() first?
838class PHCalculations: 839 """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data. 840 841 Notes 842 ----- 843 This class is intended to be used as a mixin for the LCMSBase class. 844 845 Methods 846 ------- 847 * sparse_mean_filter(idx, V, radius=[0, 1, 1]). 848 Sparse implementation of a mean filter. 849 * embed_unique_indices(a). 850 Creates an array of indices, sorted by unique element. 851 * sparse_upper_star(idx, V). 852 Sparse implementation of an upper star filtration. 853 * check_if_grid(data). 854 Check if the data is gridded in mz space. 855 * grid_data(data). 856 Grid the data in the mz dimension. 857 * find_mass_features_ph(ms_level=1, grid=True). 858 Find mass features within an LCMSBase object using persistent homology. 859 * cluster_mass_features(drop_children=True). 860 Cluster regions of interest. 861 """ 862 863 @staticmethod 864 def sparse_mean_filter(idx, V, radius=[0, 1, 1]): 865 """Sparse implementation of a mean filter. 866 867 Parameters 868 ---------- 869 idx : :obj:`~numpy.array` 870 Edge indices for each dimension (MxN). 871 V : :obj:`~numpy.array` 872 Array of intensity data (Mx1). 873 radius : float or list 874 Radius of the sparse filter in each dimension. Values less than 875 zero indicate no connectivity in that dimension. 876 877 Returns 878 ------- 879 :obj:`~numpy.array` 880 Filtered intensities (Mx1). 881 882 Notes 883 ----- 884 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. 885 This is a static method. 886 """ 887 888 # Copy indices 889 idx = idx.copy().astype(V.dtype) 890 891 # Scale 892 for i, r in enumerate(radius): 893 # Increase inter-index distance 894 if r < 1: 895 idx[:, i] *= 2 896 897 # Do nothing 898 elif r == 1: 899 pass 900 901 # Decrease inter-index distance 902 else: 903 idx[:, i] /= r 904 905 # Connectivity matrix 906 cmat = KDTree(idx) 907 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 908 cmat.setdiag(1) 909 910 # Pair indices 911 I, J = cmat.nonzero() 912 913 # Delete cmat 914 cmat_shape = cmat.shape 915 del cmat 916 917 # Sum over columns 918 V_sum = sparse.bsr_matrix( 919 (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype 920 ).diagonal(0) 921 922 # Count over columns 923 V_count = sparse.bsr_matrix( 924 (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype 925 ).diagonal(0) 926 927 return V_sum / V_count 928 929 @staticmethod 930 def embed_unique_indices(a): 931 """Creates an array of indices, sorted by unique element. 932 933 Parameters 934 ---------- 935 a : :obj:`~numpy.array` 936 Array of unique elements (Mx1). 937 938 Returns 939 ------- 940 :obj:`~numpy.array` 941 Array of indices (Mx1). 942 943 Notes 944 ----- 945 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 946 This is a static method. 947 """ 948 949 def count_tens(n): 950 # Count tens 951 ntens = (n - 1) // 10 952 953 while True: 954 ntens_test = (ntens + n - 1) // 10 955 956 if ntens_test == ntens: 957 return ntens 958 else: 959 ntens = ntens_test 960 961 def arange_exclude_10s(n): 962 # How many 10s will there be? 963 ntens = count_tens(n) 964 965 # Base array 966 arr = np.arange(0, n + ntens) 967 968 # Exclude 10s 969 arr = arr[(arr == 0) | (arr % 10 != 0)][:n] 970 971 return arr 972 973 # Creates an array of indices, sorted by unique element 974 idx_sort = np.argsort(a) 975 idx_unsort = np.argsort(idx_sort) 976 977 # Sorts records array so all unique elements are together 978 sorted_a = a[idx_sort] 979 980 # Returns the unique values, the index of the first occurrence, 981 # and the count for each element 982 vals, idx_start, count = np.unique( 983 sorted_a, return_index=True, return_counts=True 984 ) 985 986 # Splits the indices into separate arrays 987 splits = np.split(idx_sort, idx_start[1:]) 988 989 # Creates unique indices for each split 990 idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits]) 991 992 # Reorders according to input array 993 idx_unq = idx_unq[idx_unsort] 994 995 # Magnitude of each index 996 exp = np.log10( 997 idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64) 998 ) 999 idx_unq_mag = np.power(10, np.floor(exp) + 1) 1000 1001 # Result 1002 return a + idx_unq / idx_unq_mag 1003 1004 @staticmethod 1005 def roll_up_dataframe( 1006 df : pd.DataFrame, 1007 sort_by : str, 1008 tol : list, 1009 relative : list, 1010 dims : list 1011 ): 1012 """Subset data by rolling up into apex in appropriate dimensions. 1013 1014 Parameters 1015 ---------- 1016 data : pd.DataFrame 1017 The input data containing "dims" columns and the "sort_by" column. 1018 sort_by : str 1019 The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 1020 (i.e., the mass feature with the highest value in the sort_by column). 1021 dims : list 1022 A list of dimension names (column names in the data DataFrame) to roll up the mass features by. 1023 tol : list 1024 A list of tolerances for each dimension. The length of the list must match the number of dimensions. 1025 The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). 1026 If relative is True, the tolerance will be multiplied by the maximum value in that dimension. 1027 If relative is False, the tolerance will be used as is. 1028 relative : list 1029 A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1030 1031 Returns 1032 ------- 1033 pd.DataFrame 1034 A DataFrame with only the rolled up mass features, with the original index and columns. 1035 1036 1037 Raises 1038 ------ 1039 ValueError 1040 If the input data is not a pandas DataFrame. 1041 If the input data does not have columns for each of the dimensions in "dims". 1042 If the length of "dims", "tol", and "relative" do not match. 1043 """ 1044 og_columns = df.columns.copy() 1045 1046 # Unindex the data, but keep the original index 1047 if df.index.name is not None: 1048 og_index = df.index.name 1049 else: 1050 og_index = "index" 1051 df = df.reset_index(drop=False) 1052 1053 # Sort data by sort_by column, and reindex 1054 df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True) 1055 1056 # Check that data is a DataFrame and has columns for each of the dims 1057 if not isinstance(df, pd.DataFrame): 1058 raise ValueError("Data must be a pandas DataFrame") 1059 for dim in dims: 1060 if dim not in df.columns: 1061 raise ValueError(f"Data must have a column for {dim}") 1062 if len(dims) != len(tol) or len(dims) != len(relative): 1063 raise ValueError( 1064 "Dimensions, tolerances, and relative flags must be the same length" 1065 ) 1066 1067 # Compute inter-feature distances 1068 distances = None 1069 for i in range(len(dims)): 1070 # Construct k-d tree 1071 values = df[dims[i]].values 1072 tree = KDTree(values.reshape(-1, 1)) 1073 1074 max_tol = tol[i] 1075 if relative[i] is True: 1076 # Maximum absolute tolerance 1077 max_tol = tol[i] * values.max() 1078 1079 # Compute sparse distance matrix 1080 # the larger the max_tol, the slower this operation is 1081 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 1082 1083 # Only consider forward case, exclude diagonal 1084 sdm = sparse.triu(sdm, k=1) 1085 1086 # Filter relative distances 1087 if relative[i] is True: 1088 # Compute relative distances 1089 rel_dists = sdm.data / values[sdm.row] # or col? 1090 1091 # Indices of relative distances less than tolerance 1092 idx = rel_dists <= tol[i] 1093 1094 # Reconstruct sparse distance matrix 1095 sdm = sparse.coo_matrix( 1096 (rel_dists[idx], (sdm.row[idx], sdm.col[idx])), 1097 shape=(len(values), len(values)), 1098 ) 1099 1100 # Cast as binary matrix 1101 sdm.data = np.ones_like(sdm.data) 1102 1103 # Stack distances 1104 if distances is None: 1105 distances = sdm 1106 else: 1107 distances = distances.multiply(sdm) 1108 1109 # Extract indices of within-tolerance points 1110 distances = distances.tocoo() 1111 pairs = np.stack((distances.row, distances.col), axis=1) 1112 pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]) 1113 pairs_df = pairs_df.set_index("parent") 1114 1115 to_drop = [] 1116 while not pairs_df.empty: 1117 # Find root_parents and their children 1118 root_parents = np.setdiff1d( 1119 np.unique(pairs_df.index.values), np.unique(pairs_df.child.values) 1120 ) 1121 children_of_roots = pairs_df.loc[root_parents, "child"].unique() 1122 to_drop = np.append(to_drop, children_of_roots) 1123 1124 # Remove root_children as possible parents from pairs_df for next iteration 1125 pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore") 1126 pairs_df = pairs_df.reset_index().set_index("child") 1127 # Remove root_children as possible children from pairs_df for next iteration 1128 pairs_df = pairs_df.drop(index=children_of_roots) 1129 1130 # Prepare for next iteration 1131 pairs_df = pairs_df.reset_index().set_index("parent") 1132 1133 # Drop mass features that are not cluster parents 1134 df_sub = df.drop(index=np.array(to_drop)) 1135 1136 # Set index back to og_index and only keep the columns that are in the original dataframe 1137 df_sub = df_sub.set_index(og_index) 1138 1139 # sort the dataframe by the original index 1140 df_sub = df_sub.sort_index() 1141 df_sub = df_sub[og_columns] 1142 1143 return df_sub 1144 1145 def sparse_upper_star(self, idx, V): 1146 """Sparse implementation of an upper star filtration. 1147 1148 Parameters 1149 ---------- 1150 idx : :obj:`~numpy.array` 1151 Edge indices for each dimension (MxN). 1152 V : :obj:`~numpy.array` 1153 Array of intensity data (Mx1). 1154 Returns 1155 ------- 1156 idx : :obj:`~numpy.array` 1157 Index of filtered points (Mx1). 1158 persistence : :obj:`~numpy.array` 1159 Persistence of each filtered point (Mx1). 1160 1161 Notes 1162 ----- 1163 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1164 """ 1165 1166 # Invert 1167 V = -1 * V.copy().astype(int) 1168 1169 # Embed indices 1170 V = self.embed_unique_indices(V) 1171 1172 # Connectivity matrix 1173 cmat = KDTree(idx) 1174 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1175 cmat.setdiag(1) 1176 cmat = sparse.triu(cmat) 1177 1178 # Pairwise minimums 1179 I, J = cmat.nonzero() 1180 d = np.maximum(V[I], V[J]) 1181 1182 # Delete connectiity matrix 1183 cmat_shape = cmat.shape 1184 del cmat 1185 1186 # Sparse distance matrix 1187 sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape) 1188 1189 # Delete pairwise mins 1190 del d, I, J 1191 1192 # Persistence homology 1193 ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0] 1194 1195 # Bound death values 1196 ph[ph[:, 1] == np.inf, 1] = np.max(V) 1197 1198 # Construct tree to query against 1199 tree = KDTree(V.reshape((-1, 1))) 1200 1201 # Get the indexes of the first nearest neighbor by birth 1202 _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1) 1203 1204 return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1) 1205 1206 def check_if_grid(self, data): 1207 """Check if the data are gridded in mz space. 1208 1209 Parameters 1210 ---------- 1211 data : DataFrame 1212 DataFrame containing the mass spectrometry data. Needs to have mz and scan columns. 1213 1214 Returns 1215 ------- 1216 bool 1217 True if the data is gridded in the mz direction, False otherwise. 1218 1219 Notes 1220 ----- 1221 This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly. 1222 """ 1223 # Calculate the difference between consecutive mz values in a single scan 1224 dat_check = data.copy().reset_index(drop=True) 1225 dat_check["mz_diff"] = np.abs(dat_check["mz"].diff()) 1226 mz_diff_min = ( 1227 dat_check.groupby("scan")["mz_diff"].min().min() 1228 ) # within each scan, what is the smallest mz difference between consecutive mz values 1229 1230 # Find the mininum mz difference between mz values in the data; regardless of scan 1231 dat_check_mz = dat_check[["mz"]].drop_duplicates().copy() 1232 dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True) 1233 dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff()) 1234 1235 # Get minimum mz_diff between mz values in the data 1236 mz_diff_min_raw = dat_check_mz["mz_diff"].min() 1237 1238 # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded 1239 if mz_diff_min_raw < mz_diff_min: 1240 return False 1241 else: 1242 return True 1243 1244 def grid_data(self, data, attempts=5): 1245 """Grid the data in the mz dimension. 1246 1247 Data must be gridded prior to persistent homology calculations and computing average mass spectrum 1248 1249 Parameters 1250 ---------- 1251 data : DataFrame 1252 The input data containing mz, scan, scan_time, and intensity columns. 1253 attempts : int, optional 1254 The number of attempts to grid the data. Default is 5. 1255 1256 Returns 1257 ------- 1258 DataFrame 1259 The gridded data with mz, scan, scan_time, and intensity columns. 1260 1261 Raises 1262 ------ 1263 ValueError 1264 If gridding fails after the specified number of attempts. 1265 """ 1266 attempt_i = 0 1267 while attempt_i < attempts: 1268 attempt_i += 1 1269 data = self._grid_data(data) 1270 1271 if self.check_if_grid(data): 1272 return data 1273 1274 if not self.check_if_grid(data): 1275 raise ValueError( 1276 "Gridding failed after " 1277 + str(attempt_i) 1278 + " attempts. Please check the data." 1279 ) 1280 else: 1281 return data 1282 1283 def _grid_data(self, data): 1284 """Internal method to grid the data in the mz dimension. 1285 1286 Notes 1287 ----- 1288 This method is called by the grid_data method and should not be called directly. 1289 It will attempt to grid the data in the mz dimension by creating a grid of mz values based on the minimum mz difference within each scan, 1290 but it does not check if the data is already gridded or if the gridding is successful. 1291 1292 Parameters 1293 ---------- 1294 data : pd.DataFrame or pl.DataFrame 1295 The input data to grid. 1296 1297 Returns 1298 ------- 1299 pd.DataFrame or pl.DataFrame 1300 The data after attempting to grid it in the mz dimension. 1301 """ 1302 # Calculate the difference between consecutive mz values in a single scan for grid spacing 1303 data_w = data.copy().reset_index(drop=True) 1304 data_w["mz_diff"] = np.abs(data_w["mz"].diff()) 1305 mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999 1306 1307 # Need high intensity mz values first so they are parents in the output pairs stack 1308 dat_mz = data_w[["mz", "intensity"]].sort_values( 1309 by=["intensity"], ascending=False 1310 ) 1311 dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy() 1312 1313 # Construct KD tree 1314 tree = KDTree(dat_mz.mz.values.reshape(-1, 1)) 1315 sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix") 1316 sdm = sparse.triu(sdm, k=1) 1317 sdm.data = np.ones_like(sdm.data) 1318 distances = sdm.tocoo() 1319 pairs = np.stack((distances.row, distances.col), axis=1) 1320 1321 # Cull pairs to just get root 1322 to_drop = [] 1323 while len(pairs) > 0: 1324 root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1])) 1325 id_root_parents = np.isin(pairs[:, 0], root_parents) 1326 children_of_roots = np.unique(pairs[id_root_parents, 1]) 1327 to_drop = np.append(to_drop, children_of_roots) 1328 1329 # Set up pairs array for next iteration by removing pairs with children or parents already dropped 1330 pairs = pairs[~np.isin(pairs[:, 1], to_drop), :] 1331 pairs = pairs[~np.isin(pairs[:, 0], to_drop), :] 1332 dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop)) 1333 mz_dat_np = ( 1334 dat_mz[["mz"]] 1335 .sort_values(by=["mz"]) 1336 .reset_index(drop=True) 1337 .values.flatten() 1338 ) 1339 1340 # Sort data by mz and recast mz to nearest value in mz_dat_np 1341 data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy() 1342 data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)] 1343 data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"]) 1344 1345 # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity 1346 new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy() 1347 new_data_w = ( 1348 new_data_w.drop(columns=["mz_diff", "mz_orig"]) 1349 .groupby(["scan", "mz"])["intensity"] 1350 .sum() 1351 .reset_index() 1352 ) 1353 new_data_w = ( 1354 new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True]) 1355 .reset_index(drop=True) 1356 .copy() 1357 ) 1358 1359 return new_data_w 1360 1361 def find_mass_features_ph(self, ms_level=1, grid=True): 1362 """Find mass features within an LCMSBase object using persistent homology. 1363 1364 Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id) 1365 1366 Parameters 1367 ---------- 1368 ms_level : int, optional 1369 The MS level to use. Default is 1. 1370 grid : bool, optional 1371 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True. 1372 1373 Raises 1374 ------ 1375 ValueError 1376 If no MS level data is found on the object. 1377 If data is not gridded and grid is False. 1378 1379 Returns 1380 ------- 1381 None, but assigns the mass_features attribute to the object. 1382 1383 Notes 1384 ----- 1385 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1386 """ 1387 # Check that ms_level is a key in self._ms_uprocessed 1388 if ms_level not in self._ms_unprocessed.keys(): 1389 raise ValueError( 1390 "No MS level " 1391 + str(ms_level) 1392 + " data found, did you instantiate with parser specific to MS level?" 1393 ) 1394 1395 # Get ms data 1396 data = self._ms_unprocessed[ms_level].copy() 1397 1398 # Drop rows with missing intensity values and reset index 1399 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1400 1401 # Threshold data 1402 dims = ["mz", "scan_time"] 1403 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1404 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1405 1406 # Check if gridded, if not, grid 1407 gridded_mz = self.check_if_grid(data_thres) 1408 if gridded_mz is False: 1409 if grid is False: 1410 raise ValueError( 1411 "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function" 1412 ) 1413 else: 1414 data_thres = self.grid_data(data_thres) 1415 1416 # Add build factors and add scan_time 1417 data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1418 factors = { 1419 dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32) 1420 for dim in dims 1421 } # this is return a float64 index 1422 1423 # Build indexes 1424 index = { 1425 dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32) 1426 for dim in factors 1427 } 1428 1429 # Smooth data 1430 iterations = self.parameters.lc_ms.ph_smooth_it 1431 smooth_radius = [ 1432 self.parameters.lc_ms.ph_smooth_radius_mz, 1433 self.parameters.lc_ms.ph_smooth_radius_scan, 1434 ] # mz, scan_time smoothing radius (in steps) 1435 1436 index = np.vstack([index[dim] for dim in dims]).T 1437 V = data_thres["intensity"].values 1438 resid = np.inf 1439 for i in range(iterations): 1440 # Previous iteration 1441 V_prev = V.copy() 1442 resid_prev = resid 1443 V = self.sparse_mean_filter(index, V, radius=smooth_radius) 1444 1445 # Calculate residual with previous iteration 1446 resid = np.sqrt(np.mean(np.square(V - V_prev))) 1447 1448 # Evaluate convergence 1449 if i > 0: 1450 # Percent change in residual 1451 test = np.abs(resid - resid_prev) / resid_prev 1452 1453 # Exit criteria 1454 if test <= 0: 1455 break 1456 1457 # Overwrite values 1458 data_thres["intensity"] = V 1459 1460 # Use persistent homology to find regions of interest 1461 pidx, pers = self.sparse_upper_star(index, V) 1462 pidx = pidx[pers > 1] 1463 pers = pers[pers > 1] 1464 1465 # Get peaks 1466 peaks = data_thres.iloc[pidx, :].reset_index(drop=True) 1467 1468 # Add persistence column 1469 peaks["persistence"] = pers 1470 mass_features = peaks.sort_values( 1471 by="persistence", ascending=False 1472 ).reset_index(drop=True) 1473 1474 # Filter by persistence threshold 1475 persistence_threshold = ( 1476 self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max() 1477 ) 1478 mass_features = mass_features.loc[ 1479 mass_features["persistence"] > persistence_threshold, : 1480 ].reset_index(drop=True) 1481 1482 # Rename scan column to apex_scan 1483 mass_features = mass_features.rename( 1484 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1485 ) 1486 1487 # Populate mass_features attribute 1488 self.mass_features = {} 1489 for row in mass_features.itertuples(): 1490 row_dict = mass_features.iloc[row.Index].to_dict() 1491 lcms_feature = LCMSMassFeature(self, **row_dict) 1492 self.mass_features[lcms_feature.id] = lcms_feature 1493 1494 if self.parameters.lc_ms.verbose_processing: 1495 print("Found " + str(len(mass_features)) + " initial mass features") 1496 1497 def find_mass_features_ph_centroid(self, ms_level=1): 1498 """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data. 1499 1500 Parameters 1501 ---------- 1502 ms_level : int, optional 1503 The MS level to use. Default is 1. 1504 1505 Raises 1506 ------ 1507 ValueError 1508 If no MS level data is found on the object. 1509 1510 Returns 1511 ------- 1512 None, but assigns the mass_features attribute to the object. 1513 """ 1514 # Check that ms_level is a key in self._ms_uprocessed 1515 if ms_level not in self._ms_unprocessed.keys(): 1516 raise ValueError( 1517 "No MS level " 1518 + str(ms_level) 1519 + " data found, did you instantiate with parser specific to MS level?" 1520 ) 1521 1522 # Get ms data 1523 data = self._ms_unprocessed[ms_level].copy() 1524 1525 # Drop rows with missing intensity values and reset index 1526 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1527 1528 # Threshold data 1529 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1530 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1531 data_thres['persistence'] = data_thres['intensity'] 1532 1533 # Use this as the starting point for the mass features, adding scan_time 1534 mf_df = data_thres 1535 mf_df = mf_df.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1536 1537 # Define tolerances and dimensions for rolling up 1538 tol = [ 1539 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1540 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1541 ] 1542 relative = [True, False] 1543 dims = ["mz", "scan_time"] 1544 print("here") 1545 mf_df = self.roll_up_dataframe( 1546 df=mf_df, 1547 sort_by="persistence", 1548 dims=dims, 1549 tol=tol, 1550 relative=relative 1551 ) 1552 1553 # Rename scan column to apex_scan 1554 mass_features = mf_df.rename( 1555 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1556 ) 1557 # Sort my persistence and reset index 1558 mass_features = mass_features.sort_values( 1559 by="persistence", ascending=False 1560 ).reset_index(drop=True) 1561 1562 # Populate mass_features attribute 1563 self.mass_features = {} 1564 for row in mass_features.itertuples(): 1565 row_dict = mass_features.iloc[row.Index].to_dict() 1566 lcms_feature = LCMSMassFeature(self, **row_dict) 1567 self.mass_features[lcms_feature.id] = lcms_feature 1568 1569 if self.parameters.lc_ms.verbose_processing: 1570 print("Found " + str(len(mass_features)) + " initial mass features") 1571 1572 def cluster_mass_features(self, drop_children=True, sort_by="persistence"): 1573 """Cluster mass features 1574 1575 Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence. 1576 1577 Parameters 1578 ---------- 1579 drop_children : bool, optional 1580 Whether to drop the mass features that are not cluster parents. Default is True. 1581 sort_by : str, optional 1582 The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence". 1583 1584 Raises 1585 ------ 1586 ValueError 1587 If no mass features are found. 1588 If too many mass features are found. 1589 1590 Returns 1591 ------- 1592 None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents. 1593 """ 1594 verbose = self.parameters.lc_ms.verbose_processing 1595 1596 if self.mass_features is None: 1597 raise ValueError("No mass features found, run find_mass_features() first") 1598 if len(self.mass_features) > 400000: 1599 raise ValueError( 1600 "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold" 1601 ) 1602 dims = ["mz", "scan_time"] 1603 mf_df_og = self.mass_features_to_df() 1604 mf_df = mf_df_og.copy() 1605 1606 tol = [ 1607 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1608 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1609 ] # mz, in relative; scan_time in minutes 1610 relative = [True, False] 1611 1612 # Roll up mass features based on their proximity in the declared dimensions 1613 mf_df_new = self.roll_up_dataframe( 1614 df=mf_df, 1615 sort_by=sort_by, 1616 dims=dims, 1617 tol=tol, 1618 relative=relative 1619 ) 1620 1621 mf_df["cluster_parent"] = np.where( 1622 np.isin(mf_df.index, mf_df_new.index), True, False 1623 ) 1624 1625 # get mass feature ids of features that are not cluster parents 1626 cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values 1627 if drop_children is True: 1628 # Drop mass features that are not cluster parents from self 1629 self.mass_features = { 1630 k: v 1631 for k, v in self.mass_features.items() 1632 if k not in cluster_daughters 1633 } 1634 else: 1635 return cluster_daughters
Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.
Notes
This class is intended to be used as a mixin for the LCMSBase class.
Methods
- sparse_mean_filter(idx, V, radius=[0, 1, 1]). Sparse implementation of a mean filter.
- embed_unique_indices(a). Creates an array of indices, sorted by unique element.
- sparse_upper_star(idx, V). Sparse implementation of an upper star filtration.
- check_if_grid(data). Check if the data is gridded in mz space.
- grid_data(data). Grid the data in the mz dimension.
- find_mass_features_ph(ms_level=1, grid=True). Find mass features within an LCMSBase object using persistent homology.
- cluster_mass_features(drop_children=True). Cluster regions of interest.
863 @staticmethod 864 def sparse_mean_filter(idx, V, radius=[0, 1, 1]): 865 """Sparse implementation of a mean filter. 866 867 Parameters 868 ---------- 869 idx : :obj:`~numpy.array` 870 Edge indices for each dimension (MxN). 871 V : :obj:`~numpy.array` 872 Array of intensity data (Mx1). 873 radius : float or list 874 Radius of the sparse filter in each dimension. Values less than 875 zero indicate no connectivity in that dimension. 876 877 Returns 878 ------- 879 :obj:`~numpy.array` 880 Filtered intensities (Mx1). 881 882 Notes 883 ----- 884 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. 885 This is a static method. 886 """ 887 888 # Copy indices 889 idx = idx.copy().astype(V.dtype) 890 891 # Scale 892 for i, r in enumerate(radius): 893 # Increase inter-index distance 894 if r < 1: 895 idx[:, i] *= 2 896 897 # Do nothing 898 elif r == 1: 899 pass 900 901 # Decrease inter-index distance 902 else: 903 idx[:, i] /= r 904 905 # Connectivity matrix 906 cmat = KDTree(idx) 907 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 908 cmat.setdiag(1) 909 910 # Pair indices 911 I, J = cmat.nonzero() 912 913 # Delete cmat 914 cmat_shape = cmat.shape 915 del cmat 916 917 # Sum over columns 918 V_sum = sparse.bsr_matrix( 919 (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype 920 ).diagonal(0) 921 922 # Count over columns 923 V_count = sparse.bsr_matrix( 924 (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype 925 ).diagonal(0) 926 927 return V_sum / V_count
Sparse implementation of a mean filter.
Parameters
- idx (
~numpy.array
): Edge indices for each dimension (MxN). - V (
~numpy.array
): Array of intensity data (Mx1). - radius (float or list): Radius of the sparse filter in each dimension. Values less than zero indicate no connectivity in that dimension.
Returns
~numpy.array
: Filtered intensities (Mx1).
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. This is a static method.
929 @staticmethod 930 def embed_unique_indices(a): 931 """Creates an array of indices, sorted by unique element. 932 933 Parameters 934 ---------- 935 a : :obj:`~numpy.array` 936 Array of unique elements (Mx1). 937 938 Returns 939 ------- 940 :obj:`~numpy.array` 941 Array of indices (Mx1). 942 943 Notes 944 ----- 945 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 946 This is a static method. 947 """ 948 949 def count_tens(n): 950 # Count tens 951 ntens = (n - 1) // 10 952 953 while True: 954 ntens_test = (ntens + n - 1) // 10 955 956 if ntens_test == ntens: 957 return ntens 958 else: 959 ntens = ntens_test 960 961 def arange_exclude_10s(n): 962 # How many 10s will there be? 963 ntens = count_tens(n) 964 965 # Base array 966 arr = np.arange(0, n + ntens) 967 968 # Exclude 10s 969 arr = arr[(arr == 0) | (arr % 10 != 0)][:n] 970 971 return arr 972 973 # Creates an array of indices, sorted by unique element 974 idx_sort = np.argsort(a) 975 idx_unsort = np.argsort(idx_sort) 976 977 # Sorts records array so all unique elements are together 978 sorted_a = a[idx_sort] 979 980 # Returns the unique values, the index of the first occurrence, 981 # and the count for each element 982 vals, idx_start, count = np.unique( 983 sorted_a, return_index=True, return_counts=True 984 ) 985 986 # Splits the indices into separate arrays 987 splits = np.split(idx_sort, idx_start[1:]) 988 989 # Creates unique indices for each split 990 idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits]) 991 992 # Reorders according to input array 993 idx_unq = idx_unq[idx_unsort] 994 995 # Magnitude of each index 996 exp = np.log10( 997 idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64) 998 ) 999 idx_unq_mag = np.power(10, np.floor(exp) + 1) 1000 1001 # Result 1002 return a + idx_unq / idx_unq_mag
Creates an array of indices, sorted by unique element.
Parameters
- a (
~numpy.array
): Array of unique elements (Mx1).
Returns
~numpy.array
: Array of indices (Mx1).
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos This is a static method.
1004 @staticmethod 1005 def roll_up_dataframe( 1006 df : pd.DataFrame, 1007 sort_by : str, 1008 tol : list, 1009 relative : list, 1010 dims : list 1011 ): 1012 """Subset data by rolling up into apex in appropriate dimensions. 1013 1014 Parameters 1015 ---------- 1016 data : pd.DataFrame 1017 The input data containing "dims" columns and the "sort_by" column. 1018 sort_by : str 1019 The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 1020 (i.e., the mass feature with the highest value in the sort_by column). 1021 dims : list 1022 A list of dimension names (column names in the data DataFrame) to roll up the mass features by. 1023 tol : list 1024 A list of tolerances for each dimension. The length of the list must match the number of dimensions. 1025 The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). 1026 If relative is True, the tolerance will be multiplied by the maximum value in that dimension. 1027 If relative is False, the tolerance will be used as is. 1028 relative : list 1029 A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1030 1031 Returns 1032 ------- 1033 pd.DataFrame 1034 A DataFrame with only the rolled up mass features, with the original index and columns. 1035 1036 1037 Raises 1038 ------ 1039 ValueError 1040 If the input data is not a pandas DataFrame. 1041 If the input data does not have columns for each of the dimensions in "dims". 1042 If the length of "dims", "tol", and "relative" do not match. 1043 """ 1044 og_columns = df.columns.copy() 1045 1046 # Unindex the data, but keep the original index 1047 if df.index.name is not None: 1048 og_index = df.index.name 1049 else: 1050 og_index = "index" 1051 df = df.reset_index(drop=False) 1052 1053 # Sort data by sort_by column, and reindex 1054 df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True) 1055 1056 # Check that data is a DataFrame and has columns for each of the dims 1057 if not isinstance(df, pd.DataFrame): 1058 raise ValueError("Data must be a pandas DataFrame") 1059 for dim in dims: 1060 if dim not in df.columns: 1061 raise ValueError(f"Data must have a column for {dim}") 1062 if len(dims) != len(tol) or len(dims) != len(relative): 1063 raise ValueError( 1064 "Dimensions, tolerances, and relative flags must be the same length" 1065 ) 1066 1067 # Compute inter-feature distances 1068 distances = None 1069 for i in range(len(dims)): 1070 # Construct k-d tree 1071 values = df[dims[i]].values 1072 tree = KDTree(values.reshape(-1, 1)) 1073 1074 max_tol = tol[i] 1075 if relative[i] is True: 1076 # Maximum absolute tolerance 1077 max_tol = tol[i] * values.max() 1078 1079 # Compute sparse distance matrix 1080 # the larger the max_tol, the slower this operation is 1081 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 1082 1083 # Only consider forward case, exclude diagonal 1084 sdm = sparse.triu(sdm, k=1) 1085 1086 # Filter relative distances 1087 if relative[i] is True: 1088 # Compute relative distances 1089 rel_dists = sdm.data / values[sdm.row] # or col? 1090 1091 # Indices of relative distances less than tolerance 1092 idx = rel_dists <= tol[i] 1093 1094 # Reconstruct sparse distance matrix 1095 sdm = sparse.coo_matrix( 1096 (rel_dists[idx], (sdm.row[idx], sdm.col[idx])), 1097 shape=(len(values), len(values)), 1098 ) 1099 1100 # Cast as binary matrix 1101 sdm.data = np.ones_like(sdm.data) 1102 1103 # Stack distances 1104 if distances is None: 1105 distances = sdm 1106 else: 1107 distances = distances.multiply(sdm) 1108 1109 # Extract indices of within-tolerance points 1110 distances = distances.tocoo() 1111 pairs = np.stack((distances.row, distances.col), axis=1) 1112 pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]) 1113 pairs_df = pairs_df.set_index("parent") 1114 1115 to_drop = [] 1116 while not pairs_df.empty: 1117 # Find root_parents and their children 1118 root_parents = np.setdiff1d( 1119 np.unique(pairs_df.index.values), np.unique(pairs_df.child.values) 1120 ) 1121 children_of_roots = pairs_df.loc[root_parents, "child"].unique() 1122 to_drop = np.append(to_drop, children_of_roots) 1123 1124 # Remove root_children as possible parents from pairs_df for next iteration 1125 pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore") 1126 pairs_df = pairs_df.reset_index().set_index("child") 1127 # Remove root_children as possible children from pairs_df for next iteration 1128 pairs_df = pairs_df.drop(index=children_of_roots) 1129 1130 # Prepare for next iteration 1131 pairs_df = pairs_df.reset_index().set_index("parent") 1132 1133 # Drop mass features that are not cluster parents 1134 df_sub = df.drop(index=np.array(to_drop)) 1135 1136 # Set index back to og_index and only keep the columns that are in the original dataframe 1137 df_sub = df_sub.set_index(og_index) 1138 1139 # sort the dataframe by the original index 1140 df_sub = df_sub.sort_index() 1141 df_sub = df_sub[og_columns] 1142 1143 return df_sub
Subset data by rolling up into apex in appropriate dimensions.
Parameters
- data (pd.DataFrame): The input data containing "dims" columns and the "sort_by" column.
- sort_by (str): The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature (i.e., the mass feature with the highest value in the sort_by column).
- dims (list): A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
- tol (list): A list of tolerances for each dimension. The length of the list must match the number of dimensions. The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). If relative is True, the tolerance will be multiplied by the maximum value in that dimension. If relative is False, the tolerance will be used as is.
- relative (list): A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
Returns
- pd.DataFrame: A DataFrame with only the rolled up mass features, with the original index and columns.
Raises
- ValueError: If the input data is not a pandas DataFrame. If the input data does not have columns for each of the dimensions in "dims". If the length of "dims", "tol", and "relative" do not match.
1145 def sparse_upper_star(self, idx, V): 1146 """Sparse implementation of an upper star filtration. 1147 1148 Parameters 1149 ---------- 1150 idx : :obj:`~numpy.array` 1151 Edge indices for each dimension (MxN). 1152 V : :obj:`~numpy.array` 1153 Array of intensity data (Mx1). 1154 Returns 1155 ------- 1156 idx : :obj:`~numpy.array` 1157 Index of filtered points (Mx1). 1158 persistence : :obj:`~numpy.array` 1159 Persistence of each filtered point (Mx1). 1160 1161 Notes 1162 ----- 1163 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1164 """ 1165 1166 # Invert 1167 V = -1 * V.copy().astype(int) 1168 1169 # Embed indices 1170 V = self.embed_unique_indices(V) 1171 1172 # Connectivity matrix 1173 cmat = KDTree(idx) 1174 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1175 cmat.setdiag(1) 1176 cmat = sparse.triu(cmat) 1177 1178 # Pairwise minimums 1179 I, J = cmat.nonzero() 1180 d = np.maximum(V[I], V[J]) 1181 1182 # Delete connectiity matrix 1183 cmat_shape = cmat.shape 1184 del cmat 1185 1186 # Sparse distance matrix 1187 sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape) 1188 1189 # Delete pairwise mins 1190 del d, I, J 1191 1192 # Persistence homology 1193 ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0] 1194 1195 # Bound death values 1196 ph[ph[:, 1] == np.inf, 1] = np.max(V) 1197 1198 # Construct tree to query against 1199 tree = KDTree(V.reshape((-1, 1))) 1200 1201 # Get the indexes of the first nearest neighbor by birth 1202 _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1) 1203 1204 return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)
Sparse implementation of an upper star filtration.
Parameters
- idx (
~numpy.array
): Edge indices for each dimension (MxN). - V (
~numpy.array
): Array of intensity data (Mx1).
Returns
- idx (
~numpy.array
): Index of filtered points (Mx1). - persistence (
~numpy.array
): Persistence of each filtered point (Mx1).
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1206 def check_if_grid(self, data): 1207 """Check if the data are gridded in mz space. 1208 1209 Parameters 1210 ---------- 1211 data : DataFrame 1212 DataFrame containing the mass spectrometry data. Needs to have mz and scan columns. 1213 1214 Returns 1215 ------- 1216 bool 1217 True if the data is gridded in the mz direction, False otherwise. 1218 1219 Notes 1220 ----- 1221 This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly. 1222 """ 1223 # Calculate the difference between consecutive mz values in a single scan 1224 dat_check = data.copy().reset_index(drop=True) 1225 dat_check["mz_diff"] = np.abs(dat_check["mz"].diff()) 1226 mz_diff_min = ( 1227 dat_check.groupby("scan")["mz_diff"].min().min() 1228 ) # within each scan, what is the smallest mz difference between consecutive mz values 1229 1230 # Find the mininum mz difference between mz values in the data; regardless of scan 1231 dat_check_mz = dat_check[["mz"]].drop_duplicates().copy() 1232 dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True) 1233 dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff()) 1234 1235 # Get minimum mz_diff between mz values in the data 1236 mz_diff_min_raw = dat_check_mz["mz_diff"].min() 1237 1238 # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded 1239 if mz_diff_min_raw < mz_diff_min: 1240 return False 1241 else: 1242 return True
Check if the data are gridded in mz space.
Parameters
- data (DataFrame): DataFrame containing the mass spectrometry data. Needs to have mz and scan columns.
Returns
- bool: True if the data is gridded in the mz direction, False otherwise.
Notes
This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1244 def grid_data(self, data, attempts=5): 1245 """Grid the data in the mz dimension. 1246 1247 Data must be gridded prior to persistent homology calculations and computing average mass spectrum 1248 1249 Parameters 1250 ---------- 1251 data : DataFrame 1252 The input data containing mz, scan, scan_time, and intensity columns. 1253 attempts : int, optional 1254 The number of attempts to grid the data. Default is 5. 1255 1256 Returns 1257 ------- 1258 DataFrame 1259 The gridded data with mz, scan, scan_time, and intensity columns. 1260 1261 Raises 1262 ------ 1263 ValueError 1264 If gridding fails after the specified number of attempts. 1265 """ 1266 attempt_i = 0 1267 while attempt_i < attempts: 1268 attempt_i += 1 1269 data = self._grid_data(data) 1270 1271 if self.check_if_grid(data): 1272 return data 1273 1274 if not self.check_if_grid(data): 1275 raise ValueError( 1276 "Gridding failed after " 1277 + str(attempt_i) 1278 + " attempts. Please check the data." 1279 ) 1280 else: 1281 return data
Grid the data in the mz dimension.
Data must be gridded prior to persistent homology calculations and computing average mass spectrum
Parameters
- data (DataFrame): The input data containing mz, scan, scan_time, and intensity columns.
- attempts (int, optional): The number of attempts to grid the data. Default is 5.
Returns
- DataFrame: The gridded data with mz, scan, scan_time, and intensity columns.
Raises
- ValueError: If gridding fails after the specified number of attempts.
1361 def find_mass_features_ph(self, ms_level=1, grid=True): 1362 """Find mass features within an LCMSBase object using persistent homology. 1363 1364 Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id) 1365 1366 Parameters 1367 ---------- 1368 ms_level : int, optional 1369 The MS level to use. Default is 1. 1370 grid : bool, optional 1371 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True. 1372 1373 Raises 1374 ------ 1375 ValueError 1376 If no MS level data is found on the object. 1377 If data is not gridded and grid is False. 1378 1379 Returns 1380 ------- 1381 None, but assigns the mass_features attribute to the object. 1382 1383 Notes 1384 ----- 1385 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1386 """ 1387 # Check that ms_level is a key in self._ms_uprocessed 1388 if ms_level not in self._ms_unprocessed.keys(): 1389 raise ValueError( 1390 "No MS level " 1391 + str(ms_level) 1392 + " data found, did you instantiate with parser specific to MS level?" 1393 ) 1394 1395 # Get ms data 1396 data = self._ms_unprocessed[ms_level].copy() 1397 1398 # Drop rows with missing intensity values and reset index 1399 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1400 1401 # Threshold data 1402 dims = ["mz", "scan_time"] 1403 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1404 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1405 1406 # Check if gridded, if not, grid 1407 gridded_mz = self.check_if_grid(data_thres) 1408 if gridded_mz is False: 1409 if grid is False: 1410 raise ValueError( 1411 "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function" 1412 ) 1413 else: 1414 data_thres = self.grid_data(data_thres) 1415 1416 # Add build factors and add scan_time 1417 data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1418 factors = { 1419 dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32) 1420 for dim in dims 1421 } # this is return a float64 index 1422 1423 # Build indexes 1424 index = { 1425 dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32) 1426 for dim in factors 1427 } 1428 1429 # Smooth data 1430 iterations = self.parameters.lc_ms.ph_smooth_it 1431 smooth_radius = [ 1432 self.parameters.lc_ms.ph_smooth_radius_mz, 1433 self.parameters.lc_ms.ph_smooth_radius_scan, 1434 ] # mz, scan_time smoothing radius (in steps) 1435 1436 index = np.vstack([index[dim] for dim in dims]).T 1437 V = data_thres["intensity"].values 1438 resid = np.inf 1439 for i in range(iterations): 1440 # Previous iteration 1441 V_prev = V.copy() 1442 resid_prev = resid 1443 V = self.sparse_mean_filter(index, V, radius=smooth_radius) 1444 1445 # Calculate residual with previous iteration 1446 resid = np.sqrt(np.mean(np.square(V - V_prev))) 1447 1448 # Evaluate convergence 1449 if i > 0: 1450 # Percent change in residual 1451 test = np.abs(resid - resid_prev) / resid_prev 1452 1453 # Exit criteria 1454 if test <= 0: 1455 break 1456 1457 # Overwrite values 1458 data_thres["intensity"] = V 1459 1460 # Use persistent homology to find regions of interest 1461 pidx, pers = self.sparse_upper_star(index, V) 1462 pidx = pidx[pers > 1] 1463 pers = pers[pers > 1] 1464 1465 # Get peaks 1466 peaks = data_thres.iloc[pidx, :].reset_index(drop=True) 1467 1468 # Add persistence column 1469 peaks["persistence"] = pers 1470 mass_features = peaks.sort_values( 1471 by="persistence", ascending=False 1472 ).reset_index(drop=True) 1473 1474 # Filter by persistence threshold 1475 persistence_threshold = ( 1476 self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max() 1477 ) 1478 mass_features = mass_features.loc[ 1479 mass_features["persistence"] > persistence_threshold, : 1480 ].reset_index(drop=True) 1481 1482 # Rename scan column to apex_scan 1483 mass_features = mass_features.rename( 1484 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1485 ) 1486 1487 # Populate mass_features attribute 1488 self.mass_features = {} 1489 for row in mass_features.itertuples(): 1490 row_dict = mass_features.iloc[row.Index].to_dict() 1491 lcms_feature = LCMSMassFeature(self, **row_dict) 1492 self.mass_features[lcms_feature.id] = lcms_feature 1493 1494 if self.parameters.lc_ms.verbose_processing: 1495 print("Found " + str(len(mass_features)) + " initial mass features")
Find mass features within an LCMSBase object using persistent homology.
Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
Parameters
- ms_level (int, optional): The MS level to use. Default is 1.
- grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
Raises
- ValueError: If no MS level data is found on the object. If data is not gridded and grid is False.
Returns
- None, but assigns the mass_features attribute to the object.
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1497 def find_mass_features_ph_centroid(self, ms_level=1): 1498 """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data. 1499 1500 Parameters 1501 ---------- 1502 ms_level : int, optional 1503 The MS level to use. Default is 1. 1504 1505 Raises 1506 ------ 1507 ValueError 1508 If no MS level data is found on the object. 1509 1510 Returns 1511 ------- 1512 None, but assigns the mass_features attribute to the object. 1513 """ 1514 # Check that ms_level is a key in self._ms_uprocessed 1515 if ms_level not in self._ms_unprocessed.keys(): 1516 raise ValueError( 1517 "No MS level " 1518 + str(ms_level) 1519 + " data found, did you instantiate with parser specific to MS level?" 1520 ) 1521 1522 # Get ms data 1523 data = self._ms_unprocessed[ms_level].copy() 1524 1525 # Drop rows with missing intensity values and reset index 1526 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1527 1528 # Threshold data 1529 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1530 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1531 data_thres['persistence'] = data_thres['intensity'] 1532 1533 # Use this as the starting point for the mass features, adding scan_time 1534 mf_df = data_thres 1535 mf_df = mf_df.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1536 1537 # Define tolerances and dimensions for rolling up 1538 tol = [ 1539 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1540 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1541 ] 1542 relative = [True, False] 1543 dims = ["mz", "scan_time"] 1544 print("here") 1545 mf_df = self.roll_up_dataframe( 1546 df=mf_df, 1547 sort_by="persistence", 1548 dims=dims, 1549 tol=tol, 1550 relative=relative 1551 ) 1552 1553 # Rename scan column to apex_scan 1554 mass_features = mf_df.rename( 1555 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1556 ) 1557 # Sort my persistence and reset index 1558 mass_features = mass_features.sort_values( 1559 by="persistence", ascending=False 1560 ).reset_index(drop=True) 1561 1562 # Populate mass_features attribute 1563 self.mass_features = {} 1564 for row in mass_features.itertuples(): 1565 row_dict = mass_features.iloc[row.Index].to_dict() 1566 lcms_feature = LCMSMassFeature(self, **row_dict) 1567 self.mass_features[lcms_feature.id] = lcms_feature 1568 1569 if self.parameters.lc_ms.verbose_processing: 1570 print("Found " + str(len(mass_features)) + " initial mass features")
Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
Parameters
- ms_level (int, optional): The MS level to use. Default is 1.
Raises
- ValueError: If no MS level data is found on the object.
Returns
- None, but assigns the mass_features attribute to the object.
1572 def cluster_mass_features(self, drop_children=True, sort_by="persistence"): 1573 """Cluster mass features 1574 1575 Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence. 1576 1577 Parameters 1578 ---------- 1579 drop_children : bool, optional 1580 Whether to drop the mass features that are not cluster parents. Default is True. 1581 sort_by : str, optional 1582 The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence". 1583 1584 Raises 1585 ------ 1586 ValueError 1587 If no mass features are found. 1588 If too many mass features are found. 1589 1590 Returns 1591 ------- 1592 None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents. 1593 """ 1594 verbose = self.parameters.lc_ms.verbose_processing 1595 1596 if self.mass_features is None: 1597 raise ValueError("No mass features found, run find_mass_features() first") 1598 if len(self.mass_features) > 400000: 1599 raise ValueError( 1600 "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold" 1601 ) 1602 dims = ["mz", "scan_time"] 1603 mf_df_og = self.mass_features_to_df() 1604 mf_df = mf_df_og.copy() 1605 1606 tol = [ 1607 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1608 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1609 ] # mz, in relative; scan_time in minutes 1610 relative = [True, False] 1611 1612 # Roll up mass features based on their proximity in the declared dimensions 1613 mf_df_new = self.roll_up_dataframe( 1614 df=mf_df, 1615 sort_by=sort_by, 1616 dims=dims, 1617 tol=tol, 1618 relative=relative 1619 ) 1620 1621 mf_df["cluster_parent"] = np.where( 1622 np.isin(mf_df.index, mf_df_new.index), True, False 1623 ) 1624 1625 # get mass feature ids of features that are not cluster parents 1626 cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values 1627 if drop_children is True: 1628 # Drop mass features that are not cluster parents from self 1629 self.mass_features = { 1630 k: v 1631 for k, v in self.mass_features.items() 1632 if k not in cluster_daughters 1633 } 1634 else: 1635 return cluster_daughters
Cluster mass features
Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
Parameters
- drop_children (bool, optional): Whether to drop the mass features that are not cluster parents. Default is True.
- sort_by (str, optional): The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
Raises
- ValueError: If no mass features are found. If too many mass features are found.
Returns
- None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.