corems.mass_spectra.calc.lc_calc
1import numpy as np 2import pandas as pd 3from ripser import ripser 4from scipy import sparse 5from scipy.spatial import KDTree 6 7from corems.chroma_peak.factory.chroma_peak_classes import LCMSMassFeature 8from corems.mass_spectra.calc import SignalProcessing as sp 9from corems.mass_spectra.factory.chromat_data import EIC_Data 10from corems.mass_spectrum.input.numpyArray import ms_from_array_profile 11 12 13def find_closest(A, target): 14 """Find the index of closest value in A to each value in target. 15 16 Parameters 17 ---------- 18 A : :obj:`~numpy.array` 19 The array to search (blueprint). A must be sorted. 20 target : :obj:`~numpy.array` 21 The array of values to search for. target must be sorted. 22 23 Returns 24 ------- 25 :obj:`~numpy.array` 26 The indices of the closest values in A to each value in target. 27 """ 28 idx = A.searchsorted(target) 29 idx = np.clip(idx, 1, len(A) - 1) 30 left = A[idx - 1] 31 right = A[idx] 32 idx -= target - left < right - target 33 return idx 34 35 36class LCCalculations: 37 """Methods for performing LC calculations on mass spectra data. 38 39 Notes 40 ----- 41 This class is intended to be used as a mixin for the LCMSBase class. 42 43 Methods 44 ------- 45 * get_max_eic(eic_data). 46 Returns the maximum EIC value from the given EIC data. A static method. 47 * smooth_tic(tic). 48 Smooths the TIC data using the specified smoothing method and settings. 49 * eic_centroid_detector(rt, eic, max_eic). 50 Performs EIC centroid detection on the given EIC data. 51 * find_nearest_scan(rt). 52 Finds the nearest scan to the given retention time. 53 * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). 54 Returns an averaged mass spectrum object. 55 * find_mass_features(ms_level=1). 56 Find regions of interest for a given MS level (default is MS1). 57 * integrate_mass_features(drop_if_fail=False, ms_level=1). 58 Integrate mass features of interest and extracts EICs. 59 * find_c13_mass_features(). 60 Evaluate mass features and mark likely C13 isotopes. 61 * deconvolute_ms1_mass_features(). 62 Deconvolute mass features' ms1 mass spectra. 63 """ 64 65 @staticmethod 66 def get_max_eic(eic_data: dict): 67 """Returns the maximum EIC value from the given EIC data. 68 69 Notes 70 ----- 71 This is a static method. 72 73 Parameters 74 ---------- 75 eic_data : dict 76 A dictionary containing EIC data. 77 78 Returns 79 ------- 80 float 81 The maximum EIC value. 82 """ 83 max_eic = 0 84 for eic_data in eic_data.values(): 85 ind_max_eic = max(eic_data.get("EIC")) 86 max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic 87 88 return max_eic 89 90 def smooth_tic(self, tic): 91 """Smooths the TIC or EIC data using the specified smoothing method and settings. 92 93 Parameters 94 ---------- 95 tic : numpy.ndarray 96 The TIC (or EIC) data to be smoothed. 97 98 Returns 99 ------- 100 numpy.ndarray 101 The smoothed TIC data. 102 """ 103 implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method 104 105 pol_order = self.parameters.lc_ms.savgol_pol_order 106 107 window_len = self.parameters.lc_ms.smooth_window 108 109 window = self.parameters.lc_ms.smooth_method 110 111 return sp.smooth_signal( 112 tic, window_len, window, pol_order, implemented_smooth_method 113 ) 114 115 def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]): 116 """Performs EIC centroid detection on the given EIC data. 117 118 Parameters 119 ---------- 120 rt : numpy.ndarray 121 The retention time data. 122 eic : numpy.ndarray 123 The EIC data. 124 max_eic : float 125 The maximum EIC value. 126 apex_indexes : list, optional 127 The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function. 128 129 Returns 130 ------- 131 numpy.ndarray 132 The indexes of left, apex, and right limits as a generator. 133 """ 134 135 max_prominence = self.parameters.lc_ms.peak_max_prominence_percent 136 137 max_height = self.parameters.lc_ms.peak_height_max_percent 138 139 signal_threshold = self.parameters.lc_ms.eic_signal_threshold 140 141 min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints 142 143 peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold 144 145 include_indexes = sp.peak_picking_first_derivative( 146 domain=rt, 147 signal=eic, 148 max_height=max_height, 149 max_prominence=max_prominence, 150 max_signal=max_eic, 151 min_peak_datapoints=min_peak_datapoints, 152 peak_derivative_threshold=peak_derivative_threshold, 153 signal_threshold=signal_threshold, 154 correct_baseline=False, 155 plot_res=False, 156 apex_indexes=apex_indexes, 157 ) 158 159 return include_indexes 160 161 def find_nearest_scan(self, rt): 162 """Finds the nearest scan to the given retention time. 163 164 Parameters 165 ---------- 166 rt : float 167 The retention time (in minutes) to find the nearest scan for. 168 169 Returns 170 ------- 171 int 172 The scan number of the nearest scan. 173 """ 174 array_rt = np.array(self.retention_time) 175 176 scan_index = (np.abs(array_rt - rt)).argmin() 177 178 real_scan = self.scans_number[scan_index] 179 180 return real_scan 181 182 def add_peak_metrics(self): 183 """Add peak metrics to the mass features. 184 185 This function calculates the peak metrics for each mass feature and adds them to the mass feature objects. 186 """ 187 # Check that at least some mass features have eic data 188 if not any([mf._eic_data is not None for mf in self.mass_features.values()]): 189 raise ValueError( 190 "No mass features have EIC data. Run integrate_mass_features first." 191 ) 192 193 for mass_feature in self.mass_features.values(): 194 # Check if the mass feature has been integrated 195 if mass_feature._eic_data is not None: 196 # Calculate peak metrics 197 mass_feature.calc_half_height_width() 198 mass_feature.calc_tailing_factor() 199 mass_feature.calc_dispersity_index() 200 201 def get_average_mass_spectrum( 202 self, 203 scan_list, 204 apex_scan, 205 spectrum_mode="profile", 206 ms_level=1, 207 auto_process=True, 208 use_parser=False, 209 perform_checks=True, 210 polarity=None, 211 ms_params=None, 212 ): 213 """Returns an averaged mass spectrum object 214 215 Parameters 216 ---------- 217 scan_list : list 218 List of scan numbers to average. 219 apex_scan : int 220 Number of the apex scan 221 spectrum_mode : str, optional 222 The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging. 223 ms_level : int, optional 224 The MS level to use. Defaults to 1. 225 auto_process : bool, optional 226 If True, the averaged mass spectrum will be auto-processed. Defaults to True. 227 use_parser : bool, optional 228 If True, the mass spectra will be obtained from the parser. Defaults to False. 229 perform_checks : bool, optional 230 If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan 231 polarity : int, optional 232 The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1) 233 ms_params : MSParameters, optional 234 The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None. 235 236 Returns 237 ------- 238 MassSpectrumProfile 239 The averaged mass spectrum object. 240 241 Raises 242 ------ 243 ValueError 244 If the spectrum mode is not "profile". 245 If the MS level is not found in the unprocessed mass spectra dictionary. 246 If not all scan numbers are found in the unprocessed mass spectra dictionary. 247 """ 248 if perform_checks: 249 if spectrum_mode != "profile": 250 raise ValueError("Averaging only supported for profile mode") 251 252 if polarity is None: 253 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 254 if self.polarity == "negative": 255 polarity = -1 256 elif self.polarity == "positive": 257 polarity = 1 258 else: 259 raise ValueError( 260 "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'" 261 ) 262 263 # if not using_parser, check that scan numbers are in _ms_unprocessed 264 if not use_parser: 265 if perform_checks: 266 # Set index to scan for faster lookup 267 ms_df = ( 268 self._ms_unprocessed[ms_level] 269 .copy() 270 .set_index("scan", drop=False) 271 .sort_index() 272 ) 273 my_ms_df = ms_df.loc[scan_list] 274 # Check that all scan numbers are in the ms_df 275 if not all(np.isin(scan_list, ms_df.index)): 276 raise ValueError( 277 "Not all scan numbers found in the unprocessed mass spectra dictionary" 278 ) 279 else: 280 my_ms_df = ( 281 pd.DataFrame({"scan": scan_list}) 282 .set_index("scan") 283 .join(self._ms_unprocessed[ms_level], how="left") 284 ) 285 286 if use_parser: 287 ms_list = [ 288 self.spectra_parser.get_mass_spectrum_from_scan( 289 x, spectrum_mode=spectrum_mode, auto_process=False 290 ) 291 for x in scan_list 292 ] 293 ms_mz = [x._mz_exp for x in ms_list] 294 ms_int = [x._abundance for x in ms_list] 295 my_ms_df = [] 296 for i in np.arange(len(ms_mz)): 297 my_ms_df.append( 298 pd.DataFrame( 299 {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]} 300 ) 301 ) 302 my_ms_df = pd.concat(my_ms_df) 303 304 if not self.check_if_grid(my_ms_df): 305 my_ms_df = self.grid_data(my_ms_df) 306 307 my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index() 308 309 ms = ms_from_array_profile( 310 my_ms_ave.mz, 311 my_ms_ave.intensity, 312 self.file_location, 313 polarity=polarity, 314 auto_process=False, 315 ) 316 317 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 318 if ms is not None: 319 if ms_params is not None: 320 ms.parameters = ms_params 321 ms.scan_number = apex_scan 322 if auto_process: 323 ms.process_mass_spec() 324 return ms 325 326 def find_mass_features(self, ms_level=1, grid=True): 327 """Find mass features within an LCMSBase object 328 329 Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future. 330 331 Parameters 332 ---------- 333 ms_level : int, optional 334 The MS level to use for peak picking Default is 1. 335 grid : bool, optional 336 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 337 used for persistent homology peak picking for profile data only. Default is True. 338 339 Raises 340 ------ 341 ValueError 342 If no MS level data is found on the object. 343 If persistent homology peak picking is attempted on non-profile mode data. 344 If data is not gridded and grid is False. 345 If peak picking method is not implemented. 346 347 Returns 348 ------- 349 None, but assigns the mass_features and eics attributes to the object. 350 351 """ 352 pp_method = self.parameters.lc_ms.peak_picking_method 353 354 if pp_method == "persistent homology": 355 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 356 if all(msx_scan_df["ms_format"] == "profile"): 357 self.find_mass_features_ph(ms_level=ms_level, grid=grid) 358 self.cluster_mass_features(drop_children=True, sort_by="persistence") 359 else: 360 raise ValueError( 361 "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format( 362 ms_level 363 ) 364 ) 365 elif pp_method == "centroided_persistent_homology": 366 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 367 if all(msx_scan_df["ms_format"] == "centroid"): 368 self.find_mass_features_ph_centroid(ms_level=ms_level) 369 else: 370 raise ValueError( 371 "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format( 372 ms_level 373 ) 374 ) 375 else: 376 raise ValueError("Peak picking method not implemented") 377 378 def integrate_mass_features( 379 self, drop_if_fail=True, drop_duplicates=True, ms_level=1 380 ): 381 """Integrate mass features and extract EICs. 382 383 Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute. 384 385 Parameters 386 ---------- 387 drop_if_fail : bool, optional 388 Whether to drop mass features if the EIC limit calculations fail. 389 Default is True. 390 drop_duplicates : bool, optional 391 Whether to mass features that appear to be duplicates 392 (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). 393 Default is True. 394 ms_level : int, optional 395 The MS level to use. Default is 1. 396 397 Raises 398 ------ 399 ValueError 400 If no mass features are found. 401 If no MS level data is found for the given MS level (either in data or in the scan data) 402 403 Returns 404 ------- 405 None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute. 406 407 Notes 408 ----- 409 drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection). 410 """ 411 # Check if there is data 412 if ms_level in self._ms_unprocessed.keys(): 413 raw_data = self._ms_unprocessed[ms_level].copy() 414 else: 415 raise ValueError("No MS level " + str(ms_level) + " data found") 416 if self.mass_features is not None: 417 mf_df = self.mass_features_to_df().copy() 418 else: 419 raise ValueError( 420 "No mass features found, did you run find_mass_features() first?" 421 ) 422 # Check if mass_spectrum exists on each mass feature 423 if not all( 424 [mf.mass_spectrum is not None for mf in self.mass_features.values()] 425 ): 426 raise ValueError( 427 "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?" 428 ) 429 430 # Subset scan data to only include correct ms_level 431 scan_df_sub = self.scan_df[ 432 self.scan_df["ms_level"] == int(ms_level) 433 ].reset_index(drop=True) 434 if scan_df_sub.empty: 435 raise ValueError("No MS level " + ms_level + " data found in scan data") 436 scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy() 437 438 mzs_to_extract = np.unique(mf_df["mz"].values) 439 mzs_to_extract.sort() 440 441 # Get EICs for each unique mz in mass features list 442 for mz in mzs_to_extract: 443 mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 444 mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 445 raw_data_sub = raw_data[ 446 (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max) 447 ].reset_index(drop=True) 448 raw_data_sub = ( 449 raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index() 450 ) 451 raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left") 452 raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0) 453 myEIC = EIC_Data( 454 scans=raw_data_sub["scan"].values, 455 time=raw_data_sub["scan_time"].values, 456 eic=raw_data_sub["intensity"].values, 457 ) 458 # Smooth EIC 459 smoothed_eic = self.smooth_tic(myEIC.eic) 460 smoothed_eic[smoothed_eic < 0] = 0 461 myEIC.eic_smoothed = smoothed_eic 462 self.eics[mz] = myEIC 463 464 # Get limits of mass features using EIC centroid detector and integrate 465 mf_df["area"] = np.nan 466 for idx, mass_feature in mf_df.iterrows(): 467 mz = mass_feature.mz 468 apex_scan = mass_feature.apex_scan 469 470 # Pull EIC data and find apex scan index 471 myEIC = self.eics[mz] 472 self.mass_features[idx]._eic_data = myEIC 473 apex_index = np.where(myEIC.scans == apex_scan)[0][0] 474 475 # Find left and right limits of peak using EIC centroid detector, add to EICData 476 centroid_eics = self.eic_centroid_detector( 477 myEIC.time, 478 myEIC.eic_smoothed, 479 mass_feature.intensity * 1.1, 480 apex_indexes=[int(apex_index)], 481 ) 482 l_a_r_scan_idx = [i for i in centroid_eics] 483 if len(l_a_r_scan_idx) > 0: 484 # Add start and final scan to mass_features and EICData 485 left_scan, right_scan = ( 486 myEIC.scans[l_a_r_scan_idx[0][0]], 487 myEIC.scans[l_a_r_scan_idx[0][2]], 488 ) 489 mf_scan_apex = [(left_scan, int(apex_scan), right_scan)] 490 myEIC.apexes = myEIC.apexes + mf_scan_apex 491 self.mass_features[idx].start_scan = left_scan 492 self.mass_features[idx].final_scan = right_scan 493 494 # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData 495 area = np.trapz( 496 myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 497 myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 498 ) 499 mf_df.at[idx, "area"] = area 500 myEIC.areas = myEIC.areas + [area] 501 self.eics[mz] = myEIC 502 self.mass_features[idx]._area = area 503 else: 504 if drop_if_fail is True: 505 self.mass_features.pop(idx) 506 507 if drop_duplicates: 508 # Prepare mass feature dataframe 509 mf_df = self.mass_features_to_df().copy() 510 511 # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature 512 # Kepp the first mass fea 513 for idx, mass_feature in mf_df.iterrows(): 514 mz = mass_feature.mz 515 apex_scan = mass_feature.apex_scan 516 517 mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6 518 mf_df_sub = mf_df[ 519 mf_df["mz_diff_ppm"] 520 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 521 * 10**6 522 ].copy() 523 524 # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature 525 for idx2, mass_feature2 in mf_df_sub.iterrows(): 526 if idx2 != idx: 527 if ( 528 mass_feature2.start_scan >= mass_feature.start_scan 529 and mass_feature2.final_scan <= mass_feature.final_scan 530 ): 531 if idx2 in self.mass_features.keys(): 532 self.mass_features.pop(idx2) 533 534 def find_c13_mass_features(self): 535 """Mark likely C13 isotopes and connect to monoisoitopic mass features. 536 537 Returns 538 ------- 539 None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object. 540 541 Raises 542 ------ 543 ValueError 544 If no mass features are found. 545 """ 546 verbose = self.parameters.lc_ms.verbose_processing 547 if verbose: 548 print("evaluating mass features for C13 isotopes") 549 if self.mass_features is None: 550 raise ValueError("No mass features found, run find_mass_features() first") 551 552 # Data prep fo sparse distance matrix 553 dims = ["mz", "scan_time"] 554 mf_df = self.mass_features_to_df().copy() 555 # Drop mass features that have no area (these are likely to be noise) 556 mf_df = mf_df[mf_df["area"].notnull()] 557 mf_df["mf_id"] = mf_df.index.values 558 dims = ["mz", "scan_time"] 559 560 # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features 561 mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy() 562 563 mz_diff = 1.003355 # C13-C12 mass difference 564 tol = [ 565 mf_df["mz"].median() 566 * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 567 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5, 568 ] # mz, in relative; scan_time in minutes 569 570 # Compute inter-feature distances 571 distances = None 572 for i in range(len(dims)): 573 # Construct k-d tree 574 values = mf_df[dims[i]].values 575 tree = KDTree(values.reshape(-1, 1)) 576 577 max_tol = tol[i] 578 if dims[i] == "mz": 579 # Maximum absolute tolerance 580 max_tol = mz_diff + tol[i] 581 582 # Compute sparse distance matrix 583 # the larger the max_tol, the slower this operation is 584 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 585 586 # Only consider forward case, exclude diagonal 587 sdm = sparse.triu(sdm, k=1) 588 589 if dims[i] == "mz": 590 min_tol = mz_diff - tol[i] 591 # Get only the ones that are above the min tol 592 idx = sdm.data > min_tol 593 594 # Reconstruct sparse distance matrix 595 sdm = sparse.coo_matrix( 596 (sdm.data[idx], (sdm.row[idx], sdm.col[idx])), 597 shape=(len(values), len(values)), 598 ) 599 600 # Cast as binary matrix 601 sdm.data = np.ones_like(sdm.data) 602 603 # Stack distances 604 if distances is None: 605 distances = sdm 606 else: 607 distances = distances.multiply(sdm) 608 609 # Extract indices of within-tolerance points 610 distances = distances.tocoo() 611 pairs = np.stack((distances.row, distances.col), axis=1) # C12 to C13 pairs 612 613 # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df 614 pairs_mf = pairs.copy() 615 pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values 616 pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values 617 618 # Connect monoisotopic masses with isotopologes within mass_features 619 monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1])) 620 for mono in monos: 621 self.mass_features[mono].monoisotopic_mf_id = mono 622 pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"]) 623 while not pairs_iso_df.empty: 624 pairs_iso_df = pairs_iso_df.set_index("parent", drop=False) 625 m1_isos = pairs_iso_df.loc[monos, "child"].unique() 626 for iso in m1_isos: 627 # Set monoisotopic_mf_id and isotopologue_type for isotopologues 628 parent = pairs_mf[pairs_mf[:, 1] == iso, 0] 629 if len(parent) > 1: 630 # Choose the parent that is closest in time to the isotopologue 631 parent_time = [self.mass_features[p].retention_time for p in parent] 632 time_diff = [ 633 np.abs(self.mass_features[iso].retention_time - x) 634 for x in parent_time 635 ] 636 parent = parent[np.argmin(time_diff)] 637 else: 638 parent = parent[0] 639 self.mass_features[iso].monoisotopic_mf_id = self.mass_features[ 640 parent 641 ].monoisotopic_mf_id 642 if self.mass_features[iso].monoisotopic_mf_id is not None: 643 mass_diff = ( 644 self.mass_features[iso].mz 645 - self.mass_features[ 646 self.mass_features[iso].monoisotopic_mf_id 647 ].mz 648 ) 649 self.mass_features[iso].isotopologue_type = "13C" + str( 650 int(round(mass_diff, 0)) 651 ) 652 653 # Drop the mono and iso from the pairs_iso_df 654 pairs_iso_df = pairs_iso_df.drop( 655 index=monos, errors="ignore" 656 ) # Drop pairs where the parent is a child that is a child of a root 657 pairs_iso_df = pairs_iso_df.set_index("child", drop=False) 658 pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore") 659 660 if not pairs_iso_df.empty: 661 # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process 662 monos = np.setdiff1d( 663 np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child) 664 ) 665 if verbose: 666 # Report fraction of compounds annotated with isotopes 667 mf_df["c13_flag"] = np.where( 668 np.logical_or( 669 np.isin(mf_df["mf_id"], pairs_mf[:, 0]), 670 np.isin(mf_df["mf_id"], pairs_mf[:, 1]), 671 ), 672 1, 673 0, 674 ) 675 print( 676 str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3)) 677 + " of mass features have or are C13 isotopes" 678 ) 679 680 def deconvolute_ms1_mass_features(self): 681 """Deconvolute MS1 mass features 682 683 Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features 684 685 Parameters 686 ---------- 687 None 688 689 Returns 690 ------- 691 None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent, 692 and associated_mass_features_deconvoluted attributes to the mass features in the 693 mass_features attribute of the LCMSBase object. 694 695 Raises 696 ------ 697 ValueError 698 If no mass features are found, must run find_mass_features() first. 699 If no EICs are found, did you run integrate_mass_features() first? 700 701 """ 702 # Checks for set mass_features and eics 703 if self.mass_features is None: 704 raise ValueError( 705 "No mass features found, did you run find_mass_features() first?" 706 ) 707 708 if self.eics == {}: 709 raise ValueError( 710 "No EICs found, did you run integrate_mass_features() first?" 711 ) 712 713 if 1 not in self._ms_unprocessed.keys(): 714 raise ValueError("No unprocessed MS1 spectra found.") 715 716 # Prep ms1 data 717 ms1_data = self._ms_unprocessed[1].copy() 718 ms1_data = ms1_data.set_index("scan") 719 720 # Prep mass feature summary 721 mass_feature_df = self.mass_features_to_df() 722 723 # Loop through each mass feature 724 for mf_id, mass_feature in self.mass_features.items(): 725 # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df 726 if mass_feature.mz != mass_feature.ms1_peak.mz_exp: 727 continue 728 729 # Get the left and right limits of the EIC of the mass feature 730 l_scan, _, r_scan = mass_feature._eic_data.apexes[0] 731 732 # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz 733 ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy() 734 ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False) 735 736 # Get the centroided masses of the mass feature 737 mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp 738 739 # Find the closest mz in the ms1 data to the centroided masses of the mass feature 740 ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[ 741 find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values) 742 ] 743 744 # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz 745 ms1_data_sub["mz_diff_rel"] = ( 746 np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"]) 747 / ms1_data_sub["mz"] 748 ) 749 ms1_data_sub = ms1_data_sub[ 750 ms1_data_sub["mz_diff_rel"] 751 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 752 ].reset_index(drop=True) 753 754 # Group by mass_feature_mz and scan and sum intensity 755 ms1_data_sub_group = ( 756 ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"] 757 .sum() 758 .reset_index() 759 ) 760 761 # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity) 762 corr = ( 763 ms1_data_sub_group.pivot( 764 index="scan", columns="mass_feature_mz", values="intensity" 765 ) 766 .fillna(0) 767 .corr() 768 ) 769 770 # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8 771 decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min 772 corr_subset = corr.loc[mass_feature.mz,] 773 corr_subset = corr_subset[corr_subset > decon_corr_min] 774 775 # Get the masses from the mass spectrum that are the result of the deconvolution 776 mzs_decon = corr_subset.index.values 777 778 # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature 779 mzs_decon_idx = [ 780 id 781 for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp) 782 if mz in mzs_decon 783 ] 784 mass_feature._ms_deconvoluted_idx = mzs_decon_idx 785 786 # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum 787 if ( 788 mass_feature.ms1_peak.abundance 789 == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max() 790 ): 791 mass_feature.mass_spectrum_deconvoluted_parent = True 792 else: 793 mass_feature.mass_spectrum_deconvoluted_parent = False 794 795 # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature 796 # Subset mass_feature_df to only include mass features that are within the clustering tolerance 797 mass_feature_df_sub = mass_feature_df[ 798 abs(mass_feature.retention_time - mass_feature_df["scan_time"]) 799 < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 800 ].copy() 801 # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum 802 mass_feature_df_sub["mz_diff_ppm"] = [ 803 np.abs(mzs_decon - mz).min() / mz * 10**6 804 for mz in mass_feature_df_sub["mz"] 805 ] 806 # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses 807 mfs_associated_decon = mass_feature_df_sub[ 808 mass_feature_df_sub["mz_diff_ppm"] 809 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6 810 ].index.values 811 812 mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon 813 814 815class PHCalculations: 816 """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data. 817 818 Notes 819 ----- 820 This class is intended to be used as a mixin for the LCMSBase class. 821 822 Methods 823 ------- 824 * sparse_mean_filter(idx, V, radius=[0, 1, 1]). 825 Sparse implementation of a mean filter. 826 * embed_unique_indices(a). 827 Creates an array of indices, sorted by unique element. 828 * sparse_upper_star(idx, V). 829 Sparse implementation of an upper star filtration. 830 * check_if_grid(data). 831 Check if the data is gridded in mz space. 832 * grid_data(data). 833 Grid the data in the mz dimension. 834 * find_mass_features_ph(ms_level=1, grid=True). 835 Find mass features within an LCMSBase object using persistent homology. 836 * cluster_mass_features(drop_children=True). 837 Cluster regions of interest. 838 """ 839 840 @staticmethod 841 def sparse_mean_filter(idx, V, radius=[0, 1, 1]): 842 """Sparse implementation of a mean filter. 843 844 Parameters 845 ---------- 846 idx : :obj:`~numpy.array` 847 Edge indices for each dimension (MxN). 848 V : :obj:`~numpy.array` 849 Array of intensity data (Mx1). 850 radius : float or list 851 Radius of the sparse filter in each dimension. Values less than 852 zero indicate no connectivity in that dimension. 853 854 Returns 855 ------- 856 :obj:`~numpy.array` 857 Filtered intensities (Mx1). 858 859 Notes 860 ----- 861 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. 862 This is a static method. 863 """ 864 865 # Copy indices 866 idx = idx.copy().astype(V.dtype) 867 868 # Scale 869 for i, r in enumerate(radius): 870 # Increase inter-index distance 871 if r < 1: 872 idx[:, i] *= 2 873 874 # Do nothing 875 elif r == 1: 876 pass 877 878 # Decrease inter-index distance 879 else: 880 idx[:, i] /= r 881 882 # Connectivity matrix 883 cmat = KDTree(idx) 884 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 885 cmat.setdiag(1) 886 887 # Pair indices 888 I, J = cmat.nonzero() 889 890 # Delete cmat 891 cmat_shape = cmat.shape 892 del cmat 893 894 # Sum over columns 895 V_sum = sparse.bsr_matrix( 896 (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype 897 ).diagonal(0) 898 899 # Count over columns 900 V_count = sparse.bsr_matrix( 901 (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype 902 ).diagonal(0) 903 904 return V_sum / V_count 905 906 @staticmethod 907 def embed_unique_indices(a): 908 """Creates an array of indices, sorted by unique element. 909 910 Parameters 911 ---------- 912 a : :obj:`~numpy.array` 913 Array of unique elements (Mx1). 914 915 Returns 916 ------- 917 :obj:`~numpy.array` 918 Array of indices (Mx1). 919 920 Notes 921 ----- 922 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 923 This is a static method. 924 """ 925 926 def count_tens(n): 927 # Count tens 928 ntens = (n - 1) // 10 929 930 while True: 931 ntens_test = (ntens + n - 1) // 10 932 933 if ntens_test == ntens: 934 return ntens 935 else: 936 ntens = ntens_test 937 938 def arange_exclude_10s(n): 939 # How many 10s will there be? 940 ntens = count_tens(n) 941 942 # Base array 943 arr = np.arange(0, n + ntens) 944 945 # Exclude 10s 946 arr = arr[(arr == 0) | (arr % 10 != 0)][:n] 947 948 return arr 949 950 # Creates an array of indices, sorted by unique element 951 idx_sort = np.argsort(a) 952 idx_unsort = np.argsort(idx_sort) 953 954 # Sorts records array so all unique elements are together 955 sorted_a = a[idx_sort] 956 957 # Returns the unique values, the index of the first occurrence, 958 # and the count for each element 959 vals, idx_start, count = np.unique( 960 sorted_a, return_index=True, return_counts=True 961 ) 962 963 # Splits the indices into separate arrays 964 splits = np.split(idx_sort, idx_start[1:]) 965 966 # Creates unique indices for each split 967 idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits]) 968 969 # Reorders according to input array 970 idx_unq = idx_unq[idx_unsort] 971 972 # Magnitude of each index 973 exp = np.log10( 974 idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64) 975 ) 976 idx_unq_mag = np.power(10, np.floor(exp) + 1) 977 978 # Result 979 return a + idx_unq / idx_unq_mag 980 981 @staticmethod 982 def roll_up_dataframe( 983 df : pd.DataFrame, 984 sort_by : str, 985 tol : list, 986 relative : list, 987 dims : list 988 ): 989 """Subset data by rolling up into apex in appropriate dimensions. 990 991 Parameters 992 ---------- 993 data : pd.DataFrame 994 The input data containing "dims" columns and the "sort_by" column. 995 sort_by : str 996 The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 997 (i.e., the mass feature with the highest value in the sort_by column). 998 dims : list 999 A list of dimension names (column names in the data DataFrame) to roll up the mass features by. 1000 tol : list 1001 A list of tolerances for each dimension. The length of the list must match the number of dimensions. 1002 The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). 1003 If relative is True, the tolerance will be multiplied by the maximum value in that dimension. 1004 If relative is False, the tolerance will be used as is. 1005 relative : list 1006 A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1007 1008 Returns 1009 ------- 1010 pd.DataFrame 1011 A DataFrame with only the rolled up mass features, with the original index and columns. 1012 1013 1014 Raises 1015 ------ 1016 ValueError 1017 If the input data is not a pandas DataFrame. 1018 If the input data does not have columns for each of the dimensions in "dims". 1019 If the length of "dims", "tol", and "relative" do not match. 1020 """ 1021 og_columns = df.columns.copy() 1022 1023 # Unindex the data, but keep the original index 1024 if df.index.name is not None: 1025 og_index = df.index.name 1026 else: 1027 og_index = "index" 1028 df = df.reset_index(drop=False) 1029 1030 # Sort data by sort_by column, and reindex 1031 df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True) 1032 1033 # Check that data is a DataFrame and has columns for each of the dims 1034 if not isinstance(df, pd.DataFrame): 1035 raise ValueError("Data must be a pandas DataFrame") 1036 for dim in dims: 1037 if dim not in df.columns: 1038 raise ValueError(f"Data must have a column for {dim}") 1039 if len(dims) != len(tol) or len(dims) != len(relative): 1040 raise ValueError( 1041 "Dimensions, tolerances, and relative flags must be the same length" 1042 ) 1043 1044 # Compute inter-feature distances 1045 distances = None 1046 for i in range(len(dims)): 1047 # Construct k-d tree 1048 values = df[dims[i]].values 1049 tree = KDTree(values.reshape(-1, 1)) 1050 1051 max_tol = tol[i] 1052 if relative[i] is True: 1053 # Maximum absolute tolerance 1054 max_tol = tol[i] * values.max() 1055 1056 # Compute sparse distance matrix 1057 # the larger the max_tol, the slower this operation is 1058 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 1059 1060 # Only consider forward case, exclude diagonal 1061 sdm = sparse.triu(sdm, k=1) 1062 1063 # Filter relative distances 1064 if relative[i] is True: 1065 # Compute relative distances 1066 rel_dists = sdm.data / values[sdm.row] # or col? 1067 1068 # Indices of relative distances less than tolerance 1069 idx = rel_dists <= tol[i] 1070 1071 # Reconstruct sparse distance matrix 1072 sdm = sparse.coo_matrix( 1073 (rel_dists[idx], (sdm.row[idx], sdm.col[idx])), 1074 shape=(len(values), len(values)), 1075 ) 1076 1077 # Cast as binary matrix 1078 sdm.data = np.ones_like(sdm.data) 1079 1080 # Stack distances 1081 if distances is None: 1082 distances = sdm 1083 else: 1084 distances = distances.multiply(sdm) 1085 1086 # Extract indices of within-tolerance points 1087 distances = distances.tocoo() 1088 pairs = np.stack((distances.row, distances.col), axis=1) 1089 pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]) 1090 pairs_df = pairs_df.set_index("parent") 1091 1092 to_drop = [] 1093 while not pairs_df.empty: 1094 # Find root_parents and their children 1095 root_parents = np.setdiff1d( 1096 np.unique(pairs_df.index.values), np.unique(pairs_df.child.values) 1097 ) 1098 children_of_roots = pairs_df.loc[root_parents, "child"].unique() 1099 to_drop = np.append(to_drop, children_of_roots) 1100 1101 # Remove root_children as possible parents from pairs_df for next iteration 1102 pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore") 1103 pairs_df = pairs_df.reset_index().set_index("child") 1104 # Remove root_children as possible children from pairs_df for next iteration 1105 pairs_df = pairs_df.drop(index=children_of_roots) 1106 1107 # Prepare for next iteration 1108 pairs_df = pairs_df.reset_index().set_index("parent") 1109 1110 # Drop mass features that are not cluster parents 1111 df_sub = df.drop(index=np.array(to_drop)) 1112 1113 # Set index back to og_index and only keep the columns that are in the original dataframe 1114 df_sub = df_sub.set_index(og_index) 1115 1116 # sort the dataframe by the original index 1117 df_sub = df_sub.sort_index() 1118 df_sub = df_sub[og_columns] 1119 1120 return df_sub 1121 1122 def sparse_upper_star(self, idx, V): 1123 """Sparse implementation of an upper star filtration. 1124 1125 Parameters 1126 ---------- 1127 idx : :obj:`~numpy.array` 1128 Edge indices for each dimension (MxN). 1129 V : :obj:`~numpy.array` 1130 Array of intensity data (Mx1). 1131 Returns 1132 ------- 1133 idx : :obj:`~numpy.array` 1134 Index of filtered points (Mx1). 1135 persistence : :obj:`~numpy.array` 1136 Persistence of each filtered point (Mx1). 1137 1138 Notes 1139 ----- 1140 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1141 """ 1142 1143 # Invert 1144 V = -1 * V.copy().astype(int) 1145 1146 # Embed indices 1147 V = self.embed_unique_indices(V) 1148 1149 # Connectivity matrix 1150 cmat = KDTree(idx) 1151 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1152 cmat.setdiag(1) 1153 cmat = sparse.triu(cmat) 1154 1155 # Pairwise minimums 1156 I, J = cmat.nonzero() 1157 d = np.maximum(V[I], V[J]) 1158 1159 # Delete connectiity matrix 1160 cmat_shape = cmat.shape 1161 del cmat 1162 1163 # Sparse distance matrix 1164 sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape) 1165 1166 # Delete pairwise mins 1167 del d, I, J 1168 1169 # Persistence homology 1170 ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0] 1171 1172 # Bound death values 1173 ph[ph[:, 1] == np.inf, 1] = np.max(V) 1174 1175 # Construct tree to query against 1176 tree = KDTree(V.reshape((-1, 1))) 1177 1178 # Get the indexes of the first nearest neighbor by birth 1179 _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1) 1180 1181 return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1) 1182 1183 def check_if_grid(self, data): 1184 """Check if the data are gridded in mz space. 1185 1186 Parameters 1187 ---------- 1188 data : DataFrame 1189 DataFrame containing the mass spectrometry data. Needs to have mz and scan columns. 1190 1191 Returns 1192 ------- 1193 bool 1194 True if the data is gridded in the mz direction, False otherwise. 1195 1196 Notes 1197 ----- 1198 This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly. 1199 """ 1200 # Calculate the difference between consecutive mz values in a single scan 1201 dat_check = data.copy().reset_index(drop=True) 1202 dat_check["mz_diff"] = np.abs(dat_check["mz"].diff()) 1203 mz_diff_min = ( 1204 dat_check.groupby("scan")["mz_diff"].min().min() 1205 ) # within each scan, what is the smallest mz difference between consecutive mz values 1206 1207 # Find the mininum mz difference between mz values in the data; regardless of scan 1208 dat_check_mz = dat_check[["mz"]].drop_duplicates().copy() 1209 dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True) 1210 dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff()) 1211 1212 # Get minimum mz_diff between mz values in the data 1213 mz_diff_min_raw = dat_check_mz["mz_diff"].min() 1214 1215 # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded 1216 if mz_diff_min_raw < mz_diff_min: 1217 return False 1218 else: 1219 return True 1220 1221 def grid_data(self, data): 1222 """Grid the data in the mz dimension. 1223 1224 Data must be gridded prior to persistent homology calculations. 1225 1226 Parameters 1227 ---------- 1228 data : DataFrame 1229 The input data containing mz, scan, scan_time, and intensity columns. 1230 1231 Returns 1232 ------- 1233 DataFrame 1234 The gridded data with mz, scan, scan_time, and intensity columns. 1235 1236 Raises 1237 ------ 1238 ValueError 1239 If gridding fails. 1240 """ 1241 1242 # Calculate the difference between consecutive mz values in a single scan for grid spacing 1243 data_w = data.copy().reset_index(drop=True) 1244 data_w["mz_diff"] = np.abs(data_w["mz"].diff()) 1245 mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999 1246 1247 # Need high intensity mz values first so they are parents in the output pairs stack 1248 dat_mz = data_w[["mz", "intensity"]].sort_values( 1249 by=["intensity"], ascending=False 1250 ) 1251 dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy() 1252 1253 # Construct KD tree 1254 tree = KDTree(dat_mz.mz.values.reshape(-1, 1)) 1255 sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix") 1256 sdm = sparse.triu(sdm, k=1) 1257 sdm.data = np.ones_like(sdm.data) 1258 distances = sdm.tocoo() 1259 pairs = np.stack((distances.row, distances.col), axis=1) 1260 1261 # Cull pairs to just get root 1262 to_drop = [] 1263 while len(pairs) > 0: 1264 root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1])) 1265 id_root_parents = np.isin(pairs[:, 0], root_parents) 1266 children_of_roots = np.unique(pairs[id_root_parents, 1]) 1267 to_drop = np.append(to_drop, children_of_roots) 1268 1269 # Set up pairs array for next iteration by removing pairs with children or parents already dropped 1270 pairs = pairs[~np.isin(pairs[:, 1], to_drop), :] 1271 pairs = pairs[~np.isin(pairs[:, 0], to_drop), :] 1272 dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop)) 1273 mz_dat_np = ( 1274 dat_mz[["mz"]] 1275 .sort_values(by=["mz"]) 1276 .reset_index(drop=True) 1277 .values.flatten() 1278 ) 1279 1280 # Sort data by mz and recast mz to nearest value in mz_dat_np 1281 data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy() 1282 data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)] 1283 data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"]) 1284 1285 # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity 1286 new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy() 1287 new_data_w = ( 1288 new_data_w.drop(columns=["mz_diff", "mz_orig"]) 1289 .groupby(["scan", "mz"])["intensity"] 1290 .sum() 1291 .reset_index() 1292 ) 1293 new_data_w = ( 1294 new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True]) 1295 .reset_index(drop=True) 1296 .copy() 1297 ) 1298 1299 # Check if grid worked and return 1300 if self.check_if_grid(new_data_w): 1301 return new_data_w 1302 else: 1303 raise ValueError("Gridding failed") 1304 1305 def find_mass_features_ph(self, ms_level=1, grid=True): 1306 """Find mass features within an LCMSBase object using persistent homology. 1307 1308 Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id) 1309 1310 Parameters 1311 ---------- 1312 ms_level : int, optional 1313 The MS level to use. Default is 1. 1314 grid : bool, optional 1315 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True. 1316 1317 Raises 1318 ------ 1319 ValueError 1320 If no MS level data is found on the object. 1321 If data is not gridded and grid is False. 1322 1323 Returns 1324 ------- 1325 None, but assigns the mass_features attribute to the object. 1326 1327 Notes 1328 ----- 1329 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1330 """ 1331 # Check that ms_level is a key in self._ms_uprocessed 1332 if ms_level not in self._ms_unprocessed.keys(): 1333 raise ValueError( 1334 "No MS level " 1335 + str(ms_level) 1336 + " data found, did you instantiate with parser specific to MS level?" 1337 ) 1338 1339 # Get ms data 1340 data = self._ms_unprocessed[ms_level].copy() 1341 1342 # Drop rows with missing intensity values and reset index 1343 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1344 1345 # Threshold data 1346 dims = ["mz", "scan_time"] 1347 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1348 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1349 1350 # Check if gridded, if not, grid 1351 gridded_mz = self.check_if_grid(data_thres) 1352 if gridded_mz is False: 1353 if grid is False: 1354 raise ValueError( 1355 "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function" 1356 ) 1357 else: 1358 data_thres = self.grid_data(data_thres) 1359 1360 # Add build factors and add scan_time 1361 data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1362 factors = { 1363 dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32) 1364 for dim in dims 1365 } # this is return a float64 index 1366 1367 # Build indexes 1368 index = { 1369 dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32) 1370 for dim in factors 1371 } 1372 1373 # Smooth data 1374 iterations = self.parameters.lc_ms.ph_smooth_it 1375 smooth_radius = [ 1376 self.parameters.lc_ms.ph_smooth_radius_mz, 1377 self.parameters.lc_ms.ph_smooth_radius_scan, 1378 ] # mz, scan_time smoothing radius (in steps) 1379 1380 index = np.vstack([index[dim] for dim in dims]).T 1381 V = data_thres["intensity"].values 1382 resid = np.inf 1383 for i in range(iterations): 1384 # Previous iteration 1385 V_prev = V.copy() 1386 resid_prev = resid 1387 V = self.sparse_mean_filter(index, V, radius=smooth_radius) 1388 1389 # Calculate residual with previous iteration 1390 resid = np.sqrt(np.mean(np.square(V - V_prev))) 1391 1392 # Evaluate convergence 1393 if i > 0: 1394 # Percent change in residual 1395 test = np.abs(resid - resid_prev) / resid_prev 1396 1397 # Exit criteria 1398 if test <= 0: 1399 break 1400 1401 # Overwrite values 1402 data_thres["intensity"] = V 1403 1404 # Use persistent homology to find regions of interest 1405 pidx, pers = self.sparse_upper_star(index, V) 1406 pidx = pidx[pers > 1] 1407 pers = pers[pers > 1] 1408 1409 # Get peaks 1410 peaks = data_thres.iloc[pidx, :].reset_index(drop=True) 1411 1412 # Add persistence column 1413 peaks["persistence"] = pers 1414 mass_features = peaks.sort_values( 1415 by="persistence", ascending=False 1416 ).reset_index(drop=True) 1417 1418 # Filter by persistence threshold 1419 persistence_threshold = ( 1420 self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max() 1421 ) 1422 mass_features = mass_features.loc[ 1423 mass_features["persistence"] > persistence_threshold, : 1424 ].reset_index(drop=True) 1425 1426 # Rename scan column to apex_scan 1427 mass_features = mass_features.rename( 1428 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1429 ) 1430 1431 # Populate mass_features attribute 1432 self.mass_features = {} 1433 for row in mass_features.itertuples(): 1434 row_dict = mass_features.iloc[row.Index].to_dict() 1435 lcms_feature = LCMSMassFeature(self, **row_dict) 1436 self.mass_features[lcms_feature.id] = lcms_feature 1437 1438 if self.parameters.lc_ms.verbose_processing: 1439 print("Found " + str(len(mass_features)) + " initial mass features") 1440 1441 def find_mass_features_ph_centroid(self, ms_level=1): 1442 """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data. 1443 1444 Parameters 1445 ---------- 1446 ms_level : int, optional 1447 The MS level to use. Default is 1. 1448 1449 Raises 1450 ------ 1451 ValueError 1452 If no MS level data is found on the object. 1453 1454 Returns 1455 ------- 1456 None, but assigns the mass_features attribute to the object. 1457 """ 1458 # Check that ms_level is a key in self._ms_uprocessed 1459 if ms_level not in self._ms_unprocessed.keys(): 1460 raise ValueError( 1461 "No MS level " 1462 + str(ms_level) 1463 + " data found, did you instantiate with parser specific to MS level?" 1464 ) 1465 1466 # Get ms data 1467 data = self._ms_unprocessed[ms_level].copy() 1468 1469 # Drop rows with missing intensity values and reset index 1470 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1471 1472 # Threshold data 1473 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1474 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1475 data_thres['persistence'] = data_thres['intensity'] 1476 1477 # Use this as the starting point for the mass features, adding scan_time 1478 mf_df = data_thres 1479 mf_df = mf_df.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1480 1481 # Define tolerances and dimensions for rolling up 1482 tol = [ 1483 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1484 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1485 ] 1486 relative = [True, False] 1487 dims = ["mz", "scan_time"] 1488 print("here") 1489 mf_df = self.roll_up_dataframe( 1490 df=mf_df, 1491 sort_by="persistence", 1492 dims=dims, 1493 tol=tol, 1494 relative=relative 1495 ) 1496 1497 # Rename scan column to apex_scan 1498 mass_features = mf_df.rename( 1499 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1500 ) 1501 # Sort my persistence and reset index 1502 mass_features = mass_features.sort_values( 1503 by="persistence", ascending=False 1504 ).reset_index(drop=True) 1505 1506 # Populate mass_features attribute 1507 self.mass_features = {} 1508 for row in mass_features.itertuples(): 1509 row_dict = mass_features.iloc[row.Index].to_dict() 1510 lcms_feature = LCMSMassFeature(self, **row_dict) 1511 self.mass_features[lcms_feature.id] = lcms_feature 1512 1513 if self.parameters.lc_ms.verbose_processing: 1514 print("Found " + str(len(mass_features)) + " initial mass features") 1515 1516 def cluster_mass_features(self, drop_children=True, sort_by="persistence"): 1517 """Cluster mass features 1518 1519 Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence. 1520 1521 Parameters 1522 ---------- 1523 drop_children : bool, optional 1524 Whether to drop the mass features that are not cluster parents. Default is True. 1525 sort_by : str, optional 1526 The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence". 1527 1528 Raises 1529 ------ 1530 ValueError 1531 If no mass features are found. 1532 If too many mass features are found. 1533 1534 Returns 1535 ------- 1536 None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents. 1537 """ 1538 verbose = self.parameters.lc_ms.verbose_processing 1539 1540 if self.mass_features is None: 1541 raise ValueError("No mass features found, run find_mass_features() first") 1542 if len(self.mass_features) > 400000: 1543 raise ValueError( 1544 "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold" 1545 ) 1546 dims = ["mz", "scan_time"] 1547 mf_df_og = self.mass_features_to_df() 1548 mf_df = mf_df_og.copy() 1549 1550 tol = [ 1551 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1552 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1553 ] # mz, in relative; scan_time in minutes 1554 relative = [True, False] 1555 1556 # Roll up mass features based on their proximity in the declared dimensions 1557 mf_df_new = self.roll_up_dataframe( 1558 df=mf_df, 1559 sort_by=sort_by, 1560 dims=dims, 1561 tol=tol, 1562 relative=relative 1563 ) 1564 1565 mf_df["cluster_parent"] = np.where( 1566 np.isin(mf_df.index, mf_df_new.index), True, False 1567 ) 1568 1569 # get mass feature ids of features that are not cluster parents 1570 cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values 1571 if drop_children is True: 1572 # Drop mass features that are not cluster parents from self 1573 self.mass_features = { 1574 k: v 1575 for k, v in self.mass_features.items() 1576 if k not in cluster_daughters 1577 } 1578 else: 1579 return cluster_daughters
14def find_closest(A, target): 15 """Find the index of closest value in A to each value in target. 16 17 Parameters 18 ---------- 19 A : :obj:`~numpy.array` 20 The array to search (blueprint). A must be sorted. 21 target : :obj:`~numpy.array` 22 The array of values to search for. target must be sorted. 23 24 Returns 25 ------- 26 :obj:`~numpy.array` 27 The indices of the closest values in A to each value in target. 28 """ 29 idx = A.searchsorted(target) 30 idx = np.clip(idx, 1, len(A) - 1) 31 left = A[idx - 1] 32 right = A[idx] 33 idx -= target - left < right - target 34 return idx
Find the index of closest value in A to each value in target.
Parameters
- A (
~numpy.array
): The array to search (blueprint). A must be sorted. - target (
~numpy.array
): The array of values to search for. target must be sorted.
Returns
~numpy.array
: The indices of the closest values in A to each value in target.
37class LCCalculations: 38 """Methods for performing LC calculations on mass spectra data. 39 40 Notes 41 ----- 42 This class is intended to be used as a mixin for the LCMSBase class. 43 44 Methods 45 ------- 46 * get_max_eic(eic_data). 47 Returns the maximum EIC value from the given EIC data. A static method. 48 * smooth_tic(tic). 49 Smooths the TIC data using the specified smoothing method and settings. 50 * eic_centroid_detector(rt, eic, max_eic). 51 Performs EIC centroid detection on the given EIC data. 52 * find_nearest_scan(rt). 53 Finds the nearest scan to the given retention time. 54 * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). 55 Returns an averaged mass spectrum object. 56 * find_mass_features(ms_level=1). 57 Find regions of interest for a given MS level (default is MS1). 58 * integrate_mass_features(drop_if_fail=False, ms_level=1). 59 Integrate mass features of interest and extracts EICs. 60 * find_c13_mass_features(). 61 Evaluate mass features and mark likely C13 isotopes. 62 * deconvolute_ms1_mass_features(). 63 Deconvolute mass features' ms1 mass spectra. 64 """ 65 66 @staticmethod 67 def get_max_eic(eic_data: dict): 68 """Returns the maximum EIC value from the given EIC data. 69 70 Notes 71 ----- 72 This is a static method. 73 74 Parameters 75 ---------- 76 eic_data : dict 77 A dictionary containing EIC data. 78 79 Returns 80 ------- 81 float 82 The maximum EIC value. 83 """ 84 max_eic = 0 85 for eic_data in eic_data.values(): 86 ind_max_eic = max(eic_data.get("EIC")) 87 max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic 88 89 return max_eic 90 91 def smooth_tic(self, tic): 92 """Smooths the TIC or EIC data using the specified smoothing method and settings. 93 94 Parameters 95 ---------- 96 tic : numpy.ndarray 97 The TIC (or EIC) data to be smoothed. 98 99 Returns 100 ------- 101 numpy.ndarray 102 The smoothed TIC data. 103 """ 104 implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method 105 106 pol_order = self.parameters.lc_ms.savgol_pol_order 107 108 window_len = self.parameters.lc_ms.smooth_window 109 110 window = self.parameters.lc_ms.smooth_method 111 112 return sp.smooth_signal( 113 tic, window_len, window, pol_order, implemented_smooth_method 114 ) 115 116 def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]): 117 """Performs EIC centroid detection on the given EIC data. 118 119 Parameters 120 ---------- 121 rt : numpy.ndarray 122 The retention time data. 123 eic : numpy.ndarray 124 The EIC data. 125 max_eic : float 126 The maximum EIC value. 127 apex_indexes : list, optional 128 The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function. 129 130 Returns 131 ------- 132 numpy.ndarray 133 The indexes of left, apex, and right limits as a generator. 134 """ 135 136 max_prominence = self.parameters.lc_ms.peak_max_prominence_percent 137 138 max_height = self.parameters.lc_ms.peak_height_max_percent 139 140 signal_threshold = self.parameters.lc_ms.eic_signal_threshold 141 142 min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints 143 144 peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold 145 146 include_indexes = sp.peak_picking_first_derivative( 147 domain=rt, 148 signal=eic, 149 max_height=max_height, 150 max_prominence=max_prominence, 151 max_signal=max_eic, 152 min_peak_datapoints=min_peak_datapoints, 153 peak_derivative_threshold=peak_derivative_threshold, 154 signal_threshold=signal_threshold, 155 correct_baseline=False, 156 plot_res=False, 157 apex_indexes=apex_indexes, 158 ) 159 160 return include_indexes 161 162 def find_nearest_scan(self, rt): 163 """Finds the nearest scan to the given retention time. 164 165 Parameters 166 ---------- 167 rt : float 168 The retention time (in minutes) to find the nearest scan for. 169 170 Returns 171 ------- 172 int 173 The scan number of the nearest scan. 174 """ 175 array_rt = np.array(self.retention_time) 176 177 scan_index = (np.abs(array_rt - rt)).argmin() 178 179 real_scan = self.scans_number[scan_index] 180 181 return real_scan 182 183 def add_peak_metrics(self): 184 """Add peak metrics to the mass features. 185 186 This function calculates the peak metrics for each mass feature and adds them to the mass feature objects. 187 """ 188 # Check that at least some mass features have eic data 189 if not any([mf._eic_data is not None for mf in self.mass_features.values()]): 190 raise ValueError( 191 "No mass features have EIC data. Run integrate_mass_features first." 192 ) 193 194 for mass_feature in self.mass_features.values(): 195 # Check if the mass feature has been integrated 196 if mass_feature._eic_data is not None: 197 # Calculate peak metrics 198 mass_feature.calc_half_height_width() 199 mass_feature.calc_tailing_factor() 200 mass_feature.calc_dispersity_index() 201 202 def get_average_mass_spectrum( 203 self, 204 scan_list, 205 apex_scan, 206 spectrum_mode="profile", 207 ms_level=1, 208 auto_process=True, 209 use_parser=False, 210 perform_checks=True, 211 polarity=None, 212 ms_params=None, 213 ): 214 """Returns an averaged mass spectrum object 215 216 Parameters 217 ---------- 218 scan_list : list 219 List of scan numbers to average. 220 apex_scan : int 221 Number of the apex scan 222 spectrum_mode : str, optional 223 The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging. 224 ms_level : int, optional 225 The MS level to use. Defaults to 1. 226 auto_process : bool, optional 227 If True, the averaged mass spectrum will be auto-processed. Defaults to True. 228 use_parser : bool, optional 229 If True, the mass spectra will be obtained from the parser. Defaults to False. 230 perform_checks : bool, optional 231 If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan 232 polarity : int, optional 233 The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1) 234 ms_params : MSParameters, optional 235 The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None. 236 237 Returns 238 ------- 239 MassSpectrumProfile 240 The averaged mass spectrum object. 241 242 Raises 243 ------ 244 ValueError 245 If the spectrum mode is not "profile". 246 If the MS level is not found in the unprocessed mass spectra dictionary. 247 If not all scan numbers are found in the unprocessed mass spectra dictionary. 248 """ 249 if perform_checks: 250 if spectrum_mode != "profile": 251 raise ValueError("Averaging only supported for profile mode") 252 253 if polarity is None: 254 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 255 if self.polarity == "negative": 256 polarity = -1 257 elif self.polarity == "positive": 258 polarity = 1 259 else: 260 raise ValueError( 261 "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'" 262 ) 263 264 # if not using_parser, check that scan numbers are in _ms_unprocessed 265 if not use_parser: 266 if perform_checks: 267 # Set index to scan for faster lookup 268 ms_df = ( 269 self._ms_unprocessed[ms_level] 270 .copy() 271 .set_index("scan", drop=False) 272 .sort_index() 273 ) 274 my_ms_df = ms_df.loc[scan_list] 275 # Check that all scan numbers are in the ms_df 276 if not all(np.isin(scan_list, ms_df.index)): 277 raise ValueError( 278 "Not all scan numbers found in the unprocessed mass spectra dictionary" 279 ) 280 else: 281 my_ms_df = ( 282 pd.DataFrame({"scan": scan_list}) 283 .set_index("scan") 284 .join(self._ms_unprocessed[ms_level], how="left") 285 ) 286 287 if use_parser: 288 ms_list = [ 289 self.spectra_parser.get_mass_spectrum_from_scan( 290 x, spectrum_mode=spectrum_mode, auto_process=False 291 ) 292 for x in scan_list 293 ] 294 ms_mz = [x._mz_exp for x in ms_list] 295 ms_int = [x._abundance for x in ms_list] 296 my_ms_df = [] 297 for i in np.arange(len(ms_mz)): 298 my_ms_df.append( 299 pd.DataFrame( 300 {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]} 301 ) 302 ) 303 my_ms_df = pd.concat(my_ms_df) 304 305 if not self.check_if_grid(my_ms_df): 306 my_ms_df = self.grid_data(my_ms_df) 307 308 my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index() 309 310 ms = ms_from_array_profile( 311 my_ms_ave.mz, 312 my_ms_ave.intensity, 313 self.file_location, 314 polarity=polarity, 315 auto_process=False, 316 ) 317 318 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 319 if ms is not None: 320 if ms_params is not None: 321 ms.parameters = ms_params 322 ms.scan_number = apex_scan 323 if auto_process: 324 ms.process_mass_spec() 325 return ms 326 327 def find_mass_features(self, ms_level=1, grid=True): 328 """Find mass features within an LCMSBase object 329 330 Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future. 331 332 Parameters 333 ---------- 334 ms_level : int, optional 335 The MS level to use for peak picking Default is 1. 336 grid : bool, optional 337 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 338 used for persistent homology peak picking for profile data only. Default is True. 339 340 Raises 341 ------ 342 ValueError 343 If no MS level data is found on the object. 344 If persistent homology peak picking is attempted on non-profile mode data. 345 If data is not gridded and grid is False. 346 If peak picking method is not implemented. 347 348 Returns 349 ------- 350 None, but assigns the mass_features and eics attributes to the object. 351 352 """ 353 pp_method = self.parameters.lc_ms.peak_picking_method 354 355 if pp_method == "persistent homology": 356 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 357 if all(msx_scan_df["ms_format"] == "profile"): 358 self.find_mass_features_ph(ms_level=ms_level, grid=grid) 359 self.cluster_mass_features(drop_children=True, sort_by="persistence") 360 else: 361 raise ValueError( 362 "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format( 363 ms_level 364 ) 365 ) 366 elif pp_method == "centroided_persistent_homology": 367 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 368 if all(msx_scan_df["ms_format"] == "centroid"): 369 self.find_mass_features_ph_centroid(ms_level=ms_level) 370 else: 371 raise ValueError( 372 "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format( 373 ms_level 374 ) 375 ) 376 else: 377 raise ValueError("Peak picking method not implemented") 378 379 def integrate_mass_features( 380 self, drop_if_fail=True, drop_duplicates=True, ms_level=1 381 ): 382 """Integrate mass features and extract EICs. 383 384 Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute. 385 386 Parameters 387 ---------- 388 drop_if_fail : bool, optional 389 Whether to drop mass features if the EIC limit calculations fail. 390 Default is True. 391 drop_duplicates : bool, optional 392 Whether to mass features that appear to be duplicates 393 (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). 394 Default is True. 395 ms_level : int, optional 396 The MS level to use. Default is 1. 397 398 Raises 399 ------ 400 ValueError 401 If no mass features are found. 402 If no MS level data is found for the given MS level (either in data or in the scan data) 403 404 Returns 405 ------- 406 None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute. 407 408 Notes 409 ----- 410 drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection). 411 """ 412 # Check if there is data 413 if ms_level in self._ms_unprocessed.keys(): 414 raw_data = self._ms_unprocessed[ms_level].copy() 415 else: 416 raise ValueError("No MS level " + str(ms_level) + " data found") 417 if self.mass_features is not None: 418 mf_df = self.mass_features_to_df().copy() 419 else: 420 raise ValueError( 421 "No mass features found, did you run find_mass_features() first?" 422 ) 423 # Check if mass_spectrum exists on each mass feature 424 if not all( 425 [mf.mass_spectrum is not None for mf in self.mass_features.values()] 426 ): 427 raise ValueError( 428 "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?" 429 ) 430 431 # Subset scan data to only include correct ms_level 432 scan_df_sub = self.scan_df[ 433 self.scan_df["ms_level"] == int(ms_level) 434 ].reset_index(drop=True) 435 if scan_df_sub.empty: 436 raise ValueError("No MS level " + ms_level + " data found in scan data") 437 scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy() 438 439 mzs_to_extract = np.unique(mf_df["mz"].values) 440 mzs_to_extract.sort() 441 442 # Get EICs for each unique mz in mass features list 443 for mz in mzs_to_extract: 444 mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 445 mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 446 raw_data_sub = raw_data[ 447 (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max) 448 ].reset_index(drop=True) 449 raw_data_sub = ( 450 raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index() 451 ) 452 raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left") 453 raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0) 454 myEIC = EIC_Data( 455 scans=raw_data_sub["scan"].values, 456 time=raw_data_sub["scan_time"].values, 457 eic=raw_data_sub["intensity"].values, 458 ) 459 # Smooth EIC 460 smoothed_eic = self.smooth_tic(myEIC.eic) 461 smoothed_eic[smoothed_eic < 0] = 0 462 myEIC.eic_smoothed = smoothed_eic 463 self.eics[mz] = myEIC 464 465 # Get limits of mass features using EIC centroid detector and integrate 466 mf_df["area"] = np.nan 467 for idx, mass_feature in mf_df.iterrows(): 468 mz = mass_feature.mz 469 apex_scan = mass_feature.apex_scan 470 471 # Pull EIC data and find apex scan index 472 myEIC = self.eics[mz] 473 self.mass_features[idx]._eic_data = myEIC 474 apex_index = np.where(myEIC.scans == apex_scan)[0][0] 475 476 # Find left and right limits of peak using EIC centroid detector, add to EICData 477 centroid_eics = self.eic_centroid_detector( 478 myEIC.time, 479 myEIC.eic_smoothed, 480 mass_feature.intensity * 1.1, 481 apex_indexes=[int(apex_index)], 482 ) 483 l_a_r_scan_idx = [i for i in centroid_eics] 484 if len(l_a_r_scan_idx) > 0: 485 # Add start and final scan to mass_features and EICData 486 left_scan, right_scan = ( 487 myEIC.scans[l_a_r_scan_idx[0][0]], 488 myEIC.scans[l_a_r_scan_idx[0][2]], 489 ) 490 mf_scan_apex = [(left_scan, int(apex_scan), right_scan)] 491 myEIC.apexes = myEIC.apexes + mf_scan_apex 492 self.mass_features[idx].start_scan = left_scan 493 self.mass_features[idx].final_scan = right_scan 494 495 # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData 496 area = np.trapz( 497 myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 498 myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 499 ) 500 mf_df.at[idx, "area"] = area 501 myEIC.areas = myEIC.areas + [area] 502 self.eics[mz] = myEIC 503 self.mass_features[idx]._area = area 504 else: 505 if drop_if_fail is True: 506 self.mass_features.pop(idx) 507 508 if drop_duplicates: 509 # Prepare mass feature dataframe 510 mf_df = self.mass_features_to_df().copy() 511 512 # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature 513 # Kepp the first mass fea 514 for idx, mass_feature in mf_df.iterrows(): 515 mz = mass_feature.mz 516 apex_scan = mass_feature.apex_scan 517 518 mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6 519 mf_df_sub = mf_df[ 520 mf_df["mz_diff_ppm"] 521 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 522 * 10**6 523 ].copy() 524 525 # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature 526 for idx2, mass_feature2 in mf_df_sub.iterrows(): 527 if idx2 != idx: 528 if ( 529 mass_feature2.start_scan >= mass_feature.start_scan 530 and mass_feature2.final_scan <= mass_feature.final_scan 531 ): 532 if idx2 in self.mass_features.keys(): 533 self.mass_features.pop(idx2) 534 535 def find_c13_mass_features(self): 536 """Mark likely C13 isotopes and connect to monoisoitopic mass features. 537 538 Returns 539 ------- 540 None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object. 541 542 Raises 543 ------ 544 ValueError 545 If no mass features are found. 546 """ 547 verbose = self.parameters.lc_ms.verbose_processing 548 if verbose: 549 print("evaluating mass features for C13 isotopes") 550 if self.mass_features is None: 551 raise ValueError("No mass features found, run find_mass_features() first") 552 553 # Data prep fo sparse distance matrix 554 dims = ["mz", "scan_time"] 555 mf_df = self.mass_features_to_df().copy() 556 # Drop mass features that have no area (these are likely to be noise) 557 mf_df = mf_df[mf_df["area"].notnull()] 558 mf_df["mf_id"] = mf_df.index.values 559 dims = ["mz", "scan_time"] 560 561 # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features 562 mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy() 563 564 mz_diff = 1.003355 # C13-C12 mass difference 565 tol = [ 566 mf_df["mz"].median() 567 * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 568 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5, 569 ] # mz, in relative; scan_time in minutes 570 571 # Compute inter-feature distances 572 distances = None 573 for i in range(len(dims)): 574 # Construct k-d tree 575 values = mf_df[dims[i]].values 576 tree = KDTree(values.reshape(-1, 1)) 577 578 max_tol = tol[i] 579 if dims[i] == "mz": 580 # Maximum absolute tolerance 581 max_tol = mz_diff + tol[i] 582 583 # Compute sparse distance matrix 584 # the larger the max_tol, the slower this operation is 585 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 586 587 # Only consider forward case, exclude diagonal 588 sdm = sparse.triu(sdm, k=1) 589 590 if dims[i] == "mz": 591 min_tol = mz_diff - tol[i] 592 # Get only the ones that are above the min tol 593 idx = sdm.data > min_tol 594 595 # Reconstruct sparse distance matrix 596 sdm = sparse.coo_matrix( 597 (sdm.data[idx], (sdm.row[idx], sdm.col[idx])), 598 shape=(len(values), len(values)), 599 ) 600 601 # Cast as binary matrix 602 sdm.data = np.ones_like(sdm.data) 603 604 # Stack distances 605 if distances is None: 606 distances = sdm 607 else: 608 distances = distances.multiply(sdm) 609 610 # Extract indices of within-tolerance points 611 distances = distances.tocoo() 612 pairs = np.stack((distances.row, distances.col), axis=1) # C12 to C13 pairs 613 614 # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df 615 pairs_mf = pairs.copy() 616 pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values 617 pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values 618 619 # Connect monoisotopic masses with isotopologes within mass_features 620 monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1])) 621 for mono in monos: 622 self.mass_features[mono].monoisotopic_mf_id = mono 623 pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"]) 624 while not pairs_iso_df.empty: 625 pairs_iso_df = pairs_iso_df.set_index("parent", drop=False) 626 m1_isos = pairs_iso_df.loc[monos, "child"].unique() 627 for iso in m1_isos: 628 # Set monoisotopic_mf_id and isotopologue_type for isotopologues 629 parent = pairs_mf[pairs_mf[:, 1] == iso, 0] 630 if len(parent) > 1: 631 # Choose the parent that is closest in time to the isotopologue 632 parent_time = [self.mass_features[p].retention_time for p in parent] 633 time_diff = [ 634 np.abs(self.mass_features[iso].retention_time - x) 635 for x in parent_time 636 ] 637 parent = parent[np.argmin(time_diff)] 638 else: 639 parent = parent[0] 640 self.mass_features[iso].monoisotopic_mf_id = self.mass_features[ 641 parent 642 ].monoisotopic_mf_id 643 if self.mass_features[iso].monoisotopic_mf_id is not None: 644 mass_diff = ( 645 self.mass_features[iso].mz 646 - self.mass_features[ 647 self.mass_features[iso].monoisotopic_mf_id 648 ].mz 649 ) 650 self.mass_features[iso].isotopologue_type = "13C" + str( 651 int(round(mass_diff, 0)) 652 ) 653 654 # Drop the mono and iso from the pairs_iso_df 655 pairs_iso_df = pairs_iso_df.drop( 656 index=monos, errors="ignore" 657 ) # Drop pairs where the parent is a child that is a child of a root 658 pairs_iso_df = pairs_iso_df.set_index("child", drop=False) 659 pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore") 660 661 if not pairs_iso_df.empty: 662 # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process 663 monos = np.setdiff1d( 664 np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child) 665 ) 666 if verbose: 667 # Report fraction of compounds annotated with isotopes 668 mf_df["c13_flag"] = np.where( 669 np.logical_or( 670 np.isin(mf_df["mf_id"], pairs_mf[:, 0]), 671 np.isin(mf_df["mf_id"], pairs_mf[:, 1]), 672 ), 673 1, 674 0, 675 ) 676 print( 677 str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3)) 678 + " of mass features have or are C13 isotopes" 679 ) 680 681 def deconvolute_ms1_mass_features(self): 682 """Deconvolute MS1 mass features 683 684 Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features 685 686 Parameters 687 ---------- 688 None 689 690 Returns 691 ------- 692 None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent, 693 and associated_mass_features_deconvoluted attributes to the mass features in the 694 mass_features attribute of the LCMSBase object. 695 696 Raises 697 ------ 698 ValueError 699 If no mass features are found, must run find_mass_features() first. 700 If no EICs are found, did you run integrate_mass_features() first? 701 702 """ 703 # Checks for set mass_features and eics 704 if self.mass_features is None: 705 raise ValueError( 706 "No mass features found, did you run find_mass_features() first?" 707 ) 708 709 if self.eics == {}: 710 raise ValueError( 711 "No EICs found, did you run integrate_mass_features() first?" 712 ) 713 714 if 1 not in self._ms_unprocessed.keys(): 715 raise ValueError("No unprocessed MS1 spectra found.") 716 717 # Prep ms1 data 718 ms1_data = self._ms_unprocessed[1].copy() 719 ms1_data = ms1_data.set_index("scan") 720 721 # Prep mass feature summary 722 mass_feature_df = self.mass_features_to_df() 723 724 # Loop through each mass feature 725 for mf_id, mass_feature in self.mass_features.items(): 726 # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df 727 if mass_feature.mz != mass_feature.ms1_peak.mz_exp: 728 continue 729 730 # Get the left and right limits of the EIC of the mass feature 731 l_scan, _, r_scan = mass_feature._eic_data.apexes[0] 732 733 # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz 734 ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy() 735 ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False) 736 737 # Get the centroided masses of the mass feature 738 mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp 739 740 # Find the closest mz in the ms1 data to the centroided masses of the mass feature 741 ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[ 742 find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values) 743 ] 744 745 # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz 746 ms1_data_sub["mz_diff_rel"] = ( 747 np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"]) 748 / ms1_data_sub["mz"] 749 ) 750 ms1_data_sub = ms1_data_sub[ 751 ms1_data_sub["mz_diff_rel"] 752 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 753 ].reset_index(drop=True) 754 755 # Group by mass_feature_mz and scan and sum intensity 756 ms1_data_sub_group = ( 757 ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"] 758 .sum() 759 .reset_index() 760 ) 761 762 # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity) 763 corr = ( 764 ms1_data_sub_group.pivot( 765 index="scan", columns="mass_feature_mz", values="intensity" 766 ) 767 .fillna(0) 768 .corr() 769 ) 770 771 # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8 772 decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min 773 corr_subset = corr.loc[mass_feature.mz,] 774 corr_subset = corr_subset[corr_subset > decon_corr_min] 775 776 # Get the masses from the mass spectrum that are the result of the deconvolution 777 mzs_decon = corr_subset.index.values 778 779 # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature 780 mzs_decon_idx = [ 781 id 782 for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp) 783 if mz in mzs_decon 784 ] 785 mass_feature._ms_deconvoluted_idx = mzs_decon_idx 786 787 # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum 788 if ( 789 mass_feature.ms1_peak.abundance 790 == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max() 791 ): 792 mass_feature.mass_spectrum_deconvoluted_parent = True 793 else: 794 mass_feature.mass_spectrum_deconvoluted_parent = False 795 796 # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature 797 # Subset mass_feature_df to only include mass features that are within the clustering tolerance 798 mass_feature_df_sub = mass_feature_df[ 799 abs(mass_feature.retention_time - mass_feature_df["scan_time"]) 800 < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 801 ].copy() 802 # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum 803 mass_feature_df_sub["mz_diff_ppm"] = [ 804 np.abs(mzs_decon - mz).min() / mz * 10**6 805 for mz in mass_feature_df_sub["mz"] 806 ] 807 # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses 808 mfs_associated_decon = mass_feature_df_sub[ 809 mass_feature_df_sub["mz_diff_ppm"] 810 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6 811 ].index.values 812 813 mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon
Methods for performing LC calculations on mass spectra data.
Notes
This class is intended to be used as a mixin for the LCMSBase class.
Methods
- get_max_eic(eic_data). Returns the maximum EIC value from the given EIC data. A static method.
- smooth_tic(tic). Smooths the TIC data using the specified smoothing method and settings.
- eic_centroid_detector(rt, eic, max_eic). Performs EIC centroid detection on the given EIC data.
- find_nearest_scan(rt). Finds the nearest scan to the given retention time.
- get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). Returns an averaged mass spectrum object.
- find_mass_features(ms_level=1). Find regions of interest for a given MS level (default is MS1).
- integrate_mass_features(drop_if_fail=False, ms_level=1). Integrate mass features of interest and extracts EICs.
- find_c13_mass_features(). Evaluate mass features and mark likely C13 isotopes.
- deconvolute_ms1_mass_features(). Deconvolute mass features' ms1 mass spectra.
66 @staticmethod 67 def get_max_eic(eic_data: dict): 68 """Returns the maximum EIC value from the given EIC data. 69 70 Notes 71 ----- 72 This is a static method. 73 74 Parameters 75 ---------- 76 eic_data : dict 77 A dictionary containing EIC data. 78 79 Returns 80 ------- 81 float 82 The maximum EIC value. 83 """ 84 max_eic = 0 85 for eic_data in eic_data.values(): 86 ind_max_eic = max(eic_data.get("EIC")) 87 max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic 88 89 return max_eic
Returns the maximum EIC value from the given EIC data.
Notes
This is a static method.
Parameters
- eic_data (dict): A dictionary containing EIC data.
Returns
- float: The maximum EIC value.
91 def smooth_tic(self, tic): 92 """Smooths the TIC or EIC data using the specified smoothing method and settings. 93 94 Parameters 95 ---------- 96 tic : numpy.ndarray 97 The TIC (or EIC) data to be smoothed. 98 99 Returns 100 ------- 101 numpy.ndarray 102 The smoothed TIC data. 103 """ 104 implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method 105 106 pol_order = self.parameters.lc_ms.savgol_pol_order 107 108 window_len = self.parameters.lc_ms.smooth_window 109 110 window = self.parameters.lc_ms.smooth_method 111 112 return sp.smooth_signal( 113 tic, window_len, window, pol_order, implemented_smooth_method 114 )
Smooths the TIC or EIC data using the specified smoothing method and settings.
Parameters
- tic (numpy.ndarray): The TIC (or EIC) data to be smoothed.
Returns
- numpy.ndarray: The smoothed TIC data.
116 def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]): 117 """Performs EIC centroid detection on the given EIC data. 118 119 Parameters 120 ---------- 121 rt : numpy.ndarray 122 The retention time data. 123 eic : numpy.ndarray 124 The EIC data. 125 max_eic : float 126 The maximum EIC value. 127 apex_indexes : list, optional 128 The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function. 129 130 Returns 131 ------- 132 numpy.ndarray 133 The indexes of left, apex, and right limits as a generator. 134 """ 135 136 max_prominence = self.parameters.lc_ms.peak_max_prominence_percent 137 138 max_height = self.parameters.lc_ms.peak_height_max_percent 139 140 signal_threshold = self.parameters.lc_ms.eic_signal_threshold 141 142 min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints 143 144 peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold 145 146 include_indexes = sp.peak_picking_first_derivative( 147 domain=rt, 148 signal=eic, 149 max_height=max_height, 150 max_prominence=max_prominence, 151 max_signal=max_eic, 152 min_peak_datapoints=min_peak_datapoints, 153 peak_derivative_threshold=peak_derivative_threshold, 154 signal_threshold=signal_threshold, 155 correct_baseline=False, 156 plot_res=False, 157 apex_indexes=apex_indexes, 158 ) 159 160 return include_indexes
Performs EIC centroid detection on the given EIC data.
Parameters
- rt (numpy.ndarray): The retention time data.
- eic (numpy.ndarray): The EIC data.
- max_eic (float): The maximum EIC value.
- apex_indexes (list, optional): The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
Returns
- numpy.ndarray: The indexes of left, apex, and right limits as a generator.
162 def find_nearest_scan(self, rt): 163 """Finds the nearest scan to the given retention time. 164 165 Parameters 166 ---------- 167 rt : float 168 The retention time (in minutes) to find the nearest scan for. 169 170 Returns 171 ------- 172 int 173 The scan number of the nearest scan. 174 """ 175 array_rt = np.array(self.retention_time) 176 177 scan_index = (np.abs(array_rt - rt)).argmin() 178 179 real_scan = self.scans_number[scan_index] 180 181 return real_scan
Finds the nearest scan to the given retention time.
Parameters
- rt (float): The retention time (in minutes) to find the nearest scan for.
Returns
- int: The scan number of the nearest scan.
183 def add_peak_metrics(self): 184 """Add peak metrics to the mass features. 185 186 This function calculates the peak metrics for each mass feature and adds them to the mass feature objects. 187 """ 188 # Check that at least some mass features have eic data 189 if not any([mf._eic_data is not None for mf in self.mass_features.values()]): 190 raise ValueError( 191 "No mass features have EIC data. Run integrate_mass_features first." 192 ) 193 194 for mass_feature in self.mass_features.values(): 195 # Check if the mass feature has been integrated 196 if mass_feature._eic_data is not None: 197 # Calculate peak metrics 198 mass_feature.calc_half_height_width() 199 mass_feature.calc_tailing_factor() 200 mass_feature.calc_dispersity_index()
Add peak metrics to the mass features.
This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
202 def get_average_mass_spectrum( 203 self, 204 scan_list, 205 apex_scan, 206 spectrum_mode="profile", 207 ms_level=1, 208 auto_process=True, 209 use_parser=False, 210 perform_checks=True, 211 polarity=None, 212 ms_params=None, 213 ): 214 """Returns an averaged mass spectrum object 215 216 Parameters 217 ---------- 218 scan_list : list 219 List of scan numbers to average. 220 apex_scan : int 221 Number of the apex scan 222 spectrum_mode : str, optional 223 The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging. 224 ms_level : int, optional 225 The MS level to use. Defaults to 1. 226 auto_process : bool, optional 227 If True, the averaged mass spectrum will be auto-processed. Defaults to True. 228 use_parser : bool, optional 229 If True, the mass spectra will be obtained from the parser. Defaults to False. 230 perform_checks : bool, optional 231 If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan 232 polarity : int, optional 233 The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1) 234 ms_params : MSParameters, optional 235 The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None. 236 237 Returns 238 ------- 239 MassSpectrumProfile 240 The averaged mass spectrum object. 241 242 Raises 243 ------ 244 ValueError 245 If the spectrum mode is not "profile". 246 If the MS level is not found in the unprocessed mass spectra dictionary. 247 If not all scan numbers are found in the unprocessed mass spectra dictionary. 248 """ 249 if perform_checks: 250 if spectrum_mode != "profile": 251 raise ValueError("Averaging only supported for profile mode") 252 253 if polarity is None: 254 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 255 if self.polarity == "negative": 256 polarity = -1 257 elif self.polarity == "positive": 258 polarity = 1 259 else: 260 raise ValueError( 261 "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'" 262 ) 263 264 # if not using_parser, check that scan numbers are in _ms_unprocessed 265 if not use_parser: 266 if perform_checks: 267 # Set index to scan for faster lookup 268 ms_df = ( 269 self._ms_unprocessed[ms_level] 270 .copy() 271 .set_index("scan", drop=False) 272 .sort_index() 273 ) 274 my_ms_df = ms_df.loc[scan_list] 275 # Check that all scan numbers are in the ms_df 276 if not all(np.isin(scan_list, ms_df.index)): 277 raise ValueError( 278 "Not all scan numbers found in the unprocessed mass spectra dictionary" 279 ) 280 else: 281 my_ms_df = ( 282 pd.DataFrame({"scan": scan_list}) 283 .set_index("scan") 284 .join(self._ms_unprocessed[ms_level], how="left") 285 ) 286 287 if use_parser: 288 ms_list = [ 289 self.spectra_parser.get_mass_spectrum_from_scan( 290 x, spectrum_mode=spectrum_mode, auto_process=False 291 ) 292 for x in scan_list 293 ] 294 ms_mz = [x._mz_exp for x in ms_list] 295 ms_int = [x._abundance for x in ms_list] 296 my_ms_df = [] 297 for i in np.arange(len(ms_mz)): 298 my_ms_df.append( 299 pd.DataFrame( 300 {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]} 301 ) 302 ) 303 my_ms_df = pd.concat(my_ms_df) 304 305 if not self.check_if_grid(my_ms_df): 306 my_ms_df = self.grid_data(my_ms_df) 307 308 my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index() 309 310 ms = ms_from_array_profile( 311 my_ms_ave.mz, 312 my_ms_ave.intensity, 313 self.file_location, 314 polarity=polarity, 315 auto_process=False, 316 ) 317 318 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 319 if ms is not None: 320 if ms_params is not None: 321 ms.parameters = ms_params 322 ms.scan_number = apex_scan 323 if auto_process: 324 ms.process_mass_spec() 325 return ms
Returns an averaged mass spectrum object
Parameters
- scan_list (list): List of scan numbers to average.
- apex_scan (int): Number of the apex scan
- spectrum_mode (str, optional): The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
- ms_level (int, optional): The MS level to use. Defaults to 1.
- auto_process (bool, optional): If True, the averaged mass spectrum will be auto-processed. Defaults to True.
- use_parser (bool, optional): If True, the mass spectra will be obtained from the parser. Defaults to False.
- perform_checks (bool, optional): If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan
- polarity (int, optional): The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
- ms_params (MSParameters, optional): The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
Returns
- MassSpectrumProfile: The averaged mass spectrum object.
Raises
- ValueError: If the spectrum mode is not "profile". If the MS level is not found in the unprocessed mass spectra dictionary. If not all scan numbers are found in the unprocessed mass spectra dictionary.
327 def find_mass_features(self, ms_level=1, grid=True): 328 """Find mass features within an LCMSBase object 329 330 Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future. 331 332 Parameters 333 ---------- 334 ms_level : int, optional 335 The MS level to use for peak picking Default is 1. 336 grid : bool, optional 337 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 338 used for persistent homology peak picking for profile data only. Default is True. 339 340 Raises 341 ------ 342 ValueError 343 If no MS level data is found on the object. 344 If persistent homology peak picking is attempted on non-profile mode data. 345 If data is not gridded and grid is False. 346 If peak picking method is not implemented. 347 348 Returns 349 ------- 350 None, but assigns the mass_features and eics attributes to the object. 351 352 """ 353 pp_method = self.parameters.lc_ms.peak_picking_method 354 355 if pp_method == "persistent homology": 356 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 357 if all(msx_scan_df["ms_format"] == "profile"): 358 self.find_mass_features_ph(ms_level=ms_level, grid=grid) 359 self.cluster_mass_features(drop_children=True, sort_by="persistence") 360 else: 361 raise ValueError( 362 "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format( 363 ms_level 364 ) 365 ) 366 elif pp_method == "centroided_persistent_homology": 367 msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level] 368 if all(msx_scan_df["ms_format"] == "centroid"): 369 self.find_mass_features_ph_centroid(ms_level=ms_level) 370 else: 371 raise ValueError( 372 "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format( 373 ms_level 374 ) 375 ) 376 else: 377 raise ValueError("Peak picking method not implemented")
Find mass features within an LCMSBase object
Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
Parameters
- ms_level (int, optional): The MS level to use for peak picking Default is 1.
- grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), used for persistent homology peak picking for profile data only. Default is True.
Raises
- ValueError: If no MS level data is found on the object. If persistent homology peak picking is attempted on non-profile mode data. If data is not gridded and grid is False. If peak picking method is not implemented.
Returns
- None, but assigns the mass_features and eics attributes to the object.
379 def integrate_mass_features( 380 self, drop_if_fail=True, drop_duplicates=True, ms_level=1 381 ): 382 """Integrate mass features and extract EICs. 383 384 Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute. 385 386 Parameters 387 ---------- 388 drop_if_fail : bool, optional 389 Whether to drop mass features if the EIC limit calculations fail. 390 Default is True. 391 drop_duplicates : bool, optional 392 Whether to mass features that appear to be duplicates 393 (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). 394 Default is True. 395 ms_level : int, optional 396 The MS level to use. Default is 1. 397 398 Raises 399 ------ 400 ValueError 401 If no mass features are found. 402 If no MS level data is found for the given MS level (either in data or in the scan data) 403 404 Returns 405 ------- 406 None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute. 407 408 Notes 409 ----- 410 drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection). 411 """ 412 # Check if there is data 413 if ms_level in self._ms_unprocessed.keys(): 414 raw_data = self._ms_unprocessed[ms_level].copy() 415 else: 416 raise ValueError("No MS level " + str(ms_level) + " data found") 417 if self.mass_features is not None: 418 mf_df = self.mass_features_to_df().copy() 419 else: 420 raise ValueError( 421 "No mass features found, did you run find_mass_features() first?" 422 ) 423 # Check if mass_spectrum exists on each mass feature 424 if not all( 425 [mf.mass_spectrum is not None for mf in self.mass_features.values()] 426 ): 427 raise ValueError( 428 "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?" 429 ) 430 431 # Subset scan data to only include correct ms_level 432 scan_df_sub = self.scan_df[ 433 self.scan_df["ms_level"] == int(ms_level) 434 ].reset_index(drop=True) 435 if scan_df_sub.empty: 436 raise ValueError("No MS level " + ms_level + " data found in scan data") 437 scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy() 438 439 mzs_to_extract = np.unique(mf_df["mz"].values) 440 mzs_to_extract.sort() 441 442 # Get EICs for each unique mz in mass features list 443 for mz in mzs_to_extract: 444 mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 445 mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6 446 raw_data_sub = raw_data[ 447 (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max) 448 ].reset_index(drop=True) 449 raw_data_sub = ( 450 raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index() 451 ) 452 raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left") 453 raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0) 454 myEIC = EIC_Data( 455 scans=raw_data_sub["scan"].values, 456 time=raw_data_sub["scan_time"].values, 457 eic=raw_data_sub["intensity"].values, 458 ) 459 # Smooth EIC 460 smoothed_eic = self.smooth_tic(myEIC.eic) 461 smoothed_eic[smoothed_eic < 0] = 0 462 myEIC.eic_smoothed = smoothed_eic 463 self.eics[mz] = myEIC 464 465 # Get limits of mass features using EIC centroid detector and integrate 466 mf_df["area"] = np.nan 467 for idx, mass_feature in mf_df.iterrows(): 468 mz = mass_feature.mz 469 apex_scan = mass_feature.apex_scan 470 471 # Pull EIC data and find apex scan index 472 myEIC = self.eics[mz] 473 self.mass_features[idx]._eic_data = myEIC 474 apex_index = np.where(myEIC.scans == apex_scan)[0][0] 475 476 # Find left and right limits of peak using EIC centroid detector, add to EICData 477 centroid_eics = self.eic_centroid_detector( 478 myEIC.time, 479 myEIC.eic_smoothed, 480 mass_feature.intensity * 1.1, 481 apex_indexes=[int(apex_index)], 482 ) 483 l_a_r_scan_idx = [i for i in centroid_eics] 484 if len(l_a_r_scan_idx) > 0: 485 # Add start and final scan to mass_features and EICData 486 left_scan, right_scan = ( 487 myEIC.scans[l_a_r_scan_idx[0][0]], 488 myEIC.scans[l_a_r_scan_idx[0][2]], 489 ) 490 mf_scan_apex = [(left_scan, int(apex_scan), right_scan)] 491 myEIC.apexes = myEIC.apexes + mf_scan_apex 492 self.mass_features[idx].start_scan = left_scan 493 self.mass_features[idx].final_scan = right_scan 494 495 # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData 496 area = np.trapz( 497 myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 498 myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1], 499 ) 500 mf_df.at[idx, "area"] = area 501 myEIC.areas = myEIC.areas + [area] 502 self.eics[mz] = myEIC 503 self.mass_features[idx]._area = area 504 else: 505 if drop_if_fail is True: 506 self.mass_features.pop(idx) 507 508 if drop_duplicates: 509 # Prepare mass feature dataframe 510 mf_df = self.mass_features_to_df().copy() 511 512 # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature 513 # Kepp the first mass fea 514 for idx, mass_feature in mf_df.iterrows(): 515 mz = mass_feature.mz 516 apex_scan = mass_feature.apex_scan 517 518 mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6 519 mf_df_sub = mf_df[ 520 mf_df["mz_diff_ppm"] 521 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 522 * 10**6 523 ].copy() 524 525 # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature 526 for idx2, mass_feature2 in mf_df_sub.iterrows(): 527 if idx2 != idx: 528 if ( 529 mass_feature2.start_scan >= mass_feature.start_scan 530 and mass_feature2.final_scan <= mass_feature.final_scan 531 ): 532 if idx2 in self.mass_features.keys(): 533 self.mass_features.pop(idx2)
Integrate mass features and extract EICs.
Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
Parameters
- drop_if_fail (bool, optional): Whether to drop mass features if the EIC limit calculations fail. Default is True.
- drop_duplicates (bool, optional): Whether to mass features that appear to be duplicates (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). Default is True.
- ms_level (int, optional): The MS level to use. Default is 1.
Raises
- ValueError: If no mass features are found. If no MS level data is found for the given MS level (either in data or in the scan data)
Returns
- None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
Notes
drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
535 def find_c13_mass_features(self): 536 """Mark likely C13 isotopes and connect to monoisoitopic mass features. 537 538 Returns 539 ------- 540 None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object. 541 542 Raises 543 ------ 544 ValueError 545 If no mass features are found. 546 """ 547 verbose = self.parameters.lc_ms.verbose_processing 548 if verbose: 549 print("evaluating mass features for C13 isotopes") 550 if self.mass_features is None: 551 raise ValueError("No mass features found, run find_mass_features() first") 552 553 # Data prep fo sparse distance matrix 554 dims = ["mz", "scan_time"] 555 mf_df = self.mass_features_to_df().copy() 556 # Drop mass features that have no area (these are likely to be noise) 557 mf_df = mf_df[mf_df["area"].notnull()] 558 mf_df["mf_id"] = mf_df.index.values 559 dims = ["mz", "scan_time"] 560 561 # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features 562 mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy() 563 564 mz_diff = 1.003355 # C13-C12 mass difference 565 tol = [ 566 mf_df["mz"].median() 567 * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 568 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5, 569 ] # mz, in relative; scan_time in minutes 570 571 # Compute inter-feature distances 572 distances = None 573 for i in range(len(dims)): 574 # Construct k-d tree 575 values = mf_df[dims[i]].values 576 tree = KDTree(values.reshape(-1, 1)) 577 578 max_tol = tol[i] 579 if dims[i] == "mz": 580 # Maximum absolute tolerance 581 max_tol = mz_diff + tol[i] 582 583 # Compute sparse distance matrix 584 # the larger the max_tol, the slower this operation is 585 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 586 587 # Only consider forward case, exclude diagonal 588 sdm = sparse.triu(sdm, k=1) 589 590 if dims[i] == "mz": 591 min_tol = mz_diff - tol[i] 592 # Get only the ones that are above the min tol 593 idx = sdm.data > min_tol 594 595 # Reconstruct sparse distance matrix 596 sdm = sparse.coo_matrix( 597 (sdm.data[idx], (sdm.row[idx], sdm.col[idx])), 598 shape=(len(values), len(values)), 599 ) 600 601 # Cast as binary matrix 602 sdm.data = np.ones_like(sdm.data) 603 604 # Stack distances 605 if distances is None: 606 distances = sdm 607 else: 608 distances = distances.multiply(sdm) 609 610 # Extract indices of within-tolerance points 611 distances = distances.tocoo() 612 pairs = np.stack((distances.row, distances.col), axis=1) # C12 to C13 pairs 613 614 # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df 615 pairs_mf = pairs.copy() 616 pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values 617 pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values 618 619 # Connect monoisotopic masses with isotopologes within mass_features 620 monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1])) 621 for mono in monos: 622 self.mass_features[mono].monoisotopic_mf_id = mono 623 pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"]) 624 while not pairs_iso_df.empty: 625 pairs_iso_df = pairs_iso_df.set_index("parent", drop=False) 626 m1_isos = pairs_iso_df.loc[monos, "child"].unique() 627 for iso in m1_isos: 628 # Set monoisotopic_mf_id and isotopologue_type for isotopologues 629 parent = pairs_mf[pairs_mf[:, 1] == iso, 0] 630 if len(parent) > 1: 631 # Choose the parent that is closest in time to the isotopologue 632 parent_time = [self.mass_features[p].retention_time for p in parent] 633 time_diff = [ 634 np.abs(self.mass_features[iso].retention_time - x) 635 for x in parent_time 636 ] 637 parent = parent[np.argmin(time_diff)] 638 else: 639 parent = parent[0] 640 self.mass_features[iso].monoisotopic_mf_id = self.mass_features[ 641 parent 642 ].monoisotopic_mf_id 643 if self.mass_features[iso].monoisotopic_mf_id is not None: 644 mass_diff = ( 645 self.mass_features[iso].mz 646 - self.mass_features[ 647 self.mass_features[iso].monoisotopic_mf_id 648 ].mz 649 ) 650 self.mass_features[iso].isotopologue_type = "13C" + str( 651 int(round(mass_diff, 0)) 652 ) 653 654 # Drop the mono and iso from the pairs_iso_df 655 pairs_iso_df = pairs_iso_df.drop( 656 index=monos, errors="ignore" 657 ) # Drop pairs where the parent is a child that is a child of a root 658 pairs_iso_df = pairs_iso_df.set_index("child", drop=False) 659 pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore") 660 661 if not pairs_iso_df.empty: 662 # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process 663 monos = np.setdiff1d( 664 np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child) 665 ) 666 if verbose: 667 # Report fraction of compounds annotated with isotopes 668 mf_df["c13_flag"] = np.where( 669 np.logical_or( 670 np.isin(mf_df["mf_id"], pairs_mf[:, 0]), 671 np.isin(mf_df["mf_id"], pairs_mf[:, 1]), 672 ), 673 1, 674 0, 675 ) 676 print( 677 str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3)) 678 + " of mass features have or are C13 isotopes" 679 )
Mark likely C13 isotopes and connect to monoisoitopic mass features.
Returns
- None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
Raises
- ValueError: If no mass features are found.
681 def deconvolute_ms1_mass_features(self): 682 """Deconvolute MS1 mass features 683 684 Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features 685 686 Parameters 687 ---------- 688 None 689 690 Returns 691 ------- 692 None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent, 693 and associated_mass_features_deconvoluted attributes to the mass features in the 694 mass_features attribute of the LCMSBase object. 695 696 Raises 697 ------ 698 ValueError 699 If no mass features are found, must run find_mass_features() first. 700 If no EICs are found, did you run integrate_mass_features() first? 701 702 """ 703 # Checks for set mass_features and eics 704 if self.mass_features is None: 705 raise ValueError( 706 "No mass features found, did you run find_mass_features() first?" 707 ) 708 709 if self.eics == {}: 710 raise ValueError( 711 "No EICs found, did you run integrate_mass_features() first?" 712 ) 713 714 if 1 not in self._ms_unprocessed.keys(): 715 raise ValueError("No unprocessed MS1 spectra found.") 716 717 # Prep ms1 data 718 ms1_data = self._ms_unprocessed[1].copy() 719 ms1_data = ms1_data.set_index("scan") 720 721 # Prep mass feature summary 722 mass_feature_df = self.mass_features_to_df() 723 724 # Loop through each mass feature 725 for mf_id, mass_feature in self.mass_features.items(): 726 # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df 727 if mass_feature.mz != mass_feature.ms1_peak.mz_exp: 728 continue 729 730 # Get the left and right limits of the EIC of the mass feature 731 l_scan, _, r_scan = mass_feature._eic_data.apexes[0] 732 733 # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz 734 ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy() 735 ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False) 736 737 # Get the centroided masses of the mass feature 738 mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp 739 740 # Find the closest mz in the ms1 data to the centroided masses of the mass feature 741 ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[ 742 find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values) 743 ] 744 745 # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz 746 ms1_data_sub["mz_diff_rel"] = ( 747 np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"]) 748 / ms1_data_sub["mz"] 749 ) 750 ms1_data_sub = ms1_data_sub[ 751 ms1_data_sub["mz_diff_rel"] 752 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel 753 ].reset_index(drop=True) 754 755 # Group by mass_feature_mz and scan and sum intensity 756 ms1_data_sub_group = ( 757 ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"] 758 .sum() 759 .reset_index() 760 ) 761 762 # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity) 763 corr = ( 764 ms1_data_sub_group.pivot( 765 index="scan", columns="mass_feature_mz", values="intensity" 766 ) 767 .fillna(0) 768 .corr() 769 ) 770 771 # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8 772 decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min 773 corr_subset = corr.loc[mass_feature.mz,] 774 corr_subset = corr_subset[corr_subset > decon_corr_min] 775 776 # Get the masses from the mass spectrum that are the result of the deconvolution 777 mzs_decon = corr_subset.index.values 778 779 # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature 780 mzs_decon_idx = [ 781 id 782 for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp) 783 if mz in mzs_decon 784 ] 785 mass_feature._ms_deconvoluted_idx = mzs_decon_idx 786 787 # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum 788 if ( 789 mass_feature.ms1_peak.abundance 790 == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max() 791 ): 792 mass_feature.mass_spectrum_deconvoluted_parent = True 793 else: 794 mass_feature.mass_spectrum_deconvoluted_parent = False 795 796 # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature 797 # Subset mass_feature_df to only include mass features that are within the clustering tolerance 798 mass_feature_df_sub = mass_feature_df[ 799 abs(mass_feature.retention_time - mass_feature_df["scan_time"]) 800 < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance 801 ].copy() 802 # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum 803 mass_feature_df_sub["mz_diff_ppm"] = [ 804 np.abs(mzs_decon - mz).min() / mz * 10**6 805 for mz in mass_feature_df_sub["mz"] 806 ] 807 # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses 808 mfs_associated_decon = mass_feature_df_sub[ 809 mass_feature_df_sub["mz_diff_ppm"] 810 < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6 811 ].index.values 812 813 mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon
Deconvolute MS1 mass features
Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
Parameters
- None
Returns
- None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
- and associated_mass_features_deconvoluted attributes to the mass features in the
- mass_features attribute of the LCMSBase object.
Raises
- ValueError: If no mass features are found, must run find_mass_features() first. If no EICs are found, did you run integrate_mass_features() first?
816class PHCalculations: 817 """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data. 818 819 Notes 820 ----- 821 This class is intended to be used as a mixin for the LCMSBase class. 822 823 Methods 824 ------- 825 * sparse_mean_filter(idx, V, radius=[0, 1, 1]). 826 Sparse implementation of a mean filter. 827 * embed_unique_indices(a). 828 Creates an array of indices, sorted by unique element. 829 * sparse_upper_star(idx, V). 830 Sparse implementation of an upper star filtration. 831 * check_if_grid(data). 832 Check if the data is gridded in mz space. 833 * grid_data(data). 834 Grid the data in the mz dimension. 835 * find_mass_features_ph(ms_level=1, grid=True). 836 Find mass features within an LCMSBase object using persistent homology. 837 * cluster_mass_features(drop_children=True). 838 Cluster regions of interest. 839 """ 840 841 @staticmethod 842 def sparse_mean_filter(idx, V, radius=[0, 1, 1]): 843 """Sparse implementation of a mean filter. 844 845 Parameters 846 ---------- 847 idx : :obj:`~numpy.array` 848 Edge indices for each dimension (MxN). 849 V : :obj:`~numpy.array` 850 Array of intensity data (Mx1). 851 radius : float or list 852 Radius of the sparse filter in each dimension. Values less than 853 zero indicate no connectivity in that dimension. 854 855 Returns 856 ------- 857 :obj:`~numpy.array` 858 Filtered intensities (Mx1). 859 860 Notes 861 ----- 862 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. 863 This is a static method. 864 """ 865 866 # Copy indices 867 idx = idx.copy().astype(V.dtype) 868 869 # Scale 870 for i, r in enumerate(radius): 871 # Increase inter-index distance 872 if r < 1: 873 idx[:, i] *= 2 874 875 # Do nothing 876 elif r == 1: 877 pass 878 879 # Decrease inter-index distance 880 else: 881 idx[:, i] /= r 882 883 # Connectivity matrix 884 cmat = KDTree(idx) 885 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 886 cmat.setdiag(1) 887 888 # Pair indices 889 I, J = cmat.nonzero() 890 891 # Delete cmat 892 cmat_shape = cmat.shape 893 del cmat 894 895 # Sum over columns 896 V_sum = sparse.bsr_matrix( 897 (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype 898 ).diagonal(0) 899 900 # Count over columns 901 V_count = sparse.bsr_matrix( 902 (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype 903 ).diagonal(0) 904 905 return V_sum / V_count 906 907 @staticmethod 908 def embed_unique_indices(a): 909 """Creates an array of indices, sorted by unique element. 910 911 Parameters 912 ---------- 913 a : :obj:`~numpy.array` 914 Array of unique elements (Mx1). 915 916 Returns 917 ------- 918 :obj:`~numpy.array` 919 Array of indices (Mx1). 920 921 Notes 922 ----- 923 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 924 This is a static method. 925 """ 926 927 def count_tens(n): 928 # Count tens 929 ntens = (n - 1) // 10 930 931 while True: 932 ntens_test = (ntens + n - 1) // 10 933 934 if ntens_test == ntens: 935 return ntens 936 else: 937 ntens = ntens_test 938 939 def arange_exclude_10s(n): 940 # How many 10s will there be? 941 ntens = count_tens(n) 942 943 # Base array 944 arr = np.arange(0, n + ntens) 945 946 # Exclude 10s 947 arr = arr[(arr == 0) | (arr % 10 != 0)][:n] 948 949 return arr 950 951 # Creates an array of indices, sorted by unique element 952 idx_sort = np.argsort(a) 953 idx_unsort = np.argsort(idx_sort) 954 955 # Sorts records array so all unique elements are together 956 sorted_a = a[idx_sort] 957 958 # Returns the unique values, the index of the first occurrence, 959 # and the count for each element 960 vals, idx_start, count = np.unique( 961 sorted_a, return_index=True, return_counts=True 962 ) 963 964 # Splits the indices into separate arrays 965 splits = np.split(idx_sort, idx_start[1:]) 966 967 # Creates unique indices for each split 968 idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits]) 969 970 # Reorders according to input array 971 idx_unq = idx_unq[idx_unsort] 972 973 # Magnitude of each index 974 exp = np.log10( 975 idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64) 976 ) 977 idx_unq_mag = np.power(10, np.floor(exp) + 1) 978 979 # Result 980 return a + idx_unq / idx_unq_mag 981 982 @staticmethod 983 def roll_up_dataframe( 984 df : pd.DataFrame, 985 sort_by : str, 986 tol : list, 987 relative : list, 988 dims : list 989 ): 990 """Subset data by rolling up into apex in appropriate dimensions. 991 992 Parameters 993 ---------- 994 data : pd.DataFrame 995 The input data containing "dims" columns and the "sort_by" column. 996 sort_by : str 997 The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 998 (i.e., the mass feature with the highest value in the sort_by column). 999 dims : list 1000 A list of dimension names (column names in the data DataFrame) to roll up the mass features by. 1001 tol : list 1002 A list of tolerances for each dimension. The length of the list must match the number of dimensions. 1003 The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). 1004 If relative is True, the tolerance will be multiplied by the maximum value in that dimension. 1005 If relative is False, the tolerance will be used as is. 1006 relative : list 1007 A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1008 1009 Returns 1010 ------- 1011 pd.DataFrame 1012 A DataFrame with only the rolled up mass features, with the original index and columns. 1013 1014 1015 Raises 1016 ------ 1017 ValueError 1018 If the input data is not a pandas DataFrame. 1019 If the input data does not have columns for each of the dimensions in "dims". 1020 If the length of "dims", "tol", and "relative" do not match. 1021 """ 1022 og_columns = df.columns.copy() 1023 1024 # Unindex the data, but keep the original index 1025 if df.index.name is not None: 1026 og_index = df.index.name 1027 else: 1028 og_index = "index" 1029 df = df.reset_index(drop=False) 1030 1031 # Sort data by sort_by column, and reindex 1032 df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True) 1033 1034 # Check that data is a DataFrame and has columns for each of the dims 1035 if not isinstance(df, pd.DataFrame): 1036 raise ValueError("Data must be a pandas DataFrame") 1037 for dim in dims: 1038 if dim not in df.columns: 1039 raise ValueError(f"Data must have a column for {dim}") 1040 if len(dims) != len(tol) or len(dims) != len(relative): 1041 raise ValueError( 1042 "Dimensions, tolerances, and relative flags must be the same length" 1043 ) 1044 1045 # Compute inter-feature distances 1046 distances = None 1047 for i in range(len(dims)): 1048 # Construct k-d tree 1049 values = df[dims[i]].values 1050 tree = KDTree(values.reshape(-1, 1)) 1051 1052 max_tol = tol[i] 1053 if relative[i] is True: 1054 # Maximum absolute tolerance 1055 max_tol = tol[i] * values.max() 1056 1057 # Compute sparse distance matrix 1058 # the larger the max_tol, the slower this operation is 1059 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 1060 1061 # Only consider forward case, exclude diagonal 1062 sdm = sparse.triu(sdm, k=1) 1063 1064 # Filter relative distances 1065 if relative[i] is True: 1066 # Compute relative distances 1067 rel_dists = sdm.data / values[sdm.row] # or col? 1068 1069 # Indices of relative distances less than tolerance 1070 idx = rel_dists <= tol[i] 1071 1072 # Reconstruct sparse distance matrix 1073 sdm = sparse.coo_matrix( 1074 (rel_dists[idx], (sdm.row[idx], sdm.col[idx])), 1075 shape=(len(values), len(values)), 1076 ) 1077 1078 # Cast as binary matrix 1079 sdm.data = np.ones_like(sdm.data) 1080 1081 # Stack distances 1082 if distances is None: 1083 distances = sdm 1084 else: 1085 distances = distances.multiply(sdm) 1086 1087 # Extract indices of within-tolerance points 1088 distances = distances.tocoo() 1089 pairs = np.stack((distances.row, distances.col), axis=1) 1090 pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]) 1091 pairs_df = pairs_df.set_index("parent") 1092 1093 to_drop = [] 1094 while not pairs_df.empty: 1095 # Find root_parents and their children 1096 root_parents = np.setdiff1d( 1097 np.unique(pairs_df.index.values), np.unique(pairs_df.child.values) 1098 ) 1099 children_of_roots = pairs_df.loc[root_parents, "child"].unique() 1100 to_drop = np.append(to_drop, children_of_roots) 1101 1102 # Remove root_children as possible parents from pairs_df for next iteration 1103 pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore") 1104 pairs_df = pairs_df.reset_index().set_index("child") 1105 # Remove root_children as possible children from pairs_df for next iteration 1106 pairs_df = pairs_df.drop(index=children_of_roots) 1107 1108 # Prepare for next iteration 1109 pairs_df = pairs_df.reset_index().set_index("parent") 1110 1111 # Drop mass features that are not cluster parents 1112 df_sub = df.drop(index=np.array(to_drop)) 1113 1114 # Set index back to og_index and only keep the columns that are in the original dataframe 1115 df_sub = df_sub.set_index(og_index) 1116 1117 # sort the dataframe by the original index 1118 df_sub = df_sub.sort_index() 1119 df_sub = df_sub[og_columns] 1120 1121 return df_sub 1122 1123 def sparse_upper_star(self, idx, V): 1124 """Sparse implementation of an upper star filtration. 1125 1126 Parameters 1127 ---------- 1128 idx : :obj:`~numpy.array` 1129 Edge indices for each dimension (MxN). 1130 V : :obj:`~numpy.array` 1131 Array of intensity data (Mx1). 1132 Returns 1133 ------- 1134 idx : :obj:`~numpy.array` 1135 Index of filtered points (Mx1). 1136 persistence : :obj:`~numpy.array` 1137 Persistence of each filtered point (Mx1). 1138 1139 Notes 1140 ----- 1141 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1142 """ 1143 1144 # Invert 1145 V = -1 * V.copy().astype(int) 1146 1147 # Embed indices 1148 V = self.embed_unique_indices(V) 1149 1150 # Connectivity matrix 1151 cmat = KDTree(idx) 1152 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1153 cmat.setdiag(1) 1154 cmat = sparse.triu(cmat) 1155 1156 # Pairwise minimums 1157 I, J = cmat.nonzero() 1158 d = np.maximum(V[I], V[J]) 1159 1160 # Delete connectiity matrix 1161 cmat_shape = cmat.shape 1162 del cmat 1163 1164 # Sparse distance matrix 1165 sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape) 1166 1167 # Delete pairwise mins 1168 del d, I, J 1169 1170 # Persistence homology 1171 ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0] 1172 1173 # Bound death values 1174 ph[ph[:, 1] == np.inf, 1] = np.max(V) 1175 1176 # Construct tree to query against 1177 tree = KDTree(V.reshape((-1, 1))) 1178 1179 # Get the indexes of the first nearest neighbor by birth 1180 _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1) 1181 1182 return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1) 1183 1184 def check_if_grid(self, data): 1185 """Check if the data are gridded in mz space. 1186 1187 Parameters 1188 ---------- 1189 data : DataFrame 1190 DataFrame containing the mass spectrometry data. Needs to have mz and scan columns. 1191 1192 Returns 1193 ------- 1194 bool 1195 True if the data is gridded in the mz direction, False otherwise. 1196 1197 Notes 1198 ----- 1199 This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly. 1200 """ 1201 # Calculate the difference between consecutive mz values in a single scan 1202 dat_check = data.copy().reset_index(drop=True) 1203 dat_check["mz_diff"] = np.abs(dat_check["mz"].diff()) 1204 mz_diff_min = ( 1205 dat_check.groupby("scan")["mz_diff"].min().min() 1206 ) # within each scan, what is the smallest mz difference between consecutive mz values 1207 1208 # Find the mininum mz difference between mz values in the data; regardless of scan 1209 dat_check_mz = dat_check[["mz"]].drop_duplicates().copy() 1210 dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True) 1211 dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff()) 1212 1213 # Get minimum mz_diff between mz values in the data 1214 mz_diff_min_raw = dat_check_mz["mz_diff"].min() 1215 1216 # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded 1217 if mz_diff_min_raw < mz_diff_min: 1218 return False 1219 else: 1220 return True 1221 1222 def grid_data(self, data): 1223 """Grid the data in the mz dimension. 1224 1225 Data must be gridded prior to persistent homology calculations. 1226 1227 Parameters 1228 ---------- 1229 data : DataFrame 1230 The input data containing mz, scan, scan_time, and intensity columns. 1231 1232 Returns 1233 ------- 1234 DataFrame 1235 The gridded data with mz, scan, scan_time, and intensity columns. 1236 1237 Raises 1238 ------ 1239 ValueError 1240 If gridding fails. 1241 """ 1242 1243 # Calculate the difference between consecutive mz values in a single scan for grid spacing 1244 data_w = data.copy().reset_index(drop=True) 1245 data_w["mz_diff"] = np.abs(data_w["mz"].diff()) 1246 mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999 1247 1248 # Need high intensity mz values first so they are parents in the output pairs stack 1249 dat_mz = data_w[["mz", "intensity"]].sort_values( 1250 by=["intensity"], ascending=False 1251 ) 1252 dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy() 1253 1254 # Construct KD tree 1255 tree = KDTree(dat_mz.mz.values.reshape(-1, 1)) 1256 sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix") 1257 sdm = sparse.triu(sdm, k=1) 1258 sdm.data = np.ones_like(sdm.data) 1259 distances = sdm.tocoo() 1260 pairs = np.stack((distances.row, distances.col), axis=1) 1261 1262 # Cull pairs to just get root 1263 to_drop = [] 1264 while len(pairs) > 0: 1265 root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1])) 1266 id_root_parents = np.isin(pairs[:, 0], root_parents) 1267 children_of_roots = np.unique(pairs[id_root_parents, 1]) 1268 to_drop = np.append(to_drop, children_of_roots) 1269 1270 # Set up pairs array for next iteration by removing pairs with children or parents already dropped 1271 pairs = pairs[~np.isin(pairs[:, 1], to_drop), :] 1272 pairs = pairs[~np.isin(pairs[:, 0], to_drop), :] 1273 dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop)) 1274 mz_dat_np = ( 1275 dat_mz[["mz"]] 1276 .sort_values(by=["mz"]) 1277 .reset_index(drop=True) 1278 .values.flatten() 1279 ) 1280 1281 # Sort data by mz and recast mz to nearest value in mz_dat_np 1282 data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy() 1283 data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)] 1284 data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"]) 1285 1286 # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity 1287 new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy() 1288 new_data_w = ( 1289 new_data_w.drop(columns=["mz_diff", "mz_orig"]) 1290 .groupby(["scan", "mz"])["intensity"] 1291 .sum() 1292 .reset_index() 1293 ) 1294 new_data_w = ( 1295 new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True]) 1296 .reset_index(drop=True) 1297 .copy() 1298 ) 1299 1300 # Check if grid worked and return 1301 if self.check_if_grid(new_data_w): 1302 return new_data_w 1303 else: 1304 raise ValueError("Gridding failed") 1305 1306 def find_mass_features_ph(self, ms_level=1, grid=True): 1307 """Find mass features within an LCMSBase object using persistent homology. 1308 1309 Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id) 1310 1311 Parameters 1312 ---------- 1313 ms_level : int, optional 1314 The MS level to use. Default is 1. 1315 grid : bool, optional 1316 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True. 1317 1318 Raises 1319 ------ 1320 ValueError 1321 If no MS level data is found on the object. 1322 If data is not gridded and grid is False. 1323 1324 Returns 1325 ------- 1326 None, but assigns the mass_features attribute to the object. 1327 1328 Notes 1329 ----- 1330 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1331 """ 1332 # Check that ms_level is a key in self._ms_uprocessed 1333 if ms_level not in self._ms_unprocessed.keys(): 1334 raise ValueError( 1335 "No MS level " 1336 + str(ms_level) 1337 + " data found, did you instantiate with parser specific to MS level?" 1338 ) 1339 1340 # Get ms data 1341 data = self._ms_unprocessed[ms_level].copy() 1342 1343 # Drop rows with missing intensity values and reset index 1344 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1345 1346 # Threshold data 1347 dims = ["mz", "scan_time"] 1348 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1349 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1350 1351 # Check if gridded, if not, grid 1352 gridded_mz = self.check_if_grid(data_thres) 1353 if gridded_mz is False: 1354 if grid is False: 1355 raise ValueError( 1356 "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function" 1357 ) 1358 else: 1359 data_thres = self.grid_data(data_thres) 1360 1361 # Add build factors and add scan_time 1362 data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1363 factors = { 1364 dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32) 1365 for dim in dims 1366 } # this is return a float64 index 1367 1368 # Build indexes 1369 index = { 1370 dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32) 1371 for dim in factors 1372 } 1373 1374 # Smooth data 1375 iterations = self.parameters.lc_ms.ph_smooth_it 1376 smooth_radius = [ 1377 self.parameters.lc_ms.ph_smooth_radius_mz, 1378 self.parameters.lc_ms.ph_smooth_radius_scan, 1379 ] # mz, scan_time smoothing radius (in steps) 1380 1381 index = np.vstack([index[dim] for dim in dims]).T 1382 V = data_thres["intensity"].values 1383 resid = np.inf 1384 for i in range(iterations): 1385 # Previous iteration 1386 V_prev = V.copy() 1387 resid_prev = resid 1388 V = self.sparse_mean_filter(index, V, radius=smooth_radius) 1389 1390 # Calculate residual with previous iteration 1391 resid = np.sqrt(np.mean(np.square(V - V_prev))) 1392 1393 # Evaluate convergence 1394 if i > 0: 1395 # Percent change in residual 1396 test = np.abs(resid - resid_prev) / resid_prev 1397 1398 # Exit criteria 1399 if test <= 0: 1400 break 1401 1402 # Overwrite values 1403 data_thres["intensity"] = V 1404 1405 # Use persistent homology to find regions of interest 1406 pidx, pers = self.sparse_upper_star(index, V) 1407 pidx = pidx[pers > 1] 1408 pers = pers[pers > 1] 1409 1410 # Get peaks 1411 peaks = data_thres.iloc[pidx, :].reset_index(drop=True) 1412 1413 # Add persistence column 1414 peaks["persistence"] = pers 1415 mass_features = peaks.sort_values( 1416 by="persistence", ascending=False 1417 ).reset_index(drop=True) 1418 1419 # Filter by persistence threshold 1420 persistence_threshold = ( 1421 self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max() 1422 ) 1423 mass_features = mass_features.loc[ 1424 mass_features["persistence"] > persistence_threshold, : 1425 ].reset_index(drop=True) 1426 1427 # Rename scan column to apex_scan 1428 mass_features = mass_features.rename( 1429 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1430 ) 1431 1432 # Populate mass_features attribute 1433 self.mass_features = {} 1434 for row in mass_features.itertuples(): 1435 row_dict = mass_features.iloc[row.Index].to_dict() 1436 lcms_feature = LCMSMassFeature(self, **row_dict) 1437 self.mass_features[lcms_feature.id] = lcms_feature 1438 1439 if self.parameters.lc_ms.verbose_processing: 1440 print("Found " + str(len(mass_features)) + " initial mass features") 1441 1442 def find_mass_features_ph_centroid(self, ms_level=1): 1443 """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data. 1444 1445 Parameters 1446 ---------- 1447 ms_level : int, optional 1448 The MS level to use. Default is 1. 1449 1450 Raises 1451 ------ 1452 ValueError 1453 If no MS level data is found on the object. 1454 1455 Returns 1456 ------- 1457 None, but assigns the mass_features attribute to the object. 1458 """ 1459 # Check that ms_level is a key in self._ms_uprocessed 1460 if ms_level not in self._ms_unprocessed.keys(): 1461 raise ValueError( 1462 "No MS level " 1463 + str(ms_level) 1464 + " data found, did you instantiate with parser specific to MS level?" 1465 ) 1466 1467 # Get ms data 1468 data = self._ms_unprocessed[ms_level].copy() 1469 1470 # Drop rows with missing intensity values and reset index 1471 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1472 1473 # Threshold data 1474 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1475 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1476 data_thres['persistence'] = data_thres['intensity'] 1477 1478 # Use this as the starting point for the mass features, adding scan_time 1479 mf_df = data_thres 1480 mf_df = mf_df.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1481 1482 # Define tolerances and dimensions for rolling up 1483 tol = [ 1484 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1485 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1486 ] 1487 relative = [True, False] 1488 dims = ["mz", "scan_time"] 1489 print("here") 1490 mf_df = self.roll_up_dataframe( 1491 df=mf_df, 1492 sort_by="persistence", 1493 dims=dims, 1494 tol=tol, 1495 relative=relative 1496 ) 1497 1498 # Rename scan column to apex_scan 1499 mass_features = mf_df.rename( 1500 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1501 ) 1502 # Sort my persistence and reset index 1503 mass_features = mass_features.sort_values( 1504 by="persistence", ascending=False 1505 ).reset_index(drop=True) 1506 1507 # Populate mass_features attribute 1508 self.mass_features = {} 1509 for row in mass_features.itertuples(): 1510 row_dict = mass_features.iloc[row.Index].to_dict() 1511 lcms_feature = LCMSMassFeature(self, **row_dict) 1512 self.mass_features[lcms_feature.id] = lcms_feature 1513 1514 if self.parameters.lc_ms.verbose_processing: 1515 print("Found " + str(len(mass_features)) + " initial mass features") 1516 1517 def cluster_mass_features(self, drop_children=True, sort_by="persistence"): 1518 """Cluster mass features 1519 1520 Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence. 1521 1522 Parameters 1523 ---------- 1524 drop_children : bool, optional 1525 Whether to drop the mass features that are not cluster parents. Default is True. 1526 sort_by : str, optional 1527 The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence". 1528 1529 Raises 1530 ------ 1531 ValueError 1532 If no mass features are found. 1533 If too many mass features are found. 1534 1535 Returns 1536 ------- 1537 None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents. 1538 """ 1539 verbose = self.parameters.lc_ms.verbose_processing 1540 1541 if self.mass_features is None: 1542 raise ValueError("No mass features found, run find_mass_features() first") 1543 if len(self.mass_features) > 400000: 1544 raise ValueError( 1545 "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold" 1546 ) 1547 dims = ["mz", "scan_time"] 1548 mf_df_og = self.mass_features_to_df() 1549 mf_df = mf_df_og.copy() 1550 1551 tol = [ 1552 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1553 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1554 ] # mz, in relative; scan_time in minutes 1555 relative = [True, False] 1556 1557 # Roll up mass features based on their proximity in the declared dimensions 1558 mf_df_new = self.roll_up_dataframe( 1559 df=mf_df, 1560 sort_by=sort_by, 1561 dims=dims, 1562 tol=tol, 1563 relative=relative 1564 ) 1565 1566 mf_df["cluster_parent"] = np.where( 1567 np.isin(mf_df.index, mf_df_new.index), True, False 1568 ) 1569 1570 # get mass feature ids of features that are not cluster parents 1571 cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values 1572 if drop_children is True: 1573 # Drop mass features that are not cluster parents from self 1574 self.mass_features = { 1575 k: v 1576 for k, v in self.mass_features.items() 1577 if k not in cluster_daughters 1578 } 1579 else: 1580 return cluster_daughters
Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.
Notes
This class is intended to be used as a mixin for the LCMSBase class.
Methods
- sparse_mean_filter(idx, V, radius=[0, 1, 1]). Sparse implementation of a mean filter.
- embed_unique_indices(a). Creates an array of indices, sorted by unique element.
- sparse_upper_star(idx, V). Sparse implementation of an upper star filtration.
- check_if_grid(data). Check if the data is gridded in mz space.
- grid_data(data). Grid the data in the mz dimension.
- find_mass_features_ph(ms_level=1, grid=True). Find mass features within an LCMSBase object using persistent homology.
- cluster_mass_features(drop_children=True). Cluster regions of interest.
841 @staticmethod 842 def sparse_mean_filter(idx, V, radius=[0, 1, 1]): 843 """Sparse implementation of a mean filter. 844 845 Parameters 846 ---------- 847 idx : :obj:`~numpy.array` 848 Edge indices for each dimension (MxN). 849 V : :obj:`~numpy.array` 850 Array of intensity data (Mx1). 851 radius : float or list 852 Radius of the sparse filter in each dimension. Values less than 853 zero indicate no connectivity in that dimension. 854 855 Returns 856 ------- 857 :obj:`~numpy.array` 858 Filtered intensities (Mx1). 859 860 Notes 861 ----- 862 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. 863 This is a static method. 864 """ 865 866 # Copy indices 867 idx = idx.copy().astype(V.dtype) 868 869 # Scale 870 for i, r in enumerate(radius): 871 # Increase inter-index distance 872 if r < 1: 873 idx[:, i] *= 2 874 875 # Do nothing 876 elif r == 1: 877 pass 878 879 # Decrease inter-index distance 880 else: 881 idx[:, i] /= r 882 883 # Connectivity matrix 884 cmat = KDTree(idx) 885 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 886 cmat.setdiag(1) 887 888 # Pair indices 889 I, J = cmat.nonzero() 890 891 # Delete cmat 892 cmat_shape = cmat.shape 893 del cmat 894 895 # Sum over columns 896 V_sum = sparse.bsr_matrix( 897 (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype 898 ).diagonal(0) 899 900 # Count over columns 901 V_count = sparse.bsr_matrix( 902 (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype 903 ).diagonal(0) 904 905 return V_sum / V_count
Sparse implementation of a mean filter.
Parameters
- idx (
~numpy.array
): Edge indices for each dimension (MxN). - V (
~numpy.array
): Array of intensity data (Mx1). - radius (float or list): Radius of the sparse filter in each dimension. Values less than zero indicate no connectivity in that dimension.
Returns
~numpy.array
: Filtered intensities (Mx1).
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. This is a static method.
907 @staticmethod 908 def embed_unique_indices(a): 909 """Creates an array of indices, sorted by unique element. 910 911 Parameters 912 ---------- 913 a : :obj:`~numpy.array` 914 Array of unique elements (Mx1). 915 916 Returns 917 ------- 918 :obj:`~numpy.array` 919 Array of indices (Mx1). 920 921 Notes 922 ----- 923 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 924 This is a static method. 925 """ 926 927 def count_tens(n): 928 # Count tens 929 ntens = (n - 1) // 10 930 931 while True: 932 ntens_test = (ntens + n - 1) // 10 933 934 if ntens_test == ntens: 935 return ntens 936 else: 937 ntens = ntens_test 938 939 def arange_exclude_10s(n): 940 # How many 10s will there be? 941 ntens = count_tens(n) 942 943 # Base array 944 arr = np.arange(0, n + ntens) 945 946 # Exclude 10s 947 arr = arr[(arr == 0) | (arr % 10 != 0)][:n] 948 949 return arr 950 951 # Creates an array of indices, sorted by unique element 952 idx_sort = np.argsort(a) 953 idx_unsort = np.argsort(idx_sort) 954 955 # Sorts records array so all unique elements are together 956 sorted_a = a[idx_sort] 957 958 # Returns the unique values, the index of the first occurrence, 959 # and the count for each element 960 vals, idx_start, count = np.unique( 961 sorted_a, return_index=True, return_counts=True 962 ) 963 964 # Splits the indices into separate arrays 965 splits = np.split(idx_sort, idx_start[1:]) 966 967 # Creates unique indices for each split 968 idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits]) 969 970 # Reorders according to input array 971 idx_unq = idx_unq[idx_unsort] 972 973 # Magnitude of each index 974 exp = np.log10( 975 idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64) 976 ) 977 idx_unq_mag = np.power(10, np.floor(exp) + 1) 978 979 # Result 980 return a + idx_unq / idx_unq_mag
Creates an array of indices, sorted by unique element.
Parameters
- a (
~numpy.array
): Array of unique elements (Mx1).
Returns
~numpy.array
: Array of indices (Mx1).
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos This is a static method.
982 @staticmethod 983 def roll_up_dataframe( 984 df : pd.DataFrame, 985 sort_by : str, 986 tol : list, 987 relative : list, 988 dims : list 989 ): 990 """Subset data by rolling up into apex in appropriate dimensions. 991 992 Parameters 993 ---------- 994 data : pd.DataFrame 995 The input data containing "dims" columns and the "sort_by" column. 996 sort_by : str 997 The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 998 (i.e., the mass feature with the highest value in the sort_by column). 999 dims : list 1000 A list of dimension names (column names in the data DataFrame) to roll up the mass features by. 1001 tol : list 1002 A list of tolerances for each dimension. The length of the list must match the number of dimensions. 1003 The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). 1004 If relative is True, the tolerance will be multiplied by the maximum value in that dimension. 1005 If relative is False, the tolerance will be used as is. 1006 relative : list 1007 A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False). 1008 1009 Returns 1010 ------- 1011 pd.DataFrame 1012 A DataFrame with only the rolled up mass features, with the original index and columns. 1013 1014 1015 Raises 1016 ------ 1017 ValueError 1018 If the input data is not a pandas DataFrame. 1019 If the input data does not have columns for each of the dimensions in "dims". 1020 If the length of "dims", "tol", and "relative" do not match. 1021 """ 1022 og_columns = df.columns.copy() 1023 1024 # Unindex the data, but keep the original index 1025 if df.index.name is not None: 1026 og_index = df.index.name 1027 else: 1028 og_index = "index" 1029 df = df.reset_index(drop=False) 1030 1031 # Sort data by sort_by column, and reindex 1032 df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True) 1033 1034 # Check that data is a DataFrame and has columns for each of the dims 1035 if not isinstance(df, pd.DataFrame): 1036 raise ValueError("Data must be a pandas DataFrame") 1037 for dim in dims: 1038 if dim not in df.columns: 1039 raise ValueError(f"Data must have a column for {dim}") 1040 if len(dims) != len(tol) or len(dims) != len(relative): 1041 raise ValueError( 1042 "Dimensions, tolerances, and relative flags must be the same length" 1043 ) 1044 1045 # Compute inter-feature distances 1046 distances = None 1047 for i in range(len(dims)): 1048 # Construct k-d tree 1049 values = df[dims[i]].values 1050 tree = KDTree(values.reshape(-1, 1)) 1051 1052 max_tol = tol[i] 1053 if relative[i] is True: 1054 # Maximum absolute tolerance 1055 max_tol = tol[i] * values.max() 1056 1057 # Compute sparse distance matrix 1058 # the larger the max_tol, the slower this operation is 1059 sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix") 1060 1061 # Only consider forward case, exclude diagonal 1062 sdm = sparse.triu(sdm, k=1) 1063 1064 # Filter relative distances 1065 if relative[i] is True: 1066 # Compute relative distances 1067 rel_dists = sdm.data / values[sdm.row] # or col? 1068 1069 # Indices of relative distances less than tolerance 1070 idx = rel_dists <= tol[i] 1071 1072 # Reconstruct sparse distance matrix 1073 sdm = sparse.coo_matrix( 1074 (rel_dists[idx], (sdm.row[idx], sdm.col[idx])), 1075 shape=(len(values), len(values)), 1076 ) 1077 1078 # Cast as binary matrix 1079 sdm.data = np.ones_like(sdm.data) 1080 1081 # Stack distances 1082 if distances is None: 1083 distances = sdm 1084 else: 1085 distances = distances.multiply(sdm) 1086 1087 # Extract indices of within-tolerance points 1088 distances = distances.tocoo() 1089 pairs = np.stack((distances.row, distances.col), axis=1) 1090 pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]) 1091 pairs_df = pairs_df.set_index("parent") 1092 1093 to_drop = [] 1094 while not pairs_df.empty: 1095 # Find root_parents and their children 1096 root_parents = np.setdiff1d( 1097 np.unique(pairs_df.index.values), np.unique(pairs_df.child.values) 1098 ) 1099 children_of_roots = pairs_df.loc[root_parents, "child"].unique() 1100 to_drop = np.append(to_drop, children_of_roots) 1101 1102 # Remove root_children as possible parents from pairs_df for next iteration 1103 pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore") 1104 pairs_df = pairs_df.reset_index().set_index("child") 1105 # Remove root_children as possible children from pairs_df for next iteration 1106 pairs_df = pairs_df.drop(index=children_of_roots) 1107 1108 # Prepare for next iteration 1109 pairs_df = pairs_df.reset_index().set_index("parent") 1110 1111 # Drop mass features that are not cluster parents 1112 df_sub = df.drop(index=np.array(to_drop)) 1113 1114 # Set index back to og_index and only keep the columns that are in the original dataframe 1115 df_sub = df_sub.set_index(og_index) 1116 1117 # sort the dataframe by the original index 1118 df_sub = df_sub.sort_index() 1119 df_sub = df_sub[og_columns] 1120 1121 return df_sub
Subset data by rolling up into apex in appropriate dimensions.
Parameters
- data (pd.DataFrame): The input data containing "dims" columns and the "sort_by" column.
- sort_by (str): The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature (i.e., the mass feature with the highest value in the sort_by column).
- dims (list): A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
- tol (list): A list of tolerances for each dimension. The length of the list must match the number of dimensions. The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). If relative is True, the tolerance will be multiplied by the maximum value in that dimension. If relative is False, the tolerance will be used as is.
- relative (list): A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
Returns
- pd.DataFrame: A DataFrame with only the rolled up mass features, with the original index and columns.
Raises
- ValueError: If the input data is not a pandas DataFrame. If the input data does not have columns for each of the dimensions in "dims". If the length of "dims", "tol", and "relative" do not match.
1123 def sparse_upper_star(self, idx, V): 1124 """Sparse implementation of an upper star filtration. 1125 1126 Parameters 1127 ---------- 1128 idx : :obj:`~numpy.array` 1129 Edge indices for each dimension (MxN). 1130 V : :obj:`~numpy.array` 1131 Array of intensity data (Mx1). 1132 Returns 1133 ------- 1134 idx : :obj:`~numpy.array` 1135 Index of filtered points (Mx1). 1136 persistence : :obj:`~numpy.array` 1137 Persistence of each filtered point (Mx1). 1138 1139 Notes 1140 ----- 1141 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1142 """ 1143 1144 # Invert 1145 V = -1 * V.copy().astype(int) 1146 1147 # Embed indices 1148 V = self.embed_unique_indices(V) 1149 1150 # Connectivity matrix 1151 cmat = KDTree(idx) 1152 cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix") 1153 cmat.setdiag(1) 1154 cmat = sparse.triu(cmat) 1155 1156 # Pairwise minimums 1157 I, J = cmat.nonzero() 1158 d = np.maximum(V[I], V[J]) 1159 1160 # Delete connectiity matrix 1161 cmat_shape = cmat.shape 1162 del cmat 1163 1164 # Sparse distance matrix 1165 sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape) 1166 1167 # Delete pairwise mins 1168 del d, I, J 1169 1170 # Persistence homology 1171 ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0] 1172 1173 # Bound death values 1174 ph[ph[:, 1] == np.inf, 1] = np.max(V) 1175 1176 # Construct tree to query against 1177 tree = KDTree(V.reshape((-1, 1))) 1178 1179 # Get the indexes of the first nearest neighbor by birth 1180 _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1) 1181 1182 return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)
Sparse implementation of an upper star filtration.
Parameters
- idx (
~numpy.array
): Edge indices for each dimension (MxN). - V (
~numpy.array
): Array of intensity data (Mx1).
Returns
- idx (
~numpy.array
): Index of filtered points (Mx1). - persistence (
~numpy.array
): Persistence of each filtered point (Mx1).
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1184 def check_if_grid(self, data): 1185 """Check if the data are gridded in mz space. 1186 1187 Parameters 1188 ---------- 1189 data : DataFrame 1190 DataFrame containing the mass spectrometry data. Needs to have mz and scan columns. 1191 1192 Returns 1193 ------- 1194 bool 1195 True if the data is gridded in the mz direction, False otherwise. 1196 1197 Notes 1198 ----- 1199 This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly. 1200 """ 1201 # Calculate the difference between consecutive mz values in a single scan 1202 dat_check = data.copy().reset_index(drop=True) 1203 dat_check["mz_diff"] = np.abs(dat_check["mz"].diff()) 1204 mz_diff_min = ( 1205 dat_check.groupby("scan")["mz_diff"].min().min() 1206 ) # within each scan, what is the smallest mz difference between consecutive mz values 1207 1208 # Find the mininum mz difference between mz values in the data; regardless of scan 1209 dat_check_mz = dat_check[["mz"]].drop_duplicates().copy() 1210 dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True) 1211 dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff()) 1212 1213 # Get minimum mz_diff between mz values in the data 1214 mz_diff_min_raw = dat_check_mz["mz_diff"].min() 1215 1216 # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded 1217 if mz_diff_min_raw < mz_diff_min: 1218 return False 1219 else: 1220 return True
Check if the data are gridded in mz space.
Parameters
- data (DataFrame): DataFrame containing the mass spectrometry data. Needs to have mz and scan columns.
Returns
- bool: True if the data is gridded in the mz direction, False otherwise.
Notes
This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1222 def grid_data(self, data): 1223 """Grid the data in the mz dimension. 1224 1225 Data must be gridded prior to persistent homology calculations. 1226 1227 Parameters 1228 ---------- 1229 data : DataFrame 1230 The input data containing mz, scan, scan_time, and intensity columns. 1231 1232 Returns 1233 ------- 1234 DataFrame 1235 The gridded data with mz, scan, scan_time, and intensity columns. 1236 1237 Raises 1238 ------ 1239 ValueError 1240 If gridding fails. 1241 """ 1242 1243 # Calculate the difference between consecutive mz values in a single scan for grid spacing 1244 data_w = data.copy().reset_index(drop=True) 1245 data_w["mz_diff"] = np.abs(data_w["mz"].diff()) 1246 mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999 1247 1248 # Need high intensity mz values first so they are parents in the output pairs stack 1249 dat_mz = data_w[["mz", "intensity"]].sort_values( 1250 by=["intensity"], ascending=False 1251 ) 1252 dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy() 1253 1254 # Construct KD tree 1255 tree = KDTree(dat_mz.mz.values.reshape(-1, 1)) 1256 sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix") 1257 sdm = sparse.triu(sdm, k=1) 1258 sdm.data = np.ones_like(sdm.data) 1259 distances = sdm.tocoo() 1260 pairs = np.stack((distances.row, distances.col), axis=1) 1261 1262 # Cull pairs to just get root 1263 to_drop = [] 1264 while len(pairs) > 0: 1265 root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1])) 1266 id_root_parents = np.isin(pairs[:, 0], root_parents) 1267 children_of_roots = np.unique(pairs[id_root_parents, 1]) 1268 to_drop = np.append(to_drop, children_of_roots) 1269 1270 # Set up pairs array for next iteration by removing pairs with children or parents already dropped 1271 pairs = pairs[~np.isin(pairs[:, 1], to_drop), :] 1272 pairs = pairs[~np.isin(pairs[:, 0], to_drop), :] 1273 dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop)) 1274 mz_dat_np = ( 1275 dat_mz[["mz"]] 1276 .sort_values(by=["mz"]) 1277 .reset_index(drop=True) 1278 .values.flatten() 1279 ) 1280 1281 # Sort data by mz and recast mz to nearest value in mz_dat_np 1282 data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy() 1283 data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)] 1284 data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"]) 1285 1286 # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity 1287 new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy() 1288 new_data_w = ( 1289 new_data_w.drop(columns=["mz_diff", "mz_orig"]) 1290 .groupby(["scan", "mz"])["intensity"] 1291 .sum() 1292 .reset_index() 1293 ) 1294 new_data_w = ( 1295 new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True]) 1296 .reset_index(drop=True) 1297 .copy() 1298 ) 1299 1300 # Check if grid worked and return 1301 if self.check_if_grid(new_data_w): 1302 return new_data_w 1303 else: 1304 raise ValueError("Gridding failed")
Grid the data in the mz dimension.
Data must be gridded prior to persistent homology calculations.
Parameters
- data (DataFrame): The input data containing mz, scan, scan_time, and intensity columns.
Returns
- DataFrame: The gridded data with mz, scan, scan_time, and intensity columns.
Raises
- ValueError: If gridding fails.
1306 def find_mass_features_ph(self, ms_level=1, grid=True): 1307 """Find mass features within an LCMSBase object using persistent homology. 1308 1309 Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id) 1310 1311 Parameters 1312 ---------- 1313 ms_level : int, optional 1314 The MS level to use. Default is 1. 1315 grid : bool, optional 1316 If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True. 1317 1318 Raises 1319 ------ 1320 ValueError 1321 If no MS level data is found on the object. 1322 If data is not gridded and grid is False. 1323 1324 Returns 1325 ------- 1326 None, but assigns the mass_features attribute to the object. 1327 1328 Notes 1329 ----- 1330 This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos 1331 """ 1332 # Check that ms_level is a key in self._ms_uprocessed 1333 if ms_level not in self._ms_unprocessed.keys(): 1334 raise ValueError( 1335 "No MS level " 1336 + str(ms_level) 1337 + " data found, did you instantiate with parser specific to MS level?" 1338 ) 1339 1340 # Get ms data 1341 data = self._ms_unprocessed[ms_level].copy() 1342 1343 # Drop rows with missing intensity values and reset index 1344 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1345 1346 # Threshold data 1347 dims = ["mz", "scan_time"] 1348 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1349 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1350 1351 # Check if gridded, if not, grid 1352 gridded_mz = self.check_if_grid(data_thres) 1353 if gridded_mz is False: 1354 if grid is False: 1355 raise ValueError( 1356 "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function" 1357 ) 1358 else: 1359 data_thres = self.grid_data(data_thres) 1360 1361 # Add build factors and add scan_time 1362 data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1363 factors = { 1364 dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32) 1365 for dim in dims 1366 } # this is return a float64 index 1367 1368 # Build indexes 1369 index = { 1370 dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32) 1371 for dim in factors 1372 } 1373 1374 # Smooth data 1375 iterations = self.parameters.lc_ms.ph_smooth_it 1376 smooth_radius = [ 1377 self.parameters.lc_ms.ph_smooth_radius_mz, 1378 self.parameters.lc_ms.ph_smooth_radius_scan, 1379 ] # mz, scan_time smoothing radius (in steps) 1380 1381 index = np.vstack([index[dim] for dim in dims]).T 1382 V = data_thres["intensity"].values 1383 resid = np.inf 1384 for i in range(iterations): 1385 # Previous iteration 1386 V_prev = V.copy() 1387 resid_prev = resid 1388 V = self.sparse_mean_filter(index, V, radius=smooth_radius) 1389 1390 # Calculate residual with previous iteration 1391 resid = np.sqrt(np.mean(np.square(V - V_prev))) 1392 1393 # Evaluate convergence 1394 if i > 0: 1395 # Percent change in residual 1396 test = np.abs(resid - resid_prev) / resid_prev 1397 1398 # Exit criteria 1399 if test <= 0: 1400 break 1401 1402 # Overwrite values 1403 data_thres["intensity"] = V 1404 1405 # Use persistent homology to find regions of interest 1406 pidx, pers = self.sparse_upper_star(index, V) 1407 pidx = pidx[pers > 1] 1408 pers = pers[pers > 1] 1409 1410 # Get peaks 1411 peaks = data_thres.iloc[pidx, :].reset_index(drop=True) 1412 1413 # Add persistence column 1414 peaks["persistence"] = pers 1415 mass_features = peaks.sort_values( 1416 by="persistence", ascending=False 1417 ).reset_index(drop=True) 1418 1419 # Filter by persistence threshold 1420 persistence_threshold = ( 1421 self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max() 1422 ) 1423 mass_features = mass_features.loc[ 1424 mass_features["persistence"] > persistence_threshold, : 1425 ].reset_index(drop=True) 1426 1427 # Rename scan column to apex_scan 1428 mass_features = mass_features.rename( 1429 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1430 ) 1431 1432 # Populate mass_features attribute 1433 self.mass_features = {} 1434 for row in mass_features.itertuples(): 1435 row_dict = mass_features.iloc[row.Index].to_dict() 1436 lcms_feature = LCMSMassFeature(self, **row_dict) 1437 self.mass_features[lcms_feature.id] = lcms_feature 1438 1439 if self.parameters.lc_ms.verbose_processing: 1440 print("Found " + str(len(mass_features)) + " initial mass features")
Find mass features within an LCMSBase object using persistent homology.
Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
Parameters
- ms_level (int, optional): The MS level to use. Default is 1.
- grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
Raises
- ValueError: If no MS level data is found on the object. If data is not gridded and grid is False.
Returns
- None, but assigns the mass_features attribute to the object.
Notes
This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1442 def find_mass_features_ph_centroid(self, ms_level=1): 1443 """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data. 1444 1445 Parameters 1446 ---------- 1447 ms_level : int, optional 1448 The MS level to use. Default is 1. 1449 1450 Raises 1451 ------ 1452 ValueError 1453 If no MS level data is found on the object. 1454 1455 Returns 1456 ------- 1457 None, but assigns the mass_features attribute to the object. 1458 """ 1459 # Check that ms_level is a key in self._ms_uprocessed 1460 if ms_level not in self._ms_unprocessed.keys(): 1461 raise ValueError( 1462 "No MS level " 1463 + str(ms_level) 1464 + " data found, did you instantiate with parser specific to MS level?" 1465 ) 1466 1467 # Get ms data 1468 data = self._ms_unprocessed[ms_level].copy() 1469 1470 # Drop rows with missing intensity values and reset index 1471 data = data.dropna(subset=["intensity"]).reset_index(drop=True) 1472 1473 # Threshold data 1474 threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max() 1475 data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy() 1476 data_thres['persistence'] = data_thres['intensity'] 1477 1478 # Use this as the starting point for the mass features, adding scan_time 1479 mf_df = data_thres 1480 mf_df = mf_df.merge(self.scan_df[["scan", "scan_time"]], on="scan") 1481 1482 # Define tolerances and dimensions for rolling up 1483 tol = [ 1484 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1485 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1486 ] 1487 relative = [True, False] 1488 dims = ["mz", "scan_time"] 1489 print("here") 1490 mf_df = self.roll_up_dataframe( 1491 df=mf_df, 1492 sort_by="persistence", 1493 dims=dims, 1494 tol=tol, 1495 relative=relative 1496 ) 1497 1498 # Rename scan column to apex_scan 1499 mass_features = mf_df.rename( 1500 columns={"scan": "apex_scan", "scan_time": "retention_time"} 1501 ) 1502 # Sort my persistence and reset index 1503 mass_features = mass_features.sort_values( 1504 by="persistence", ascending=False 1505 ).reset_index(drop=True) 1506 1507 # Populate mass_features attribute 1508 self.mass_features = {} 1509 for row in mass_features.itertuples(): 1510 row_dict = mass_features.iloc[row.Index].to_dict() 1511 lcms_feature = LCMSMassFeature(self, **row_dict) 1512 self.mass_features[lcms_feature.id] = lcms_feature 1513 1514 if self.parameters.lc_ms.verbose_processing: 1515 print("Found " + str(len(mass_features)) + " initial mass features")
Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
Parameters
- ms_level (int, optional): The MS level to use. Default is 1.
Raises
- ValueError: If no MS level data is found on the object.
Returns
- None, but assigns the mass_features attribute to the object.
1517 def cluster_mass_features(self, drop_children=True, sort_by="persistence"): 1518 """Cluster mass features 1519 1520 Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence. 1521 1522 Parameters 1523 ---------- 1524 drop_children : bool, optional 1525 Whether to drop the mass features that are not cluster parents. Default is True. 1526 sort_by : str, optional 1527 The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence". 1528 1529 Raises 1530 ------ 1531 ValueError 1532 If no mass features are found. 1533 If too many mass features are found. 1534 1535 Returns 1536 ------- 1537 None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents. 1538 """ 1539 verbose = self.parameters.lc_ms.verbose_processing 1540 1541 if self.mass_features is None: 1542 raise ValueError("No mass features found, run find_mass_features() first") 1543 if len(self.mass_features) > 400000: 1544 raise ValueError( 1545 "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold" 1546 ) 1547 dims = ["mz", "scan_time"] 1548 mf_df_og = self.mass_features_to_df() 1549 mf_df = mf_df_og.copy() 1550 1551 tol = [ 1552 self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel, 1553 self.parameters.lc_ms.mass_feature_cluster_rt_tolerance, 1554 ] # mz, in relative; scan_time in minutes 1555 relative = [True, False] 1556 1557 # Roll up mass features based on their proximity in the declared dimensions 1558 mf_df_new = self.roll_up_dataframe( 1559 df=mf_df, 1560 sort_by=sort_by, 1561 dims=dims, 1562 tol=tol, 1563 relative=relative 1564 ) 1565 1566 mf_df["cluster_parent"] = np.where( 1567 np.isin(mf_df.index, mf_df_new.index), True, False 1568 ) 1569 1570 # get mass feature ids of features that are not cluster parents 1571 cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values 1572 if drop_children is True: 1573 # Drop mass features that are not cluster parents from self 1574 self.mass_features = { 1575 k: v 1576 for k, v in self.mass_features.items() 1577 if k not in cluster_daughters 1578 } 1579 else: 1580 return cluster_daughters
Cluster mass features
Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
Parameters
- drop_children (bool, optional): Whether to drop the mass features that are not cluster parents. Default is True.
- sort_by (str, optional): The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
Raises
- ValueError: If no mass features are found. If too many mass features are found.
Returns
- None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.