corems.encapsulation.factory.processingSetting
1__author__ = "Yuri E. Corilo" 2__date__ = "Jul 02, 2019" 3 4import dataclasses 5import os 6from typing import List, Dict 7 8from corems.encapsulation.constant import Atoms, Labels 9 10 11@dataclasses.dataclass 12class TransientSetting: 13 """Transient processing settings class 14 15 Attributes 16 ---------- 17 implemented_apodization_function : tuple 18 Available apodization functions 19 apodization_method : str 20 Apodization function to use. Hanning is a good default for Fourier transform magnitude mode. 21 For absorption mode processing, Half-Sine or Half-Kaiser may be more appropriate. 22 number_of_truncations : int 23 How many times to truncate the transient prior to Fourier transform 24 number_of_zero_fills : int 25 How many times to zero fille the transient prior to Fourier transform. 26 next_power_of_two : bool 27 If True, zero fill to the next power of two after the new length of len(transient)+(number_of_zero_fills*len(transient)). 28 kaiser_beta : float 29 Beta parameter for Kaiser or Half-Kaiser apodisation function. 0 is rectangular, 5 is similar to Hamming, 30 6 is similar to hanning, and 8.6 is similar to Blackman (from numpy docs) 31 32 """ 33 34 implemented_apodization_function: tuple = ( 35 "Hamming", 36 "Hanning", 37 "Blackman", 38 "Full-Sine", 39 "Half-Sine", 40 "Kaiser", 41 "Half-Kaiser", 42 "Rectangle", 43 ) 44 apodization_method: str = "Hanning" 45 number_of_truncations: int = 0 46 number_of_zero_fills: int = 1 47 next_power_of_two: bool = False 48 kaiser_beta: float = 8.6 49 50 def __post_init__(self): 51 # enforce datatype 52 for field in dataclasses.fields(self): 53 value = getattr(self, field.name) 54 if not isinstance(value, field.type): 55 value = field.type(value) 56 setattr(self, field.name, value) 57 58 59@dataclasses.dataclass 60class DataInputSetting: 61 """Data input settings class 62 63 Attributes 64 ---------- 65 header_translate : dict 66 Dictionary with the header labels to be translated to the corems labels. For example, {'m/z':'m/z', 'Resolving Power':'Resolving Power', 'Abundance':'Abundance' , 'S/N':'S/N'} 67 """ 68 69 # add to this dict the VALUES to match your labels, THE ORDER WON"T MATTER 70 # "column_translate" : {"m/z":"m/z", "Resolving Power":"Resolving Power", "Abundance":"Abundance" , "S/N":"S/N"} 71 header_translate: dict = dataclasses.field(default_factory=dict) 72 73 def __post_init__(self): 74 self.header_translate = { 75 "m/z": Labels.mz, 76 "mOz": Labels.mz, 77 "Mass": Labels.mz, 78 "Resolving Power": Labels.rp, 79 "Res.": Labels.rp, 80 "resolution": Labels.rp, 81 "Intensity": Labels.abundance, 82 "Peak Height": Labels.abundance, 83 "I": Labels.abundance, 84 "Abundance": Labels.abundance, 85 "abs_abu": Labels.abundance, 86 "Signal/Noise": Labels.s2n, 87 "S/N": Labels.s2n, 88 "sn": Labels.s2n, 89 } 90 91 def add_mz_label(self, label): 92 """Add a label to the header_translate dictionary to be translated to the corems label for mz.""" 93 self.header_translate[label] = Labels.mz 94 95 def add_peak_height_label(self, label): 96 """Add a label to the header_translate dictionary to be translated to the corems label for peak height.""" 97 98 self.header_translate[label] = Labels.abundance 99 100 def add_sn_label(self, label): 101 """Add a label to the header_translate dictionary to be translated to the corems label for signal to noise.""" 102 self.header_translate[label] = Labels.s2n 103 104 def add_resolving_power_label(self, label): 105 """Add a label to the header_translate dictionary to be translated to the corems label for resolving power.""" 106 self.header_translate[label] = Labels.rp 107 108 109@dataclasses.dataclass 110class LiquidChromatographSetting: 111 """Liquid chromatograph processing settings class 112 113 Attributes 114 ---------- 115 scans : list or tuple, optional 116 List of select scan to average or a tuple containing the range to average. 117 Default is (-1, -1). 118 eic_tolerance_ppm : float, optional 119 Mass tolerance in ppm for extracted ion chromatogram peak detection. 120 Default is 5. 121 correct_eic_baseline : bool, optional 122 If True, correct the baseline of the extracted ion chromatogram. 123 Default is True. 124 smooth_window : int, optional 125 Window size for smoothing the ion chromatogram (extracted or total). 126 Default is 5. 127 smooth_method : str, optional 128 Smoothing method to use. See implemented_smooth_method for options. 129 Default is 'savgol'. 130 implemented_smooth_method : tuple, optional 131 Smoothing methods that can be implemented. 132 Default is ('savgol', 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar'). 133 savgol_pol_order : int, optional 134 Polynomial order for Savitzky-Golay smoothing. 135 Default is 2. 136 consecutive_scan_min : int, optional 137 Minimum number of consecutive scans to consider for peak detection. 138 Default is 0 for backwards compatibility, but a value of 3 is recommended. 139 peak_height_max_percent : float, optional 140 1-100 % used for baseline detection use 0.1 for second_derivative and 10 for other methods. 141 Default is 10. 142 peak_max_prominence_percent : float, optional 143 1-100 % used for baseline detection. 144 Default is 1. 145 peak_derivative_threshold : float, optional 146 Threshold for defining derivative crossing. 147 Default is 0.0005. 148 min_peak_datapoints : float, optional 149 minimum data point to define a chromatografic peak. 150 Default is 5. 151 noise_threshold_method : str, optional 152 Method for detecting noise threshold. 153 Default is 'manual_relative_abundance'. 154 noise_threshold_methods_implemented : tuple, optional 155 Methods for detected noise threshold that can be implemented. 156 Default is ('auto_relative_abundance', 'manual_relative_abundance', 'second_derivative'). 157 peak_height_min_percent : float, optional 158 0-100 % used for peak detection. 159 Default is 0.1. 160 eic_signal_threshold : float, optional 161 0-100 % used for extracted ion chromatogram peak detection. 162 Default is 0.01. 163 eic_buffer_time : float, optional 164 Buffer time to add to the start and end of the plot of the extracted ion chromatogram, in minutes. 165 Default is 1.5. 166 dispersity_index_window : float, optional 167 Dispersity index window size, in minutes. 168 Default is 3.0. 169 noise_window_factor : float, optional 170 Factor to determine noise estimation window size relative to peak width. 171 Larger values use wider windows for noise estimation. 172 For example, a value of 2.0 uses a window size equal to twice the peak width 173 (depending on it's start and end scans) on each side. 174 Called within the LCMSMassFeature.calc_noise_score() method. 175 Default is 2.0. 176 remove_redundant_mass_features : bool, optional 177 If True, remove redundant mass features that are likely contaminants based on 178 their m/z values and scan frequency. 179 Especially useful for HILIC data where signals do not return to baseline between peaks 180 or for data with significant background noise. 181 Called within the LC_Calculations.find_mass_features() method. 182 Default is False. 183 redundant_scan_frequency_min : float, optional 184 Minimum fraction of scans that must contain the m/z to be considered a likely 185 noise/contaminant when using remove_redundant_mass_features. 186 Default is 0.1 (10% of scans). 187 redundant_feature_retain_n : int, optional 188 Number of features to retain in each group when using remove_redundant_mass_features. 189 Default is 3. 190 remove_mass_features_by_peak_metrics : bool, optional 191 If True, remove mass features based on their peak metrics such as S/N, Gaussian similarity, 192 dispersity index, and noise score. 193 Called within the LC_Calculations.add_peak_metrics() method. 194 Default is False. 195 mass_feature_attribute_filter_dict : dict, optional 196 Dictionary specifying filtering criteria for mass feature attributes. 197 Each key is an attribute name, and each value is a dict with 'value' and 'operator' keys. 198 199 Structure: {attribute_name: {'value': threshold, 'operator': comparison}} 200 201 Available operators: 202 - '>' or 'greater': Keep features where attribute > threshold 203 - '<' or 'less': Keep features where attribute < threshold 204 - '>=' or 'greater_equal': Keep features where attribute >= threshold 205 - '<=' or 'less_equal': Keep features where attribute <= threshold 206 207 Examples: 208 { 209 'noise_score_max': {'value': 0.5, 'operator': '>'}, # Keep if noise_score_max > 0.5 210 'dispersity_index': {'value': 0.1, 'operator': '<'}, # Keep if dispersity_index < 0.1 211 'gaussian_similarity': {'value': 0.7, 'operator': '>='} # Keep if gaussian_similarity >= 0.7 212 } 213 214 Available attributes include: 'noise_score', 'noise_score_min', 'noise_score_max', 215 'gaussian_similarity', 'tailing_factor', 'dispersity_index', 'half_height_width', 'intensity'. 216 Default is {"noise_score_max": {"value": 0.8, "operator": ">="},"noise_score_min": {"value": 0.5, "operator": ">="}}, 217 peak_picking_method : str, optional 218 Peak picking method to use. See implemented_peak_picking_methods for options. 219 Default is 'persistent homology'. 220 implemented_peak_picking_methods : tuple, optional 221 Peak picking methods that can be implemented. 222 Default is ('persistent homology', 'centroided_persistent_homology'). 223 ph_smooth_it : int, optional 224 Number of iterations to use for smoothing prior to finding mass features. 225 Used only for "persistent homology" peak picking method. 226 Called within the PHCalculations.find_mass_features_ph() method. 227 Default is 1. 228 ph_smooth_radius_mz : int, optional 229 Radius in m/z steps (not daltons) for smoothing prior to finding mass features. 230 Used only for "persistent homology" peak picking method. 231 Called within the PHCalculations.find_mass_features_ph() method. 232 Default is 0. 233 ph_smooth_radius_scan : int, optional 234 Radius in scan steps for smoothing prior to finding mass features. 235 Used only for "persistent homology" peak picking method. 236 Called within the PHCalculations.find_mass_features_ph() method. 237 Default is 1. 238 ph_inten_min_rel : float, optional 239 Relative minimum intensity to use for finding mass features for persistent homology. 240 Used only for "persistent homology" peak picking method. 241 Calculated as a fraction of the maximum intensity of the unprocessed profile data (mz, scan). 242 Called within the PH_Calculations.find_mass_features() method. 243 Default is 0.001. 244 ph_persis_min_rel : float, optional 245 Relative minimum persistence for retaining mass features. 246 Used for both "persistent homology" and "centroided_persistent_homology" peak picking methods. 247 Calculated as a fraction of the maximum intensity of the unprocessed profile data (mz, scan). 248 Should be greater to or equal to ph_inten_min_rel. 249 Called within the PH_Calculations.find_mass_features() method. 250 Default is 0.001. 251 mass_feature_cluster_mz_tolerance_rel : float, optional 252 Relative m/z tolerance to use for clustering mass features. 253 Used for both "persistent homology" and "centroided_persistent_homology" 254 peak picking methods. 255 Called with the PHCalculations.cluster_mass_features() and the 256 LCCalculations.deconvolute_ms1_mass_features() methods. 257 Default is 5e-6 (5 ppm). 258 mass_feature_cluster_rt_tolerance : float, optional 259 Retention time tolerance to use for clustering mass features, in minutes. 260 Used for both "persistent homology" and "centroided_persistent_homology" 261 peak picking methods. 262 Called with the PHCalculations.cluster_mass_features() and the 263 LCCalculations.deconvolute_ms1_mass_features() methods. 264 Default is 0.3. 265 ms1_scans_to_average : int, optional 266 Number of MS1 scans to average for mass-feature associated m/zs. 267 Called within the LCMSBase.add_associated_ms1() method. 268 Default is 1. 269 ms1_deconvolution_corr_min : float, optional 270 Minimum correlation to use for deconvoluting MS1 mass features. 271 Called within the LCCalculations.deconvolute_ms1_mass_features() method. 272 Default is 0.8. 273 ms2_dda_rt_tolerance : float, optional 274 Retention time tolerance to use for associating MS2 spectra to mass features, in minutes. 275 Called within the LCMSBase.add_associated_ms2_dda() method. 276 Default is 0.15. 277 ms2_dda_mz_tolerance : float, optional 278 Mass tolerance to use for associating MS2 spectra to mass features. 279 Called within the LCMSBase.add_associated_ms2_dda() method. 280 Default is 0.05. 281 ms2_min_fe_score : float, optional 282 Minimum flash entropy for retaining MS2 annotations. 283 Called within the LCMSSpectralSearch.fe_search() method. 284 Default is 0.2. 285 search_as_lipids : bool, optional 286 If True, prepare the database for lipid searching. 287 Called within the LCMSSpectralSearch.fe_prep_search_db() method. 288 Default is False. 289 include_fragment_types : bool, optional 290 If True, include fragment types in the database. 291 Called within the LCMSSpectralSearch.fe_search() and related methods. 292 Default is False. 293 export_profile_spectra : bool, optional 294 If True, export profile spectra data. 295 Default is False. 296 export_eics : bool, optional 297 If True, export extracted ion chromatograms. 298 Default is True. 299 export_unprocessed_ms1 : bool, optional 300 If True, export unprocessed MS1 data. 301 Default is False. 302 export_only_relevant_mass_spectra : bool, optional 303 If True, export only mass spectra associated with detected mass features: 304 specifically the apex MS1 scan for each mass feature and the best MS2 scan 305 for each mass feature (if available). If False, export all mass spectra. 306 This parameter reduces HDF5 file size by excluding non-feature spectra. 307 Default is False (backwards compatible - exports all spectra). 308 verbose_processing : bool, optional 309 If True, print verbose processing information. 310 Default is True. 311 """ 312 313 scans: list | tuple = (-1, -1) 314 315 # Parameters used for generating EICs and performing 1D peak picking and EIC/TIC smoothing 316 eic_tolerance_ppm: float = 5 317 correct_eic_baseline = True 318 smooth_window: int = 5 319 smooth_method: str = "savgol" 320 implemented_smooth_method: tuple = ( 321 "savgol", 322 "hanning", 323 "blackman", 324 "bartlett", 325 "flat", 326 "boxcar", 327 ) 328 savgol_pol_order: int = 2 329 consecutive_scan_min: int = 0 330 peak_height_max_percent: float = 10 331 peak_max_prominence_percent: float = 1 332 peak_derivative_threshold: float = 0.0005 333 min_peak_datapoints: float = 5 334 noise_threshold_method: str = "manual_relative_abundance" 335 noise_threshold_methods_implemented: tuple = ( 336 "auto_relative_abundance", 337 "manual_relative_abundance", 338 "second_derivative", 339 ) 340 peak_height_min_percent: float = 0.1 341 eic_signal_threshold: float = 0.01 342 eic_buffer_time = 1.5 343 dispersity_index_window: float = 3.0 # minutes 344 noise_window_factor: float = 2.0 # times the peak width for detemining SN for EIC 345 346 # Parameters used for filtering mass features after peak picking 347 remove_redundant_mass_features: bool = False 348 redundant_scan_frequency_min: float = 0.1 349 redundant_feature_retain_n: int = 3 350 remove_mass_features_by_peak_metrics: bool = False 351 # note that this is a dictionary of dictionaries and set in __post_init__ instead of here 352 mass_feature_attribute_filter_dict: Dict = dataclasses.field(default_factory=dict) 353 354 # Parameters used for 2D peak picking 355 peak_picking_method: str = "persistent homology" 356 implemented_peak_picking_methods: tuple = ( 357 "persistent homology", 358 "centroided_persistent_homology", 359 ) 360 361 # Parameters used in persistent homology calculations 362 ph_smooth_it = 1 363 ph_smooth_radius_mz = 0 364 ph_smooth_radius_scan = 1 365 ph_inten_min_rel = 0.001 366 ph_persis_min_rel = 0.001 367 368 # Parameters used to cluster mass features 369 mass_feature_cluster_mz_tolerance_rel: float = 5e-6 370 mass_feature_cluster_rt_tolerance: float = 0.3 371 372 # Parameters used in associating MS1 and MS2 spectra to LCMS mass features and deconvoluting MS1 mass features 373 ms1_scans_to_average: int = 1 374 ms1_deconvolution_corr_min: float = 0.8 375 ms2_dda_rt_tolerance: float = 0.15 376 ms2_dda_mz_tolerance: float = 0.05 377 378 # Parameters used for flash entropy searching and database preparation 379 ms2_min_fe_score: float = 0.2 380 search_as_lipids: bool = False 381 include_fragment_types: bool = False 382 383 # Parameters used for saving the data 384 export_profile_spectra: bool = False 385 export_eics: bool = True 386 export_unprocessed_ms1: bool = False 387 export_only_relevant_mass_spectra: bool = False 388 389 # Parameters used for verbose processing 390 verbose_processing: bool = True 391 392 def __post_init__(self): 393 # Set default values for mass_feature_attribute_filter_dict if empty 394 if not self.mass_feature_attribute_filter_dict: 395 self.mass_feature_attribute_filter_dict = { 396 "noise_score_max": {"value": 0.8, "operator": ">="}, 397 "noise_score_min": {"value": 0.5, "operator": ">="}, 398 } 399 400 # enforce datatype 401 for field in dataclasses.fields(self): 402 value = getattr(self, field.name) 403 if not isinstance(value, field.type): 404 value = field.type(value) 405 setattr(self, field.name, value) 406 407 408@dataclasses.dataclass 409class MassSpectrumSetting: 410 """Mass spectrum processing settings class 411 412 Attributes 413 ---------- 414 noise_threshold_method : str, optional 415 Method for detecting noise threshold. Default is 'log'. 416 noise_threshold_methods_implemented : tuple, optional 417 Methods for detected noise threshold that can be implemented. Default is ('minima', 'signal_noise', 'relative_abundance', 'absolute_abundance', 'log'). 418 noise_threshold_min_std : int, optional 419 Minumum value for noise thresholding when using 'minima' noise threshold method. Default is 6. 420 noise_threshold_min_s2n : float, optional 421 Minimum value for noise thresholding when using 'signal_noise' noise threshold method. Default is 4. 422 noise_threshold_min_relative_abundance : float, optional 423 Minimum value for noise thresholding when using 'relative_abundance' noise threshold method. Note that this is a percentage value. Default is 6 (6%). 424 noise_threshold_absolute_abundance : float, optional 425 Minimum value for noise thresholding when using 'absolute_abundance' noise threshold method. Default is 1_000_000. 426 noise_threshold_log_nsigma : int, optional 427 Number of standard deviations to use when using 'log' noise threshold method. Default is 6. 428 noise_threshold_log_nsigma_corr_factor : float, optional 429 Correction factor for log noise threshold method. Default is 0.463. 430 noise_threshold_log_nsigma_bins : int, optional 431 Number of bins to use for histogram when using 'log' noise threshold method. Default is 500. 432 noise_min_mz : float, optional 433 Minimum m/z to use for noise thresholding. Default is 50.0. 434 noise_max_mz : float, optional 435 Maximum m/z to use for noise thresholding. Default is 1200.0. 436 min_picking_mz : float, optional 437 Minimum m/z to use for peak picking. Default is 50.0. 438 max_picking_mz : float, optional 439 Maximum m/z to use for peak picking. Default is 1200.0. 440 picking_point_extrapolate : int, optional 441 How many data points (in each direction) to extrapolate the mz axis and 0 pad the abundance axis. Default is 3. 442 Recommend 3 for reduced profile data or if peak picking faults 443 calib_minimize_method : str, optional 444 Minimization method to use for calibration. Default is 'Powell'. 445 calib_pol_order : int, optional 446 Polynomial order to use for calibration. Default is 2. 447 max_calib_ppm_error : float, optional 448 Maximum ppm error to use for calibration. Default is 1.0. 449 min_calib_ppm_error : float, optional 450 Minimum ppm error to use for calibration. Default is -1.0. 451 calib_sn_threshold : float, optional 452 Signal to noise threshold to use for calibration. Default is 2.0. 453 calibration_ref_match_method: string, optional 454 Method for matching reference masses with measured masses for recalibration. Default is 'legacy'. 455 calibration_ref_match_tolerance: float, optional 456 If using the new method for calibration reference mass matching, this tolerance is the initial matching tolerance. Default is 0.003 457 do_calibration : bool, optional 458 If True, perform calibration. Default is True. 459 verbose_processing : bool, optional 460 If True, print verbose processing information. Default is True. 461 """ 462 463 noise_threshold_method: str = "log" 464 465 noise_threshold_methods_implemented: tuple = ( 466 "minima", 467 "signal_noise", 468 "relative_abundance", 469 "absolute_abundance", 470 "log", 471 ) 472 473 noise_threshold_min_std: int = 6 # when using 'minima' method 474 475 noise_threshold_min_s2n: float = 4 # when using 'signal_noise' method 476 477 noise_threshold_min_relative_abundance: float = ( 478 6 # from 0-100, when using 'relative_abundance' method 479 ) 480 481 noise_threshold_absolute_abundance: float = ( 482 1_000_000 # when using 'absolute_abundance' method 483 ) 484 485 noise_threshold_log_nsigma: int = 6 # when using 'log' method 486 noise_threshold_log_nsigma_corr_factor: float = 0.463 # mFT is 0.463, aFT is 1.0 487 noise_threshold_log_nsigma_bins: int = 500 # bins for the histogram for the noise 488 489 noise_min_mz: float = 50.0 490 noise_max_mz: float = 1200.0 491 492 min_picking_mz: float = 50.0 493 max_picking_mz: float = 1200.0 494 495 # How many data points (in each direction) to extrapolate the mz axis and 0 pad the abundance axis 496 # This will fix peak picking at spectrum limit issues 497 # 0 to keep normal behaviour, typical value 3 to fix 498 picking_point_extrapolate: int = 3 499 500 calib_minimize_method: str = "Powell" 501 calib_pol_order: int = 2 502 max_calib_ppm_error: float = 1.0 503 min_calib_ppm_error: float = -1.0 504 calib_sn_threshold: float = 2.0 505 calibration_ref_match_method: str = "legacy" 506 calibration_ref_match_method_implemented: tuple = ("legacy", "merged") 507 calibration_ref_match_tolerance: float = 0.003 508 calibration_ref_match_std_raw_error_limit: float = 1.5 509 # calib_ref_mzs: list = [0] 510 511 do_calibration: bool = True 512 verbose_processing: bool = True 513 514 def __post_init__(self): 515 # enforce datatype 516 for field in dataclasses.fields(self): 517 value = getattr(self, field.name) 518 if not isinstance(value, field.type): 519 value = field.type(value) 520 setattr(self, field.name, value) 521 522 523@dataclasses.dataclass 524class MassSpecPeakSetting: 525 """Mass spectrum peak processing settings class 526 527 Attributes 528 ---------- 529 kendrick_base : Dict, optional 530 Dictionary specifying the elements and their counts in the Kendrick base. 531 Defaults to {'C': 1, 'H': 2}. 532 kendrick_rounding_method : str, optional 533 Method for calculating the nominal Kendrick mass. Valid values are 'floor', 'ceil', or 'round'. 534 Defaults to 'floor'. 535 implemented_kendrick_rounding_methods : tuple 536 Tuple of valid rounding methods for calculating the nominal Kendrick mass. 537 Defaults to ('floor', 'ceil', 'round'). 538 peak_derivative_threshold : float, optional 539 Threshold for defining derivative crossing. Should be a value between 0 and 1. 540 Defaults to 0.0. 541 peak_min_prominence_percent : float, optional 542 Minimum prominence percentage used for peak detection. Should be a value between 1 and 100. 543 Defaults to 0.1. 544 min_peak_datapoints : float, optional 545 Minimum number of data points used for peak detection. Should be a value between 0 and infinity. 546 Defaults to 5. 547 peak_max_prominence_percent : float, optional 548 Maximum prominence percentage used for baseline detection. Should be a value between 1 and 100. 549 Defaults to 0.1. 550 peak_height_max_percent : float, optional 551 Maximum height percentage used for baseline detection. Should be a value between 1 and 100. 552 Defaults to 10. 553 legacy_resolving_power : bool, optional 554 Flag indicating whether to use the legacy (CoreMS v1) resolving power calculation. 555 Defaults to True. 556 legacy_centroid_polyfit : bool, optional 557 Use legacy (numpy polyfit) to fit centroid 558 Default false. 559 """ 560 561 kendrick_base: Dict = dataclasses.field(default_factory=dict) 562 563 kendrick_rounding_method: str = "floor" # 'floor', 'ceil' or 'round' are valid methods for calculating nominal kendrick mass 564 565 implemented_kendrick_rounding_methods: tuple = ("floor", "ceil", "round") 566 567 peak_derivative_threshold: float = 0.0 # define derivative crossing threshould 0-1 568 569 peak_min_prominence_percent: float = 0.1 # 1-100 % used for peak detection 570 571 min_peak_datapoints: float = 5 # 0-inf used for peak detection 572 573 peak_max_prominence_percent: float = 0.1 # 1-100 % used for baseline detection 574 575 peak_height_max_percent: float = 10 # 1-100 % used for baseline detection 576 577 legacy_resolving_power: bool = ( 578 True # Use the legacy (CoreMS v1) resolving power calculation (True) 579 ) 580 581 legacy_centroid_polyfit: bool = False 582 583 def __post_init__(self): 584 # default to CH2 585 if not self.kendrick_base: 586 self.kendrick_base = {"C": 1, "H": 2} 587 # enforce datatype 588 for field in dataclasses.fields(self): 589 value = getattr(self, field.name) 590 if not isinstance(value, field.type): 591 value = field.type(value) 592 setattr(self, field.name, value) 593 594 595@dataclasses.dataclass 596class GasChromatographSetting: 597 """Gas chromatograph processing settings class 598 599 Attributes 600 ---------- 601 use_deconvolution : bool, optional 602 If True, use deconvolution. Default is False. 603 implemented_smooth_method : tuple, optional 604 Smoothing methods that can be implemented. Default is ('savgol', 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar'). 605 smooth_window : int, optional 606 Window size for smoothing the ion chromatogram. Default is 5. 607 smooth_method : str, optional 608 Smoothing method to use. Default is 'savgol'. Other options are 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar'. 609 savgol_pol_order : int, optional 610 Polynomial order for Savitzky-Golay smoothing. Default is 2. 611 peak_derivative_threshold : float, optional 612 Threshold for defining derivative crossing. Should be a value between 0 and 1. 613 Defaults to 0.0005. 614 peak_height_max_percent : float, optional 615 Maximum height percentage used for baseline detection. Should be a value between 1 and 100. 616 Defaults to 10. 617 peak_max_prominence_percent : float, optional 618 Maximum prominence percentage used for baseline detection. Should be a value between 1 and 100. 619 Defaults to 1. 620 min_peak_datapoints : float, optional 621 Minimum number of data points used for peak detection. Should be a value between 0 and infinity. 622 Defaults to 5. 623 max_peak_width : float, optional 624 Maximum peak width used for peak detection. Should be a value between 0 and infinity. 625 Defaults to 0.1. 626 noise_threshold_method : str, optional 627 Method for detecting noise threshold. Default is 'manual_relative_abundance'. 628 noise_threshold_methods_implemented : tuple, optional 629 Methods for detected noise threshold that can be implemented. Default is ('auto_relative_abundance', 'manual_relative_abundance', 'second_derivative'). 630 std_noise_threshold : int, optional 631 Default is 3. 632 peak_height_min_percent : float, optional 633 0-100 % used for peak detection. Default is 0.1. 634 peak_min_prominence_percent : float, optional 635 0-100 % used for peak detection. Default is 0.1. 636 eic_signal_threshold : float, optional 637 0-100 % used for extracted ion chromatogram peak detection. Default is 0.01. 638 max_rt_distance : float, optional 639 Maximum distance allowance for hierarchical cluster, in minutes. Default is 0.025. 640 verbose_processing : bool, optional 641 If True, print verbose processing information. Default is True. 642 """ 643 644 use_deconvolution: bool = False 645 646 implemented_smooth_method: tuple = ( 647 "savgol", 648 "hanning", 649 "blackman", 650 "bartlett", 651 "flat", 652 "boxcar", 653 ) 654 655 smooth_window: int = 5 656 657 smooth_method: str = "savgol" 658 659 savgol_pol_order: int = 2 660 661 peak_derivative_threshold: float = 0.0005 662 663 peak_height_max_percent: float = 10 # 1-100 % used for baseline detection use 0.1 for second_derivative and 10 for other methods 664 665 peak_max_prominence_percent: float = 1 # 1-100 % used for baseline detection 666 667 min_peak_datapoints: float = 5 668 669 max_peak_width: float = 0.1 670 671 noise_threshold_method: str = "manual_relative_abundance" 672 673 noise_threshold_methods_implemented: tuple = ( 674 "auto_relative_abundance", 675 "manual_relative_abundance", 676 "second_derivative", 677 ) 678 679 std_noise_threshold: int = 3 680 681 peak_height_min_percent: float = 0.1 # 0-100 % used for peak detection 682 683 peak_min_prominence_percent: float = 0.1 # 0-100 % used for peak detection 684 685 eic_signal_threshold: float = ( 686 0.01 # 0-100 % used for extracted ion chromatogram peak detection 687 ) 688 689 max_rt_distance: float = ( 690 0.025 # minutes, max distance allowance hierarchical clutter 691 ) 692 693 verbose_processing: bool = True 694 695 def __post_init__(self): 696 # enforce datatype 697 for field in dataclasses.fields(self): 698 value = getattr(self, field.name) 699 if not isinstance(value, field.type): 700 value = field.type(value) 701 setattr(self, field.name, value) 702 703 704@dataclasses.dataclass 705class CompoundSearchSettings: 706 """Settings for compound search 707 708 Attributes 709 ---------- 710 url_database : str, optional 711 URL for the database. Default is 'sqlite:///db/pnnl_lowres_gcms_compounds.sqlite'. 712 ri_search_range : float, optional 713 Retention index search range. Default is 35. 714 rt_search_range : float, optional 715 Retention time search range, in minutes. Default is 1.0. 716 correlation_threshold : float, optional 717 Threshold for correlation for spectral similarity. Default is 0.5. 718 score_threshold : float, optional 719 Threshold for compsite score. Default is 0.0. 720 ri_spacing : float, optional 721 Retention index spacing. Default is 200. 722 ri_std : float, optional 723 Retention index standard deviation. Default is 3. 724 ri_calibration_compound_names : list, optional 725 List of compound names to use for retention index calibration. Default is ['Methyl Caprylate', 'Methyl Caprate', 'Methyl Pelargonate', 'Methyl Laurate', 'Methyl Myristate', 'Methyl Palmitate', 'Methyl Stearate', 'Methyl Eicosanoate', 'Methyl Docosanoate', 'Methyl Linocerate', 'Methyl Hexacosanoate', 'Methyl Octacosanoate', 'Methyl Triacontanoate']. 726 727 """ 728 729 url_database: str = "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/lowres" # 'postgresql://postgres:labthomson0102@172.22.113.27:5432/GCMS' # 'sqlite:///db/pnnl_lowres_gcms_compounds.sqlite' 730 731 ri_search_range: float = 35 732 733 rt_search_range: float = 1.0 # used for retention index calibration 734 735 correlation_threshold: float = 0.5 # used for calibration, spectral similarity 736 737 score_threshold: float = 0.0 738 739 ri_spacing: float = 200 740 741 ri_std: float = 3 # in standard deviation 742 743 ri_calibration_compound_names: List = dataclasses.field(default_factory=list) 744 745 # calculates and export all spectral similarity methods 746 exploratory_mode: bool = False 747 748 score_methods: tuple = ("highest_sim_score", "highest_ss") 749 750 output_score_method: str = "All" 751 752 def __post_init__(self): 753 # enforce datatype 754 self.url_database = os.getenv( 755 "SPECTRAL_GCMS_DATABASE_URL", 756 "sqlite:///db/pnnl_lowres_gcms_compounds.sqlite", 757 ) 758 759 for field in dataclasses.fields(self): 760 value = getattr(self, field.name) 761 if not isinstance(value, field.type): 762 value = field.type(value) 763 setattr(self, field.name, value) 764 765 self.ri_calibration_compound_names = [ 766 "Methyl Caprylate", 767 "Methyl Caprate", 768 "Methyl Pelargonate", 769 "Methyl Laurate", 770 "Methyl Myristate", 771 "Methyl Palmitate", 772 "Methyl Stearate", 773 "Methyl Eicosanoate", 774 "Methyl Docosanoate", 775 "Methyl Linocerate", 776 "Methyl Hexacosanoate", 777 "Methyl Octacosanoate", 778 "Methyl Triacontanoate", 779 ] 780 781 782class MolecularLookupDictSettings: 783 """Settings for molecular searching 784 785 These are used to generate the database entries, do not change. 786 787 Attributes 788 ---------- 789 usedAtoms : dict, optional 790 Dictionary of atoms and ranges. Default is {'C': (1, 90), 'H': (4, 200), 'O': (0, 12), 'N': (0, 0), 'S': (0, 0), 'P': (0, 0), 'Cl': (0, 0)}. 791 min_mz : float, optional 792 Minimum m/z to use for searching. Default is 50.0. 793 max_mz : float, optional 794 Maximum m/z to use for searching. Default is 1200.0. 795 min_dbe : float, optional 796 Minimum double bond equivalent to use for searching. Default is 0. 797 max_dbe : float, optional 798 Maximum double bond equivalent to use for searching. Default is 50. 799 use_pah_line_rule : bool, optional 800 If True, use the PAH line rule. Default is False. 801 isRadical : bool, optional 802 If True, search for radical ions. Default is True. 803 isProtonated : bool, optional 804 If True, search for protonated ions. Default is True. 805 url_database : str, optional 806 URL for the database. Default is None. 807 db_jobs : int, optional 808 Number of jobs to use for database queries. Default is 1. 809 used_atom_valences : dict, optional 810 Dictionary of atoms and valences. Default is {'C': 4, '13C': 4, 'H': 1, 'O': 2, '18O': 2, 'N': 3, 'S': 2, '34S': 2, 'P': 3, 'Cl': 1, '37Cl': 1, 'Br': 1, 'Na': 1, 'F': 1, 'K': 0}. 811 812 """ 813 814 ### DO NOT CHANGE IT! These are used to generate the database entries 815 816 ### DO change when creating a new application database 817 818 ### FOR search settings runtime and database query check use the MolecularFormulaSearchSettings class below 819 820 ### C, H, N, O, S and P atoms are ALWAYS needed at usedAtoms 821 ### if you don't want to include one of those atoms set the max and min at 0 822 ### you can include any atom listed at Atoms class inside encapsulation.settings.constants module 823 ### make sure to include the selected covalence at the used_atoms_valences when adding new atoms 824 ### NOTE : Adducts atoms have zero covalence 825 ### NOTE : Not using static variable because this class is distributed using multiprocessing 826 def __init__(self): 827 self.usedAtoms = { 828 "C": (1, 90), 829 "H": (4, 200), 830 "O": (0, 12), 831 "N": (0, 0), 832 "S": (0, 0), 833 "P": (0, 0), 834 "Cl": (0, 0), 835 } 836 837 self.min_mz = 50 838 839 self.max_mz = 1200 840 841 self.min_dbe = 0 842 843 self.max_dbe = 50 844 845 # overwrites the dbe limits above to DBE = (C + heteroatoms) * 0.9 846 self.use_pah_line_rule = False 847 848 self.isRadical = True 849 850 self.isProtonated = True 851 852 self.url_database = None 853 854 self.db_jobs = 1 855 856 self.used_atom_valences = { 857 "C": 4, 858 "13C": 4, 859 "H": 1, 860 "O": 2, 861 "18O": 2, 862 "N": 3, 863 "S": 2, 864 "34S": 2, 865 "P": 3, 866 "Cl": 1, 867 "37Cl": 1, 868 "Br": 1, 869 "Na": 1, 870 "F": 1, 871 "K": 0, 872 } 873 874 875@dataclasses.dataclass 876class MolecularFormulaSearchSettings: 877 """Settings for molecular searching 878 879 Attributes 880 ---------- 881 use_isotopologue_filter : bool, optional 882 If True, use isotopologue filter. Default is False. 883 isotopologue_filter_threshold : float, optional 884 Threshold for isotopologue filter. Default is 33. 885 isotopologue_filter_atoms : tuple, optional 886 Tuple of atoms to use for isotopologue filter. Default is ('Cl', 'Br'). 887 use_runtime_kendrick_filter : bool, optional 888 If True, use runtime Kendrick filter. Default is False. 889 use_min_peaks_filter : bool, optional 890 If True, use minimum peaks filter. Default is True. 891 min_peaks_per_class : int, optional 892 Minimum number of peaks per class. Default is 15. 893 url_database : str, optional 894 URL for the database. Default is 'postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp'. 895 db_jobs : int, optional 896 Number of jobs to use for database queries. Default is 3. 897 db_chunk_size : int, optional 898 Chunk size to use for database queries. Default is 300. 899 ion_charge : int, optional 900 Ion charge. Default is -1. 901 min_hc_filter : float, optional 902 Minimum hydrogen to carbon ratio. Default is 0.3. 903 max_hc_filter : float, optional 904 Maximum hydrogen to carbon ratio. Default is 3. 905 min_oc_filter : float, optional 906 Minimum oxygen to carbon ratio. Default is 0.0. 907 max_oc_filter : float, optional 908 Maximum oxygen to carbon ratio. Default is 1.2. 909 min_op_filter : float, optional 910 Minimum oxygen to phosphorous ratio. Default is 2. 911 use_pah_line_rule : bool, optional 912 If True, use the PAH line rule. Default is False. 913 min_dbe : float, optional 914 Minimum double bond equivalent to use for searching. Default is 0. 915 max_dbe : float, optional 916 Maximum double bond equivalent to use for searching. Default is 40. 917 mz_error_score_weight : float, optional 918 Weight for m/z error score to contribute to composite score. Default is 0.6. 919 isotopologue_score_weight : float, optional 920 Weight for isotopologue score to contribute to composite score. Default is 0.4. 921 adduct_atoms_neg : tuple, optional 922 Tuple of atoms to use in negative polarity. Default is ('Cl', 'Br'). 923 adduct_atoms_pos : tuple, optional 924 Tuple of atoms to use in positive polarity. Default is ('Na', 'K'). 925 score_methods : tuple, optional 926 Tuple of score method that can be implemented. 927 Default is ('S_P_lowest_error', 'N_S_P_lowest_error', 'lowest_error', 'prob_score', 'air_filter_error', 'water_filter_error', 'earth_filter_error'). 928 score_method : str, optional 929 Score method to use. Default is 'prob_score'. Options are 'S_P_lowest_error', 'N_S_P_lowest_error', 'lowest_error', 'prob_score', 'air_filter_error', 'water_filter_error', 'earth_filter_error'. 930 output_min_score : float, optional 931 Minimum score for output. Default is 0.1. 932 output_score_method : str, optional 933 Score method to use for output. Default is 'All Candidates'. 934 isRadical : bool, optional 935 If True, search for radical ions. Default is False. 936 isProtonated : bool, optional 937 If True, search for protonated ions. Default is True. 938 isAdduct : bool, optional 939 If True, search for adduct ions. Default is False. 940 usedAtoms : dict, optional 941 Dictionary of atoms and ranges. Default is {'C': (1, 90), 'H': (4, 200), 'O': (0, 12), 'N': (0, 0), 'S': (0, 0), 'P': (0, 0), 'Cl': (0, 0)}. 942 ion_types_excluded : list, optional 943 List of ion types to exclude from molecular id search, commonly ['[M+CH3COO]-]'] or ['[M+COOH]-'] depending on mobile phase content. Default is []. 944 ionization_type : str, optional 945 Ionization type. Default is 'ESI'. 946 min_ppm_error : float, optional 947 Minimum ppm error. Default is -10.0. 948 max_ppm_error : float, optional 949 Maximum ppm error. Default is 10.0. 950 min_abun_error : float, optional 951 Minimum abundance error for isotolopologue search. Default is -100.0. 952 max_abun_error : float, optional 953 Maximum abundance error for isotolopologue search. Default is 100.0. 954 mz_error_range : float, optional 955 m/z error range. Default is 1.5. 956 error_method : str, optional 957 Error method. Default is 'None'. Options are 'distance', 'lowest', 'symmetrical','average' 'None'. 958 mz_error_average : float, optional 959 m/z error average. Default is 0.0. 960 used_atom_valences : dict, optional 961 Dictionary of atoms and valences. Default is {'C': 4, '13C': 4, 'H': 1, 'O': 2, '18O': 2, 'N': 3, 'S': 2, '34S': 2, 'P': 3, 'Cl': 1, '37Cl': 1, 'Br': 1, 'Na': 1, 'F': 1, 'K': 0}. 962 verbose_processing: bool, optional 963 If True, print verbose processing information. Default is True. 964 """ 965 966 verbose_processing: bool = True 967 968 use_isotopologue_filter: bool = False 969 970 isotopologue_filter_threshold: float = 33 971 972 isotopologue_filter_atoms: tuple = ("Cl", "Br") 973 974 use_runtime_kendrick_filter: bool = False 975 976 use_min_peaks_filter: bool = True 977 978 min_peaks_per_class: int = 15 979 980 url_database: str = ( 981 "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp" 982 ) 983 984 db_jobs: int = 3 985 986 db_chunk_size: int = 300 987 988 # query setting======== 989 ion_charge: int = -1 990 991 min_hc_filter: float = 0.3 992 993 max_hc_filter: float = 3 994 995 min_oc_filter: float = 0.0 996 997 max_oc_filter: float = 1.2 998 999 min_op_filter: float = 2 1000 1001 use_pah_line_rule: bool = False 1002 1003 min_dbe: float = 0 1004 1005 max_dbe: float = 40 1006 1007 mz_error_score_weight: float = 0.6 1008 1009 isotopologue_score_weight: float = 0.4 1010 1011 # look for close shell ions [M + Adduct]+ only considers metal set in the list adduct_atoms 1012 adduct_atoms_neg: tuple = ("Cl", "Br") 1013 1014 adduct_atoms_pos: tuple = ("Na", "K") 1015 1016 score_methods: tuple = ( 1017 "S_P_lowest_error", 1018 "N_S_P_lowest_error", 1019 "lowest_error", 1020 "prob_score", 1021 "air_filter_error", 1022 "water_filter_error", 1023 "earth_filter_error", 1024 ) 1025 1026 score_method: str = "prob_score" 1027 1028 output_min_score: float = 0.1 1029 1030 output_score_method: str = "All Candidates" 1031 1032 # depending on the polarity mode it looks for [M].+ , [M].- 1033 # query and automatically compile add entry if it doesn't exist 1034 1035 isRadical: bool = False 1036 1037 # depending on the polarity mode it looks for [M + H]+ , [M - H]+ 1038 # query and automatically compile and push options if it doesn't exist 1039 isProtonated: bool = True 1040 1041 isAdduct: bool = False 1042 1043 usedAtoms: dict = dataclasses.field(default_factory=dict) 1044 ion_types_excluded: list = dataclasses.field(default_factory=list) 1045 1046 # search setting ======== 1047 1048 ionization_type: str = "ESI" 1049 1050 # empirically set / needs optimization 1051 min_ppm_error: float = -10.0 # ppm 1052 1053 # empirically set / needs optimization 1054 max_ppm_error: float = 10.0 # ppm 1055 1056 # empirically set / needs optimization set for isotopologue search 1057 min_abun_error: float = -100.0 # percentage 1058 1059 # empirically set / needs optimization set for isotopologue search 1060 max_abun_error: float = 100.0 # percentage 1061 1062 # empirically set / needs optimization 1063 mz_error_range: float = 1.5 1064 1065 # 'distance', 'lowest', 'symmetrical','average' 'None' 1066 error_method: str = "None" 1067 1068 mz_error_average: float = 0.0 1069 1070 # used_atom_valences: {'C': 4, 'H':1, etc} = dataclasses.field(default_factory=dict) 1071 used_atom_valences: dict = dataclasses.field(default_factory=dict) 1072 1073 def __post_init__(self): 1074 if not self.url_database or self.url_database == "": 1075 self.url_database = os.getenv( 1076 "COREMS_DATABASE_URL", "sqlite:///db/molformula.db" 1077 ) 1078 # enforce datatype 1079 for field in dataclasses.fields(self): 1080 value = getattr(self, field.name) 1081 if not isinstance(value, field.type): 1082 value = field.type(value) 1083 setattr(self, field.name, value) 1084 1085 # enforce C and H if either do not exists 1086 if "C" not in self.usedAtoms.keys(): 1087 self.usedAtoms["C"] = (1, 100) 1088 if "H" not in self.usedAtoms.keys(): 1089 self.usedAtoms["H"] = (1, 200) 1090 1091 # add cummon values 1092 current_used_atoms = self.used_atom_valences.keys() 1093 1094 for atom in Atoms.atoms_covalence.keys(): 1095 if atom not in current_used_atoms: 1096 covalence = Atoms.atoms_covalence.get(atom) 1097 1098 if isinstance(covalence, int): 1099 self.used_atom_valences[atom] = covalence 1100 1101 else: 1102 # will get the first number of all possible covalances, which should be the most commum 1103 self.used_atom_valences[atom] = covalence[0]
12@dataclasses.dataclass 13class TransientSetting: 14 """Transient processing settings class 15 16 Attributes 17 ---------- 18 implemented_apodization_function : tuple 19 Available apodization functions 20 apodization_method : str 21 Apodization function to use. Hanning is a good default for Fourier transform magnitude mode. 22 For absorption mode processing, Half-Sine or Half-Kaiser may be more appropriate. 23 number_of_truncations : int 24 How many times to truncate the transient prior to Fourier transform 25 number_of_zero_fills : int 26 How many times to zero fille the transient prior to Fourier transform. 27 next_power_of_two : bool 28 If True, zero fill to the next power of two after the new length of len(transient)+(number_of_zero_fills*len(transient)). 29 kaiser_beta : float 30 Beta parameter for Kaiser or Half-Kaiser apodisation function. 0 is rectangular, 5 is similar to Hamming, 31 6 is similar to hanning, and 8.6 is similar to Blackman (from numpy docs) 32 33 """ 34 35 implemented_apodization_function: tuple = ( 36 "Hamming", 37 "Hanning", 38 "Blackman", 39 "Full-Sine", 40 "Half-Sine", 41 "Kaiser", 42 "Half-Kaiser", 43 "Rectangle", 44 ) 45 apodization_method: str = "Hanning" 46 number_of_truncations: int = 0 47 number_of_zero_fills: int = 1 48 next_power_of_two: bool = False 49 kaiser_beta: float = 8.6 50 51 def __post_init__(self): 52 # enforce datatype 53 for field in dataclasses.fields(self): 54 value = getattr(self, field.name) 55 if not isinstance(value, field.type): 56 value = field.type(value) 57 setattr(self, field.name, value)
Transient processing settings class
Attributes
- implemented_apodization_function (tuple): Available apodization functions
- apodization_method (str): Apodization function to use. Hanning is a good default for Fourier transform magnitude mode. For absorption mode processing, Half-Sine or Half-Kaiser may be more appropriate.
- number_of_truncations (int): How many times to truncate the transient prior to Fourier transform
- number_of_zero_fills (int): How many times to zero fille the transient prior to Fourier transform.
- next_power_of_two (bool): If True, zero fill to the next power of two after the new length of len(transient)+(number_of_zero_fills*len(transient)).
- kaiser_beta (float): Beta parameter for Kaiser or Half-Kaiser apodisation function. 0 is rectangular, 5 is similar to Hamming, 6 is similar to hanning, and 8.6 is similar to Blackman (from numpy docs)
60@dataclasses.dataclass 61class DataInputSetting: 62 """Data input settings class 63 64 Attributes 65 ---------- 66 header_translate : dict 67 Dictionary with the header labels to be translated to the corems labels. For example, {'m/z':'m/z', 'Resolving Power':'Resolving Power', 'Abundance':'Abundance' , 'S/N':'S/N'} 68 """ 69 70 # add to this dict the VALUES to match your labels, THE ORDER WON"T MATTER 71 # "column_translate" : {"m/z":"m/z", "Resolving Power":"Resolving Power", "Abundance":"Abundance" , "S/N":"S/N"} 72 header_translate: dict = dataclasses.field(default_factory=dict) 73 74 def __post_init__(self): 75 self.header_translate = { 76 "m/z": Labels.mz, 77 "mOz": Labels.mz, 78 "Mass": Labels.mz, 79 "Resolving Power": Labels.rp, 80 "Res.": Labels.rp, 81 "resolution": Labels.rp, 82 "Intensity": Labels.abundance, 83 "Peak Height": Labels.abundance, 84 "I": Labels.abundance, 85 "Abundance": Labels.abundance, 86 "abs_abu": Labels.abundance, 87 "Signal/Noise": Labels.s2n, 88 "S/N": Labels.s2n, 89 "sn": Labels.s2n, 90 } 91 92 def add_mz_label(self, label): 93 """Add a label to the header_translate dictionary to be translated to the corems label for mz.""" 94 self.header_translate[label] = Labels.mz 95 96 def add_peak_height_label(self, label): 97 """Add a label to the header_translate dictionary to be translated to the corems label for peak height.""" 98 99 self.header_translate[label] = Labels.abundance 100 101 def add_sn_label(self, label): 102 """Add a label to the header_translate dictionary to be translated to the corems label for signal to noise.""" 103 self.header_translate[label] = Labels.s2n 104 105 def add_resolving_power_label(self, label): 106 """Add a label to the header_translate dictionary to be translated to the corems label for resolving power.""" 107 self.header_translate[label] = Labels.rp
Data input settings class
Attributes
- header_translate (dict): Dictionary with the header labels to be translated to the corems labels. For example, {'m/z':'m/z', 'Resolving Power':'Resolving Power', 'Abundance':'Abundance' , 'S/N':'S/N'}
92 def add_mz_label(self, label): 93 """Add a label to the header_translate dictionary to be translated to the corems label for mz.""" 94 self.header_translate[label] = Labels.mz
Add a label to the header_translate dictionary to be translated to the corems label for mz.
96 def add_peak_height_label(self, label): 97 """Add a label to the header_translate dictionary to be translated to the corems label for peak height.""" 98 99 self.header_translate[label] = Labels.abundance
Add a label to the header_translate dictionary to be translated to the corems label for peak height.
101 def add_sn_label(self, label): 102 """Add a label to the header_translate dictionary to be translated to the corems label for signal to noise.""" 103 self.header_translate[label] = Labels.s2n
Add a label to the header_translate dictionary to be translated to the corems label for signal to noise.
105 def add_resolving_power_label(self, label): 106 """Add a label to the header_translate dictionary to be translated to the corems label for resolving power.""" 107 self.header_translate[label] = Labels.rp
Add a label to the header_translate dictionary to be translated to the corems label for resolving power.
110@dataclasses.dataclass 111class LiquidChromatographSetting: 112 """Liquid chromatograph processing settings class 113 114 Attributes 115 ---------- 116 scans : list or tuple, optional 117 List of select scan to average or a tuple containing the range to average. 118 Default is (-1, -1). 119 eic_tolerance_ppm : float, optional 120 Mass tolerance in ppm for extracted ion chromatogram peak detection. 121 Default is 5. 122 correct_eic_baseline : bool, optional 123 If True, correct the baseline of the extracted ion chromatogram. 124 Default is True. 125 smooth_window : int, optional 126 Window size for smoothing the ion chromatogram (extracted or total). 127 Default is 5. 128 smooth_method : str, optional 129 Smoothing method to use. See implemented_smooth_method for options. 130 Default is 'savgol'. 131 implemented_smooth_method : tuple, optional 132 Smoothing methods that can be implemented. 133 Default is ('savgol', 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar'). 134 savgol_pol_order : int, optional 135 Polynomial order for Savitzky-Golay smoothing. 136 Default is 2. 137 consecutive_scan_min : int, optional 138 Minimum number of consecutive scans to consider for peak detection. 139 Default is 0 for backwards compatibility, but a value of 3 is recommended. 140 peak_height_max_percent : float, optional 141 1-100 % used for baseline detection use 0.1 for second_derivative and 10 for other methods. 142 Default is 10. 143 peak_max_prominence_percent : float, optional 144 1-100 % used for baseline detection. 145 Default is 1. 146 peak_derivative_threshold : float, optional 147 Threshold for defining derivative crossing. 148 Default is 0.0005. 149 min_peak_datapoints : float, optional 150 minimum data point to define a chromatografic peak. 151 Default is 5. 152 noise_threshold_method : str, optional 153 Method for detecting noise threshold. 154 Default is 'manual_relative_abundance'. 155 noise_threshold_methods_implemented : tuple, optional 156 Methods for detected noise threshold that can be implemented. 157 Default is ('auto_relative_abundance', 'manual_relative_abundance', 'second_derivative'). 158 peak_height_min_percent : float, optional 159 0-100 % used for peak detection. 160 Default is 0.1. 161 eic_signal_threshold : float, optional 162 0-100 % used for extracted ion chromatogram peak detection. 163 Default is 0.01. 164 eic_buffer_time : float, optional 165 Buffer time to add to the start and end of the plot of the extracted ion chromatogram, in minutes. 166 Default is 1.5. 167 dispersity_index_window : float, optional 168 Dispersity index window size, in minutes. 169 Default is 3.0. 170 noise_window_factor : float, optional 171 Factor to determine noise estimation window size relative to peak width. 172 Larger values use wider windows for noise estimation. 173 For example, a value of 2.0 uses a window size equal to twice the peak width 174 (depending on it's start and end scans) on each side. 175 Called within the LCMSMassFeature.calc_noise_score() method. 176 Default is 2.0. 177 remove_redundant_mass_features : bool, optional 178 If True, remove redundant mass features that are likely contaminants based on 179 their m/z values and scan frequency. 180 Especially useful for HILIC data where signals do not return to baseline between peaks 181 or for data with significant background noise. 182 Called within the LC_Calculations.find_mass_features() method. 183 Default is False. 184 redundant_scan_frequency_min : float, optional 185 Minimum fraction of scans that must contain the m/z to be considered a likely 186 noise/contaminant when using remove_redundant_mass_features. 187 Default is 0.1 (10% of scans). 188 redundant_feature_retain_n : int, optional 189 Number of features to retain in each group when using remove_redundant_mass_features. 190 Default is 3. 191 remove_mass_features_by_peak_metrics : bool, optional 192 If True, remove mass features based on their peak metrics such as S/N, Gaussian similarity, 193 dispersity index, and noise score. 194 Called within the LC_Calculations.add_peak_metrics() method. 195 Default is False. 196 mass_feature_attribute_filter_dict : dict, optional 197 Dictionary specifying filtering criteria for mass feature attributes. 198 Each key is an attribute name, and each value is a dict with 'value' and 'operator' keys. 199 200 Structure: {attribute_name: {'value': threshold, 'operator': comparison}} 201 202 Available operators: 203 - '>' or 'greater': Keep features where attribute > threshold 204 - '<' or 'less': Keep features where attribute < threshold 205 - '>=' or 'greater_equal': Keep features where attribute >= threshold 206 - '<=' or 'less_equal': Keep features where attribute <= threshold 207 208 Examples: 209 { 210 'noise_score_max': {'value': 0.5, 'operator': '>'}, # Keep if noise_score_max > 0.5 211 'dispersity_index': {'value': 0.1, 'operator': '<'}, # Keep if dispersity_index < 0.1 212 'gaussian_similarity': {'value': 0.7, 'operator': '>='} # Keep if gaussian_similarity >= 0.7 213 } 214 215 Available attributes include: 'noise_score', 'noise_score_min', 'noise_score_max', 216 'gaussian_similarity', 'tailing_factor', 'dispersity_index', 'half_height_width', 'intensity'. 217 Default is {"noise_score_max": {"value": 0.8, "operator": ">="},"noise_score_min": {"value": 0.5, "operator": ">="}}, 218 peak_picking_method : str, optional 219 Peak picking method to use. See implemented_peak_picking_methods for options. 220 Default is 'persistent homology'. 221 implemented_peak_picking_methods : tuple, optional 222 Peak picking methods that can be implemented. 223 Default is ('persistent homology', 'centroided_persistent_homology'). 224 ph_smooth_it : int, optional 225 Number of iterations to use for smoothing prior to finding mass features. 226 Used only for "persistent homology" peak picking method. 227 Called within the PHCalculations.find_mass_features_ph() method. 228 Default is 1. 229 ph_smooth_radius_mz : int, optional 230 Radius in m/z steps (not daltons) for smoothing prior to finding mass features. 231 Used only for "persistent homology" peak picking method. 232 Called within the PHCalculations.find_mass_features_ph() method. 233 Default is 0. 234 ph_smooth_radius_scan : int, optional 235 Radius in scan steps for smoothing prior to finding mass features. 236 Used only for "persistent homology" peak picking method. 237 Called within the PHCalculations.find_mass_features_ph() method. 238 Default is 1. 239 ph_inten_min_rel : float, optional 240 Relative minimum intensity to use for finding mass features for persistent homology. 241 Used only for "persistent homology" peak picking method. 242 Calculated as a fraction of the maximum intensity of the unprocessed profile data (mz, scan). 243 Called within the PH_Calculations.find_mass_features() method. 244 Default is 0.001. 245 ph_persis_min_rel : float, optional 246 Relative minimum persistence for retaining mass features. 247 Used for both "persistent homology" and "centroided_persistent_homology" peak picking methods. 248 Calculated as a fraction of the maximum intensity of the unprocessed profile data (mz, scan). 249 Should be greater to or equal to ph_inten_min_rel. 250 Called within the PH_Calculations.find_mass_features() method. 251 Default is 0.001. 252 mass_feature_cluster_mz_tolerance_rel : float, optional 253 Relative m/z tolerance to use for clustering mass features. 254 Used for both "persistent homology" and "centroided_persistent_homology" 255 peak picking methods. 256 Called with the PHCalculations.cluster_mass_features() and the 257 LCCalculations.deconvolute_ms1_mass_features() methods. 258 Default is 5e-6 (5 ppm). 259 mass_feature_cluster_rt_tolerance : float, optional 260 Retention time tolerance to use for clustering mass features, in minutes. 261 Used for both "persistent homology" and "centroided_persistent_homology" 262 peak picking methods. 263 Called with the PHCalculations.cluster_mass_features() and the 264 LCCalculations.deconvolute_ms1_mass_features() methods. 265 Default is 0.3. 266 ms1_scans_to_average : int, optional 267 Number of MS1 scans to average for mass-feature associated m/zs. 268 Called within the LCMSBase.add_associated_ms1() method. 269 Default is 1. 270 ms1_deconvolution_corr_min : float, optional 271 Minimum correlation to use for deconvoluting MS1 mass features. 272 Called within the LCCalculations.deconvolute_ms1_mass_features() method. 273 Default is 0.8. 274 ms2_dda_rt_tolerance : float, optional 275 Retention time tolerance to use for associating MS2 spectra to mass features, in minutes. 276 Called within the LCMSBase.add_associated_ms2_dda() method. 277 Default is 0.15. 278 ms2_dda_mz_tolerance : float, optional 279 Mass tolerance to use for associating MS2 spectra to mass features. 280 Called within the LCMSBase.add_associated_ms2_dda() method. 281 Default is 0.05. 282 ms2_min_fe_score : float, optional 283 Minimum flash entropy for retaining MS2 annotations. 284 Called within the LCMSSpectralSearch.fe_search() method. 285 Default is 0.2. 286 search_as_lipids : bool, optional 287 If True, prepare the database for lipid searching. 288 Called within the LCMSSpectralSearch.fe_prep_search_db() method. 289 Default is False. 290 include_fragment_types : bool, optional 291 If True, include fragment types in the database. 292 Called within the LCMSSpectralSearch.fe_search() and related methods. 293 Default is False. 294 export_profile_spectra : bool, optional 295 If True, export profile spectra data. 296 Default is False. 297 export_eics : bool, optional 298 If True, export extracted ion chromatograms. 299 Default is True. 300 export_unprocessed_ms1 : bool, optional 301 If True, export unprocessed MS1 data. 302 Default is False. 303 export_only_relevant_mass_spectra : bool, optional 304 If True, export only mass spectra associated with detected mass features: 305 specifically the apex MS1 scan for each mass feature and the best MS2 scan 306 for each mass feature (if available). If False, export all mass spectra. 307 This parameter reduces HDF5 file size by excluding non-feature spectra. 308 Default is False (backwards compatible - exports all spectra). 309 verbose_processing : bool, optional 310 If True, print verbose processing information. 311 Default is True. 312 """ 313 314 scans: list | tuple = (-1, -1) 315 316 # Parameters used for generating EICs and performing 1D peak picking and EIC/TIC smoothing 317 eic_tolerance_ppm: float = 5 318 correct_eic_baseline = True 319 smooth_window: int = 5 320 smooth_method: str = "savgol" 321 implemented_smooth_method: tuple = ( 322 "savgol", 323 "hanning", 324 "blackman", 325 "bartlett", 326 "flat", 327 "boxcar", 328 ) 329 savgol_pol_order: int = 2 330 consecutive_scan_min: int = 0 331 peak_height_max_percent: float = 10 332 peak_max_prominence_percent: float = 1 333 peak_derivative_threshold: float = 0.0005 334 min_peak_datapoints: float = 5 335 noise_threshold_method: str = "manual_relative_abundance" 336 noise_threshold_methods_implemented: tuple = ( 337 "auto_relative_abundance", 338 "manual_relative_abundance", 339 "second_derivative", 340 ) 341 peak_height_min_percent: float = 0.1 342 eic_signal_threshold: float = 0.01 343 eic_buffer_time = 1.5 344 dispersity_index_window: float = 3.0 # minutes 345 noise_window_factor: float = 2.0 # times the peak width for detemining SN for EIC 346 347 # Parameters used for filtering mass features after peak picking 348 remove_redundant_mass_features: bool = False 349 redundant_scan_frequency_min: float = 0.1 350 redundant_feature_retain_n: int = 3 351 remove_mass_features_by_peak_metrics: bool = False 352 # note that this is a dictionary of dictionaries and set in __post_init__ instead of here 353 mass_feature_attribute_filter_dict: Dict = dataclasses.field(default_factory=dict) 354 355 # Parameters used for 2D peak picking 356 peak_picking_method: str = "persistent homology" 357 implemented_peak_picking_methods: tuple = ( 358 "persistent homology", 359 "centroided_persistent_homology", 360 ) 361 362 # Parameters used in persistent homology calculations 363 ph_smooth_it = 1 364 ph_smooth_radius_mz = 0 365 ph_smooth_radius_scan = 1 366 ph_inten_min_rel = 0.001 367 ph_persis_min_rel = 0.001 368 369 # Parameters used to cluster mass features 370 mass_feature_cluster_mz_tolerance_rel: float = 5e-6 371 mass_feature_cluster_rt_tolerance: float = 0.3 372 373 # Parameters used in associating MS1 and MS2 spectra to LCMS mass features and deconvoluting MS1 mass features 374 ms1_scans_to_average: int = 1 375 ms1_deconvolution_corr_min: float = 0.8 376 ms2_dda_rt_tolerance: float = 0.15 377 ms2_dda_mz_tolerance: float = 0.05 378 379 # Parameters used for flash entropy searching and database preparation 380 ms2_min_fe_score: float = 0.2 381 search_as_lipids: bool = False 382 include_fragment_types: bool = False 383 384 # Parameters used for saving the data 385 export_profile_spectra: bool = False 386 export_eics: bool = True 387 export_unprocessed_ms1: bool = False 388 export_only_relevant_mass_spectra: bool = False 389 390 # Parameters used for verbose processing 391 verbose_processing: bool = True 392 393 def __post_init__(self): 394 # Set default values for mass_feature_attribute_filter_dict if empty 395 if not self.mass_feature_attribute_filter_dict: 396 self.mass_feature_attribute_filter_dict = { 397 "noise_score_max": {"value": 0.8, "operator": ">="}, 398 "noise_score_min": {"value": 0.5, "operator": ">="}, 399 } 400 401 # enforce datatype 402 for field in dataclasses.fields(self): 403 value = getattr(self, field.name) 404 if not isinstance(value, field.type): 405 value = field.type(value) 406 setattr(self, field.name, value)
Liquid chromatograph processing settings class
Attributes
- scans (list or tuple, optional): List of select scan to average or a tuple containing the range to average. Default is (-1, -1).
- eic_tolerance_ppm (float, optional): Mass tolerance in ppm for extracted ion chromatogram peak detection. Default is 5.
- correct_eic_baseline (bool, optional): If True, correct the baseline of the extracted ion chromatogram. Default is True.
- smooth_window (int, optional): Window size for smoothing the ion chromatogram (extracted or total). Default is 5.
- smooth_method (str, optional): Smoothing method to use. See implemented_smooth_method for options. Default is 'savgol'.
- implemented_smooth_method (tuple, optional): Smoothing methods that can be implemented. Default is ('savgol', 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar').
- savgol_pol_order (int, optional): Polynomial order for Savitzky-Golay smoothing. Default is 2.
- consecutive_scan_min (int, optional): Minimum number of consecutive scans to consider for peak detection. Default is 0 for backwards compatibility, but a value of 3 is recommended.
- peak_height_max_percent (float, optional): 1-100 % used for baseline detection use 0.1 for second_derivative and 10 for other methods. Default is 10.
- peak_max_prominence_percent (float, optional): 1-100 % used for baseline detection. Default is 1.
- peak_derivative_threshold (float, optional): Threshold for defining derivative crossing. Default is 0.0005.
- min_peak_datapoints (float, optional): minimum data point to define a chromatografic peak. Default is 5.
- noise_threshold_method (str, optional): Method for detecting noise threshold. Default is 'manual_relative_abundance'.
- noise_threshold_methods_implemented (tuple, optional): Methods for detected noise threshold that can be implemented. Default is ('auto_relative_abundance', 'manual_relative_abundance', 'second_derivative').
- peak_height_min_percent (float, optional): 0-100 % used for peak detection. Default is 0.1.
- eic_signal_threshold (float, optional): 0-100 % used for extracted ion chromatogram peak detection. Default is 0.01.
- eic_buffer_time (float, optional): Buffer time to add to the start and end of the plot of the extracted ion chromatogram, in minutes. Default is 1.5.
- dispersity_index_window (float, optional): Dispersity index window size, in minutes. Default is 3.0.
- noise_window_factor (float, optional): Factor to determine noise estimation window size relative to peak width. Larger values use wider windows for noise estimation. For example, a value of 2.0 uses a window size equal to twice the peak width (depending on it's start and end scans) on each side. Called within the LCMSMassFeature.calc_noise_score() method. Default is 2.0.
- remove_redundant_mass_features (bool, optional): If True, remove redundant mass features that are likely contaminants based on their m/z values and scan frequency. Especially useful for HILIC data where signals do not return to baseline between peaks or for data with significant background noise. Called within the LC_Calculations.find_mass_features() method. Default is False.
- redundant_scan_frequency_min (float, optional): Minimum fraction of scans that must contain the m/z to be considered a likely noise/contaminant when using remove_redundant_mass_features. Default is 0.1 (10% of scans).
- redundant_feature_retain_n (int, optional): Number of features to retain in each group when using remove_redundant_mass_features. Default is 3.
- remove_mass_features_by_peak_metrics (bool, optional): If True, remove mass features based on their peak metrics such as S/N, Gaussian similarity, dispersity index, and noise score. Called within the LC_Calculations.add_peak_metrics() method. Default is False.
mass_feature_attribute_filter_dict (dict, optional): Dictionary specifying filtering criteria for mass feature attributes. Each key is an attribute name, and each value is a dict with 'value' and 'operator' keys.
Structure: {attribute_name: {'value': threshold, 'operator': comparison}}
Available operators:
- '>' or 'greater': Keep features where attribute > threshold
- '<' or 'less': Keep features where attribute < threshold
- '>=' or 'greater_equal': Keep features where attribute >= threshold
- '<=' or 'less_equal': Keep features where attribute <= threshold
Examples: { 'noise_score_max': {'value': 0.5, 'operator': '>'}, # Keep if noise_score_max > 0.5 'dispersity_index': {'value': 0.1, 'operator': '<'}, # Keep if dispersity_index < 0.1 'gaussian_similarity': {'value': 0.7, 'operator': '>='} # Keep if gaussian_similarity >= 0.7 }
Available attributes include: 'noise_score', 'noise_score_min', 'noise_score_max', 'gaussian_similarity', 'tailing_factor', 'dispersity_index', 'half_height_width', 'intensity'. Default is {"noise_score_max": {"value": 0.8, "operator": ">="},"noise_score_min": {"value": 0.5, "operator": ">="}},
- peak_picking_method (str, optional): Peak picking method to use. See implemented_peak_picking_methods for options. Default is 'persistent homology'.
- implemented_peak_picking_methods (tuple, optional): Peak picking methods that can be implemented. Default is ('persistent homology', 'centroided_persistent_homology').
- ph_smooth_it (int, optional): Number of iterations to use for smoothing prior to finding mass features. Used only for "persistent homology" peak picking method. Called within the PHCalculations.find_mass_features_ph() method. Default is 1.
- ph_smooth_radius_mz (int, optional): Radius in m/z steps (not daltons) for smoothing prior to finding mass features. Used only for "persistent homology" peak picking method. Called within the PHCalculations.find_mass_features_ph() method. Default is 0.
- ph_smooth_radius_scan (int, optional): Radius in scan steps for smoothing prior to finding mass features. Used only for "persistent homology" peak picking method. Called within the PHCalculations.find_mass_features_ph() method. Default is 1.
- ph_inten_min_rel (float, optional): Relative minimum intensity to use for finding mass features for persistent homology. Used only for "persistent homology" peak picking method. Calculated as a fraction of the maximum intensity of the unprocessed profile data (mz, scan). Called within the PH_Calculations.find_mass_features() method. Default is 0.001.
- ph_persis_min_rel (float, optional): Relative minimum persistence for retaining mass features. Used for both "persistent homology" and "centroided_persistent_homology" peak picking methods. Calculated as a fraction of the maximum intensity of the unprocessed profile data (mz, scan). Should be greater to or equal to ph_inten_min_rel. Called within the PH_Calculations.find_mass_features() method. Default is 0.001.
- mass_feature_cluster_mz_tolerance_rel (float, optional): Relative m/z tolerance to use for clustering mass features. Used for both "persistent homology" and "centroided_persistent_homology" peak picking methods. Called with the PHCalculations.cluster_mass_features() and the LCCalculations.deconvolute_ms1_mass_features() methods. Default is 5e-6 (5 ppm).
- mass_feature_cluster_rt_tolerance (float, optional): Retention time tolerance to use for clustering mass features, in minutes. Used for both "persistent homology" and "centroided_persistent_homology" peak picking methods. Called with the PHCalculations.cluster_mass_features() and the LCCalculations.deconvolute_ms1_mass_features() methods. Default is 0.3.
- ms1_scans_to_average (int, optional): Number of MS1 scans to average for mass-feature associated m/zs. Called within the LCMSBase.add_associated_ms1() method. Default is 1.
- ms1_deconvolution_corr_min (float, optional): Minimum correlation to use for deconvoluting MS1 mass features. Called within the LCCalculations.deconvolute_ms1_mass_features() method. Default is 0.8.
- ms2_dda_rt_tolerance (float, optional): Retention time tolerance to use for associating MS2 spectra to mass features, in minutes. Called within the LCMSBase.add_associated_ms2_dda() method. Default is 0.15.
- ms2_dda_mz_tolerance (float, optional): Mass tolerance to use for associating MS2 spectra to mass features. Called within the LCMSBase.add_associated_ms2_dda() method. Default is 0.05.
- ms2_min_fe_score (float, optional): Minimum flash entropy for retaining MS2 annotations. Called within the LCMSSpectralSearch.fe_search() method. Default is 0.2.
- search_as_lipids (bool, optional): If True, prepare the database for lipid searching. Called within the LCMSSpectralSearch.fe_prep_search_db() method. Default is False.
- include_fragment_types (bool, optional): If True, include fragment types in the database. Called within the LCMSSpectralSearch.fe_search() and related methods. Default is False.
- export_profile_spectra (bool, optional): If True, export profile spectra data. Default is False.
- export_eics (bool, optional): If True, export extracted ion chromatograms. Default is True.
- export_unprocessed_ms1 (bool, optional): If True, export unprocessed MS1 data. Default is False.
- export_only_relevant_mass_spectra (bool, optional): If True, export only mass spectra associated with detected mass features: specifically the apex MS1 scan for each mass feature and the best MS2 scan for each mass feature (if available). If False, export all mass spectra. This parameter reduces HDF5 file size by excluding non-feature spectra. Default is False (backwards compatible - exports all spectra).
- verbose_processing (bool, optional): If True, print verbose processing information. Default is True.
409@dataclasses.dataclass 410class MassSpectrumSetting: 411 """Mass spectrum processing settings class 412 413 Attributes 414 ---------- 415 noise_threshold_method : str, optional 416 Method for detecting noise threshold. Default is 'log'. 417 noise_threshold_methods_implemented : tuple, optional 418 Methods for detected noise threshold that can be implemented. Default is ('minima', 'signal_noise', 'relative_abundance', 'absolute_abundance', 'log'). 419 noise_threshold_min_std : int, optional 420 Minumum value for noise thresholding when using 'minima' noise threshold method. Default is 6. 421 noise_threshold_min_s2n : float, optional 422 Minimum value for noise thresholding when using 'signal_noise' noise threshold method. Default is 4. 423 noise_threshold_min_relative_abundance : float, optional 424 Minimum value for noise thresholding when using 'relative_abundance' noise threshold method. Note that this is a percentage value. Default is 6 (6%). 425 noise_threshold_absolute_abundance : float, optional 426 Minimum value for noise thresholding when using 'absolute_abundance' noise threshold method. Default is 1_000_000. 427 noise_threshold_log_nsigma : int, optional 428 Number of standard deviations to use when using 'log' noise threshold method. Default is 6. 429 noise_threshold_log_nsigma_corr_factor : float, optional 430 Correction factor for log noise threshold method. Default is 0.463. 431 noise_threshold_log_nsigma_bins : int, optional 432 Number of bins to use for histogram when using 'log' noise threshold method. Default is 500. 433 noise_min_mz : float, optional 434 Minimum m/z to use for noise thresholding. Default is 50.0. 435 noise_max_mz : float, optional 436 Maximum m/z to use for noise thresholding. Default is 1200.0. 437 min_picking_mz : float, optional 438 Minimum m/z to use for peak picking. Default is 50.0. 439 max_picking_mz : float, optional 440 Maximum m/z to use for peak picking. Default is 1200.0. 441 picking_point_extrapolate : int, optional 442 How many data points (in each direction) to extrapolate the mz axis and 0 pad the abundance axis. Default is 3. 443 Recommend 3 for reduced profile data or if peak picking faults 444 calib_minimize_method : str, optional 445 Minimization method to use for calibration. Default is 'Powell'. 446 calib_pol_order : int, optional 447 Polynomial order to use for calibration. Default is 2. 448 max_calib_ppm_error : float, optional 449 Maximum ppm error to use for calibration. Default is 1.0. 450 min_calib_ppm_error : float, optional 451 Minimum ppm error to use for calibration. Default is -1.0. 452 calib_sn_threshold : float, optional 453 Signal to noise threshold to use for calibration. Default is 2.0. 454 calibration_ref_match_method: string, optional 455 Method for matching reference masses with measured masses for recalibration. Default is 'legacy'. 456 calibration_ref_match_tolerance: float, optional 457 If using the new method for calibration reference mass matching, this tolerance is the initial matching tolerance. Default is 0.003 458 do_calibration : bool, optional 459 If True, perform calibration. Default is True. 460 verbose_processing : bool, optional 461 If True, print verbose processing information. Default is True. 462 """ 463 464 noise_threshold_method: str = "log" 465 466 noise_threshold_methods_implemented: tuple = ( 467 "minima", 468 "signal_noise", 469 "relative_abundance", 470 "absolute_abundance", 471 "log", 472 ) 473 474 noise_threshold_min_std: int = 6 # when using 'minima' method 475 476 noise_threshold_min_s2n: float = 4 # when using 'signal_noise' method 477 478 noise_threshold_min_relative_abundance: float = ( 479 6 # from 0-100, when using 'relative_abundance' method 480 ) 481 482 noise_threshold_absolute_abundance: float = ( 483 1_000_000 # when using 'absolute_abundance' method 484 ) 485 486 noise_threshold_log_nsigma: int = 6 # when using 'log' method 487 noise_threshold_log_nsigma_corr_factor: float = 0.463 # mFT is 0.463, aFT is 1.0 488 noise_threshold_log_nsigma_bins: int = 500 # bins for the histogram for the noise 489 490 noise_min_mz: float = 50.0 491 noise_max_mz: float = 1200.0 492 493 min_picking_mz: float = 50.0 494 max_picking_mz: float = 1200.0 495 496 # How many data points (in each direction) to extrapolate the mz axis and 0 pad the abundance axis 497 # This will fix peak picking at spectrum limit issues 498 # 0 to keep normal behaviour, typical value 3 to fix 499 picking_point_extrapolate: int = 3 500 501 calib_minimize_method: str = "Powell" 502 calib_pol_order: int = 2 503 max_calib_ppm_error: float = 1.0 504 min_calib_ppm_error: float = -1.0 505 calib_sn_threshold: float = 2.0 506 calibration_ref_match_method: str = "legacy" 507 calibration_ref_match_method_implemented: tuple = ("legacy", "merged") 508 calibration_ref_match_tolerance: float = 0.003 509 calibration_ref_match_std_raw_error_limit: float = 1.5 510 # calib_ref_mzs: list = [0] 511 512 do_calibration: bool = True 513 verbose_processing: bool = True 514 515 def __post_init__(self): 516 # enforce datatype 517 for field in dataclasses.fields(self): 518 value = getattr(self, field.name) 519 if not isinstance(value, field.type): 520 value = field.type(value) 521 setattr(self, field.name, value)
Mass spectrum processing settings class
Attributes
- noise_threshold_method (str, optional): Method for detecting noise threshold. Default is 'log'.
- noise_threshold_methods_implemented (tuple, optional): Methods for detected noise threshold that can be implemented. Default is ('minima', 'signal_noise', 'relative_abundance', 'absolute_abundance', 'log').
- noise_threshold_min_std (int, optional): Minumum value for noise thresholding when using 'minima' noise threshold method. Default is 6.
- noise_threshold_min_s2n (float, optional): Minimum value for noise thresholding when using 'signal_noise' noise threshold method. Default is 4.
- noise_threshold_min_relative_abundance (float, optional): Minimum value for noise thresholding when using 'relative_abundance' noise threshold method. Note that this is a percentage value. Default is 6 (6%).
- noise_threshold_absolute_abundance (float, optional): Minimum value for noise thresholding when using 'absolute_abundance' noise threshold method. Default is 1_000_000.
- noise_threshold_log_nsigma (int, optional): Number of standard deviations to use when using 'log' noise threshold method. Default is 6.
- noise_threshold_log_nsigma_corr_factor (float, optional): Correction factor for log noise threshold method. Default is 0.463.
- noise_threshold_log_nsigma_bins (int, optional): Number of bins to use for histogram when using 'log' noise threshold method. Default is 500.
- noise_min_mz (float, optional): Minimum m/z to use for noise thresholding. Default is 50.0.
- noise_max_mz (float, optional): Maximum m/z to use for noise thresholding. Default is 1200.0.
- min_picking_mz (float, optional): Minimum m/z to use for peak picking. Default is 50.0.
- max_picking_mz (float, optional): Maximum m/z to use for peak picking. Default is 1200.0.
- picking_point_extrapolate (int, optional): How many data points (in each direction) to extrapolate the mz axis and 0 pad the abundance axis. Default is 3. Recommend 3 for reduced profile data or if peak picking faults
- calib_minimize_method (str, optional): Minimization method to use for calibration. Default is 'Powell'.
- calib_pol_order (int, optional): Polynomial order to use for calibration. Default is 2.
- max_calib_ppm_error (float, optional): Maximum ppm error to use for calibration. Default is 1.0.
- min_calib_ppm_error (float, optional): Minimum ppm error to use for calibration. Default is -1.0.
- calib_sn_threshold (float, optional): Signal to noise threshold to use for calibration. Default is 2.0.
- calibration_ref_match_method (string, optional): Method for matching reference masses with measured masses for recalibration. Default is 'legacy'.
- calibration_ref_match_tolerance (float, optional): If using the new method for calibration reference mass matching, this tolerance is the initial matching tolerance. Default is 0.003
- do_calibration (bool, optional): If True, perform calibration. Default is True.
- verbose_processing (bool, optional): If True, print verbose processing information. Default is True.
524@dataclasses.dataclass 525class MassSpecPeakSetting: 526 """Mass spectrum peak processing settings class 527 528 Attributes 529 ---------- 530 kendrick_base : Dict, optional 531 Dictionary specifying the elements and their counts in the Kendrick base. 532 Defaults to {'C': 1, 'H': 2}. 533 kendrick_rounding_method : str, optional 534 Method for calculating the nominal Kendrick mass. Valid values are 'floor', 'ceil', or 'round'. 535 Defaults to 'floor'. 536 implemented_kendrick_rounding_methods : tuple 537 Tuple of valid rounding methods for calculating the nominal Kendrick mass. 538 Defaults to ('floor', 'ceil', 'round'). 539 peak_derivative_threshold : float, optional 540 Threshold for defining derivative crossing. Should be a value between 0 and 1. 541 Defaults to 0.0. 542 peak_min_prominence_percent : float, optional 543 Minimum prominence percentage used for peak detection. Should be a value between 1 and 100. 544 Defaults to 0.1. 545 min_peak_datapoints : float, optional 546 Minimum number of data points used for peak detection. Should be a value between 0 and infinity. 547 Defaults to 5. 548 peak_max_prominence_percent : float, optional 549 Maximum prominence percentage used for baseline detection. Should be a value between 1 and 100. 550 Defaults to 0.1. 551 peak_height_max_percent : float, optional 552 Maximum height percentage used for baseline detection. Should be a value between 1 and 100. 553 Defaults to 10. 554 legacy_resolving_power : bool, optional 555 Flag indicating whether to use the legacy (CoreMS v1) resolving power calculation. 556 Defaults to True. 557 legacy_centroid_polyfit : bool, optional 558 Use legacy (numpy polyfit) to fit centroid 559 Default false. 560 """ 561 562 kendrick_base: Dict = dataclasses.field(default_factory=dict) 563 564 kendrick_rounding_method: str = "floor" # 'floor', 'ceil' or 'round' are valid methods for calculating nominal kendrick mass 565 566 implemented_kendrick_rounding_methods: tuple = ("floor", "ceil", "round") 567 568 peak_derivative_threshold: float = 0.0 # define derivative crossing threshould 0-1 569 570 peak_min_prominence_percent: float = 0.1 # 1-100 % used for peak detection 571 572 min_peak_datapoints: float = 5 # 0-inf used for peak detection 573 574 peak_max_prominence_percent: float = 0.1 # 1-100 % used for baseline detection 575 576 peak_height_max_percent: float = 10 # 1-100 % used for baseline detection 577 578 legacy_resolving_power: bool = ( 579 True # Use the legacy (CoreMS v1) resolving power calculation (True) 580 ) 581 582 legacy_centroid_polyfit: bool = False 583 584 def __post_init__(self): 585 # default to CH2 586 if not self.kendrick_base: 587 self.kendrick_base = {"C": 1, "H": 2} 588 # enforce datatype 589 for field in dataclasses.fields(self): 590 value = getattr(self, field.name) 591 if not isinstance(value, field.type): 592 value = field.type(value) 593 setattr(self, field.name, value)
Mass spectrum peak processing settings class
Attributes
- kendrick_base (Dict, optional): Dictionary specifying the elements and their counts in the Kendrick base. Defaults to {'C': 1, 'H': 2}.
- kendrick_rounding_method (str, optional): Method for calculating the nominal Kendrick mass. Valid values are 'floor', 'ceil', or 'round'. Defaults to 'floor'.
- implemented_kendrick_rounding_methods (tuple): Tuple of valid rounding methods for calculating the nominal Kendrick mass. Defaults to ('floor', 'ceil', 'round').
- peak_derivative_threshold (float, optional): Threshold for defining derivative crossing. Should be a value between 0 and 1. Defaults to 0.0.
- peak_min_prominence_percent (float, optional): Minimum prominence percentage used for peak detection. Should be a value between 1 and 100. Defaults to 0.1.
- min_peak_datapoints (float, optional): Minimum number of data points used for peak detection. Should be a value between 0 and infinity. Defaults to 5.
- peak_max_prominence_percent (float, optional): Maximum prominence percentage used for baseline detection. Should be a value between 1 and 100. Defaults to 0.1.
- peak_height_max_percent (float, optional): Maximum height percentage used for baseline detection. Should be a value between 1 and 100. Defaults to 10.
- legacy_resolving_power (bool, optional): Flag indicating whether to use the legacy (CoreMS v1) resolving power calculation. Defaults to True.
- legacy_centroid_polyfit (bool, optional): Use legacy (numpy polyfit) to fit centroid Default false.
596@dataclasses.dataclass 597class GasChromatographSetting: 598 """Gas chromatograph processing settings class 599 600 Attributes 601 ---------- 602 use_deconvolution : bool, optional 603 If True, use deconvolution. Default is False. 604 implemented_smooth_method : tuple, optional 605 Smoothing methods that can be implemented. Default is ('savgol', 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar'). 606 smooth_window : int, optional 607 Window size for smoothing the ion chromatogram. Default is 5. 608 smooth_method : str, optional 609 Smoothing method to use. Default is 'savgol'. Other options are 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar'. 610 savgol_pol_order : int, optional 611 Polynomial order for Savitzky-Golay smoothing. Default is 2. 612 peak_derivative_threshold : float, optional 613 Threshold for defining derivative crossing. Should be a value between 0 and 1. 614 Defaults to 0.0005. 615 peak_height_max_percent : float, optional 616 Maximum height percentage used for baseline detection. Should be a value between 1 and 100. 617 Defaults to 10. 618 peak_max_prominence_percent : float, optional 619 Maximum prominence percentage used for baseline detection. Should be a value between 1 and 100. 620 Defaults to 1. 621 min_peak_datapoints : float, optional 622 Minimum number of data points used for peak detection. Should be a value between 0 and infinity. 623 Defaults to 5. 624 max_peak_width : float, optional 625 Maximum peak width used for peak detection. Should be a value between 0 and infinity. 626 Defaults to 0.1. 627 noise_threshold_method : str, optional 628 Method for detecting noise threshold. Default is 'manual_relative_abundance'. 629 noise_threshold_methods_implemented : tuple, optional 630 Methods for detected noise threshold that can be implemented. Default is ('auto_relative_abundance', 'manual_relative_abundance', 'second_derivative'). 631 std_noise_threshold : int, optional 632 Default is 3. 633 peak_height_min_percent : float, optional 634 0-100 % used for peak detection. Default is 0.1. 635 peak_min_prominence_percent : float, optional 636 0-100 % used for peak detection. Default is 0.1. 637 eic_signal_threshold : float, optional 638 0-100 % used for extracted ion chromatogram peak detection. Default is 0.01. 639 max_rt_distance : float, optional 640 Maximum distance allowance for hierarchical cluster, in minutes. Default is 0.025. 641 verbose_processing : bool, optional 642 If True, print verbose processing information. Default is True. 643 """ 644 645 use_deconvolution: bool = False 646 647 implemented_smooth_method: tuple = ( 648 "savgol", 649 "hanning", 650 "blackman", 651 "bartlett", 652 "flat", 653 "boxcar", 654 ) 655 656 smooth_window: int = 5 657 658 smooth_method: str = "savgol" 659 660 savgol_pol_order: int = 2 661 662 peak_derivative_threshold: float = 0.0005 663 664 peak_height_max_percent: float = 10 # 1-100 % used for baseline detection use 0.1 for second_derivative and 10 for other methods 665 666 peak_max_prominence_percent: float = 1 # 1-100 % used for baseline detection 667 668 min_peak_datapoints: float = 5 669 670 max_peak_width: float = 0.1 671 672 noise_threshold_method: str = "manual_relative_abundance" 673 674 noise_threshold_methods_implemented: tuple = ( 675 "auto_relative_abundance", 676 "manual_relative_abundance", 677 "second_derivative", 678 ) 679 680 std_noise_threshold: int = 3 681 682 peak_height_min_percent: float = 0.1 # 0-100 % used for peak detection 683 684 peak_min_prominence_percent: float = 0.1 # 0-100 % used for peak detection 685 686 eic_signal_threshold: float = ( 687 0.01 # 0-100 % used for extracted ion chromatogram peak detection 688 ) 689 690 max_rt_distance: float = ( 691 0.025 # minutes, max distance allowance hierarchical clutter 692 ) 693 694 verbose_processing: bool = True 695 696 def __post_init__(self): 697 # enforce datatype 698 for field in dataclasses.fields(self): 699 value = getattr(self, field.name) 700 if not isinstance(value, field.type): 701 value = field.type(value) 702 setattr(self, field.name, value)
Gas chromatograph processing settings class
Attributes
- use_deconvolution (bool, optional): If True, use deconvolution. Default is False.
- implemented_smooth_method (tuple, optional): Smoothing methods that can be implemented. Default is ('savgol', 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar').
- smooth_window (int, optional): Window size for smoothing the ion chromatogram. Default is 5.
- smooth_method (str, optional): Smoothing method to use. Default is 'savgol'. Other options are 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar'.
- savgol_pol_order (int, optional): Polynomial order for Savitzky-Golay smoothing. Default is 2.
- peak_derivative_threshold (float, optional): Threshold for defining derivative crossing. Should be a value between 0 and 1. Defaults to 0.0005.
- peak_height_max_percent (float, optional): Maximum height percentage used for baseline detection. Should be a value between 1 and 100. Defaults to 10.
- peak_max_prominence_percent (float, optional): Maximum prominence percentage used for baseline detection. Should be a value between 1 and 100. Defaults to 1.
- min_peak_datapoints (float, optional): Minimum number of data points used for peak detection. Should be a value between 0 and infinity. Defaults to 5.
- max_peak_width (float, optional): Maximum peak width used for peak detection. Should be a value between 0 and infinity. Defaults to 0.1.
- noise_threshold_method (str, optional): Method for detecting noise threshold. Default is 'manual_relative_abundance'.
- noise_threshold_methods_implemented (tuple, optional): Methods for detected noise threshold that can be implemented. Default is ('auto_relative_abundance', 'manual_relative_abundance', 'second_derivative').
- std_noise_threshold (int, optional): Default is 3.
- peak_height_min_percent (float, optional): 0-100 % used for peak detection. Default is 0.1.
- peak_min_prominence_percent (float, optional): 0-100 % used for peak detection. Default is 0.1.
- eic_signal_threshold (float, optional): 0-100 % used for extracted ion chromatogram peak detection. Default is 0.01.
- max_rt_distance (float, optional): Maximum distance allowance for hierarchical cluster, in minutes. Default is 0.025.
- verbose_processing (bool, optional): If True, print verbose processing information. Default is True.
705@dataclasses.dataclass 706class CompoundSearchSettings: 707 """Settings for compound search 708 709 Attributes 710 ---------- 711 url_database : str, optional 712 URL for the database. Default is 'sqlite:///db/pnnl_lowres_gcms_compounds.sqlite'. 713 ri_search_range : float, optional 714 Retention index search range. Default is 35. 715 rt_search_range : float, optional 716 Retention time search range, in minutes. Default is 1.0. 717 correlation_threshold : float, optional 718 Threshold for correlation for spectral similarity. Default is 0.5. 719 score_threshold : float, optional 720 Threshold for compsite score. Default is 0.0. 721 ri_spacing : float, optional 722 Retention index spacing. Default is 200. 723 ri_std : float, optional 724 Retention index standard deviation. Default is 3. 725 ri_calibration_compound_names : list, optional 726 List of compound names to use for retention index calibration. Default is ['Methyl Caprylate', 'Methyl Caprate', 'Methyl Pelargonate', 'Methyl Laurate', 'Methyl Myristate', 'Methyl Palmitate', 'Methyl Stearate', 'Methyl Eicosanoate', 'Methyl Docosanoate', 'Methyl Linocerate', 'Methyl Hexacosanoate', 'Methyl Octacosanoate', 'Methyl Triacontanoate']. 727 728 """ 729 730 url_database: str = "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/lowres" # 'postgresql://postgres:labthomson0102@172.22.113.27:5432/GCMS' # 'sqlite:///db/pnnl_lowres_gcms_compounds.sqlite' 731 732 ri_search_range: float = 35 733 734 rt_search_range: float = 1.0 # used for retention index calibration 735 736 correlation_threshold: float = 0.5 # used for calibration, spectral similarity 737 738 score_threshold: float = 0.0 739 740 ri_spacing: float = 200 741 742 ri_std: float = 3 # in standard deviation 743 744 ri_calibration_compound_names: List = dataclasses.field(default_factory=list) 745 746 # calculates and export all spectral similarity methods 747 exploratory_mode: bool = False 748 749 score_methods: tuple = ("highest_sim_score", "highest_ss") 750 751 output_score_method: str = "All" 752 753 def __post_init__(self): 754 # enforce datatype 755 self.url_database = os.getenv( 756 "SPECTRAL_GCMS_DATABASE_URL", 757 "sqlite:///db/pnnl_lowres_gcms_compounds.sqlite", 758 ) 759 760 for field in dataclasses.fields(self): 761 value = getattr(self, field.name) 762 if not isinstance(value, field.type): 763 value = field.type(value) 764 setattr(self, field.name, value) 765 766 self.ri_calibration_compound_names = [ 767 "Methyl Caprylate", 768 "Methyl Caprate", 769 "Methyl Pelargonate", 770 "Methyl Laurate", 771 "Methyl Myristate", 772 "Methyl Palmitate", 773 "Methyl Stearate", 774 "Methyl Eicosanoate", 775 "Methyl Docosanoate", 776 "Methyl Linocerate", 777 "Methyl Hexacosanoate", 778 "Methyl Octacosanoate", 779 "Methyl Triacontanoate", 780 ]
Settings for compound search
Attributes
- url_database (str, optional): URL for the database. Default is 'sqlite:///db/pnnl_lowres_gcms_compounds.sqlite'.
- ri_search_range (float, optional): Retention index search range. Default is 35.
- rt_search_range (float, optional): Retention time search range, in minutes. Default is 1.0.
- correlation_threshold (float, optional): Threshold for correlation for spectral similarity. Default is 0.5.
- score_threshold (float, optional): Threshold for compsite score. Default is 0.0.
- ri_spacing (float, optional): Retention index spacing. Default is 200.
- ri_std (float, optional): Retention index standard deviation. Default is 3.
- ri_calibration_compound_names (list, optional): List of compound names to use for retention index calibration. Default is ['Methyl Caprylate', 'Methyl Caprate', 'Methyl Pelargonate', 'Methyl Laurate', 'Methyl Myristate', 'Methyl Palmitate', 'Methyl Stearate', 'Methyl Eicosanoate', 'Methyl Docosanoate', 'Methyl Linocerate', 'Methyl Hexacosanoate', 'Methyl Octacosanoate', 'Methyl Triacontanoate'].
783class MolecularLookupDictSettings: 784 """Settings for molecular searching 785 786 These are used to generate the database entries, do not change. 787 788 Attributes 789 ---------- 790 usedAtoms : dict, optional 791 Dictionary of atoms and ranges. Default is {'C': (1, 90), 'H': (4, 200), 'O': (0, 12), 'N': (0, 0), 'S': (0, 0), 'P': (0, 0), 'Cl': (0, 0)}. 792 min_mz : float, optional 793 Minimum m/z to use for searching. Default is 50.0. 794 max_mz : float, optional 795 Maximum m/z to use for searching. Default is 1200.0. 796 min_dbe : float, optional 797 Minimum double bond equivalent to use for searching. Default is 0. 798 max_dbe : float, optional 799 Maximum double bond equivalent to use for searching. Default is 50. 800 use_pah_line_rule : bool, optional 801 If True, use the PAH line rule. Default is False. 802 isRadical : bool, optional 803 If True, search for radical ions. Default is True. 804 isProtonated : bool, optional 805 If True, search for protonated ions. Default is True. 806 url_database : str, optional 807 URL for the database. Default is None. 808 db_jobs : int, optional 809 Number of jobs to use for database queries. Default is 1. 810 used_atom_valences : dict, optional 811 Dictionary of atoms and valences. Default is {'C': 4, '13C': 4, 'H': 1, 'O': 2, '18O': 2, 'N': 3, 'S': 2, '34S': 2, 'P': 3, 'Cl': 1, '37Cl': 1, 'Br': 1, 'Na': 1, 'F': 1, 'K': 0}. 812 813 """ 814 815 ### DO NOT CHANGE IT! These are used to generate the database entries 816 817 ### DO change when creating a new application database 818 819 ### FOR search settings runtime and database query check use the MolecularFormulaSearchSettings class below 820 821 ### C, H, N, O, S and P atoms are ALWAYS needed at usedAtoms 822 ### if you don't want to include one of those atoms set the max and min at 0 823 ### you can include any atom listed at Atoms class inside encapsulation.settings.constants module 824 ### make sure to include the selected covalence at the used_atoms_valences when adding new atoms 825 ### NOTE : Adducts atoms have zero covalence 826 ### NOTE : Not using static variable because this class is distributed using multiprocessing 827 def __init__(self): 828 self.usedAtoms = { 829 "C": (1, 90), 830 "H": (4, 200), 831 "O": (0, 12), 832 "N": (0, 0), 833 "S": (0, 0), 834 "P": (0, 0), 835 "Cl": (0, 0), 836 } 837 838 self.min_mz = 50 839 840 self.max_mz = 1200 841 842 self.min_dbe = 0 843 844 self.max_dbe = 50 845 846 # overwrites the dbe limits above to DBE = (C + heteroatoms) * 0.9 847 self.use_pah_line_rule = False 848 849 self.isRadical = True 850 851 self.isProtonated = True 852 853 self.url_database = None 854 855 self.db_jobs = 1 856 857 self.used_atom_valences = { 858 "C": 4, 859 "13C": 4, 860 "H": 1, 861 "O": 2, 862 "18O": 2, 863 "N": 3, 864 "S": 2, 865 "34S": 2, 866 "P": 3, 867 "Cl": 1, 868 "37Cl": 1, 869 "Br": 1, 870 "Na": 1, 871 "F": 1, 872 "K": 0, 873 }
Settings for molecular searching
These are used to generate the database entries, do not change.
Attributes
- usedAtoms (dict, optional): Dictionary of atoms and ranges. Default is {'C': (1, 90), 'H': (4, 200), 'O': (0, 12), 'N': (0, 0), 'S': (0, 0), 'P': (0, 0), 'Cl': (0, 0)}.
- min_mz (float, optional): Minimum m/z to use for searching. Default is 50.0.
- max_mz (float, optional): Maximum m/z to use for searching. Default is 1200.0.
- min_dbe (float, optional): Minimum double bond equivalent to use for searching. Default is 0.
- max_dbe (float, optional): Maximum double bond equivalent to use for searching. Default is 50.
- use_pah_line_rule (bool, optional): If True, use the PAH line rule. Default is False.
- isRadical (bool, optional): If True, search for radical ions. Default is True.
- isProtonated (bool, optional): If True, search for protonated ions. Default is True.
- url_database (str, optional): URL for the database. Default is None.
- db_jobs (int, optional): Number of jobs to use for database queries. Default is 1.
- used_atom_valences (dict, optional): Dictionary of atoms and valences. Default is {'C': 4, '13C': 4, 'H': 1, 'O': 2, '18O': 2, 'N': 3, 'S': 2, '34S': 2, 'P': 3, 'Cl': 1, '37Cl': 1, 'Br': 1, 'Na': 1, 'F': 1, 'K': 0}.
876@dataclasses.dataclass 877class MolecularFormulaSearchSettings: 878 """Settings for molecular searching 879 880 Attributes 881 ---------- 882 use_isotopologue_filter : bool, optional 883 If True, use isotopologue filter. Default is False. 884 isotopologue_filter_threshold : float, optional 885 Threshold for isotopologue filter. Default is 33. 886 isotopologue_filter_atoms : tuple, optional 887 Tuple of atoms to use for isotopologue filter. Default is ('Cl', 'Br'). 888 use_runtime_kendrick_filter : bool, optional 889 If True, use runtime Kendrick filter. Default is False. 890 use_min_peaks_filter : bool, optional 891 If True, use minimum peaks filter. Default is True. 892 min_peaks_per_class : int, optional 893 Minimum number of peaks per class. Default is 15. 894 url_database : str, optional 895 URL for the database. Default is 'postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp'. 896 db_jobs : int, optional 897 Number of jobs to use for database queries. Default is 3. 898 db_chunk_size : int, optional 899 Chunk size to use for database queries. Default is 300. 900 ion_charge : int, optional 901 Ion charge. Default is -1. 902 min_hc_filter : float, optional 903 Minimum hydrogen to carbon ratio. Default is 0.3. 904 max_hc_filter : float, optional 905 Maximum hydrogen to carbon ratio. Default is 3. 906 min_oc_filter : float, optional 907 Minimum oxygen to carbon ratio. Default is 0.0. 908 max_oc_filter : float, optional 909 Maximum oxygen to carbon ratio. Default is 1.2. 910 min_op_filter : float, optional 911 Minimum oxygen to phosphorous ratio. Default is 2. 912 use_pah_line_rule : bool, optional 913 If True, use the PAH line rule. Default is False. 914 min_dbe : float, optional 915 Minimum double bond equivalent to use for searching. Default is 0. 916 max_dbe : float, optional 917 Maximum double bond equivalent to use for searching. Default is 40. 918 mz_error_score_weight : float, optional 919 Weight for m/z error score to contribute to composite score. Default is 0.6. 920 isotopologue_score_weight : float, optional 921 Weight for isotopologue score to contribute to composite score. Default is 0.4. 922 adduct_atoms_neg : tuple, optional 923 Tuple of atoms to use in negative polarity. Default is ('Cl', 'Br'). 924 adduct_atoms_pos : tuple, optional 925 Tuple of atoms to use in positive polarity. Default is ('Na', 'K'). 926 score_methods : tuple, optional 927 Tuple of score method that can be implemented. 928 Default is ('S_P_lowest_error', 'N_S_P_lowest_error', 'lowest_error', 'prob_score', 'air_filter_error', 'water_filter_error', 'earth_filter_error'). 929 score_method : str, optional 930 Score method to use. Default is 'prob_score'. Options are 'S_P_lowest_error', 'N_S_P_lowest_error', 'lowest_error', 'prob_score', 'air_filter_error', 'water_filter_error', 'earth_filter_error'. 931 output_min_score : float, optional 932 Minimum score for output. Default is 0.1. 933 output_score_method : str, optional 934 Score method to use for output. Default is 'All Candidates'. 935 isRadical : bool, optional 936 If True, search for radical ions. Default is False. 937 isProtonated : bool, optional 938 If True, search for protonated ions. Default is True. 939 isAdduct : bool, optional 940 If True, search for adduct ions. Default is False. 941 usedAtoms : dict, optional 942 Dictionary of atoms and ranges. Default is {'C': (1, 90), 'H': (4, 200), 'O': (0, 12), 'N': (0, 0), 'S': (0, 0), 'P': (0, 0), 'Cl': (0, 0)}. 943 ion_types_excluded : list, optional 944 List of ion types to exclude from molecular id search, commonly ['[M+CH3COO]-]'] or ['[M+COOH]-'] depending on mobile phase content. Default is []. 945 ionization_type : str, optional 946 Ionization type. Default is 'ESI'. 947 min_ppm_error : float, optional 948 Minimum ppm error. Default is -10.0. 949 max_ppm_error : float, optional 950 Maximum ppm error. Default is 10.0. 951 min_abun_error : float, optional 952 Minimum abundance error for isotolopologue search. Default is -100.0. 953 max_abun_error : float, optional 954 Maximum abundance error for isotolopologue search. Default is 100.0. 955 mz_error_range : float, optional 956 m/z error range. Default is 1.5. 957 error_method : str, optional 958 Error method. Default is 'None'. Options are 'distance', 'lowest', 'symmetrical','average' 'None'. 959 mz_error_average : float, optional 960 m/z error average. Default is 0.0. 961 used_atom_valences : dict, optional 962 Dictionary of atoms and valences. Default is {'C': 4, '13C': 4, 'H': 1, 'O': 2, '18O': 2, 'N': 3, 'S': 2, '34S': 2, 'P': 3, 'Cl': 1, '37Cl': 1, 'Br': 1, 'Na': 1, 'F': 1, 'K': 0}. 963 verbose_processing: bool, optional 964 If True, print verbose processing information. Default is True. 965 """ 966 967 verbose_processing: bool = True 968 969 use_isotopologue_filter: bool = False 970 971 isotopologue_filter_threshold: float = 33 972 973 isotopologue_filter_atoms: tuple = ("Cl", "Br") 974 975 use_runtime_kendrick_filter: bool = False 976 977 use_min_peaks_filter: bool = True 978 979 min_peaks_per_class: int = 15 980 981 url_database: str = ( 982 "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp" 983 ) 984 985 db_jobs: int = 3 986 987 db_chunk_size: int = 300 988 989 # query setting======== 990 ion_charge: int = -1 991 992 min_hc_filter: float = 0.3 993 994 max_hc_filter: float = 3 995 996 min_oc_filter: float = 0.0 997 998 max_oc_filter: float = 1.2 999 1000 min_op_filter: float = 2 1001 1002 use_pah_line_rule: bool = False 1003 1004 min_dbe: float = 0 1005 1006 max_dbe: float = 40 1007 1008 mz_error_score_weight: float = 0.6 1009 1010 isotopologue_score_weight: float = 0.4 1011 1012 # look for close shell ions [M + Adduct]+ only considers metal set in the list adduct_atoms 1013 adduct_atoms_neg: tuple = ("Cl", "Br") 1014 1015 adduct_atoms_pos: tuple = ("Na", "K") 1016 1017 score_methods: tuple = ( 1018 "S_P_lowest_error", 1019 "N_S_P_lowest_error", 1020 "lowest_error", 1021 "prob_score", 1022 "air_filter_error", 1023 "water_filter_error", 1024 "earth_filter_error", 1025 ) 1026 1027 score_method: str = "prob_score" 1028 1029 output_min_score: float = 0.1 1030 1031 output_score_method: str = "All Candidates" 1032 1033 # depending on the polarity mode it looks for [M].+ , [M].- 1034 # query and automatically compile add entry if it doesn't exist 1035 1036 isRadical: bool = False 1037 1038 # depending on the polarity mode it looks for [M + H]+ , [M - H]+ 1039 # query and automatically compile and push options if it doesn't exist 1040 isProtonated: bool = True 1041 1042 isAdduct: bool = False 1043 1044 usedAtoms: dict = dataclasses.field(default_factory=dict) 1045 ion_types_excluded: list = dataclasses.field(default_factory=list) 1046 1047 # search setting ======== 1048 1049 ionization_type: str = "ESI" 1050 1051 # empirically set / needs optimization 1052 min_ppm_error: float = -10.0 # ppm 1053 1054 # empirically set / needs optimization 1055 max_ppm_error: float = 10.0 # ppm 1056 1057 # empirically set / needs optimization set for isotopologue search 1058 min_abun_error: float = -100.0 # percentage 1059 1060 # empirically set / needs optimization set for isotopologue search 1061 max_abun_error: float = 100.0 # percentage 1062 1063 # empirically set / needs optimization 1064 mz_error_range: float = 1.5 1065 1066 # 'distance', 'lowest', 'symmetrical','average' 'None' 1067 error_method: str = "None" 1068 1069 mz_error_average: float = 0.0 1070 1071 # used_atom_valences: {'C': 4, 'H':1, etc} = dataclasses.field(default_factory=dict) 1072 used_atom_valences: dict = dataclasses.field(default_factory=dict) 1073 1074 def __post_init__(self): 1075 if not self.url_database or self.url_database == "": 1076 self.url_database = os.getenv( 1077 "COREMS_DATABASE_URL", "sqlite:///db/molformula.db" 1078 ) 1079 # enforce datatype 1080 for field in dataclasses.fields(self): 1081 value = getattr(self, field.name) 1082 if not isinstance(value, field.type): 1083 value = field.type(value) 1084 setattr(self, field.name, value) 1085 1086 # enforce C and H if either do not exists 1087 if "C" not in self.usedAtoms.keys(): 1088 self.usedAtoms["C"] = (1, 100) 1089 if "H" not in self.usedAtoms.keys(): 1090 self.usedAtoms["H"] = (1, 200) 1091 1092 # add cummon values 1093 current_used_atoms = self.used_atom_valences.keys() 1094 1095 for atom in Atoms.atoms_covalence.keys(): 1096 if atom not in current_used_atoms: 1097 covalence = Atoms.atoms_covalence.get(atom) 1098 1099 if isinstance(covalence, int): 1100 self.used_atom_valences[atom] = covalence 1101 1102 else: 1103 # will get the first number of all possible covalances, which should be the most commum 1104 self.used_atom_valences[atom] = covalence[0]
Settings for molecular searching
Attributes
- use_isotopologue_filter (bool, optional): If True, use isotopologue filter. Default is False.
- isotopologue_filter_threshold (float, optional): Threshold for isotopologue filter. Default is 33.
- isotopologue_filter_atoms (tuple, optional): Tuple of atoms to use for isotopologue filter. Default is ('Cl', 'Br').
- use_runtime_kendrick_filter (bool, optional): If True, use runtime Kendrick filter. Default is False.
- use_min_peaks_filter (bool, optional): If True, use minimum peaks filter. Default is True.
- min_peaks_per_class (int, optional): Minimum number of peaks per class. Default is 15.
- url_database (str, optional): URL for the database. Default is 'postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp'.
- db_jobs (int, optional): Number of jobs to use for database queries. Default is 3.
- db_chunk_size (int, optional): Chunk size to use for database queries. Default is 300.
- ion_charge (int, optional): Ion charge. Default is -1.
- min_hc_filter (float, optional): Minimum hydrogen to carbon ratio. Default is 0.3.
- max_hc_filter (float, optional): Maximum hydrogen to carbon ratio. Default is 3.
- min_oc_filter (float, optional): Minimum oxygen to carbon ratio. Default is 0.0.
- max_oc_filter (float, optional): Maximum oxygen to carbon ratio. Default is 1.2.
- min_op_filter (float, optional): Minimum oxygen to phosphorous ratio. Default is 2.
- use_pah_line_rule (bool, optional): If True, use the PAH line rule. Default is False.
- min_dbe (float, optional): Minimum double bond equivalent to use for searching. Default is 0.
- max_dbe (float, optional): Maximum double bond equivalent to use for searching. Default is 40.
- mz_error_score_weight (float, optional): Weight for m/z error score to contribute to composite score. Default is 0.6.
- isotopologue_score_weight (float, optional): Weight for isotopologue score to contribute to composite score. Default is 0.4.
- adduct_atoms_neg (tuple, optional): Tuple of atoms to use in negative polarity. Default is ('Cl', 'Br').
- adduct_atoms_pos (tuple, optional): Tuple of atoms to use in positive polarity. Default is ('Na', 'K').
- score_methods (tuple, optional): Tuple of score method that can be implemented. Default is ('S_P_lowest_error', 'N_S_P_lowest_error', 'lowest_error', 'prob_score', 'air_filter_error', 'water_filter_error', 'earth_filter_error').
- score_method (str, optional): Score method to use. Default is 'prob_score'. Options are 'S_P_lowest_error', 'N_S_P_lowest_error', 'lowest_error', 'prob_score', 'air_filter_error', 'water_filter_error', 'earth_filter_error'.
- output_min_score (float, optional): Minimum score for output. Default is 0.1.
- output_score_method (str, optional): Score method to use for output. Default is 'All Candidates'.
- isRadical (bool, optional): If True, search for radical ions. Default is False.
- isProtonated (bool, optional): If True, search for protonated ions. Default is True.
- isAdduct (bool, optional): If True, search for adduct ions. Default is False.
- usedAtoms (dict, optional): Dictionary of atoms and ranges. Default is {'C': (1, 90), 'H': (4, 200), 'O': (0, 12), 'N': (0, 0), 'S': (0, 0), 'P': (0, 0), 'Cl': (0, 0)}.
- ion_types_excluded (list, optional): List of ion types to exclude from molecular id search, commonly ['[M+CH3COO]-]'] or ['[M+COOH]-'] depending on mobile phase content. Default is [].
- ionization_type (str, optional): Ionization type. Default is 'ESI'.
- min_ppm_error (float, optional): Minimum ppm error. Default is -10.0.
- max_ppm_error (float, optional): Maximum ppm error. Default is 10.0.
- min_abun_error (float, optional): Minimum abundance error for isotolopologue search. Default is -100.0.
- max_abun_error (float, optional): Maximum abundance error for isotolopologue search. Default is 100.0.
- mz_error_range (float, optional): m/z error range. Default is 1.5.
- error_method (str, optional): Error method. Default is 'None'. Options are 'distance', 'lowest', 'symmetrical','average' 'None'.
- mz_error_average (float, optional): m/z error average. Default is 0.0.
- used_atom_valences (dict, optional): Dictionary of atoms and valences. Default is {'C': 4, '13C': 4, 'H': 1, 'O': 2, '18O': 2, 'N': 3, 'S': 2, '34S': 2, 'P': 3, 'Cl': 1, '37Cl': 1, 'Br': 1, 'Na': 1, 'F': 1, 'K': 0}.
- verbose_processing (bool, optional): If True, print verbose processing information. Default is True.