corems.encapsulation.factory.processingSetting

  1__author__ = "Yuri E. Corilo"
  2__date__ = "Jul 02, 2019"
  3
  4import dataclasses
  5import os
  6from typing import List, Dict
  7
  8from corems.encapsulation.constant import Atoms, Labels
  9
 10
 11@dataclasses.dataclass
 12class TransientSetting:
 13    """Transient processing settings class
 14
 15    Attributes
 16    ----------
 17    implemented_apodization_function : tuple
 18        Available apodization functions
 19    apodization_method : str
 20        Apodization function to use. Hanning is a good default for Fourier transform magnitude mode. For absorption mode processing, Half-Sine or Half-Kaiser may be more appropriate.
 21    number_of_truncations : int
 22        How many times to truncate the transient prior to Fourier transform
 23    number_of_zero_fills : int
 24        How many times to zero fille the transient prior to Fourier transform.
 25    next_power_of_two : bool
 26        If True, zero fill to the next power of two after the new length of len(transient)+(number_of_zero_fills*len(transient)).
 27    kaiser_beta : float
 28        Beta parameter for Kaiser or Half-Kaiser apodisation function. 0 is rectangular,  5 is similar to Hamming,
 29        6 is similar to hanning, and 8.6 is similar to Blackman (from numpy docs)
 30
 31    """
 32
 33    implemented_apodization_function: tuple = (
 34        "Hamming",
 35        "Hanning",
 36        "Blackman",
 37        "Full-Sine",
 38        "Half-Sine",
 39        "Kaiser",
 40        "Half-Kaiser",
 41    )
 42    apodization_method: str = "Hanning"
 43    number_of_truncations: int = 0
 44    number_of_zero_fills: int = 1
 45    next_power_of_two: bool = False
 46    kaiser_beta: float = 8.6
 47
 48    def __post_init__(self):
 49        # enforce datatype
 50        for field in dataclasses.fields(self):
 51            value = getattr(self, field.name)
 52            if not isinstance(value, field.type):
 53                value = field.type(value)
 54                setattr(self, field.name, value)
 55
 56
 57@dataclasses.dataclass
 58class DataInputSetting:
 59    """Data input settings class
 60
 61    Attributes
 62    ----------
 63    header_translate : dict
 64        Dictionary with the header labels to be translated to the corems labels. For example, {'m/z':'m/z', 'Resolving Power':'Resolving Power', 'Abundance':'Abundance' , 'S/N':'S/N'}
 65    """
 66
 67    # add to this dict the VALUES to match your labels, THE ORDER WON"T MATTER
 68    # "column_translate" : {"m/z":"m/z", "Resolving Power":"Resolving Power", "Abundance":"Abundance" , "S/N":"S/N"}
 69    header_translate: dict = dataclasses.field(default_factory=dict)
 70
 71    def __post_init__(self):
 72        self.header_translate = {
 73            "m/z": Labels.mz,
 74            "mOz": Labels.mz,
 75            "Mass": Labels.mz,
 76            "Resolving Power": Labels.rp,
 77            "Res.": Labels.rp,
 78            "resolution": Labels.rp,
 79            "Intensity": Labels.abundance,
 80            "Peak Height": Labels.abundance,
 81            "I": Labels.abundance,
 82            "Abundance": Labels.abundance,
 83            "abs_abu": Labels.abundance,
 84            "Signal/Noise": Labels.s2n,
 85            "S/N": Labels.s2n,
 86            "sn": Labels.s2n,
 87        }
 88
 89    def add_mz_label(self, label):
 90        """Add a label to the header_translate dictionary to be translated to the corems label for mz."""
 91        self.header_translate[label] = Labels.mz
 92
 93    def add_peak_height_label(self, label):
 94        """Add a label to the header_translate dictionary to be translated to the corems label for peak height."""
 95
 96        self.header_translate[label] = Labels.abundance
 97
 98    def add_sn_label(self, label):
 99        """Add a label to the header_translate dictionary to be translated to the corems label for signal to noise."""
100        self.header_translate[label] = Labels.s2n
101
102    def add_resolving_power_label(self, label):
103        """Add a label to the header_translate dictionary to be translated to the corems label for resolving power."""
104        self.header_translate[label] = Labels.rp
105
106
107@dataclasses.dataclass
108class LiquidChromatographSetting:
109    """Liquid chromatograph processing settings class
110
111    Attributes
112    ----------
113    scans : list or tuple, optional
114        List of select scan to average or a tuple containing the range to average. Default is (0, 1).
115    eic_tolerance_ppm : float, optional
116        Mass tolerance in ppm for extracted ion chromatogram peak detection. Default is 5.
117    correct_eic_baseline : bool, optional
118        If True, correct the baseline of the extracted ion chromatogram. Default is True.
119    smooth_window : int, optional
120        Window size for smoothing the ion chromatogram (extracted or total). Default is 5.
121    smooth_method : str, optional
122        Smoothing method to use. Default is 'savgol'. Other options are 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar'.
123    implemented_smooth_method : tuple, optional
124        Smoothing methods that can be implemented. Values are ('savgol', 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar').
125    savgol_pol_order : int, optional
126        Polynomial order for Savitzky-Golay smoothing. Default is 2.
127    peak_height_max_percent : float, optional
128        1-100 % used for baseline detection use 0.1 for second_derivative and 10 for other methods. Default is 10.
129    peak_max_prominence_percent : float, optional
130        1-100 % used for baseline detection. Default is 1.
131    peak_derivative_threshold : float, optional
132        Threshold for defining derivative crossing. Default is 0.0005.
133    min_peak_datapoints : float, optional
134        minimum data point to define a chromatografic peak. Default is 5.
135    noise_threshold_method : str, optional
136        Method for detecting noise threshold. Default is 'manual_relative_abundance'.
137    noise_threshold_methods_implemented : tuple, optional
138        Methods for detected noise threshold that can be implemented. Default is ('auto_relative_abundance', 'manual_relative_abundance', 'second_derivative').
139    peak_height_min_percent : float, optional
140        0-100 % used for peak detection. Default is 0.1.
141    eic_signal_threshold : float, optional
142        0-100 % used for extracted ion chromatogram peak detection. Default is 0.01.
143    eic_buffer_time : float, optional
144        Buffer time to add to the start and end of the plot of the extracted ion chromatogram, in minutes. Default is 1.5.
145    ph_smooth_it : int, optional
146        Number of iterations to use for smoothing prior to finding mass features.
147        Called within the PHCalculations.find_mass_features_ph() method. Default is 7.
148    ph_smooth_radius_mz : int, optional
149        Radius in m/z steps (not daltons) for smoothing prior to finding mass features.
150        Called within the PHCalculations.find_mass_features_ph() method. Default is 0.
151    ph_smooth_radius_scan : int, optional
152        Radius in scan steps for smoothing prior to finding mass features.
153        Called within the PHCalculations.find_mass_features_ph() method. Default is 3.
154    ph_inten_min_rel : int, optional
155        Relative minimum intensity to use for finding mass features.
156        Calculated as a fraction of the maximum intensity of the unprocessed profile data (mz, scan).
157        Called within the PH_Calculations.find_mass_features() method. Default is 0.001.
158    ph_persis_min_rel : int, optional
159        Relative minimum persistence for retaining mass features.
160        Calculated as a fraction of the maximum intensity of the unprocessed profile data (mz, scan).
161        Should be greater to or equal to ph_inten_min_rel.
162        Called within the PH_Calculations.find_mass_features() method. Default is 0.001.
163    mass_feature_cluster_mz_tolerance_rel : float, optional
164        Relative m/z tolerance to use for clustering mass features.
165        Called with the PHCalculations.cluster_mass_features() and the LCCalculations.deconvolute_ms1_mass_features() methods.
166        Default is 5E-6 (5 ppm).
167    mass_feature_cluster_rt_tolerance : float, optional
168        Retention time tolerance to use for clustering mass features, in minutes.
169        Called with the PHCalculations.cluster_mass_features() and the LCCalculations.deconvolute_ms1_mass_features() methods.
170        Default is 0.2.
171    ms1_scans_to_average : int, optional
172        Number of MS1 scans to average for mass-feature associated m/zs.
173        Called within the LCMSBase.add_associated_ms1() method. Default is 1.
174    ms1_deconvolution_corr_min : float, optional
175        Minimum correlation to use for deconvoluting MS1 mass features.
176        Called within the LCCalculations.deconvolute_ms1_mass_features() method.
177        Default is 0.8.
178    ms2_dda_rt_tolerance : float, optional
179        Retention time tolerance to use for associating MS2 spectra to mass features, in minutes. Called within the LCMSBase.add_associated_ms2_dda() method. Default is 0.15.
180    ms2_dda_mz_tolerance : float, optional
181        Mass tolerance to use for associating MS2 spectra to mass features. Called within the LCMSBase.add_associated_ms2_dda() method. Default is 0.05.
182    ms2_min_fe_score : float, optional
183        Minimum flash entropy for retaining MS2 annotations. Called within the LCMSSpectralSearch.fe_search() method. Default is 0.2.
184    search_as_lipids : bool, optional
185        If True, prepare the database for lipid searching. Called within the LCMSSpectralSearch.fe_prep_search_db() method. Default is False.
186    include_fragment_types : bool, optional
187        If True, include fragment types in the database. Called within the LCMSSpectralSearch.fe_search() and related methods. Default is False.
188    verbose_processing : bool, optional
189        If True, print verbose processing information. Default is True.
190    """
191
192    scans: list | tuple = (-1, -1)
193
194    # Parameters used for generating EICs and performing 1D peak picking and EIC/TIC smoothing
195    eic_tolerance_ppm: float = 5
196    correct_eic_baseline = True
197    smooth_window: int = 5
198    smooth_method: str = "savgol"
199    implemented_smooth_method: tuple = (
200        "savgol",
201        "hanning",
202        "blackman",
203        "bartlett",
204        "flat",
205        "boxcar",
206    )
207    savgol_pol_order: int = 2
208    peak_height_max_percent: float = 10
209    peak_max_prominence_percent: float = 1
210    peak_derivative_threshold: float = 0.0005
211    min_peak_datapoints: float = 5
212    noise_threshold_method: str = "manual_relative_abundance"
213    noise_threshold_methods_implemented: tuple = (
214        "auto_relative_abundance",
215        "manual_relative_abundance",
216        "second_derivative",
217    )
218    peak_height_min_percent: float = 0.1
219    eic_signal_threshold: float = 0.01
220    eic_buffer_time = 1.5
221
222    # Parameters used for 2D peak picking
223    peak_picking_method: str = "persistent homology"
224    implemented_peak_picking_methods: tuple = ("persistent homology",)
225
226    # Parameters used in persistent homology calculations
227    ph_smooth_it = 1
228    ph_smooth_radius_mz = 0
229    ph_smooth_radius_scan = 1
230    ph_inten_min_rel = 0.001
231    ph_persis_min_rel = 0.001
232
233    # Parameters used to cluster mass features
234    mass_feature_cluster_mz_tolerance_rel: float = 5e-6
235    mass_feature_cluster_rt_tolerance: float = 0.3
236
237    # Parameters used in associating MS1 and MS2 spectra to LCMS mass features and deconvoluting MS1 mass features
238    ms1_scans_to_average: int = 1
239    ms1_deconvolution_corr_min: float = 0.8
240    ms2_dda_rt_tolerance: float = 0.15
241    ms2_dda_mz_tolerance: float = 0.05
242
243    # Parameters used for flash entropy searching and database preparation
244    ms2_min_fe_score: float = 0.2
245    search_as_lipids: bool = False
246    include_fragment_types: bool = False
247
248    # Parameters used for saving the data
249    export_profile_spectra: bool = False
250    export_eics: bool = True
251    export_unprocessed_ms1: bool = False
252
253    # Parameters used for verbose processing
254    verbose_processing: bool = True
255
256    def __post_init__(self):
257        # enforce datatype
258        for field in dataclasses.fields(self):
259            value = getattr(self, field.name)
260            if not isinstance(value, field.type):
261                value = field.type(value)
262                setattr(self, field.name, value)
263
264
265@dataclasses.dataclass
266class MassSpectrumSetting:
267    """Mass spectrum processing settings class
268
269    Attributes
270    ----------
271    noise_threshold_method : str, optional
272        Method for detecting noise threshold. Default is 'log'.
273    noise_threshold_methods_implemented : tuple, optional
274        Methods for detected noise threshold that can be implemented. Default is ('minima', 'signal_noise', 'relative_abundance', 'absolute_abundance', 'log').
275    noise_threshold_min_std : int, optional
276        Minumum value for noise thresholding when using 'minima' noise threshold method. Default is 6.
277    noise_threshold_min_s2n : float, optional
278        Minimum value for noise thresholding when using 'signal_noise' noise threshold method. Default is 4.
279    noise_threshold_min_relative_abundance : float, optional
280        Minimum value for noise thresholding when using 'relative_abundance' noise threshold method. Note that this is a percentage value. Default is 6 (6%).
281    noise_threshold_absolute_abundance : float, optional
282        Minimum value for noise thresholding when using 'absolute_abundance' noise threshold method. Default is 1_000_000.
283    noise_threshold_log_nsigma : int, optional
284        Number of standard deviations to use when using 'log' noise threshold method. Default is 6.
285    noise_threshold_log_nsigma_corr_factor : float, optional
286        Correction factor for log noise threshold method. Default is 0.463.
287    noise_threshold_log_nsigma_bins : int, optional
288        Number of bins to use for histogram when using 'log' noise threshold method. Default is 500.
289    noise_min_mz : float, optional
290        Minimum m/z to use for noise thresholding. Default is 50.0.
291    noise_max_mz : float, optional
292        Maximum m/z to use for noise thresholding. Default is 1200.0.
293    min_picking_mz : float, optional
294        Minimum m/z to use for peak picking. Default is 50.0.
295    max_picking_mz : float, optional
296        Maximum m/z to use for peak picking. Default is 1200.0.
297    picking_point_extrapolate : int, optional
298        How many data points (in each direction) to extrapolate the mz axis and 0 pad the abundance axis. Default is 3.
299        Recommend 3 for reduced profile data or if peak picking faults
300    calib_minimize_method : str, optional
301        Minimization method to use for calibration. Default is 'Powell'.
302    calib_pol_order : int, optional
303        Polynomial order to use for calibration. Default is 2.
304    max_calib_ppm_error : float, optional
305        Maximum ppm error to use for calibration. Default is 1.0.
306    min_calib_ppm_error : float, optional
307        Minimum ppm error to use for calibration. Default is -1.0.
308    calib_sn_threshold : float, optional
309        Signal to noise threshold to use for calibration. Default is 2.0.
310    calibration_ref_match_method: string, optional
311        Method for matching reference masses with measured masses for recalibration. Default is 'legacy'.
312    calibration_ref_match_tolerance: float, optional
313        If using the new method for calibration reference mass matching, this tolerance is the initial matching tolerance. Default is 0.003
314    do_calibration : bool, optional
315        If True, perform calibration. Default is True.
316    verbose_processing : bool, optional
317        If True, print verbose processing information. Default is True.
318    """
319
320    noise_threshold_method: str = "log"
321
322    noise_threshold_methods_implemented: tuple = (
323        "minima",
324        "signal_noise",
325        "relative_abundance",
326        "absolute_abundance",
327        "log",
328    )
329
330    noise_threshold_min_std: int = 6  # when using 'minima' method
331
332    noise_threshold_min_s2n: float = 4  # when using 'signal_noise' method
333
334    noise_threshold_min_relative_abundance: float = (
335        6  # from 0-100, when using 'relative_abundance' method
336    )
337
338    noise_threshold_absolute_abundance: float = (
339        1_000_000  # when using 'absolute_abundance' method
340    )
341
342    noise_threshold_log_nsigma: int = 6  # when using 'log' method
343    noise_threshold_log_nsigma_corr_factor: float = 0.463  # mFT is 0.463, aFT is 1.0
344    noise_threshold_log_nsigma_bins: int = 500  # bins for the histogram for the noise
345
346    noise_min_mz: float = 50.0
347    noise_max_mz: float = 1200.0
348
349    min_picking_mz: float = 50.0
350    max_picking_mz: float = 1200.0
351
352    # How many data points (in each direction) to extrapolate the mz axis and 0 pad the abundance axis
353    # This will fix peak picking at spectrum limit issues
354    #  0 to keep normal behaviour, typical value 3 to fix
355    picking_point_extrapolate: int = 3
356
357    calib_minimize_method: str = "Powell"
358    calib_pol_order: int = 2
359    max_calib_ppm_error: float = 1.0
360    min_calib_ppm_error: float = -1.0
361    calib_sn_threshold: float = 2.0
362    calibration_ref_match_method: str = "legacy"
363    calibration_ref_match_method_implemented: tuple = ("legacy", "merged")
364    calibration_ref_match_tolerance: float = 0.003
365    calibration_ref_match_std_raw_error_limit: float = 1.5
366    # calib_ref_mzs: list = [0]
367
368    do_calibration: bool = True
369    verbose_processing: bool = True
370
371    def __post_init__(self):
372        # enforce datatype
373        for field in dataclasses.fields(self):
374            value = getattr(self, field.name)
375            if not isinstance(value, field.type):
376                value = field.type(value)
377                setattr(self, field.name, value)
378
379
380@dataclasses.dataclass
381class MassSpecPeakSetting:
382    """Mass spectrum peak processing settings class
383
384    Attributes
385    ----------
386    kendrick_base : Dict, optional
387        Dictionary specifying the elements and their counts in the Kendrick base.
388        Defaults to {'C': 1, 'H': 2}.
389    kendrick_rounding_method : str, optional
390        Method for calculating the nominal Kendrick mass. Valid values are 'floor', 'ceil', or 'round'.
391        Defaults to 'floor'.
392    implemented_kendrick_rounding_methods : tuple
393        Tuple of valid rounding methods for calculating the nominal Kendrick mass.
394        Defaults to ('floor', 'ceil', 'round').
395    peak_derivative_threshold : float, optional
396        Threshold for defining derivative crossing. Should be a value between 0 and 1.
397        Defaults to 0.0.
398    peak_min_prominence_percent : float, optional
399        Minimum prominence percentage used for peak detection. Should be a value between 1 and 100.
400        Defaults to 0.1.
401    min_peak_datapoints : float, optional
402        Minimum number of data points used for peak detection. Should be a value between 0 and infinity.
403        Defaults to 5.
404    peak_max_prominence_percent : float, optional
405        Maximum prominence percentage used for baseline detection. Should be a value between 1 and 100.
406        Defaults to 0.1.
407    peak_height_max_percent : float, optional
408        Maximum height percentage used for baseline detection. Should be a value between 1 and 100.
409        Defaults to 10.
410    legacy_resolving_power : bool, optional
411        Flag indicating whether to use the legacy (CoreMS v1) resolving power calculation.
412        Defaults to True.
413    legacy_centroid_polyfit : bool, optional
414        Use legacy (numpy polyfit) to fit centroid
415        Default false.
416    """
417
418    kendrick_base: Dict = dataclasses.field(default_factory=dict)
419
420    kendrick_rounding_method: str = "floor"  # 'floor', 'ceil' or 'round' are valid methods for calculating nominal kendrick mass
421
422    implemented_kendrick_rounding_methods: tuple = ("floor", "ceil", "round")
423
424    peak_derivative_threshold: float = 0.0  # define derivative crossing threshould 0-1
425
426    peak_min_prominence_percent: float = 0.1  # 1-100 % used for peak detection
427
428    min_peak_datapoints: float = 5  # 0-inf used for peak detection
429
430    peak_max_prominence_percent: float = 0.1  # 1-100 % used for baseline detection
431
432    peak_height_max_percent: float = 10  # 1-100 % used for baseline detection
433
434    legacy_resolving_power: bool = (
435        True  # Use the legacy (CoreMS v1) resolving power calculation (True)
436    )
437
438    legacy_centroid_polyfit: bool = False
439
440    def __post_init__(self):
441        # default to CH2
442        if not self.kendrick_base:
443            self.kendrick_base = {"C": 1, "H": 2}
444        # enforce datatype
445        for field in dataclasses.fields(self):
446            value = getattr(self, field.name)
447            if not isinstance(value, field.type):
448                value = field.type(value)
449                setattr(self, field.name, value)
450
451
452@dataclasses.dataclass
453class GasChromatographSetting:
454    """Gas chromatograph processing settings class
455
456    Attributes
457    ----------
458    use_deconvolution : bool, optional
459        If True, use deconvolution. Default is False.
460    implemented_smooth_method : tuple, optional
461        Smoothing methods that can be implemented. Default is ('savgol', 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar').
462    smooth_window : int, optional
463        Window size for smoothing the ion chromatogram. Default is 5.
464    smooth_method : str, optional
465        Smoothing method to use. Default is 'savgol'. Other options are 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar'.
466    savgol_pol_order : int, optional
467        Polynomial order for Savitzky-Golay smoothing. Default is 2.
468    peak_derivative_threshold : float, optional
469        Threshold for defining derivative crossing. Should be a value between 0 and 1.
470        Defaults to 0.0005.
471    peak_height_max_percent : float, optional
472        Maximum height percentage used for baseline detection. Should be a value between 1 and 100.
473        Defaults to 10.
474    peak_max_prominence_percent : float, optional
475        Maximum prominence percentage used for baseline detection. Should be a value between 1 and 100.
476        Defaults to 1.
477    min_peak_datapoints : float, optional
478        Minimum number of data points used for peak detection. Should be a value between 0 and infinity.
479        Defaults to 5.
480    max_peak_width : float, optional
481        Maximum peak width used for peak detection. Should be a value between 0 and infinity.
482        Defaults to 0.1.
483    noise_threshold_method : str, optional
484        Method for detecting noise threshold. Default is 'manual_relative_abundance'.
485    noise_threshold_methods_implemented : tuple, optional
486        Methods for detected noise threshold that can be implemented. Default is ('auto_relative_abundance', 'manual_relative_abundance', 'second_derivative').
487    std_noise_threshold : int, optional
488        Default is 3.
489    peak_height_min_percent : float, optional
490        0-100 % used for peak detection. Default is 0.1.
491    peak_min_prominence_percent : float, optional
492        0-100 % used for peak detection. Default is 0.1.
493    eic_signal_threshold : float, optional
494        0-100 % used for extracted ion chromatogram peak detection. Default is 0.01.
495    max_rt_distance : float, optional
496        Maximum distance allowance for hierarchical cluster, in minutes. Default is 0.025.
497    verbose_processing : bool, optional
498        If True, print verbose processing information. Default is True.
499    """
500
501    use_deconvolution: bool = False
502
503    implemented_smooth_method: tuple = (
504        "savgol",
505        "hanning",
506        "blackman",
507        "bartlett",
508        "flat",
509        "boxcar",
510    )
511
512    smooth_window: int = 5
513
514    smooth_method: str = "savgol"
515
516    savgol_pol_order: int = 2
517
518    peak_derivative_threshold: float = 0.0005
519
520    peak_height_max_percent: float = 10  # 1-100 % used for baseline detection use 0.1 for second_derivative and 10 for other methods
521
522    peak_max_prominence_percent: float = 1  # 1-100 % used for baseline detection
523
524    min_peak_datapoints: float = 5
525
526    max_peak_width: float = 0.1
527
528    noise_threshold_method: str = "manual_relative_abundance"
529
530    noise_threshold_methods_implemented: tuple = (
531        "auto_relative_abundance",
532        "manual_relative_abundance",
533        "second_derivative",
534    )
535
536    std_noise_threshold: int = 3
537
538    peak_height_min_percent: float = 0.1  # 0-100 % used for peak detection
539
540    peak_min_prominence_percent: float = 0.1  # 0-100 % used for peak detection
541
542    eic_signal_threshold: float = (
543        0.01  # 0-100 % used for extracted ion chromatogram peak detection
544    )
545
546    max_rt_distance: float = (
547        0.025  # minutes, max distance allowance hierarchical clutter
548    )
549
550    verbose_processing: bool = True
551
552    def __post_init__(self):
553        # enforce datatype
554        for field in dataclasses.fields(self):
555            value = getattr(self, field.name)
556            if not isinstance(value, field.type):
557                value = field.type(value)
558                setattr(self, field.name, value)
559
560
561@dataclasses.dataclass
562class CompoundSearchSettings:
563    """Settings for compound search
564
565    Attributes
566    ----------
567    url_database : str, optional
568        URL for the database. Default is 'sqlite:///db/pnnl_lowres_gcms_compounds.sqlite'.
569    ri_search_range : float, optional
570        Retention index search range. Default is 35.
571    rt_search_range : float, optional
572        Retention time search range, in minutes. Default is 1.0.
573    correlation_threshold : float, optional
574        Threshold for correlation for spectral similarity. Default is 0.5.
575    score_threshold : float, optional
576        Threshold for compsite score. Default is 0.0.
577    ri_spacing : float, optional
578        Retention index spacing. Default is 200.
579    ri_std : float, optional
580        Retention index standard deviation. Default is 3.
581    ri_calibration_compound_names : list, optional
582        List of compound names to use for retention index calibration. Default is ['Methyl Caprylate', 'Methyl Caprate', 'Methyl Pelargonate', 'Methyl Laurate', 'Methyl Myristate', 'Methyl Palmitate', 'Methyl Stearate', 'Methyl Eicosanoate', 'Methyl Docosanoate', 'Methyl Linocerate', 'Methyl Hexacosanoate', 'Methyl Octacosanoate', 'Methyl Triacontanoate'].
583
584    """
585
586    url_database: str = "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/lowres"  # 'postgresql://postgres:labthomson0102@172.22.113.27:5432/GCMS' # 'sqlite:///db/pnnl_lowres_gcms_compounds.sqlite'
587
588    ri_search_range: float = 35
589
590    rt_search_range: float = 1.0  # used for retention index calibration
591
592    correlation_threshold: float = 0.5  # used for calibration, spectral similarity
593
594    score_threshold: float = 0.0
595
596    ri_spacing: float = 200
597
598    ri_std: float = 3  # in standard deviation
599
600    ri_calibration_compound_names: List = dataclasses.field(default_factory=list)
601
602    # calculates and export all spectral similarity methods
603    exploratory_mode: bool = False
604
605    score_methods: tuple = ("highest_sim_score", "highest_ss")
606
607    output_score_method: str = "All"
608
609    def __post_init__(self):
610        # enforce datatype
611        self.url_database = os.getenv(
612            "SPECTRAL_GCMS_DATABASE_URL",
613            "sqlite:///db/pnnl_lowres_gcms_compounds.sqlite",
614        )
615
616        for field in dataclasses.fields(self):
617            value = getattr(self, field.name)
618            if not isinstance(value, field.type):
619                value = field.type(value)
620                setattr(self, field.name, value)
621
622        self.ri_calibration_compound_names = [
623            "Methyl Caprylate",
624            "Methyl Caprate",
625            "Methyl Pelargonate",
626            "Methyl Laurate",
627            "Methyl Myristate",
628            "Methyl Palmitate",
629            "Methyl Stearate",
630            "Methyl Eicosanoate",
631            "Methyl Docosanoate",
632            "Methyl Linocerate",
633            "Methyl Hexacosanoate",
634            "Methyl Octacosanoate",
635            "Methyl Triacontanoate",
636        ]
637
638
639class MolecularLookupDictSettings:
640    """Settings for molecular searching
641
642    These are used to generate the database entries, do not change.
643
644    Attributes
645    ----------
646    usedAtoms : dict, optional
647        Dictionary of atoms and ranges. Default is {'C': (1, 90), 'H': (4, 200), 'O': (0, 12), 'N': (0, 0), 'S': (0, 0), 'P': (0, 0), 'Cl': (0, 0)}.
648    min_mz : float, optional
649        Minimum m/z to use for searching. Default is 50.0.
650    max_mz : float, optional
651        Maximum m/z to use for searching. Default is 1200.0.
652    min_dbe : float, optional
653        Minimum double bond equivalent to use for searching. Default is 0.
654    max_dbe : float, optional
655        Maximum double bond equivalent to use for searching. Default is 50.
656    use_pah_line_rule : bool, optional
657        If True, use the PAH line rule. Default is False.
658    isRadical : bool, optional
659        If True, search for radical ions. Default is True.
660    isProtonated : bool, optional
661        If True, search for protonated ions. Default is True.
662    url_database : str, optional
663        URL for the database. Default is None.
664    db_jobs : int, optional
665        Number of jobs to use for database queries. Default is 1.
666    used_atom_valences : dict, optional
667        Dictionary of atoms and valences. Default is {'C': 4, '13C': 4, 'H': 1, 'O': 2, '18O': 2, 'N': 3, 'S': 2, '34S': 2, 'P': 3, 'Cl': 1, '37Cl': 1, 'Br': 1, 'Na': 1, 'F': 1, 'K': 0}.
668
669    """
670
671    ### DO NOT CHANGE IT! These are used to generate the database entries
672
673    ### DO change when creating a new application database
674
675    ### FOR search settings runtime and database query check use the MolecularFormulaSearchSettings class below
676
677    ### C, H, N, O, S and P atoms are ALWAYS needed at usedAtoms
678    ### if you don't want to include one of those atoms set the max and min at 0
679    ### you can include any atom listed at Atoms class inside encapsulation.settings.constants module
680    ### make sure to include the selected covalence at the used_atoms_valences when adding new atoms
681    ### NOTE : Adducts atoms have zero covalence
682    ### NOTE : Not using static variable because this class is distributed using multiprocessing
683    def __init__(self):
684        self.usedAtoms = {
685            "C": (1, 90),
686            "H": (4, 200),
687            "O": (0, 12),
688            "N": (0, 0),
689            "S": (0, 0),
690            "P": (0, 0),
691            "Cl": (0, 0),
692        }
693
694        self.min_mz = 50
695
696        self.max_mz = 1200
697
698        self.min_dbe = 0
699
700        self.max_dbe = 50
701
702        # overwrites the dbe limits above to DBE = (C + heteroatoms) * 0.9
703        self.use_pah_line_rule = False
704
705        self.isRadical = True
706
707        self.isProtonated = True
708
709        self.url_database = None
710
711        self.db_jobs = 1
712
713        self.used_atom_valences = {
714            "C": 4,
715            "13C": 4,
716            "H": 1,
717            "O": 2,
718            "18O": 2,
719            "N": 3,
720            "S": 2,
721            "34S": 2,
722            "P": 3,
723            "Cl": 1,
724            "37Cl": 1,
725            "Br": 1,
726            "Na": 1,
727            "F": 1,
728            "K": 0,
729        }
730
731
732@dataclasses.dataclass
733class MolecularFormulaSearchSettings:
734    """Settings for molecular searching
735
736    Attributes
737    ----------
738    use_isotopologue_filter : bool, optional
739        If True, use isotopologue filter. Default is False.
740    isotopologue_filter_threshold : float, optional
741        Threshold for isotopologue filter. Default is 33.
742    isotopologue_filter_atoms : tuple, optional
743        Tuple of atoms to use for isotopologue filter. Default is ('Cl', 'Br').
744    use_runtime_kendrick_filter : bool, optional
745        If True, use runtime Kendrick filter. Default is False.
746    use_min_peaks_filter : bool, optional
747        If True, use minimum peaks filter. Default is True.
748    min_peaks_per_class : int, optional
749        Minimum number of peaks per class. Default is 15.
750    url_database : str, optional
751        URL for the database. Default is 'postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp'.
752    db_jobs : int, optional
753        Number of jobs to use for database queries. Default is 3.
754    db_chunk_size : int, optional
755        Chunk size to use for database queries. Default is 300.
756    ion_charge : int, optional
757        Ion charge. Default is -1.
758    min_hc_filter : float, optional
759        Minimum hydrogen to carbon ratio. Default is 0.3.
760    max_hc_filter : float, optional
761        Maximum hydrogen to carbon ratio. Default is 3.
762    min_oc_filter : float, optional
763        Minimum oxygen to carbon ratio. Default is 0.0.
764    max_oc_filter : float, optional
765        Maximum oxygen to carbon ratio. Default is 1.2.
766    min_op_filter : float, optional
767        Minimum oxygen to phosphorous ratio. Default is 2.
768    use_pah_line_rule : bool, optional
769        If True, use the PAH line rule. Default is False.
770    min_dbe : float, optional
771        Minimum double bond equivalent to use for searching. Default is 0.
772    max_dbe : float, optional
773        Maximum double bond equivalent to use for searching. Default is 40.
774    mz_error_score_weight : float, optional
775        Weight for m/z error score to contribute to composite score. Default is 0.6.
776    isotopologue_score_weight : float, optional
777        Weight for isotopologue score to contribute to composite score. Default is 0.4.
778    adduct_atoms_neg : tuple, optional
779        Tuple of atoms to use in negative polarity. Default is ('Cl', 'Br').
780    adduct_atoms_pos : tuple, optional
781        Tuple of atoms to use in positive polarity. Default is ('Na', 'K').
782    score_methods : tuple, optional
783        Tuple of score method that can be implemented.
784        Default is ('S_P_lowest_error', 'N_S_P_lowest_error', 'lowest_error', 'prob_score', 'air_filter_error', 'water_filter_error', 'earth_filter_error').
785    score_method : str, optional
786        Score method to use. Default is 'prob_score'. Options are 'S_P_lowest_error', 'N_S_P_lowest_error', 'lowest_error', 'prob_score', 'air_filter_error', 'water_filter_error', 'earth_filter_error'.
787    output_min_score : float, optional
788        Minimum score for output. Default is 0.1.
789    output_score_method : str, optional
790        Score method to use for output. Default is 'All Candidates'.
791    isRadical : bool, optional
792        If True, search for radical ions. Default is False.
793    isProtonated : bool, optional
794        If True, search for protonated ions. Default is True.
795    isAdduct : bool, optional
796        If True, search for adduct ions. Default is False.
797    usedAtoms : dict, optional
798        Dictionary of atoms and ranges. Default is {'C': (1, 90), 'H': (4, 200), 'O': (0, 12), 'N': (0, 0), 'S': (0, 0), 'P': (0, 0), 'Cl': (0, 0)}.
799    ion_types_excluded : list, optional
800        List of ion types to exclude from molecular id search, commonly ['[M+CH3COO]-]'] or ['[M+COOH]-'] depending on mobile phase content. Default is [].
801    ionization_type : str, optional
802        Ionization type. Default is 'ESI'.
803    min_ppm_error : float, optional
804        Minimum ppm error. Default is -10.0.
805    max_ppm_error : float, optional
806        Maximum ppm error. Default is 10.0.
807    min_abun_error : float, optional
808        Minimum abundance error for isotolopologue search. Default is -100.0.
809    max_abun_error : float, optional
810        Maximum abundance error for isotolopologue search. Default is 100.0.
811    mz_error_range : float, optional
812        m/z error range. Default is 1.5.
813    error_method : str, optional
814        Error method. Default is 'None'. Options are 'distance', 'lowest', 'symmetrical','average' 'None'.
815    mz_error_average : float, optional
816        m/z error average. Default is 0.0.
817    used_atom_valences : dict, optional
818        Dictionary of atoms and valences. Default is {'C': 4, '13C': 4, 'H': 1, 'O': 2, '18O': 2, 'N': 3, 'S': 2, '34S': 2, 'P': 3, 'Cl': 1, '37Cl': 1, 'Br': 1, 'Na': 1, 'F': 1, 'K': 0}.
819    verbose_processing: bool, optional
820        If True, print verbose processing information. Default is True.
821    """
822    verbose_processing: bool = True    
823
824    use_isotopologue_filter: bool = False
825
826    isotopologue_filter_threshold: float = 33
827
828    isotopologue_filter_atoms: tuple = ("Cl", "Br")
829
830    use_runtime_kendrick_filter: bool = False
831
832    use_min_peaks_filter: bool = True
833
834    min_peaks_per_class: int = 15
835
836    url_database: str = (
837        "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp"
838    )
839
840    db_jobs: int = 3
841
842    db_chunk_size: int = 300
843
844    # query setting========
845    ion_charge: int = -1
846
847    min_hc_filter: float = 0.3
848
849    max_hc_filter: float = 3
850
851    min_oc_filter: float = 0.0
852
853    max_oc_filter: float = 1.2
854
855    min_op_filter: float = 2
856
857    use_pah_line_rule: bool = False
858
859    min_dbe: float = 0
860
861    max_dbe: float = 40
862
863    mz_error_score_weight: float = 0.6
864
865    isotopologue_score_weight: float = 0.4
866
867    # look for close shell ions [M + Adduct]+ only considers metal set in the list adduct_atoms
868    adduct_atoms_neg: tuple = ("Cl", "Br")
869
870    adduct_atoms_pos: tuple = ("Na", "K")
871
872    score_methods: tuple = (
873        "S_P_lowest_error",
874        "N_S_P_lowest_error",
875        "lowest_error",
876        "prob_score",
877        "air_filter_error",
878        "water_filter_error",
879        "earth_filter_error",
880    )
881
882    score_method: str = "prob_score"
883
884    output_min_score: float = 0.1
885
886    output_score_method: str = "All Candidates"
887
888    # depending on the polarity mode it looks for [M].+ , [M].-
889    # query and automatically compile add entry if it doesn't exist
890
891    isRadical: bool = False
892
893    # depending on the polarity mode it looks for [M + H]+ , [M - H]+
894    # query and automatically compile and push options if it doesn't exist
895    isProtonated: bool = True
896
897    isAdduct: bool = False
898
899    usedAtoms: dict = dataclasses.field(default_factory=dict)
900    ion_types_excluded: list = dataclasses.field(default_factory=list)
901
902    # search setting ========
903
904    ionization_type: str = "ESI"
905
906    # empirically set / needs optimization
907    min_ppm_error: float = -10.0  # ppm
908
909    # empirically set / needs optimization
910    max_ppm_error: float = 10.0  # ppm
911
912    # empirically set / needs optimization set for isotopologue search
913    min_abun_error: float = -100.0  # percentage
914
915    # empirically set / needs optimization set for isotopologue search
916    max_abun_error: float = 100.0  # percentage
917
918    # empirically set / needs optimization
919    mz_error_range: float = 1.5
920
921    # 'distance', 'lowest', 'symmetrical','average' 'None'
922    error_method: str = "None"
923
924    mz_error_average: float = 0.0
925
926    # used_atom_valences: {'C': 4, 'H':1, etc} = dataclasses.field(default_factory=dict)
927    used_atom_valences: dict = dataclasses.field(default_factory=dict)
928
929    def __post_init__(self):
930        if not self.url_database or self.url_database == "":
931            self.url_database = os.getenv(
932            "COREMS_DATABASE_URL", "sqlite:///db/molformula.db"
933            )
934        # enforce datatype
935        for field in dataclasses.fields(self):
936            value = getattr(self, field.name)
937            if not isinstance(value, field.type):
938                value = field.type(value)
939                setattr(self, field.name, value)
940
941        # enforce C and H if either do not exists
942        if "C" not in self.usedAtoms.keys():
943            self.usedAtoms["C"] = (1, 100)
944        if "H" not in self.usedAtoms.keys():
945            self.usedAtoms["H"] = (1, 200)
946
947        # add cummon values
948        current_used_atoms = self.used_atom_valences.keys()
949
950        for atom in Atoms.atoms_covalence.keys():
951            if atom not in current_used_atoms:
952                covalence = Atoms.atoms_covalence.get(atom)
953
954                if isinstance(covalence, int):
955                    self.used_atom_valences[atom] = covalence
956
957                else:
958                    # will get the first number of all possible covalances, which should be the most commum
959                    self.used_atom_valences[atom] = covalence[0]
@dataclasses.dataclass
class TransientSetting:
12@dataclasses.dataclass
13class TransientSetting:
14    """Transient processing settings class
15
16    Attributes
17    ----------
18    implemented_apodization_function : tuple
19        Available apodization functions
20    apodization_method : str
21        Apodization function to use. Hanning is a good default for Fourier transform magnitude mode. For absorption mode processing, Half-Sine or Half-Kaiser may be more appropriate.
22    number_of_truncations : int
23        How many times to truncate the transient prior to Fourier transform
24    number_of_zero_fills : int
25        How many times to zero fille the transient prior to Fourier transform.
26    next_power_of_two : bool
27        If True, zero fill to the next power of two after the new length of len(transient)+(number_of_zero_fills*len(transient)).
28    kaiser_beta : float
29        Beta parameter for Kaiser or Half-Kaiser apodisation function. 0 is rectangular,  5 is similar to Hamming,
30        6 is similar to hanning, and 8.6 is similar to Blackman (from numpy docs)
31
32    """
33
34    implemented_apodization_function: tuple = (
35        "Hamming",
36        "Hanning",
37        "Blackman",
38        "Full-Sine",
39        "Half-Sine",
40        "Kaiser",
41        "Half-Kaiser",
42    )
43    apodization_method: str = "Hanning"
44    number_of_truncations: int = 0
45    number_of_zero_fills: int = 1
46    next_power_of_two: bool = False
47    kaiser_beta: float = 8.6
48
49    def __post_init__(self):
50        # enforce datatype
51        for field in dataclasses.fields(self):
52            value = getattr(self, field.name)
53            if not isinstance(value, field.type):
54                value = field.type(value)
55                setattr(self, field.name, value)

Transient processing settings class

Attributes
  • implemented_apodization_function (tuple): Available apodization functions
  • apodization_method (str): Apodization function to use. Hanning is a good default for Fourier transform magnitude mode. For absorption mode processing, Half-Sine or Half-Kaiser may be more appropriate.
  • number_of_truncations (int): How many times to truncate the transient prior to Fourier transform
  • number_of_zero_fills (int): How many times to zero fille the transient prior to Fourier transform.
  • next_power_of_two (bool): If True, zero fill to the next power of two after the new length of len(transient)+(number_of_zero_fills*len(transient)).
  • kaiser_beta (float): Beta parameter for Kaiser or Half-Kaiser apodisation function. 0 is rectangular, 5 is similar to Hamming, 6 is similar to hanning, and 8.6 is similar to Blackman (from numpy docs)
TransientSetting( implemented_apodization_function: tuple = ('Hamming', 'Hanning', 'Blackman', 'Full-Sine', 'Half-Sine', 'Kaiser', 'Half-Kaiser'), apodization_method: str = 'Hanning', number_of_truncations: int = 0, number_of_zero_fills: int = 1, next_power_of_two: bool = False, kaiser_beta: float = 8.6)
implemented_apodization_function: tuple = ('Hamming', 'Hanning', 'Blackman', 'Full-Sine', 'Half-Sine', 'Kaiser', 'Half-Kaiser')
apodization_method: str = 'Hanning'
number_of_truncations: int = 0
number_of_zero_fills: int = 1
next_power_of_two: bool = False
kaiser_beta: float = 8.6
@dataclasses.dataclass
class DataInputSetting:
 58@dataclasses.dataclass
 59class DataInputSetting:
 60    """Data input settings class
 61
 62    Attributes
 63    ----------
 64    header_translate : dict
 65        Dictionary with the header labels to be translated to the corems labels. For example, {'m/z':'m/z', 'Resolving Power':'Resolving Power', 'Abundance':'Abundance' , 'S/N':'S/N'}
 66    """
 67
 68    # add to this dict the VALUES to match your labels, THE ORDER WON"T MATTER
 69    # "column_translate" : {"m/z":"m/z", "Resolving Power":"Resolving Power", "Abundance":"Abundance" , "S/N":"S/N"}
 70    header_translate: dict = dataclasses.field(default_factory=dict)
 71
 72    def __post_init__(self):
 73        self.header_translate = {
 74            "m/z": Labels.mz,
 75            "mOz": Labels.mz,
 76            "Mass": Labels.mz,
 77            "Resolving Power": Labels.rp,
 78            "Res.": Labels.rp,
 79            "resolution": Labels.rp,
 80            "Intensity": Labels.abundance,
 81            "Peak Height": Labels.abundance,
 82            "I": Labels.abundance,
 83            "Abundance": Labels.abundance,
 84            "abs_abu": Labels.abundance,
 85            "Signal/Noise": Labels.s2n,
 86            "S/N": Labels.s2n,
 87            "sn": Labels.s2n,
 88        }
 89
 90    def add_mz_label(self, label):
 91        """Add a label to the header_translate dictionary to be translated to the corems label for mz."""
 92        self.header_translate[label] = Labels.mz
 93
 94    def add_peak_height_label(self, label):
 95        """Add a label to the header_translate dictionary to be translated to the corems label for peak height."""
 96
 97        self.header_translate[label] = Labels.abundance
 98
 99    def add_sn_label(self, label):
100        """Add a label to the header_translate dictionary to be translated to the corems label for signal to noise."""
101        self.header_translate[label] = Labels.s2n
102
103    def add_resolving_power_label(self, label):
104        """Add a label to the header_translate dictionary to be translated to the corems label for resolving power."""
105        self.header_translate[label] = Labels.rp

Data input settings class

Attributes
  • header_translate (dict): Dictionary with the header labels to be translated to the corems labels. For example, {'m/z':'m/z', 'Resolving Power':'Resolving Power', 'Abundance':'Abundance' , 'S/N':'S/N'}
DataInputSetting(header_translate: dict = <factory>)
header_translate: dict
def add_mz_label(self, label):
90    def add_mz_label(self, label):
91        """Add a label to the header_translate dictionary to be translated to the corems label for mz."""
92        self.header_translate[label] = Labels.mz

Add a label to the header_translate dictionary to be translated to the corems label for mz.

def add_peak_height_label(self, label):
94    def add_peak_height_label(self, label):
95        """Add a label to the header_translate dictionary to be translated to the corems label for peak height."""
96
97        self.header_translate[label] = Labels.abundance

Add a label to the header_translate dictionary to be translated to the corems label for peak height.

def add_sn_label(self, label):
 99    def add_sn_label(self, label):
100        """Add a label to the header_translate dictionary to be translated to the corems label for signal to noise."""
101        self.header_translate[label] = Labels.s2n

Add a label to the header_translate dictionary to be translated to the corems label for signal to noise.

def add_resolving_power_label(self, label):
103    def add_resolving_power_label(self, label):
104        """Add a label to the header_translate dictionary to be translated to the corems label for resolving power."""
105        self.header_translate[label] = Labels.rp

Add a label to the header_translate dictionary to be translated to the corems label for resolving power.

@dataclasses.dataclass
class LiquidChromatographSetting:
108@dataclasses.dataclass
109class LiquidChromatographSetting:
110    """Liquid chromatograph processing settings class
111
112    Attributes
113    ----------
114    scans : list or tuple, optional
115        List of select scan to average or a tuple containing the range to average. Default is (0, 1).
116    eic_tolerance_ppm : float, optional
117        Mass tolerance in ppm for extracted ion chromatogram peak detection. Default is 5.
118    correct_eic_baseline : bool, optional
119        If True, correct the baseline of the extracted ion chromatogram. Default is True.
120    smooth_window : int, optional
121        Window size for smoothing the ion chromatogram (extracted or total). Default is 5.
122    smooth_method : str, optional
123        Smoothing method to use. Default is 'savgol'. Other options are 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar'.
124    implemented_smooth_method : tuple, optional
125        Smoothing methods that can be implemented. Values are ('savgol', 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar').
126    savgol_pol_order : int, optional
127        Polynomial order for Savitzky-Golay smoothing. Default is 2.
128    peak_height_max_percent : float, optional
129        1-100 % used for baseline detection use 0.1 for second_derivative and 10 for other methods. Default is 10.
130    peak_max_prominence_percent : float, optional
131        1-100 % used for baseline detection. Default is 1.
132    peak_derivative_threshold : float, optional
133        Threshold for defining derivative crossing. Default is 0.0005.
134    min_peak_datapoints : float, optional
135        minimum data point to define a chromatografic peak. Default is 5.
136    noise_threshold_method : str, optional
137        Method for detecting noise threshold. Default is 'manual_relative_abundance'.
138    noise_threshold_methods_implemented : tuple, optional
139        Methods for detected noise threshold that can be implemented. Default is ('auto_relative_abundance', 'manual_relative_abundance', 'second_derivative').
140    peak_height_min_percent : float, optional
141        0-100 % used for peak detection. Default is 0.1.
142    eic_signal_threshold : float, optional
143        0-100 % used for extracted ion chromatogram peak detection. Default is 0.01.
144    eic_buffer_time : float, optional
145        Buffer time to add to the start and end of the plot of the extracted ion chromatogram, in minutes. Default is 1.5.
146    ph_smooth_it : int, optional
147        Number of iterations to use for smoothing prior to finding mass features.
148        Called within the PHCalculations.find_mass_features_ph() method. Default is 7.
149    ph_smooth_radius_mz : int, optional
150        Radius in m/z steps (not daltons) for smoothing prior to finding mass features.
151        Called within the PHCalculations.find_mass_features_ph() method. Default is 0.
152    ph_smooth_radius_scan : int, optional
153        Radius in scan steps for smoothing prior to finding mass features.
154        Called within the PHCalculations.find_mass_features_ph() method. Default is 3.
155    ph_inten_min_rel : int, optional
156        Relative minimum intensity to use for finding mass features.
157        Calculated as a fraction of the maximum intensity of the unprocessed profile data (mz, scan).
158        Called within the PH_Calculations.find_mass_features() method. Default is 0.001.
159    ph_persis_min_rel : int, optional
160        Relative minimum persistence for retaining mass features.
161        Calculated as a fraction of the maximum intensity of the unprocessed profile data (mz, scan).
162        Should be greater to or equal to ph_inten_min_rel.
163        Called within the PH_Calculations.find_mass_features() method. Default is 0.001.
164    mass_feature_cluster_mz_tolerance_rel : float, optional
165        Relative m/z tolerance to use for clustering mass features.
166        Called with the PHCalculations.cluster_mass_features() and the LCCalculations.deconvolute_ms1_mass_features() methods.
167        Default is 5E-6 (5 ppm).
168    mass_feature_cluster_rt_tolerance : float, optional
169        Retention time tolerance to use for clustering mass features, in minutes.
170        Called with the PHCalculations.cluster_mass_features() and the LCCalculations.deconvolute_ms1_mass_features() methods.
171        Default is 0.2.
172    ms1_scans_to_average : int, optional
173        Number of MS1 scans to average for mass-feature associated m/zs.
174        Called within the LCMSBase.add_associated_ms1() method. Default is 1.
175    ms1_deconvolution_corr_min : float, optional
176        Minimum correlation to use for deconvoluting MS1 mass features.
177        Called within the LCCalculations.deconvolute_ms1_mass_features() method.
178        Default is 0.8.
179    ms2_dda_rt_tolerance : float, optional
180        Retention time tolerance to use for associating MS2 spectra to mass features, in minutes. Called within the LCMSBase.add_associated_ms2_dda() method. Default is 0.15.
181    ms2_dda_mz_tolerance : float, optional
182        Mass tolerance to use for associating MS2 spectra to mass features. Called within the LCMSBase.add_associated_ms2_dda() method. Default is 0.05.
183    ms2_min_fe_score : float, optional
184        Minimum flash entropy for retaining MS2 annotations. Called within the LCMSSpectralSearch.fe_search() method. Default is 0.2.
185    search_as_lipids : bool, optional
186        If True, prepare the database for lipid searching. Called within the LCMSSpectralSearch.fe_prep_search_db() method. Default is False.
187    include_fragment_types : bool, optional
188        If True, include fragment types in the database. Called within the LCMSSpectralSearch.fe_search() and related methods. Default is False.
189    verbose_processing : bool, optional
190        If True, print verbose processing information. Default is True.
191    """
192
193    scans: list | tuple = (-1, -1)
194
195    # Parameters used for generating EICs and performing 1D peak picking and EIC/TIC smoothing
196    eic_tolerance_ppm: float = 5
197    correct_eic_baseline = True
198    smooth_window: int = 5
199    smooth_method: str = "savgol"
200    implemented_smooth_method: tuple = (
201        "savgol",
202        "hanning",
203        "blackman",
204        "bartlett",
205        "flat",
206        "boxcar",
207    )
208    savgol_pol_order: int = 2
209    peak_height_max_percent: float = 10
210    peak_max_prominence_percent: float = 1
211    peak_derivative_threshold: float = 0.0005
212    min_peak_datapoints: float = 5
213    noise_threshold_method: str = "manual_relative_abundance"
214    noise_threshold_methods_implemented: tuple = (
215        "auto_relative_abundance",
216        "manual_relative_abundance",
217        "second_derivative",
218    )
219    peak_height_min_percent: float = 0.1
220    eic_signal_threshold: float = 0.01
221    eic_buffer_time = 1.5
222
223    # Parameters used for 2D peak picking
224    peak_picking_method: str = "persistent homology"
225    implemented_peak_picking_methods: tuple = ("persistent homology",)
226
227    # Parameters used in persistent homology calculations
228    ph_smooth_it = 1
229    ph_smooth_radius_mz = 0
230    ph_smooth_radius_scan = 1
231    ph_inten_min_rel = 0.001
232    ph_persis_min_rel = 0.001
233
234    # Parameters used to cluster mass features
235    mass_feature_cluster_mz_tolerance_rel: float = 5e-6
236    mass_feature_cluster_rt_tolerance: float = 0.3
237
238    # Parameters used in associating MS1 and MS2 spectra to LCMS mass features and deconvoluting MS1 mass features
239    ms1_scans_to_average: int = 1
240    ms1_deconvolution_corr_min: float = 0.8
241    ms2_dda_rt_tolerance: float = 0.15
242    ms2_dda_mz_tolerance: float = 0.05
243
244    # Parameters used for flash entropy searching and database preparation
245    ms2_min_fe_score: float = 0.2
246    search_as_lipids: bool = False
247    include_fragment_types: bool = False
248
249    # Parameters used for saving the data
250    export_profile_spectra: bool = False
251    export_eics: bool = True
252    export_unprocessed_ms1: bool = False
253
254    # Parameters used for verbose processing
255    verbose_processing: bool = True
256
257    def __post_init__(self):
258        # enforce datatype
259        for field in dataclasses.fields(self):
260            value = getattr(self, field.name)
261            if not isinstance(value, field.type):
262                value = field.type(value)
263                setattr(self, field.name, value)

Liquid chromatograph processing settings class

Attributes
  • scans (list or tuple, optional): List of select scan to average or a tuple containing the range to average. Default is (0, 1).
  • eic_tolerance_ppm (float, optional): Mass tolerance in ppm for extracted ion chromatogram peak detection. Default is 5.
  • correct_eic_baseline (bool, optional): If True, correct the baseline of the extracted ion chromatogram. Default is True.
  • smooth_window (int, optional): Window size for smoothing the ion chromatogram (extracted or total). Default is 5.
  • smooth_method (str, optional): Smoothing method to use. Default is 'savgol'. Other options are 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar'.
  • implemented_smooth_method (tuple, optional): Smoothing methods that can be implemented. Values are ('savgol', 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar').
  • savgol_pol_order (int, optional): Polynomial order for Savitzky-Golay smoothing. Default is 2.
  • peak_height_max_percent (float, optional): 1-100 % used for baseline detection use 0.1 for second_derivative and 10 for other methods. Default is 10.
  • peak_max_prominence_percent (float, optional): 1-100 % used for baseline detection. Default is 1.
  • peak_derivative_threshold (float, optional): Threshold for defining derivative crossing. Default is 0.0005.
  • min_peak_datapoints (float, optional): minimum data point to define a chromatografic peak. Default is 5.
  • noise_threshold_method (str, optional): Method for detecting noise threshold. Default is 'manual_relative_abundance'.
  • noise_threshold_methods_implemented (tuple, optional): Methods for detected noise threshold that can be implemented. Default is ('auto_relative_abundance', 'manual_relative_abundance', 'second_derivative').
  • peak_height_min_percent (float, optional): 0-100 % used for peak detection. Default is 0.1.
  • eic_signal_threshold (float, optional): 0-100 % used for extracted ion chromatogram peak detection. Default is 0.01.
  • eic_buffer_time (float, optional): Buffer time to add to the start and end of the plot of the extracted ion chromatogram, in minutes. Default is 1.5.
  • ph_smooth_it (int, optional): Number of iterations to use for smoothing prior to finding mass features. Called within the PHCalculations.find_mass_features_ph() method. Default is 7.
  • ph_smooth_radius_mz (int, optional): Radius in m/z steps (not daltons) for smoothing prior to finding mass features. Called within the PHCalculations.find_mass_features_ph() method. Default is 0.
  • ph_smooth_radius_scan (int, optional): Radius in scan steps for smoothing prior to finding mass features. Called within the PHCalculations.find_mass_features_ph() method. Default is 3.
  • ph_inten_min_rel (int, optional): Relative minimum intensity to use for finding mass features. Calculated as a fraction of the maximum intensity of the unprocessed profile data (mz, scan). Called within the PH_Calculations.find_mass_features() method. Default is 0.001.
  • ph_persis_min_rel (int, optional): Relative minimum persistence for retaining mass features. Calculated as a fraction of the maximum intensity of the unprocessed profile data (mz, scan). Should be greater to or equal to ph_inten_min_rel. Called within the PH_Calculations.find_mass_features() method. Default is 0.001.
  • mass_feature_cluster_mz_tolerance_rel (float, optional): Relative m/z tolerance to use for clustering mass features. Called with the PHCalculations.cluster_mass_features() and the LCCalculations.deconvolute_ms1_mass_features() methods. Default is 5E-6 (5 ppm).
  • mass_feature_cluster_rt_tolerance (float, optional): Retention time tolerance to use for clustering mass features, in minutes. Called with the PHCalculations.cluster_mass_features() and the LCCalculations.deconvolute_ms1_mass_features() methods. Default is 0.2.
  • ms1_scans_to_average (int, optional): Number of MS1 scans to average for mass-feature associated m/zs. Called within the LCMSBase.add_associated_ms1() method. Default is 1.
  • ms1_deconvolution_corr_min (float, optional): Minimum correlation to use for deconvoluting MS1 mass features. Called within the LCCalculations.deconvolute_ms1_mass_features() method. Default is 0.8.
  • ms2_dda_rt_tolerance (float, optional): Retention time tolerance to use for associating MS2 spectra to mass features, in minutes. Called within the LCMSBase.add_associated_ms2_dda() method. Default is 0.15.
  • ms2_dda_mz_tolerance (float, optional): Mass tolerance to use for associating MS2 spectra to mass features. Called within the LCMSBase.add_associated_ms2_dda() method. Default is 0.05.
  • ms2_min_fe_score (float, optional): Minimum flash entropy for retaining MS2 annotations. Called within the LCMSSpectralSearch.fe_search() method. Default is 0.2.
  • search_as_lipids (bool, optional): If True, prepare the database for lipid searching. Called within the LCMSSpectralSearch.fe_prep_search_db() method. Default is False.
  • include_fragment_types (bool, optional): If True, include fragment types in the database. Called within the LCMSSpectralSearch.fe_search() and related methods. Default is False.
  • verbose_processing (bool, optional): If True, print verbose processing information. Default is True.
LiquidChromatographSetting( scans: list | tuple = (-1, -1), eic_tolerance_ppm: float = 5, smooth_window: int = 5, smooth_method: str = 'savgol', implemented_smooth_method: tuple = ('savgol', 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar'), savgol_pol_order: int = 2, peak_height_max_percent: float = 10, peak_max_prominence_percent: float = 1, peak_derivative_threshold: float = 0.0005, min_peak_datapoints: float = 5, noise_threshold_method: str = 'manual_relative_abundance', noise_threshold_methods_implemented: tuple = ('auto_relative_abundance', 'manual_relative_abundance', 'second_derivative'), peak_height_min_percent: float = 0.1, eic_signal_threshold: float = 0.01, peak_picking_method: str = 'persistent homology', implemented_peak_picking_methods: tuple = ('persistent homology',), mass_feature_cluster_mz_tolerance_rel: float = 5e-06, mass_feature_cluster_rt_tolerance: float = 0.3, ms1_scans_to_average: int = 1, ms1_deconvolution_corr_min: float = 0.8, ms2_dda_rt_tolerance: float = 0.15, ms2_dda_mz_tolerance: float = 0.05, ms2_min_fe_score: float = 0.2, search_as_lipids: bool = False, include_fragment_types: bool = False, export_profile_spectra: bool = False, export_eics: bool = True, export_unprocessed_ms1: bool = False, verbose_processing: bool = True)
scans: list | tuple = (-1, -1)
eic_tolerance_ppm: float = 5
correct_eic_baseline = True
smooth_window: int = 5
smooth_method: str = 'savgol'
implemented_smooth_method: tuple = ('savgol', 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar')
savgol_pol_order: int = 2
peak_height_max_percent: float = 10
peak_max_prominence_percent: float = 1
peak_derivative_threshold: float = 0.0005
min_peak_datapoints: float = 5
noise_threshold_method: str = 'manual_relative_abundance'
noise_threshold_methods_implemented: tuple = ('auto_relative_abundance', 'manual_relative_abundance', 'second_derivative')
peak_height_min_percent: float = 0.1
eic_signal_threshold: float = 0.01
eic_buffer_time = 1.5
peak_picking_method: str = 'persistent homology'
implemented_peak_picking_methods: tuple = ('persistent homology',)
ph_smooth_it = 1
ph_smooth_radius_mz = 0
ph_smooth_radius_scan = 1
ph_inten_min_rel = 0.001
ph_persis_min_rel = 0.001
mass_feature_cluster_mz_tolerance_rel: float = 5e-06
mass_feature_cluster_rt_tolerance: float = 0.3
ms1_scans_to_average: int = 1
ms1_deconvolution_corr_min: float = 0.8
ms2_dda_rt_tolerance: float = 0.15
ms2_dda_mz_tolerance: float = 0.05
ms2_min_fe_score: float = 0.2
search_as_lipids: bool = False
include_fragment_types: bool = False
export_profile_spectra: bool = False
export_eics: bool = True
export_unprocessed_ms1: bool = False
verbose_processing: bool = True
@dataclasses.dataclass
class MassSpectrumSetting:
266@dataclasses.dataclass
267class MassSpectrumSetting:
268    """Mass spectrum processing settings class
269
270    Attributes
271    ----------
272    noise_threshold_method : str, optional
273        Method for detecting noise threshold. Default is 'log'.
274    noise_threshold_methods_implemented : tuple, optional
275        Methods for detected noise threshold that can be implemented. Default is ('minima', 'signal_noise', 'relative_abundance', 'absolute_abundance', 'log').
276    noise_threshold_min_std : int, optional
277        Minumum value for noise thresholding when using 'minima' noise threshold method. Default is 6.
278    noise_threshold_min_s2n : float, optional
279        Minimum value for noise thresholding when using 'signal_noise' noise threshold method. Default is 4.
280    noise_threshold_min_relative_abundance : float, optional
281        Minimum value for noise thresholding when using 'relative_abundance' noise threshold method. Note that this is a percentage value. Default is 6 (6%).
282    noise_threshold_absolute_abundance : float, optional
283        Minimum value for noise thresholding when using 'absolute_abundance' noise threshold method. Default is 1_000_000.
284    noise_threshold_log_nsigma : int, optional
285        Number of standard deviations to use when using 'log' noise threshold method. Default is 6.
286    noise_threshold_log_nsigma_corr_factor : float, optional
287        Correction factor for log noise threshold method. Default is 0.463.
288    noise_threshold_log_nsigma_bins : int, optional
289        Number of bins to use for histogram when using 'log' noise threshold method. Default is 500.
290    noise_min_mz : float, optional
291        Minimum m/z to use for noise thresholding. Default is 50.0.
292    noise_max_mz : float, optional
293        Maximum m/z to use for noise thresholding. Default is 1200.0.
294    min_picking_mz : float, optional
295        Minimum m/z to use for peak picking. Default is 50.0.
296    max_picking_mz : float, optional
297        Maximum m/z to use for peak picking. Default is 1200.0.
298    picking_point_extrapolate : int, optional
299        How many data points (in each direction) to extrapolate the mz axis and 0 pad the abundance axis. Default is 3.
300        Recommend 3 for reduced profile data or if peak picking faults
301    calib_minimize_method : str, optional
302        Minimization method to use for calibration. Default is 'Powell'.
303    calib_pol_order : int, optional
304        Polynomial order to use for calibration. Default is 2.
305    max_calib_ppm_error : float, optional
306        Maximum ppm error to use for calibration. Default is 1.0.
307    min_calib_ppm_error : float, optional
308        Minimum ppm error to use for calibration. Default is -1.0.
309    calib_sn_threshold : float, optional
310        Signal to noise threshold to use for calibration. Default is 2.0.
311    calibration_ref_match_method: string, optional
312        Method for matching reference masses with measured masses for recalibration. Default is 'legacy'.
313    calibration_ref_match_tolerance: float, optional
314        If using the new method for calibration reference mass matching, this tolerance is the initial matching tolerance. Default is 0.003
315    do_calibration : bool, optional
316        If True, perform calibration. Default is True.
317    verbose_processing : bool, optional
318        If True, print verbose processing information. Default is True.
319    """
320
321    noise_threshold_method: str = "log"
322
323    noise_threshold_methods_implemented: tuple = (
324        "minima",
325        "signal_noise",
326        "relative_abundance",
327        "absolute_abundance",
328        "log",
329    )
330
331    noise_threshold_min_std: int = 6  # when using 'minima' method
332
333    noise_threshold_min_s2n: float = 4  # when using 'signal_noise' method
334
335    noise_threshold_min_relative_abundance: float = (
336        6  # from 0-100, when using 'relative_abundance' method
337    )
338
339    noise_threshold_absolute_abundance: float = (
340        1_000_000  # when using 'absolute_abundance' method
341    )
342
343    noise_threshold_log_nsigma: int = 6  # when using 'log' method
344    noise_threshold_log_nsigma_corr_factor: float = 0.463  # mFT is 0.463, aFT is 1.0
345    noise_threshold_log_nsigma_bins: int = 500  # bins for the histogram for the noise
346
347    noise_min_mz: float = 50.0
348    noise_max_mz: float = 1200.0
349
350    min_picking_mz: float = 50.0
351    max_picking_mz: float = 1200.0
352
353    # How many data points (in each direction) to extrapolate the mz axis and 0 pad the abundance axis
354    # This will fix peak picking at spectrum limit issues
355    #  0 to keep normal behaviour, typical value 3 to fix
356    picking_point_extrapolate: int = 3
357
358    calib_minimize_method: str = "Powell"
359    calib_pol_order: int = 2
360    max_calib_ppm_error: float = 1.0
361    min_calib_ppm_error: float = -1.0
362    calib_sn_threshold: float = 2.0
363    calibration_ref_match_method: str = "legacy"
364    calibration_ref_match_method_implemented: tuple = ("legacy", "merged")
365    calibration_ref_match_tolerance: float = 0.003
366    calibration_ref_match_std_raw_error_limit: float = 1.5
367    # calib_ref_mzs: list = [0]
368
369    do_calibration: bool = True
370    verbose_processing: bool = True
371
372    def __post_init__(self):
373        # enforce datatype
374        for field in dataclasses.fields(self):
375            value = getattr(self, field.name)
376            if not isinstance(value, field.type):
377                value = field.type(value)
378                setattr(self, field.name, value)

Mass spectrum processing settings class

Attributes
  • noise_threshold_method (str, optional): Method for detecting noise threshold. Default is 'log'.
  • noise_threshold_methods_implemented (tuple, optional): Methods for detected noise threshold that can be implemented. Default is ('minima', 'signal_noise', 'relative_abundance', 'absolute_abundance', 'log').
  • noise_threshold_min_std (int, optional): Minumum value for noise thresholding when using 'minima' noise threshold method. Default is 6.
  • noise_threshold_min_s2n (float, optional): Minimum value for noise thresholding when using 'signal_noise' noise threshold method. Default is 4.
  • noise_threshold_min_relative_abundance (float, optional): Minimum value for noise thresholding when using 'relative_abundance' noise threshold method. Note that this is a percentage value. Default is 6 (6%).
  • noise_threshold_absolute_abundance (float, optional): Minimum value for noise thresholding when using 'absolute_abundance' noise threshold method. Default is 1_000_000.
  • noise_threshold_log_nsigma (int, optional): Number of standard deviations to use when using 'log' noise threshold method. Default is 6.
  • noise_threshold_log_nsigma_corr_factor (float, optional): Correction factor for log noise threshold method. Default is 0.463.
  • noise_threshold_log_nsigma_bins (int, optional): Number of bins to use for histogram when using 'log' noise threshold method. Default is 500.
  • noise_min_mz (float, optional): Minimum m/z to use for noise thresholding. Default is 50.0.
  • noise_max_mz (float, optional): Maximum m/z to use for noise thresholding. Default is 1200.0.
  • min_picking_mz (float, optional): Minimum m/z to use for peak picking. Default is 50.0.
  • max_picking_mz (float, optional): Maximum m/z to use for peak picking. Default is 1200.0.
  • picking_point_extrapolate (int, optional): How many data points (in each direction) to extrapolate the mz axis and 0 pad the abundance axis. Default is 3. Recommend 3 for reduced profile data or if peak picking faults
  • calib_minimize_method (str, optional): Minimization method to use for calibration. Default is 'Powell'.
  • calib_pol_order (int, optional): Polynomial order to use for calibration. Default is 2.
  • max_calib_ppm_error (float, optional): Maximum ppm error to use for calibration. Default is 1.0.
  • min_calib_ppm_error (float, optional): Minimum ppm error to use for calibration. Default is -1.0.
  • calib_sn_threshold (float, optional): Signal to noise threshold to use for calibration. Default is 2.0.
  • calibration_ref_match_method (string, optional): Method for matching reference masses with measured masses for recalibration. Default is 'legacy'.
  • calibration_ref_match_tolerance (float, optional): If using the new method for calibration reference mass matching, this tolerance is the initial matching tolerance. Default is 0.003
  • do_calibration (bool, optional): If True, perform calibration. Default is True.
  • verbose_processing (bool, optional): If True, print verbose processing information. Default is True.
MassSpectrumSetting( noise_threshold_method: str = 'log', noise_threshold_methods_implemented: tuple = ('minima', 'signal_noise', 'relative_abundance', 'absolute_abundance', 'log'), noise_threshold_min_std: int = 6, noise_threshold_min_s2n: float = 4, noise_threshold_min_relative_abundance: float = 6, noise_threshold_absolute_abundance: float = 1000000, noise_threshold_log_nsigma: int = 6, noise_threshold_log_nsigma_corr_factor: float = 0.463, noise_threshold_log_nsigma_bins: int = 500, noise_min_mz: float = 50.0, noise_max_mz: float = 1200.0, min_picking_mz: float = 50.0, max_picking_mz: float = 1200.0, picking_point_extrapolate: int = 3, calib_minimize_method: str = 'Powell', calib_pol_order: int = 2, max_calib_ppm_error: float = 1.0, min_calib_ppm_error: float = -1.0, calib_sn_threshold: float = 2.0, calibration_ref_match_method: str = 'legacy', calibration_ref_match_method_implemented: tuple = ('legacy', 'merged'), calibration_ref_match_tolerance: float = 0.003, calibration_ref_match_std_raw_error_limit: float = 1.5, do_calibration: bool = True, verbose_processing: bool = True)
noise_threshold_method: str = 'log'
noise_threshold_methods_implemented: tuple = ('minima', 'signal_noise', 'relative_abundance', 'absolute_abundance', 'log')
noise_threshold_min_std: int = 6
noise_threshold_min_s2n: float = 4
noise_threshold_min_relative_abundance: float = 6
noise_threshold_absolute_abundance: float = 1000000
noise_threshold_log_nsigma: int = 6
noise_threshold_log_nsigma_corr_factor: float = 0.463
noise_threshold_log_nsigma_bins: int = 500
noise_min_mz: float = 50.0
noise_max_mz: float = 1200.0
min_picking_mz: float = 50.0
max_picking_mz: float = 1200.0
picking_point_extrapolate: int = 3
calib_minimize_method: str = 'Powell'
calib_pol_order: int = 2
max_calib_ppm_error: float = 1.0
min_calib_ppm_error: float = -1.0
calib_sn_threshold: float = 2.0
calibration_ref_match_method: str = 'legacy'
calibration_ref_match_method_implemented: tuple = ('legacy', 'merged')
calibration_ref_match_tolerance: float = 0.003
calibration_ref_match_std_raw_error_limit: float = 1.5
do_calibration: bool = True
verbose_processing: bool = True
@dataclasses.dataclass
class MassSpecPeakSetting:
381@dataclasses.dataclass
382class MassSpecPeakSetting:
383    """Mass spectrum peak processing settings class
384
385    Attributes
386    ----------
387    kendrick_base : Dict, optional
388        Dictionary specifying the elements and their counts in the Kendrick base.
389        Defaults to {'C': 1, 'H': 2}.
390    kendrick_rounding_method : str, optional
391        Method for calculating the nominal Kendrick mass. Valid values are 'floor', 'ceil', or 'round'.
392        Defaults to 'floor'.
393    implemented_kendrick_rounding_methods : tuple
394        Tuple of valid rounding methods for calculating the nominal Kendrick mass.
395        Defaults to ('floor', 'ceil', 'round').
396    peak_derivative_threshold : float, optional
397        Threshold for defining derivative crossing. Should be a value between 0 and 1.
398        Defaults to 0.0.
399    peak_min_prominence_percent : float, optional
400        Minimum prominence percentage used for peak detection. Should be a value between 1 and 100.
401        Defaults to 0.1.
402    min_peak_datapoints : float, optional
403        Minimum number of data points used for peak detection. Should be a value between 0 and infinity.
404        Defaults to 5.
405    peak_max_prominence_percent : float, optional
406        Maximum prominence percentage used for baseline detection. Should be a value between 1 and 100.
407        Defaults to 0.1.
408    peak_height_max_percent : float, optional
409        Maximum height percentage used for baseline detection. Should be a value between 1 and 100.
410        Defaults to 10.
411    legacy_resolving_power : bool, optional
412        Flag indicating whether to use the legacy (CoreMS v1) resolving power calculation.
413        Defaults to True.
414    legacy_centroid_polyfit : bool, optional
415        Use legacy (numpy polyfit) to fit centroid
416        Default false.
417    """
418
419    kendrick_base: Dict = dataclasses.field(default_factory=dict)
420
421    kendrick_rounding_method: str = "floor"  # 'floor', 'ceil' or 'round' are valid methods for calculating nominal kendrick mass
422
423    implemented_kendrick_rounding_methods: tuple = ("floor", "ceil", "round")
424
425    peak_derivative_threshold: float = 0.0  # define derivative crossing threshould 0-1
426
427    peak_min_prominence_percent: float = 0.1  # 1-100 % used for peak detection
428
429    min_peak_datapoints: float = 5  # 0-inf used for peak detection
430
431    peak_max_prominence_percent: float = 0.1  # 1-100 % used for baseline detection
432
433    peak_height_max_percent: float = 10  # 1-100 % used for baseline detection
434
435    legacy_resolving_power: bool = (
436        True  # Use the legacy (CoreMS v1) resolving power calculation (True)
437    )
438
439    legacy_centroid_polyfit: bool = False
440
441    def __post_init__(self):
442        # default to CH2
443        if not self.kendrick_base:
444            self.kendrick_base = {"C": 1, "H": 2}
445        # enforce datatype
446        for field in dataclasses.fields(self):
447            value = getattr(self, field.name)
448            if not isinstance(value, field.type):
449                value = field.type(value)
450                setattr(self, field.name, value)

Mass spectrum peak processing settings class

Attributes
  • kendrick_base (Dict, optional): Dictionary specifying the elements and their counts in the Kendrick base. Defaults to {'C': 1, 'H': 2}.
  • kendrick_rounding_method (str, optional): Method for calculating the nominal Kendrick mass. Valid values are 'floor', 'ceil', or 'round'. Defaults to 'floor'.
  • implemented_kendrick_rounding_methods (tuple): Tuple of valid rounding methods for calculating the nominal Kendrick mass. Defaults to ('floor', 'ceil', 'round').
  • peak_derivative_threshold (float, optional): Threshold for defining derivative crossing. Should be a value between 0 and 1. Defaults to 0.0.
  • peak_min_prominence_percent (float, optional): Minimum prominence percentage used for peak detection. Should be a value between 1 and 100. Defaults to 0.1.
  • min_peak_datapoints (float, optional): Minimum number of data points used for peak detection. Should be a value between 0 and infinity. Defaults to 5.
  • peak_max_prominence_percent (float, optional): Maximum prominence percentage used for baseline detection. Should be a value between 1 and 100. Defaults to 0.1.
  • peak_height_max_percent (float, optional): Maximum height percentage used for baseline detection. Should be a value between 1 and 100. Defaults to 10.
  • legacy_resolving_power (bool, optional): Flag indicating whether to use the legacy (CoreMS v1) resolving power calculation. Defaults to True.
  • legacy_centroid_polyfit (bool, optional): Use legacy (numpy polyfit) to fit centroid Default false.
MassSpecPeakSetting( kendrick_base: Dict = <factory>, kendrick_rounding_method: str = 'floor', implemented_kendrick_rounding_methods: tuple = ('floor', 'ceil', 'round'), peak_derivative_threshold: float = 0.0, peak_min_prominence_percent: float = 0.1, min_peak_datapoints: float = 5, peak_max_prominence_percent: float = 0.1, peak_height_max_percent: float = 10, legacy_resolving_power: bool = True, legacy_centroid_polyfit: bool = False)
kendrick_base: Dict
kendrick_rounding_method: str = 'floor'
implemented_kendrick_rounding_methods: tuple = ('floor', 'ceil', 'round')
peak_derivative_threshold: float = 0.0
peak_min_prominence_percent: float = 0.1
min_peak_datapoints: float = 5
peak_max_prominence_percent: float = 0.1
peak_height_max_percent: float = 10
legacy_resolving_power: bool = True
legacy_centroid_polyfit: bool = False
@dataclasses.dataclass
class GasChromatographSetting:
453@dataclasses.dataclass
454class GasChromatographSetting:
455    """Gas chromatograph processing settings class
456
457    Attributes
458    ----------
459    use_deconvolution : bool, optional
460        If True, use deconvolution. Default is False.
461    implemented_smooth_method : tuple, optional
462        Smoothing methods that can be implemented. Default is ('savgol', 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar').
463    smooth_window : int, optional
464        Window size for smoothing the ion chromatogram. Default is 5.
465    smooth_method : str, optional
466        Smoothing method to use. Default is 'savgol'. Other options are 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar'.
467    savgol_pol_order : int, optional
468        Polynomial order for Savitzky-Golay smoothing. Default is 2.
469    peak_derivative_threshold : float, optional
470        Threshold for defining derivative crossing. Should be a value between 0 and 1.
471        Defaults to 0.0005.
472    peak_height_max_percent : float, optional
473        Maximum height percentage used for baseline detection. Should be a value between 1 and 100.
474        Defaults to 10.
475    peak_max_prominence_percent : float, optional
476        Maximum prominence percentage used for baseline detection. Should be a value between 1 and 100.
477        Defaults to 1.
478    min_peak_datapoints : float, optional
479        Minimum number of data points used for peak detection. Should be a value between 0 and infinity.
480        Defaults to 5.
481    max_peak_width : float, optional
482        Maximum peak width used for peak detection. Should be a value between 0 and infinity.
483        Defaults to 0.1.
484    noise_threshold_method : str, optional
485        Method for detecting noise threshold. Default is 'manual_relative_abundance'.
486    noise_threshold_methods_implemented : tuple, optional
487        Methods for detected noise threshold that can be implemented. Default is ('auto_relative_abundance', 'manual_relative_abundance', 'second_derivative').
488    std_noise_threshold : int, optional
489        Default is 3.
490    peak_height_min_percent : float, optional
491        0-100 % used for peak detection. Default is 0.1.
492    peak_min_prominence_percent : float, optional
493        0-100 % used for peak detection. Default is 0.1.
494    eic_signal_threshold : float, optional
495        0-100 % used for extracted ion chromatogram peak detection. Default is 0.01.
496    max_rt_distance : float, optional
497        Maximum distance allowance for hierarchical cluster, in minutes. Default is 0.025.
498    verbose_processing : bool, optional
499        If True, print verbose processing information. Default is True.
500    """
501
502    use_deconvolution: bool = False
503
504    implemented_smooth_method: tuple = (
505        "savgol",
506        "hanning",
507        "blackman",
508        "bartlett",
509        "flat",
510        "boxcar",
511    )
512
513    smooth_window: int = 5
514
515    smooth_method: str = "savgol"
516
517    savgol_pol_order: int = 2
518
519    peak_derivative_threshold: float = 0.0005
520
521    peak_height_max_percent: float = 10  # 1-100 % used for baseline detection use 0.1 for second_derivative and 10 for other methods
522
523    peak_max_prominence_percent: float = 1  # 1-100 % used for baseline detection
524
525    min_peak_datapoints: float = 5
526
527    max_peak_width: float = 0.1
528
529    noise_threshold_method: str = "manual_relative_abundance"
530
531    noise_threshold_methods_implemented: tuple = (
532        "auto_relative_abundance",
533        "manual_relative_abundance",
534        "second_derivative",
535    )
536
537    std_noise_threshold: int = 3
538
539    peak_height_min_percent: float = 0.1  # 0-100 % used for peak detection
540
541    peak_min_prominence_percent: float = 0.1  # 0-100 % used for peak detection
542
543    eic_signal_threshold: float = (
544        0.01  # 0-100 % used for extracted ion chromatogram peak detection
545    )
546
547    max_rt_distance: float = (
548        0.025  # minutes, max distance allowance hierarchical clutter
549    )
550
551    verbose_processing: bool = True
552
553    def __post_init__(self):
554        # enforce datatype
555        for field in dataclasses.fields(self):
556            value = getattr(self, field.name)
557            if not isinstance(value, field.type):
558                value = field.type(value)
559                setattr(self, field.name, value)

Gas chromatograph processing settings class

Attributes
  • use_deconvolution (bool, optional): If True, use deconvolution. Default is False.
  • implemented_smooth_method (tuple, optional): Smoothing methods that can be implemented. Default is ('savgol', 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar').
  • smooth_window (int, optional): Window size for smoothing the ion chromatogram. Default is 5.
  • smooth_method (str, optional): Smoothing method to use. Default is 'savgol'. Other options are 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar'.
  • savgol_pol_order (int, optional): Polynomial order for Savitzky-Golay smoothing. Default is 2.
  • peak_derivative_threshold (float, optional): Threshold for defining derivative crossing. Should be a value between 0 and 1. Defaults to 0.0005.
  • peak_height_max_percent (float, optional): Maximum height percentage used for baseline detection. Should be a value between 1 and 100. Defaults to 10.
  • peak_max_prominence_percent (float, optional): Maximum prominence percentage used for baseline detection. Should be a value between 1 and 100. Defaults to 1.
  • min_peak_datapoints (float, optional): Minimum number of data points used for peak detection. Should be a value between 0 and infinity. Defaults to 5.
  • max_peak_width (float, optional): Maximum peak width used for peak detection. Should be a value between 0 and infinity. Defaults to 0.1.
  • noise_threshold_method (str, optional): Method for detecting noise threshold. Default is 'manual_relative_abundance'.
  • noise_threshold_methods_implemented (tuple, optional): Methods for detected noise threshold that can be implemented. Default is ('auto_relative_abundance', 'manual_relative_abundance', 'second_derivative').
  • std_noise_threshold (int, optional): Default is 3.
  • peak_height_min_percent (float, optional): 0-100 % used for peak detection. Default is 0.1.
  • peak_min_prominence_percent (float, optional): 0-100 % used for peak detection. Default is 0.1.
  • eic_signal_threshold (float, optional): 0-100 % used for extracted ion chromatogram peak detection. Default is 0.01.
  • max_rt_distance (float, optional): Maximum distance allowance for hierarchical cluster, in minutes. Default is 0.025.
  • verbose_processing (bool, optional): If True, print verbose processing information. Default is True.
GasChromatographSetting( use_deconvolution: bool = False, implemented_smooth_method: tuple = ('savgol', 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar'), smooth_window: int = 5, smooth_method: str = 'savgol', savgol_pol_order: int = 2, peak_derivative_threshold: float = 0.0005, peak_height_max_percent: float = 10, peak_max_prominence_percent: float = 1, min_peak_datapoints: float = 5, max_peak_width: float = 0.1, noise_threshold_method: str = 'manual_relative_abundance', noise_threshold_methods_implemented: tuple = ('auto_relative_abundance', 'manual_relative_abundance', 'second_derivative'), std_noise_threshold: int = 3, peak_height_min_percent: float = 0.1, peak_min_prominence_percent: float = 0.1, eic_signal_threshold: float = 0.01, max_rt_distance: float = 0.025, verbose_processing: bool = True)
use_deconvolution: bool = False
implemented_smooth_method: tuple = ('savgol', 'hanning', 'blackman', 'bartlett', 'flat', 'boxcar')
smooth_window: int = 5
smooth_method: str = 'savgol'
savgol_pol_order: int = 2
peak_derivative_threshold: float = 0.0005
peak_height_max_percent: float = 10
peak_max_prominence_percent: float = 1
min_peak_datapoints: float = 5
max_peak_width: float = 0.1
noise_threshold_method: str = 'manual_relative_abundance'
noise_threshold_methods_implemented: tuple = ('auto_relative_abundance', 'manual_relative_abundance', 'second_derivative')
std_noise_threshold: int = 3
peak_height_min_percent: float = 0.1
peak_min_prominence_percent: float = 0.1
eic_signal_threshold: float = 0.01
max_rt_distance: float = 0.025
verbose_processing: bool = True
@dataclasses.dataclass
class CompoundSearchSettings:
562@dataclasses.dataclass
563class CompoundSearchSettings:
564    """Settings for compound search
565
566    Attributes
567    ----------
568    url_database : str, optional
569        URL for the database. Default is 'sqlite:///db/pnnl_lowres_gcms_compounds.sqlite'.
570    ri_search_range : float, optional
571        Retention index search range. Default is 35.
572    rt_search_range : float, optional
573        Retention time search range, in minutes. Default is 1.0.
574    correlation_threshold : float, optional
575        Threshold for correlation for spectral similarity. Default is 0.5.
576    score_threshold : float, optional
577        Threshold for compsite score. Default is 0.0.
578    ri_spacing : float, optional
579        Retention index spacing. Default is 200.
580    ri_std : float, optional
581        Retention index standard deviation. Default is 3.
582    ri_calibration_compound_names : list, optional
583        List of compound names to use for retention index calibration. Default is ['Methyl Caprylate', 'Methyl Caprate', 'Methyl Pelargonate', 'Methyl Laurate', 'Methyl Myristate', 'Methyl Palmitate', 'Methyl Stearate', 'Methyl Eicosanoate', 'Methyl Docosanoate', 'Methyl Linocerate', 'Methyl Hexacosanoate', 'Methyl Octacosanoate', 'Methyl Triacontanoate'].
584
585    """
586
587    url_database: str = "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/lowres"  # 'postgresql://postgres:labthomson0102@172.22.113.27:5432/GCMS' # 'sqlite:///db/pnnl_lowres_gcms_compounds.sqlite'
588
589    ri_search_range: float = 35
590
591    rt_search_range: float = 1.0  # used for retention index calibration
592
593    correlation_threshold: float = 0.5  # used for calibration, spectral similarity
594
595    score_threshold: float = 0.0
596
597    ri_spacing: float = 200
598
599    ri_std: float = 3  # in standard deviation
600
601    ri_calibration_compound_names: List = dataclasses.field(default_factory=list)
602
603    # calculates and export all spectral similarity methods
604    exploratory_mode: bool = False
605
606    score_methods: tuple = ("highest_sim_score", "highest_ss")
607
608    output_score_method: str = "All"
609
610    def __post_init__(self):
611        # enforce datatype
612        self.url_database = os.getenv(
613            "SPECTRAL_GCMS_DATABASE_URL",
614            "sqlite:///db/pnnl_lowres_gcms_compounds.sqlite",
615        )
616
617        for field in dataclasses.fields(self):
618            value = getattr(self, field.name)
619            if not isinstance(value, field.type):
620                value = field.type(value)
621                setattr(self, field.name, value)
622
623        self.ri_calibration_compound_names = [
624            "Methyl Caprylate",
625            "Methyl Caprate",
626            "Methyl Pelargonate",
627            "Methyl Laurate",
628            "Methyl Myristate",
629            "Methyl Palmitate",
630            "Methyl Stearate",
631            "Methyl Eicosanoate",
632            "Methyl Docosanoate",
633            "Methyl Linocerate",
634            "Methyl Hexacosanoate",
635            "Methyl Octacosanoate",
636            "Methyl Triacontanoate",
637        ]

Settings for compound search

Attributes
  • url_database (str, optional): URL for the database. Default is 'sqlite:///db/pnnl_lowres_gcms_compounds.sqlite'.
  • ri_search_range (float, optional): Retention index search range. Default is 35.
  • rt_search_range (float, optional): Retention time search range, in minutes. Default is 1.0.
  • correlation_threshold (float, optional): Threshold for correlation for spectral similarity. Default is 0.5.
  • score_threshold (float, optional): Threshold for compsite score. Default is 0.0.
  • ri_spacing (float, optional): Retention index spacing. Default is 200.
  • ri_std (float, optional): Retention index standard deviation. Default is 3.
  • ri_calibration_compound_names (list, optional): List of compound names to use for retention index calibration. Default is ['Methyl Caprylate', 'Methyl Caprate', 'Methyl Pelargonate', 'Methyl Laurate', 'Methyl Myristate', 'Methyl Palmitate', 'Methyl Stearate', 'Methyl Eicosanoate', 'Methyl Docosanoate', 'Methyl Linocerate', 'Methyl Hexacosanoate', 'Methyl Octacosanoate', 'Methyl Triacontanoate'].
CompoundSearchSettings( url_database: str = 'postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/lowres', ri_search_range: float = 35, rt_search_range: float = 1.0, correlation_threshold: float = 0.5, score_threshold: float = 0.0, ri_spacing: float = 200, ri_std: float = 3, ri_calibration_compound_names: List = <factory>, exploratory_mode: bool = False, score_methods: tuple = ('highest_sim_score', 'highest_ss'), output_score_method: str = 'All')
url_database: str = 'postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/lowres'
ri_search_range: float = 35
rt_search_range: float = 1.0
correlation_threshold: float = 0.5
score_threshold: float = 0.0
ri_spacing: float = 200
ri_std: float = 3
ri_calibration_compound_names: List
exploratory_mode: bool = False
score_methods: tuple = ('highest_sim_score', 'highest_ss')
output_score_method: str = 'All'
class MolecularLookupDictSettings:
640class MolecularLookupDictSettings:
641    """Settings for molecular searching
642
643    These are used to generate the database entries, do not change.
644
645    Attributes
646    ----------
647    usedAtoms : dict, optional
648        Dictionary of atoms and ranges. Default is {'C': (1, 90), 'H': (4, 200), 'O': (0, 12), 'N': (0, 0), 'S': (0, 0), 'P': (0, 0), 'Cl': (0, 0)}.
649    min_mz : float, optional
650        Minimum m/z to use for searching. Default is 50.0.
651    max_mz : float, optional
652        Maximum m/z to use for searching. Default is 1200.0.
653    min_dbe : float, optional
654        Minimum double bond equivalent to use for searching. Default is 0.
655    max_dbe : float, optional
656        Maximum double bond equivalent to use for searching. Default is 50.
657    use_pah_line_rule : bool, optional
658        If True, use the PAH line rule. Default is False.
659    isRadical : bool, optional
660        If True, search for radical ions. Default is True.
661    isProtonated : bool, optional
662        If True, search for protonated ions. Default is True.
663    url_database : str, optional
664        URL for the database. Default is None.
665    db_jobs : int, optional
666        Number of jobs to use for database queries. Default is 1.
667    used_atom_valences : dict, optional
668        Dictionary of atoms and valences. Default is {'C': 4, '13C': 4, 'H': 1, 'O': 2, '18O': 2, 'N': 3, 'S': 2, '34S': 2, 'P': 3, 'Cl': 1, '37Cl': 1, 'Br': 1, 'Na': 1, 'F': 1, 'K': 0}.
669
670    """
671
672    ### DO NOT CHANGE IT! These are used to generate the database entries
673
674    ### DO change when creating a new application database
675
676    ### FOR search settings runtime and database query check use the MolecularFormulaSearchSettings class below
677
678    ### C, H, N, O, S and P atoms are ALWAYS needed at usedAtoms
679    ### if you don't want to include one of those atoms set the max and min at 0
680    ### you can include any atom listed at Atoms class inside encapsulation.settings.constants module
681    ### make sure to include the selected covalence at the used_atoms_valences when adding new atoms
682    ### NOTE : Adducts atoms have zero covalence
683    ### NOTE : Not using static variable because this class is distributed using multiprocessing
684    def __init__(self):
685        self.usedAtoms = {
686            "C": (1, 90),
687            "H": (4, 200),
688            "O": (0, 12),
689            "N": (0, 0),
690            "S": (0, 0),
691            "P": (0, 0),
692            "Cl": (0, 0),
693        }
694
695        self.min_mz = 50
696
697        self.max_mz = 1200
698
699        self.min_dbe = 0
700
701        self.max_dbe = 50
702
703        # overwrites the dbe limits above to DBE = (C + heteroatoms) * 0.9
704        self.use_pah_line_rule = False
705
706        self.isRadical = True
707
708        self.isProtonated = True
709
710        self.url_database = None
711
712        self.db_jobs = 1
713
714        self.used_atom_valences = {
715            "C": 4,
716            "13C": 4,
717            "H": 1,
718            "O": 2,
719            "18O": 2,
720            "N": 3,
721            "S": 2,
722            "34S": 2,
723            "P": 3,
724            "Cl": 1,
725            "37Cl": 1,
726            "Br": 1,
727            "Na": 1,
728            "F": 1,
729            "K": 0,
730        }

Settings for molecular searching

These are used to generate the database entries, do not change.

Attributes
  • usedAtoms (dict, optional): Dictionary of atoms and ranges. Default is {'C': (1, 90), 'H': (4, 200), 'O': (0, 12), 'N': (0, 0), 'S': (0, 0), 'P': (0, 0), 'Cl': (0, 0)}.
  • min_mz (float, optional): Minimum m/z to use for searching. Default is 50.0.
  • max_mz (float, optional): Maximum m/z to use for searching. Default is 1200.0.
  • min_dbe (float, optional): Minimum double bond equivalent to use for searching. Default is 0.
  • max_dbe (float, optional): Maximum double bond equivalent to use for searching. Default is 50.
  • use_pah_line_rule (bool, optional): If True, use the PAH line rule. Default is False.
  • isRadical (bool, optional): If True, search for radical ions. Default is True.
  • isProtonated (bool, optional): If True, search for protonated ions. Default is True.
  • url_database (str, optional): URL for the database. Default is None.
  • db_jobs (int, optional): Number of jobs to use for database queries. Default is 1.
  • used_atom_valences (dict, optional): Dictionary of atoms and valences. Default is {'C': 4, '13C': 4, 'H': 1, 'O': 2, '18O': 2, 'N': 3, 'S': 2, '34S': 2, 'P': 3, 'Cl': 1, '37Cl': 1, 'Br': 1, 'Na': 1, 'F': 1, 'K': 0}.
usedAtoms
min_mz
max_mz
min_dbe
max_dbe
use_pah_line_rule
isRadical
isProtonated
url_database
db_jobs
used_atom_valences
@dataclasses.dataclass
class MolecularFormulaSearchSettings:
733@dataclasses.dataclass
734class MolecularFormulaSearchSettings:
735    """Settings for molecular searching
736
737    Attributes
738    ----------
739    use_isotopologue_filter : bool, optional
740        If True, use isotopologue filter. Default is False.
741    isotopologue_filter_threshold : float, optional
742        Threshold for isotopologue filter. Default is 33.
743    isotopologue_filter_atoms : tuple, optional
744        Tuple of atoms to use for isotopologue filter. Default is ('Cl', 'Br').
745    use_runtime_kendrick_filter : bool, optional
746        If True, use runtime Kendrick filter. Default is False.
747    use_min_peaks_filter : bool, optional
748        If True, use minimum peaks filter. Default is True.
749    min_peaks_per_class : int, optional
750        Minimum number of peaks per class. Default is 15.
751    url_database : str, optional
752        URL for the database. Default is 'postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp'.
753    db_jobs : int, optional
754        Number of jobs to use for database queries. Default is 3.
755    db_chunk_size : int, optional
756        Chunk size to use for database queries. Default is 300.
757    ion_charge : int, optional
758        Ion charge. Default is -1.
759    min_hc_filter : float, optional
760        Minimum hydrogen to carbon ratio. Default is 0.3.
761    max_hc_filter : float, optional
762        Maximum hydrogen to carbon ratio. Default is 3.
763    min_oc_filter : float, optional
764        Minimum oxygen to carbon ratio. Default is 0.0.
765    max_oc_filter : float, optional
766        Maximum oxygen to carbon ratio. Default is 1.2.
767    min_op_filter : float, optional
768        Minimum oxygen to phosphorous ratio. Default is 2.
769    use_pah_line_rule : bool, optional
770        If True, use the PAH line rule. Default is False.
771    min_dbe : float, optional
772        Minimum double bond equivalent to use for searching. Default is 0.
773    max_dbe : float, optional
774        Maximum double bond equivalent to use for searching. Default is 40.
775    mz_error_score_weight : float, optional
776        Weight for m/z error score to contribute to composite score. Default is 0.6.
777    isotopologue_score_weight : float, optional
778        Weight for isotopologue score to contribute to composite score. Default is 0.4.
779    adduct_atoms_neg : tuple, optional
780        Tuple of atoms to use in negative polarity. Default is ('Cl', 'Br').
781    adduct_atoms_pos : tuple, optional
782        Tuple of atoms to use in positive polarity. Default is ('Na', 'K').
783    score_methods : tuple, optional
784        Tuple of score method that can be implemented.
785        Default is ('S_P_lowest_error', 'N_S_P_lowest_error', 'lowest_error', 'prob_score', 'air_filter_error', 'water_filter_error', 'earth_filter_error').
786    score_method : str, optional
787        Score method to use. Default is 'prob_score'. Options are 'S_P_lowest_error', 'N_S_P_lowest_error', 'lowest_error', 'prob_score', 'air_filter_error', 'water_filter_error', 'earth_filter_error'.
788    output_min_score : float, optional
789        Minimum score for output. Default is 0.1.
790    output_score_method : str, optional
791        Score method to use for output. Default is 'All Candidates'.
792    isRadical : bool, optional
793        If True, search for radical ions. Default is False.
794    isProtonated : bool, optional
795        If True, search for protonated ions. Default is True.
796    isAdduct : bool, optional
797        If True, search for adduct ions. Default is False.
798    usedAtoms : dict, optional
799        Dictionary of atoms and ranges. Default is {'C': (1, 90), 'H': (4, 200), 'O': (0, 12), 'N': (0, 0), 'S': (0, 0), 'P': (0, 0), 'Cl': (0, 0)}.
800    ion_types_excluded : list, optional
801        List of ion types to exclude from molecular id search, commonly ['[M+CH3COO]-]'] or ['[M+COOH]-'] depending on mobile phase content. Default is [].
802    ionization_type : str, optional
803        Ionization type. Default is 'ESI'.
804    min_ppm_error : float, optional
805        Minimum ppm error. Default is -10.0.
806    max_ppm_error : float, optional
807        Maximum ppm error. Default is 10.0.
808    min_abun_error : float, optional
809        Minimum abundance error for isotolopologue search. Default is -100.0.
810    max_abun_error : float, optional
811        Maximum abundance error for isotolopologue search. Default is 100.0.
812    mz_error_range : float, optional
813        m/z error range. Default is 1.5.
814    error_method : str, optional
815        Error method. Default is 'None'. Options are 'distance', 'lowest', 'symmetrical','average' 'None'.
816    mz_error_average : float, optional
817        m/z error average. Default is 0.0.
818    used_atom_valences : dict, optional
819        Dictionary of atoms and valences. Default is {'C': 4, '13C': 4, 'H': 1, 'O': 2, '18O': 2, 'N': 3, 'S': 2, '34S': 2, 'P': 3, 'Cl': 1, '37Cl': 1, 'Br': 1, 'Na': 1, 'F': 1, 'K': 0}.
820    verbose_processing: bool, optional
821        If True, print verbose processing information. Default is True.
822    """
823    verbose_processing: bool = True    
824
825    use_isotopologue_filter: bool = False
826
827    isotopologue_filter_threshold: float = 33
828
829    isotopologue_filter_atoms: tuple = ("Cl", "Br")
830
831    use_runtime_kendrick_filter: bool = False
832
833    use_min_peaks_filter: bool = True
834
835    min_peaks_per_class: int = 15
836
837    url_database: str = (
838        "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp"
839    )
840
841    db_jobs: int = 3
842
843    db_chunk_size: int = 300
844
845    # query setting========
846    ion_charge: int = -1
847
848    min_hc_filter: float = 0.3
849
850    max_hc_filter: float = 3
851
852    min_oc_filter: float = 0.0
853
854    max_oc_filter: float = 1.2
855
856    min_op_filter: float = 2
857
858    use_pah_line_rule: bool = False
859
860    min_dbe: float = 0
861
862    max_dbe: float = 40
863
864    mz_error_score_weight: float = 0.6
865
866    isotopologue_score_weight: float = 0.4
867
868    # look for close shell ions [M + Adduct]+ only considers metal set in the list adduct_atoms
869    adduct_atoms_neg: tuple = ("Cl", "Br")
870
871    adduct_atoms_pos: tuple = ("Na", "K")
872
873    score_methods: tuple = (
874        "S_P_lowest_error",
875        "N_S_P_lowest_error",
876        "lowest_error",
877        "prob_score",
878        "air_filter_error",
879        "water_filter_error",
880        "earth_filter_error",
881    )
882
883    score_method: str = "prob_score"
884
885    output_min_score: float = 0.1
886
887    output_score_method: str = "All Candidates"
888
889    # depending on the polarity mode it looks for [M].+ , [M].-
890    # query and automatically compile add entry if it doesn't exist
891
892    isRadical: bool = False
893
894    # depending on the polarity mode it looks for [M + H]+ , [M - H]+
895    # query and automatically compile and push options if it doesn't exist
896    isProtonated: bool = True
897
898    isAdduct: bool = False
899
900    usedAtoms: dict = dataclasses.field(default_factory=dict)
901    ion_types_excluded: list = dataclasses.field(default_factory=list)
902
903    # search setting ========
904
905    ionization_type: str = "ESI"
906
907    # empirically set / needs optimization
908    min_ppm_error: float = -10.0  # ppm
909
910    # empirically set / needs optimization
911    max_ppm_error: float = 10.0  # ppm
912
913    # empirically set / needs optimization set for isotopologue search
914    min_abun_error: float = -100.0  # percentage
915
916    # empirically set / needs optimization set for isotopologue search
917    max_abun_error: float = 100.0  # percentage
918
919    # empirically set / needs optimization
920    mz_error_range: float = 1.5
921
922    # 'distance', 'lowest', 'symmetrical','average' 'None'
923    error_method: str = "None"
924
925    mz_error_average: float = 0.0
926
927    # used_atom_valences: {'C': 4, 'H':1, etc} = dataclasses.field(default_factory=dict)
928    used_atom_valences: dict = dataclasses.field(default_factory=dict)
929
930    def __post_init__(self):
931        if not self.url_database or self.url_database == "":
932            self.url_database = os.getenv(
933            "COREMS_DATABASE_URL", "sqlite:///db/molformula.db"
934            )
935        # enforce datatype
936        for field in dataclasses.fields(self):
937            value = getattr(self, field.name)
938            if not isinstance(value, field.type):
939                value = field.type(value)
940                setattr(self, field.name, value)
941
942        # enforce C and H if either do not exists
943        if "C" not in self.usedAtoms.keys():
944            self.usedAtoms["C"] = (1, 100)
945        if "H" not in self.usedAtoms.keys():
946            self.usedAtoms["H"] = (1, 200)
947
948        # add cummon values
949        current_used_atoms = self.used_atom_valences.keys()
950
951        for atom in Atoms.atoms_covalence.keys():
952            if atom not in current_used_atoms:
953                covalence = Atoms.atoms_covalence.get(atom)
954
955                if isinstance(covalence, int):
956                    self.used_atom_valences[atom] = covalence
957
958                else:
959                    # will get the first number of all possible covalances, which should be the most commum
960                    self.used_atom_valences[atom] = covalence[0]

Settings for molecular searching

Attributes
  • use_isotopologue_filter (bool, optional): If True, use isotopologue filter. Default is False.
  • isotopologue_filter_threshold (float, optional): Threshold for isotopologue filter. Default is 33.
  • isotopologue_filter_atoms (tuple, optional): Tuple of atoms to use for isotopologue filter. Default is ('Cl', 'Br').
  • use_runtime_kendrick_filter (bool, optional): If True, use runtime Kendrick filter. Default is False.
  • use_min_peaks_filter (bool, optional): If True, use minimum peaks filter. Default is True.
  • min_peaks_per_class (int, optional): Minimum number of peaks per class. Default is 15.
  • url_database (str, optional): URL for the database. Default is 'postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp'.
  • db_jobs (int, optional): Number of jobs to use for database queries. Default is 3.
  • db_chunk_size (int, optional): Chunk size to use for database queries. Default is 300.
  • ion_charge (int, optional): Ion charge. Default is -1.
  • min_hc_filter (float, optional): Minimum hydrogen to carbon ratio. Default is 0.3.
  • max_hc_filter (float, optional): Maximum hydrogen to carbon ratio. Default is 3.
  • min_oc_filter (float, optional): Minimum oxygen to carbon ratio. Default is 0.0.
  • max_oc_filter (float, optional): Maximum oxygen to carbon ratio. Default is 1.2.
  • min_op_filter (float, optional): Minimum oxygen to phosphorous ratio. Default is 2.
  • use_pah_line_rule (bool, optional): If True, use the PAH line rule. Default is False.
  • min_dbe (float, optional): Minimum double bond equivalent to use for searching. Default is 0.
  • max_dbe (float, optional): Maximum double bond equivalent to use for searching. Default is 40.
  • mz_error_score_weight (float, optional): Weight for m/z error score to contribute to composite score. Default is 0.6.
  • isotopologue_score_weight (float, optional): Weight for isotopologue score to contribute to composite score. Default is 0.4.
  • adduct_atoms_neg (tuple, optional): Tuple of atoms to use in negative polarity. Default is ('Cl', 'Br').
  • adduct_atoms_pos (tuple, optional): Tuple of atoms to use in positive polarity. Default is ('Na', 'K').
  • score_methods (tuple, optional): Tuple of score method that can be implemented. Default is ('S_P_lowest_error', 'N_S_P_lowest_error', 'lowest_error', 'prob_score', 'air_filter_error', 'water_filter_error', 'earth_filter_error').
  • score_method (str, optional): Score method to use. Default is 'prob_score'. Options are 'S_P_lowest_error', 'N_S_P_lowest_error', 'lowest_error', 'prob_score', 'air_filter_error', 'water_filter_error', 'earth_filter_error'.
  • output_min_score (float, optional): Minimum score for output. Default is 0.1.
  • output_score_method (str, optional): Score method to use for output. Default is 'All Candidates'.
  • isRadical (bool, optional): If True, search for radical ions. Default is False.
  • isProtonated (bool, optional): If True, search for protonated ions. Default is True.
  • isAdduct (bool, optional): If True, search for adduct ions. Default is False.
  • usedAtoms (dict, optional): Dictionary of atoms and ranges. Default is {'C': (1, 90), 'H': (4, 200), 'O': (0, 12), 'N': (0, 0), 'S': (0, 0), 'P': (0, 0), 'Cl': (0, 0)}.
  • ion_types_excluded (list, optional): List of ion types to exclude from molecular id search, commonly ['[M+CH3COO]-]'] or ['[M+COOH]-'] depending on mobile phase content. Default is [].
  • ionization_type (str, optional): Ionization type. Default is 'ESI'.
  • min_ppm_error (float, optional): Minimum ppm error. Default is -10.0.
  • max_ppm_error (float, optional): Maximum ppm error. Default is 10.0.
  • min_abun_error (float, optional): Minimum abundance error for isotolopologue search. Default is -100.0.
  • max_abun_error (float, optional): Maximum abundance error for isotolopologue search. Default is 100.0.
  • mz_error_range (float, optional): m/z error range. Default is 1.5.
  • error_method (str, optional): Error method. Default is 'None'. Options are 'distance', 'lowest', 'symmetrical','average' 'None'.
  • mz_error_average (float, optional): m/z error average. Default is 0.0.
  • used_atom_valences (dict, optional): Dictionary of atoms and valences. Default is {'C': 4, '13C': 4, 'H': 1, 'O': 2, '18O': 2, 'N': 3, 'S': 2, '34S': 2, 'P': 3, 'Cl': 1, '37Cl': 1, 'Br': 1, 'Na': 1, 'F': 1, 'K': 0}.
  • verbose_processing (bool, optional): If True, print verbose processing information. Default is True.
MolecularFormulaSearchSettings( verbose_processing: bool = True, use_isotopologue_filter: bool = False, isotopologue_filter_threshold: float = 33, isotopologue_filter_atoms: tuple = ('Cl', 'Br'), use_runtime_kendrick_filter: bool = False, use_min_peaks_filter: bool = True, min_peaks_per_class: int = 15, url_database: str = 'postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp', db_jobs: int = 3, db_chunk_size: int = 300, ion_charge: int = -1, min_hc_filter: float = 0.3, max_hc_filter: float = 3, min_oc_filter: float = 0.0, max_oc_filter: float = 1.2, min_op_filter: float = 2, use_pah_line_rule: bool = False, min_dbe: float = 0, max_dbe: float = 40, mz_error_score_weight: float = 0.6, isotopologue_score_weight: float = 0.4, adduct_atoms_neg: tuple = ('Cl', 'Br'), adduct_atoms_pos: tuple = ('Na', 'K'), score_methods: tuple = ('S_P_lowest_error', 'N_S_P_lowest_error', 'lowest_error', 'prob_score', 'air_filter_error', 'water_filter_error', 'earth_filter_error'), score_method: str = 'prob_score', output_min_score: float = 0.1, output_score_method: str = 'All Candidates', isRadical: bool = False, isProtonated: bool = True, isAdduct: bool = False, usedAtoms: dict = <factory>, ion_types_excluded: list = <factory>, ionization_type: str = 'ESI', min_ppm_error: float = -10.0, max_ppm_error: float = 10.0, min_abun_error: float = -100.0, max_abun_error: float = 100.0, mz_error_range: float = 1.5, error_method: str = 'None', mz_error_average: float = 0.0, used_atom_valences: dict = <factory>)
verbose_processing: bool = True
use_isotopologue_filter: bool = False
isotopologue_filter_threshold: float = 33
isotopologue_filter_atoms: tuple = ('Cl', 'Br')
use_runtime_kendrick_filter: bool = False
use_min_peaks_filter: bool = True
min_peaks_per_class: int = 15
url_database: str = 'postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp'
db_jobs: int = 3
db_chunk_size: int = 300
ion_charge: int = -1
min_hc_filter: float = 0.3
max_hc_filter: float = 3
min_oc_filter: float = 0.0
max_oc_filter: float = 1.2
min_op_filter: float = 2
use_pah_line_rule: bool = False
min_dbe: float = 0
max_dbe: float = 40
mz_error_score_weight: float = 0.6
isotopologue_score_weight: float = 0.4
adduct_atoms_neg: tuple = ('Cl', 'Br')
adduct_atoms_pos: tuple = ('Na', 'K')
score_methods: tuple = ('S_P_lowest_error', 'N_S_P_lowest_error', 'lowest_error', 'prob_score', 'air_filter_error', 'water_filter_error', 'earth_filter_error')
score_method: str = 'prob_score'
output_min_score: float = 0.1
output_score_method: str = 'All Candidates'
isRadical: bool = False
isProtonated: bool = True
isAdduct: bool = False
usedAtoms: dict
ion_types_excluded: list
ionization_type: str = 'ESI'
min_ppm_error: float = -10.0
max_ppm_error: float = 10.0
min_abun_error: float = -100.0
max_abun_error: float = 100.0
mz_error_range: float = 1.5
error_method: str = 'None'
mz_error_average: float = 0.0
used_atom_valences: dict