corems.mass_spectra.input.mzml

  1from collections import defaultdict
  2from pathlib import Path
  3
  4import numpy as np
  5import pandas as pd
  6import pymzml
  7import datetime
  8
  9from corems.encapsulation.constant import Labels
 10from corems.encapsulation.factory.parameters import default_parameters
 11from corems.mass_spectra.factory.lc_class import LCMSBase, MassSpectraBase
 12from corems.mass_spectra.input.parserbase import SpectraParserInterface
 13from corems.mass_spectrum.factory.MassSpectrumClasses import (
 14    MassSpecCentroid,
 15    MassSpecProfile,
 16)
 17
 18
 19class MZMLSpectraParser(SpectraParserInterface):
 20    """A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects
 21
 22    Parameters
 23    ----------
 24    file_location : str or Path
 25        The path to the RAW file to be parsed.
 26    analyzer : str, optional
 27        The type of mass analyzer used in the instrument. Default is "Unknown".
 28    instrument_label : str, optional
 29        The name of the instrument used to acquire the data. Default is "Unknown".
 30    sample_name : str, optional
 31        The name of the sample being analyzed. If not provided, the stem of the file_location path will be used.
 32
 33    Attributes
 34    ----------
 35    file_location : Path
 36        The path to the RAW file being parsed.
 37    analyzer : str
 38        The type of mass analyzer used in the instrument.
 39    instrument_label : str
 40        The name of the instrument used to acquire the data.
 41    sample_name : str
 42        The name of the sample being analyzed.
 43
 44    Methods
 45    -------
 46    * load().
 47        Load mzML file using pymzml.run.Reader and return the data as a numpy array.
 48    * run(spectra=True).
 49        Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe.
 50    * get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True)
 51        Parses the mzml file and returns a MassSpecBase object from a single scan.
 52    * get_mass_spectra_obj().
 53        Parses the mzml file and instantiates a MassSpectraBase object.
 54    * get_lcms_obj().
 55        Parses the mzml file and instantiates an LCMSBase object.
 56    * get_instrument_info().
 57        Return instrument information from the mzML file.
 58    * get_creation_time().
 59        Return the creation time of the mzML file as a datetime object.
 60
 61    Inherits from ThermoBaseClass and SpectraParserInterface
 62    """
 63
 64    def __init__(
 65        self,
 66        file_location,
 67        analyzer="Unknown",
 68        instrument_label="Unknown",
 69        sample_name=None,
 70    ):
 71        # implementation details
 72        if isinstance(file_location, str):
 73            # if obj is a string it defaults to create a Path obj, pass the S3Path if needed
 74            file_location = Path(file_location)
 75        if not file_location.exists():
 76            raise FileExistsError("File does not exist: " + str(file_location))
 77        self.file_location = file_location
 78        self.analyzer = analyzer
 79        self.instrument_label = instrument_label
 80
 81        if sample_name:
 82            self.sample_name = sample_name
 83        else:
 84            self.sample_name = file_location.stem
 85
 86    def load(self):
 87        """
 88        Load mzML file using pymzml.run.Reader and return the data as a numpy array.
 89
 90        Returns
 91        -------
 92        numpy.ndarray
 93            The mass spectra data as a numpy array.
 94        """
 95        data = pymzml.run.Reader(self.file_location)
 96        return data
 97
 98    def get_scan_df(self, data):
 99        """
100        Return scan data as a pandas DataFrame.
101
102        Parameters
103        ----------
104        data : pymzml.run.Reader
105            The mass spectra data.
106
107        Returns
108        -------
109        pandas.DataFrame
110            A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
111        """
112        # Scan dict
113        # instatinate scan dict, with empty lists of size of scans
114        n_scans = data.get_spectrum_count()
115        scan_dict = {
116            "scan": np.empty(n_scans, dtype=np.int32),
117            "scan_time": np.empty(n_scans, dtype=np.float32),
118            "ms_level": [None] * n_scans,
119            "polarity": [None] * n_scans,
120            "precursor_mz": [None] * n_scans,
121            "scan_text": [None] * n_scans,
122            "scan_window_lower": np.empty(n_scans, dtype=np.float32),
123            "scan_window_upper": np.empty(n_scans, dtype=np.float32),
124            "scan_precision": [None] * n_scans,
125            "tic": np.empty(n_scans, dtype=np.float32),
126            "ms_format": [None] * n_scans,
127        }
128
129        # First pass: loop through scans to get scan info
130        for i, spec in enumerate(data):
131            scan_dict["scan"][i] = spec.ID
132            scan_dict["ms_level"][i] = spec.ms_level
133            scan_dict["scan_precision"][i] = spec._measured_precision
134            scan_dict["tic"][i] = spec.TIC
135            if spec.selected_precursors:
136                scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get(
137                    "mz", None
138                )
139            if spec["negative scan"] is not None:
140                scan_dict["polarity"][i] = "negative"
141            if spec["positive scan"] is not None:
142                scan_dict["polarity"][i] = "positive"
143            if spec["negative scan"] is not None and spec["positive scan"] is not None:
144                raise ValueError(
145                    "Error: scan {0} has both negative and positive polarity".format(
146                        spec.ID
147                    )
148                )
149
150            scan_dict["scan_time"][i] = spec.get("MS:1000016")
151            scan_dict["scan_text"][i] = spec.get("MS:1000512")
152            scan_dict["scan_window_lower"][i] = spec.get("MS:1000501")
153            scan_dict["scan_window_upper"][i] = spec.get("MS:1000500")
154            if spec.get("MS:1000128"):
155                scan_dict["ms_format"][i] = "profile"
156            elif spec.get("MS:1000127"):
157                scan_dict["ms_format"][i] = "centroid"
158            else:
159                scan_dict["ms_format"][i] = None
160
161        scan_df = pd.DataFrame(scan_dict)
162
163        return scan_df
164
165    def get_ms_raw(self, spectra, scan_df, data):
166        """Return a dictionary of mass spectra data as a pandas DataFrame.
167
168        Parameters
169        ----------
170        spectra : str
171            Which mass spectra data to include in the output.
172            Options: None, "ms1", "ms2", "all".
173        scan_df : pandas.DataFrame
174            Scan dataframe. Output from get_scan_df().
175        data : pymzml.run.Reader
176            The mass spectra data.
177
178        Returns
179        -------
180        dict
181            A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level.
182
183        """
184        if spectra == "all":
185            scan_df_forspec = scan_df
186        elif spectra == "ms1":
187            scan_df_forspec = scan_df[scan_df.ms_level == 1]
188        elif spectra == "ms2":
189            scan_df_forspec = scan_df[scan_df.ms_level == 2]
190        else:
191            raise ValueError("spectra must be 'all', 'ms1', or 'ms2'")
192
193        # Result container
194        res = {}
195
196        # Row count container
197        counter = {}
198
199        # Column name container
200        cols = {}
201
202        # set at float32
203        dtype = np.float32
204
205        # First pass: get nrows
206        N = defaultdict(lambda: 0)
207        for i, spec in enumerate(data):
208            if i in scan_df_forspec.scan:
209                # Get ms level
210                level = "ms{}".format(spec.ms_level)
211
212                # Number of rows
213                N[level] += spec.mz.shape[0]
214
215        # Second pass: parse
216        for i, spec in enumerate(data):
217            if i in scan_df_forspec.scan:
218                # Number of rows
219                n = spec.mz.shape[0]
220
221                # No measurements
222                if n == 0:
223                    continue
224
225                # Dimension check
226                if len(spec.mz) != len(spec.i):
227                    # raise an error if the mz and intensity arrays are not the same length
228                    raise ValueError("m/z and intensity array dimension mismatch")
229
230                # Scan/frame info
231                id_dict = spec.id_dict
232
233                # Get ms level
234                level = "ms{}".format(spec.ms_level)
235
236                # Columns
237                cols[level] = list(id_dict.keys()) + ["mz", "intensity"]
238                m = len(cols[level])
239
240                # Subarray init
241                arr = np.empty((n, m), dtype=dtype)
242                inx = 0
243
244                # Populate scan/frame info
245                for k, v in id_dict.items():
246                    arr[:, inx] = v
247                    inx += 1
248
249                # Populate m/z
250                arr[:, inx] = spec.mz
251                inx += 1
252
253                # Populate intensity
254                arr[:, inx] = spec.i
255                inx += 1
256
257                # Initialize output container
258                if level not in res:
259                    res[level] = np.empty((N[level], m), dtype=dtype)
260                    counter[level] = 0
261
262                # Insert subarray
263                res[level][counter[level] : counter[level] + n, :] = arr
264                counter[level] += n
265
266        # Construct ms1 and ms2 mz dataframes
267        for level in res.keys():
268            res[level] = pd.DataFrame(res[level], columns=cols[level]).drop(
269                columns=["controllerType", "controllerNumber"],
270                axis=1,
271                inplace=False,
272            )
273
274        return res
275
276    def run(self, spectra="all", scan_df=None):
277        """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe.
278
279        Parameters
280        ----------
281        spectra : str, optional
282            Which mass spectra data to include in the output. Default is "all".
283            Other options: None, "ms1", "ms2".
284        scan_df : pandas.DataFrame, optional
285            Scan dataframe.  If not provided, the scan dataframe is created from the mzML file.
286
287        Returns
288        -------
289        tuple
290            A tuple containing two elements:
291            - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level.
292            - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
293        """
294
295        # Open file
296        data = self.load()
297
298        if scan_df is None:
299            scan_df = self.get_scan_df(data)
300
301        if spectra != "none":
302            res = self.get_ms_raw(spectra, scan_df, data)
303
304        else:
305            res = None
306
307        return res, scan_df
308
309    def get_mass_spectrum_from_scan(
310        self, scan_number, spectrum_mode, auto_process=True
311    ):
312        """Instatiate a mass spectrum object from the mzML file.
313
314        Parameters
315        ----------
316        scan_number : int
317            The scan number to be parsed.
318        spectrum_mode : str
319            The type of spectrum to instantiate.  Must be'profile' or 'centroid'.
320        polarity : int
321            The polarity of the scan.  Must be -1 or 1.
322        auto_process : bool, optional
323            If True, process the mass spectrum. Default is True.
324
325        Returns
326        -------
327        MassSpecProfile | MassSpecCentroid
328            The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum.
329        """
330
331        def set_metadata(
332            scan_number: int,
333            polarity: int,
334            file_location: str,
335            label=Labels.thermo_profile,
336        ):
337            """
338            Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object.
339
340            Parameters
341            ----------
342            scan_number : int
343                The scan number.
344            polarity : int
345                The polarity of the data.
346            file_location : str
347                The file location.
348            label : str, optional
349                The label for the mass spectrum. Default is Labels.thermo_profile.
350
351            Returns
352            -------
353            dict
354                The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object.
355            """
356            d_params = default_parameters(file_location)
357            d_params["label"] = label
358            d_params["polarity"] = polarity
359            d_params["filename_path"] = file_location
360            d_params["scan_number"] = scan_number
361
362            return d_params
363
364        # Open file
365        data = self.load()
366
367        # Pluck out individual scan mz and intensity
368        spec = data[scan_number]
369
370        # Get polarity
371        if spec["negative scan"] is not None:
372            polarity = -1
373        elif spec["positive scan"] is not None:
374            polarity = 1
375
376        # Get mass spectrum
377        if spectrum_mode == "profile":
378            # Check if profile
379            if not spec.get("MS:1000128"):
380                raise ValueError("spectrum is not profile")
381            data_dict = {
382                Labels.mz: spec.mz,
383                Labels.abundance: spec.i,
384            }
385            d_params = set_metadata(
386                scan_number,
387                polarity,
388                self.file_location,
389                label=Labels.simulated_profile,
390            )
391            mass_spectrum_obj = mass_spectrum_obj = MassSpecProfile(
392                data_dict, d_params, auto_process=auto_process
393            )
394        elif spectrum_mode == "centroid":
395            # Check if centroided
396            if not spec.get("MS:1000127"):
397                raise ValueError("spectrum is not centroided")
398            data_dict = {
399                Labels.mz: spec.mz,
400                Labels.abundance: spec.i,
401                Labels.rp: [np.nan] * len(spec.mz),
402                Labels.s2n: [np.nan] * len(spec.i),
403            }
404            d_params = set_metadata(
405                scan_number, polarity, self.file_location, label=Labels.corems_centroid
406            )
407            mass_spectrum_obj = MassSpecCentroid(
408                data_dict, d_params, auto_process=auto_process
409            )
410
411        return mass_spectrum_obj
412
413    def get_mass_spectra_obj(self):
414        """Instatiate a MassSpectraBase object from the mzML file.
415
416
417        Returns
418        -------
419        MassSpectraBase
420            The MassSpectra object containing the parsed mass spectra.
421            The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe.
422        """
423        _, scan_df = self.run(spectra=False)
424        mass_spectra_obj = MassSpectraBase(
425            self.file_location,
426            self.analyzer,
427            self.instrument_label,
428            self.sample_name,
429            self,
430        )
431        scan_df = scan_df.set_index("scan", drop=False)
432        mass_spectra_obj.scan_df = scan_df
433
434        return mass_spectra_obj
435
436    def get_lcms_obj(self, spectra="all"):
437        """Instatiates a LCMSBase object from the mzML file.
438
439        Parameters
440        ----------
441        spectra : str, optional
442            Which mass spectra data to include in the output. Default is all.  Other options: none, ms1, ms2.
443
444        Returns
445        -------
446        LCMSBase
447            LCMS object containing mass spectra data.
448            The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe,
449            and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs.
450        """
451        _, scan_df = self.run(spectra="none")  # first run it to just get scan info
452        res, scan_df = self.run(
453            scan_df=scan_df, spectra=spectra
454        )  # second run to parse data
455        lcms_obj = LCMSBase(
456            self.file_location,
457            self.analyzer,
458            self.instrument_label,
459            self.sample_name,
460            self,
461        )
462        for key in res:
463            key_int = int(key.replace("ms", ""))
464            res[key] = res[key][res[key].intensity > 0]
465            res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True)
466            lcms_obj._ms_unprocessed[key_int] = res[key]
467        lcms_obj.scan_df = scan_df.set_index("scan", drop=False)
468        # Check if polarity is mixed
469        if len(set(scan_df.polarity)) > 1:
470            raise ValueError("Mixed polarities detected in scan data")
471        lcms_obj.polarity = scan_df.polarity[0]
472        lcms_obj._scans_number_list = list(scan_df.scan)
473        lcms_obj._retention_time_list = list(scan_df.scan_time)
474        lcms_obj._tic_list = list(scan_df.tic)
475
476        return lcms_obj
477
478    def get_instrument_info(self):
479        """
480        Return instrument information.
481
482        Returns
483        -------
484        dict
485            A dictionary with the keys 'model' and 'serial_number'.
486        """
487        # Load the pymzml data
488        data = self.load()
489        instrument_info = data.info.get('referenceable_param_group_list_element')[0]
490        cv_params = instrument_info.findall('{http://psi.hupo.org/ms/mzml}cvParam')
491
492        # Extract details from each cvParam
493        params = []
494        for param in cv_params:
495            accession = param.get('accession')  # Get 'accession' attribute
496            name = param.get('name')           # Get 'name' attribute
497            value = param.get('value')         # Get 'value' attribute
498            params.append({
499                'accession': accession,
500                'name': name,
501                'value': value
502            })
503
504        # Loop through params and try to find the relevant information
505        instrument_dict = {
506            'model': 'Unknown',
507            'serial_number': 'Unknown'
508        }
509
510        # Assuming there are only two paramters here - one is for the serial number (agnostic to the model) and the other is for the model
511        # If there are more than two, we raise an error
512        if len(params) < 2:
513            raise ValueError("Not enough parameters found in the instrument info, cannot parse.")
514        if len(params) > 2:
515            raise ValueError("Too many parameters found in the instrument info, cannot parse.")
516        for param in params:
517            if param['accession'] == 'MS:1000529':
518                instrument_dict['serial_number'] = param['value']
519            else:
520                instrument_dict['model'] = data.OT[param['accession']]     
521
522        return instrument_dict
523
524    def get_creation_time(self) -> datetime.datetime:
525        """
526        Return the creation time of the mzML file.
527        """
528        data = self.load()
529        write_time = data.info.get('start_time')
530        if write_time:
531            # Convert the write time to a datetime object
532            return datetime.datetime.strptime(write_time, "%Y-%m-%dT%H:%M:%SZ")
533        else:
534            raise ValueError("Creation time is not available in the mzML file. "
535                           "Please ensure the file contains the 'start_time' information.")
class MZMLSpectraParser(corems.mass_spectra.input.parserbase.SpectraParserInterface):
 20class MZMLSpectraParser(SpectraParserInterface):
 21    """A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects
 22
 23    Parameters
 24    ----------
 25    file_location : str or Path
 26        The path to the RAW file to be parsed.
 27    analyzer : str, optional
 28        The type of mass analyzer used in the instrument. Default is "Unknown".
 29    instrument_label : str, optional
 30        The name of the instrument used to acquire the data. Default is "Unknown".
 31    sample_name : str, optional
 32        The name of the sample being analyzed. If not provided, the stem of the file_location path will be used.
 33
 34    Attributes
 35    ----------
 36    file_location : Path
 37        The path to the RAW file being parsed.
 38    analyzer : str
 39        The type of mass analyzer used in the instrument.
 40    instrument_label : str
 41        The name of the instrument used to acquire the data.
 42    sample_name : str
 43        The name of the sample being analyzed.
 44
 45    Methods
 46    -------
 47    * load().
 48        Load mzML file using pymzml.run.Reader and return the data as a numpy array.
 49    * run(spectra=True).
 50        Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe.
 51    * get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True)
 52        Parses the mzml file and returns a MassSpecBase object from a single scan.
 53    * get_mass_spectra_obj().
 54        Parses the mzml file and instantiates a MassSpectraBase object.
 55    * get_lcms_obj().
 56        Parses the mzml file and instantiates an LCMSBase object.
 57    * get_instrument_info().
 58        Return instrument information from the mzML file.
 59    * get_creation_time().
 60        Return the creation time of the mzML file as a datetime object.
 61
 62    Inherits from ThermoBaseClass and SpectraParserInterface
 63    """
 64
 65    def __init__(
 66        self,
 67        file_location,
 68        analyzer="Unknown",
 69        instrument_label="Unknown",
 70        sample_name=None,
 71    ):
 72        # implementation details
 73        if isinstance(file_location, str):
 74            # if obj is a string it defaults to create a Path obj, pass the S3Path if needed
 75            file_location = Path(file_location)
 76        if not file_location.exists():
 77            raise FileExistsError("File does not exist: " + str(file_location))
 78        self.file_location = file_location
 79        self.analyzer = analyzer
 80        self.instrument_label = instrument_label
 81
 82        if sample_name:
 83            self.sample_name = sample_name
 84        else:
 85            self.sample_name = file_location.stem
 86
 87    def load(self):
 88        """
 89        Load mzML file using pymzml.run.Reader and return the data as a numpy array.
 90
 91        Returns
 92        -------
 93        numpy.ndarray
 94            The mass spectra data as a numpy array.
 95        """
 96        data = pymzml.run.Reader(self.file_location)
 97        return data
 98
 99    def get_scan_df(self, data):
100        """
101        Return scan data as a pandas DataFrame.
102
103        Parameters
104        ----------
105        data : pymzml.run.Reader
106            The mass spectra data.
107
108        Returns
109        -------
110        pandas.DataFrame
111            A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
112        """
113        # Scan dict
114        # instatinate scan dict, with empty lists of size of scans
115        n_scans = data.get_spectrum_count()
116        scan_dict = {
117            "scan": np.empty(n_scans, dtype=np.int32),
118            "scan_time": np.empty(n_scans, dtype=np.float32),
119            "ms_level": [None] * n_scans,
120            "polarity": [None] * n_scans,
121            "precursor_mz": [None] * n_scans,
122            "scan_text": [None] * n_scans,
123            "scan_window_lower": np.empty(n_scans, dtype=np.float32),
124            "scan_window_upper": np.empty(n_scans, dtype=np.float32),
125            "scan_precision": [None] * n_scans,
126            "tic": np.empty(n_scans, dtype=np.float32),
127            "ms_format": [None] * n_scans,
128        }
129
130        # First pass: loop through scans to get scan info
131        for i, spec in enumerate(data):
132            scan_dict["scan"][i] = spec.ID
133            scan_dict["ms_level"][i] = spec.ms_level
134            scan_dict["scan_precision"][i] = spec._measured_precision
135            scan_dict["tic"][i] = spec.TIC
136            if spec.selected_precursors:
137                scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get(
138                    "mz", None
139                )
140            if spec["negative scan"] is not None:
141                scan_dict["polarity"][i] = "negative"
142            if spec["positive scan"] is not None:
143                scan_dict["polarity"][i] = "positive"
144            if spec["negative scan"] is not None and spec["positive scan"] is not None:
145                raise ValueError(
146                    "Error: scan {0} has both negative and positive polarity".format(
147                        spec.ID
148                    )
149                )
150
151            scan_dict["scan_time"][i] = spec.get("MS:1000016")
152            scan_dict["scan_text"][i] = spec.get("MS:1000512")
153            scan_dict["scan_window_lower"][i] = spec.get("MS:1000501")
154            scan_dict["scan_window_upper"][i] = spec.get("MS:1000500")
155            if spec.get("MS:1000128"):
156                scan_dict["ms_format"][i] = "profile"
157            elif spec.get("MS:1000127"):
158                scan_dict["ms_format"][i] = "centroid"
159            else:
160                scan_dict["ms_format"][i] = None
161
162        scan_df = pd.DataFrame(scan_dict)
163
164        return scan_df
165
166    def get_ms_raw(self, spectra, scan_df, data):
167        """Return a dictionary of mass spectra data as a pandas DataFrame.
168
169        Parameters
170        ----------
171        spectra : str
172            Which mass spectra data to include in the output.
173            Options: None, "ms1", "ms2", "all".
174        scan_df : pandas.DataFrame
175            Scan dataframe. Output from get_scan_df().
176        data : pymzml.run.Reader
177            The mass spectra data.
178
179        Returns
180        -------
181        dict
182            A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level.
183
184        """
185        if spectra == "all":
186            scan_df_forspec = scan_df
187        elif spectra == "ms1":
188            scan_df_forspec = scan_df[scan_df.ms_level == 1]
189        elif spectra == "ms2":
190            scan_df_forspec = scan_df[scan_df.ms_level == 2]
191        else:
192            raise ValueError("spectra must be 'all', 'ms1', or 'ms2'")
193
194        # Result container
195        res = {}
196
197        # Row count container
198        counter = {}
199
200        # Column name container
201        cols = {}
202
203        # set at float32
204        dtype = np.float32
205
206        # First pass: get nrows
207        N = defaultdict(lambda: 0)
208        for i, spec in enumerate(data):
209            if i in scan_df_forspec.scan:
210                # Get ms level
211                level = "ms{}".format(spec.ms_level)
212
213                # Number of rows
214                N[level] += spec.mz.shape[0]
215
216        # Second pass: parse
217        for i, spec in enumerate(data):
218            if i in scan_df_forspec.scan:
219                # Number of rows
220                n = spec.mz.shape[0]
221
222                # No measurements
223                if n == 0:
224                    continue
225
226                # Dimension check
227                if len(spec.mz) != len(spec.i):
228                    # raise an error if the mz and intensity arrays are not the same length
229                    raise ValueError("m/z and intensity array dimension mismatch")
230
231                # Scan/frame info
232                id_dict = spec.id_dict
233
234                # Get ms level
235                level = "ms{}".format(spec.ms_level)
236
237                # Columns
238                cols[level] = list(id_dict.keys()) + ["mz", "intensity"]
239                m = len(cols[level])
240
241                # Subarray init
242                arr = np.empty((n, m), dtype=dtype)
243                inx = 0
244
245                # Populate scan/frame info
246                for k, v in id_dict.items():
247                    arr[:, inx] = v
248                    inx += 1
249
250                # Populate m/z
251                arr[:, inx] = spec.mz
252                inx += 1
253
254                # Populate intensity
255                arr[:, inx] = spec.i
256                inx += 1
257
258                # Initialize output container
259                if level not in res:
260                    res[level] = np.empty((N[level], m), dtype=dtype)
261                    counter[level] = 0
262
263                # Insert subarray
264                res[level][counter[level] : counter[level] + n, :] = arr
265                counter[level] += n
266
267        # Construct ms1 and ms2 mz dataframes
268        for level in res.keys():
269            res[level] = pd.DataFrame(res[level], columns=cols[level]).drop(
270                columns=["controllerType", "controllerNumber"],
271                axis=1,
272                inplace=False,
273            )
274
275        return res
276
277    def run(self, spectra="all", scan_df=None):
278        """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe.
279
280        Parameters
281        ----------
282        spectra : str, optional
283            Which mass spectra data to include in the output. Default is "all".
284            Other options: None, "ms1", "ms2".
285        scan_df : pandas.DataFrame, optional
286            Scan dataframe.  If not provided, the scan dataframe is created from the mzML file.
287
288        Returns
289        -------
290        tuple
291            A tuple containing two elements:
292            - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level.
293            - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
294        """
295
296        # Open file
297        data = self.load()
298
299        if scan_df is None:
300            scan_df = self.get_scan_df(data)
301
302        if spectra != "none":
303            res = self.get_ms_raw(spectra, scan_df, data)
304
305        else:
306            res = None
307
308        return res, scan_df
309
310    def get_mass_spectrum_from_scan(
311        self, scan_number, spectrum_mode, auto_process=True
312    ):
313        """Instatiate a mass spectrum object from the mzML file.
314
315        Parameters
316        ----------
317        scan_number : int
318            The scan number to be parsed.
319        spectrum_mode : str
320            The type of spectrum to instantiate.  Must be'profile' or 'centroid'.
321        polarity : int
322            The polarity of the scan.  Must be -1 or 1.
323        auto_process : bool, optional
324            If True, process the mass spectrum. Default is True.
325
326        Returns
327        -------
328        MassSpecProfile | MassSpecCentroid
329            The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum.
330        """
331
332        def set_metadata(
333            scan_number: int,
334            polarity: int,
335            file_location: str,
336            label=Labels.thermo_profile,
337        ):
338            """
339            Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object.
340
341            Parameters
342            ----------
343            scan_number : int
344                The scan number.
345            polarity : int
346                The polarity of the data.
347            file_location : str
348                The file location.
349            label : str, optional
350                The label for the mass spectrum. Default is Labels.thermo_profile.
351
352            Returns
353            -------
354            dict
355                The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object.
356            """
357            d_params = default_parameters(file_location)
358            d_params["label"] = label
359            d_params["polarity"] = polarity
360            d_params["filename_path"] = file_location
361            d_params["scan_number"] = scan_number
362
363            return d_params
364
365        # Open file
366        data = self.load()
367
368        # Pluck out individual scan mz and intensity
369        spec = data[scan_number]
370
371        # Get polarity
372        if spec["negative scan"] is not None:
373            polarity = -1
374        elif spec["positive scan"] is not None:
375            polarity = 1
376
377        # Get mass spectrum
378        if spectrum_mode == "profile":
379            # Check if profile
380            if not spec.get("MS:1000128"):
381                raise ValueError("spectrum is not profile")
382            data_dict = {
383                Labels.mz: spec.mz,
384                Labels.abundance: spec.i,
385            }
386            d_params = set_metadata(
387                scan_number,
388                polarity,
389                self.file_location,
390                label=Labels.simulated_profile,
391            )
392            mass_spectrum_obj = mass_spectrum_obj = MassSpecProfile(
393                data_dict, d_params, auto_process=auto_process
394            )
395        elif spectrum_mode == "centroid":
396            # Check if centroided
397            if not spec.get("MS:1000127"):
398                raise ValueError("spectrum is not centroided")
399            data_dict = {
400                Labels.mz: spec.mz,
401                Labels.abundance: spec.i,
402                Labels.rp: [np.nan] * len(spec.mz),
403                Labels.s2n: [np.nan] * len(spec.i),
404            }
405            d_params = set_metadata(
406                scan_number, polarity, self.file_location, label=Labels.corems_centroid
407            )
408            mass_spectrum_obj = MassSpecCentroid(
409                data_dict, d_params, auto_process=auto_process
410            )
411
412        return mass_spectrum_obj
413
414    def get_mass_spectra_obj(self):
415        """Instatiate a MassSpectraBase object from the mzML file.
416
417
418        Returns
419        -------
420        MassSpectraBase
421            The MassSpectra object containing the parsed mass spectra.
422            The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe.
423        """
424        _, scan_df = self.run(spectra=False)
425        mass_spectra_obj = MassSpectraBase(
426            self.file_location,
427            self.analyzer,
428            self.instrument_label,
429            self.sample_name,
430            self,
431        )
432        scan_df = scan_df.set_index("scan", drop=False)
433        mass_spectra_obj.scan_df = scan_df
434
435        return mass_spectra_obj
436
437    def get_lcms_obj(self, spectra="all"):
438        """Instatiates a LCMSBase object from the mzML file.
439
440        Parameters
441        ----------
442        spectra : str, optional
443            Which mass spectra data to include in the output. Default is all.  Other options: none, ms1, ms2.
444
445        Returns
446        -------
447        LCMSBase
448            LCMS object containing mass spectra data.
449            The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe,
450            and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs.
451        """
452        _, scan_df = self.run(spectra="none")  # first run it to just get scan info
453        res, scan_df = self.run(
454            scan_df=scan_df, spectra=spectra
455        )  # second run to parse data
456        lcms_obj = LCMSBase(
457            self.file_location,
458            self.analyzer,
459            self.instrument_label,
460            self.sample_name,
461            self,
462        )
463        for key in res:
464            key_int = int(key.replace("ms", ""))
465            res[key] = res[key][res[key].intensity > 0]
466            res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True)
467            lcms_obj._ms_unprocessed[key_int] = res[key]
468        lcms_obj.scan_df = scan_df.set_index("scan", drop=False)
469        # Check if polarity is mixed
470        if len(set(scan_df.polarity)) > 1:
471            raise ValueError("Mixed polarities detected in scan data")
472        lcms_obj.polarity = scan_df.polarity[0]
473        lcms_obj._scans_number_list = list(scan_df.scan)
474        lcms_obj._retention_time_list = list(scan_df.scan_time)
475        lcms_obj._tic_list = list(scan_df.tic)
476
477        return lcms_obj
478
479    def get_instrument_info(self):
480        """
481        Return instrument information.
482
483        Returns
484        -------
485        dict
486            A dictionary with the keys 'model' and 'serial_number'.
487        """
488        # Load the pymzml data
489        data = self.load()
490        instrument_info = data.info.get('referenceable_param_group_list_element')[0]
491        cv_params = instrument_info.findall('{http://psi.hupo.org/ms/mzml}cvParam')
492
493        # Extract details from each cvParam
494        params = []
495        for param in cv_params:
496            accession = param.get('accession')  # Get 'accession' attribute
497            name = param.get('name')           # Get 'name' attribute
498            value = param.get('value')         # Get 'value' attribute
499            params.append({
500                'accession': accession,
501                'name': name,
502                'value': value
503            })
504
505        # Loop through params and try to find the relevant information
506        instrument_dict = {
507            'model': 'Unknown',
508            'serial_number': 'Unknown'
509        }
510
511        # Assuming there are only two paramters here - one is for the serial number (agnostic to the model) and the other is for the model
512        # If there are more than two, we raise an error
513        if len(params) < 2:
514            raise ValueError("Not enough parameters found in the instrument info, cannot parse.")
515        if len(params) > 2:
516            raise ValueError("Too many parameters found in the instrument info, cannot parse.")
517        for param in params:
518            if param['accession'] == 'MS:1000529':
519                instrument_dict['serial_number'] = param['value']
520            else:
521                instrument_dict['model'] = data.OT[param['accession']]     
522
523        return instrument_dict
524
525    def get_creation_time(self) -> datetime.datetime:
526        """
527        Return the creation time of the mzML file.
528        """
529        data = self.load()
530        write_time = data.info.get('start_time')
531        if write_time:
532            # Convert the write time to a datetime object
533            return datetime.datetime.strptime(write_time, "%Y-%m-%dT%H:%M:%SZ")
534        else:
535            raise ValueError("Creation time is not available in the mzML file. "
536                           "Please ensure the file contains the 'start_time' information.")

A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects

Parameters
  • file_location (str or Path): The path to the RAW file to be parsed.
  • analyzer (str, optional): The type of mass analyzer used in the instrument. Default is "Unknown".
  • instrument_label (str, optional): The name of the instrument used to acquire the data. Default is "Unknown".
  • sample_name (str, optional): The name of the sample being analyzed. If not provided, the stem of the file_location path will be used.
Attributes
  • file_location (Path): The path to the RAW file being parsed.
  • analyzer (str): The type of mass analyzer used in the instrument.
  • instrument_label (str): The name of the instrument used to acquire the data.
  • sample_name (str): The name of the sample being analyzed.
Methods
  • load(). Load mzML file using pymzml.run.Reader and return the data as a numpy array.
  • run(spectra=True). Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe.
  • get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True) Parses the mzml file and returns a MassSpecBase object from a single scan.
  • get_mass_spectra_obj(). Parses the mzml file and instantiates a MassSpectraBase object.
  • get_lcms_obj(). Parses the mzml file and instantiates an LCMSBase object.
  • get_instrument_info(). Return instrument information from the mzML file.
  • get_creation_time(). Return the creation time of the mzML file as a datetime object.

Inherits from ThermoBaseClass and SpectraParserInterface

MZMLSpectraParser( file_location, analyzer='Unknown', instrument_label='Unknown', sample_name=None)
65    def __init__(
66        self,
67        file_location,
68        analyzer="Unknown",
69        instrument_label="Unknown",
70        sample_name=None,
71    ):
72        # implementation details
73        if isinstance(file_location, str):
74            # if obj is a string it defaults to create a Path obj, pass the S3Path if needed
75            file_location = Path(file_location)
76        if not file_location.exists():
77            raise FileExistsError("File does not exist: " + str(file_location))
78        self.file_location = file_location
79        self.analyzer = analyzer
80        self.instrument_label = instrument_label
81
82        if sample_name:
83            self.sample_name = sample_name
84        else:
85            self.sample_name = file_location.stem
file_location
analyzer
instrument_label
def load(self):
87    def load(self):
88        """
89        Load mzML file using pymzml.run.Reader and return the data as a numpy array.
90
91        Returns
92        -------
93        numpy.ndarray
94            The mass spectra data as a numpy array.
95        """
96        data = pymzml.run.Reader(self.file_location)
97        return data

Load mzML file using pymzml.run.Reader and return the data as a numpy array.

Returns
  • numpy.ndarray: The mass spectra data as a numpy array.
def get_scan_df(self, data):
 99    def get_scan_df(self, data):
100        """
101        Return scan data as a pandas DataFrame.
102
103        Parameters
104        ----------
105        data : pymzml.run.Reader
106            The mass spectra data.
107
108        Returns
109        -------
110        pandas.DataFrame
111            A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
112        """
113        # Scan dict
114        # instatinate scan dict, with empty lists of size of scans
115        n_scans = data.get_spectrum_count()
116        scan_dict = {
117            "scan": np.empty(n_scans, dtype=np.int32),
118            "scan_time": np.empty(n_scans, dtype=np.float32),
119            "ms_level": [None] * n_scans,
120            "polarity": [None] * n_scans,
121            "precursor_mz": [None] * n_scans,
122            "scan_text": [None] * n_scans,
123            "scan_window_lower": np.empty(n_scans, dtype=np.float32),
124            "scan_window_upper": np.empty(n_scans, dtype=np.float32),
125            "scan_precision": [None] * n_scans,
126            "tic": np.empty(n_scans, dtype=np.float32),
127            "ms_format": [None] * n_scans,
128        }
129
130        # First pass: loop through scans to get scan info
131        for i, spec in enumerate(data):
132            scan_dict["scan"][i] = spec.ID
133            scan_dict["ms_level"][i] = spec.ms_level
134            scan_dict["scan_precision"][i] = spec._measured_precision
135            scan_dict["tic"][i] = spec.TIC
136            if spec.selected_precursors:
137                scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get(
138                    "mz", None
139                )
140            if spec["negative scan"] is not None:
141                scan_dict["polarity"][i] = "negative"
142            if spec["positive scan"] is not None:
143                scan_dict["polarity"][i] = "positive"
144            if spec["negative scan"] is not None and spec["positive scan"] is not None:
145                raise ValueError(
146                    "Error: scan {0} has both negative and positive polarity".format(
147                        spec.ID
148                    )
149                )
150
151            scan_dict["scan_time"][i] = spec.get("MS:1000016")
152            scan_dict["scan_text"][i] = spec.get("MS:1000512")
153            scan_dict["scan_window_lower"][i] = spec.get("MS:1000501")
154            scan_dict["scan_window_upper"][i] = spec.get("MS:1000500")
155            if spec.get("MS:1000128"):
156                scan_dict["ms_format"][i] = "profile"
157            elif spec.get("MS:1000127"):
158                scan_dict["ms_format"][i] = "centroid"
159            else:
160                scan_dict["ms_format"][i] = None
161
162        scan_df = pd.DataFrame(scan_dict)
163
164        return scan_df

Return scan data as a pandas DataFrame.

Parameters
  • data (pymzml.run.Reader): The mass spectra data.
Returns
  • pandas.DataFrame: A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
def get_ms_raw(self, spectra, scan_df, data):
166    def get_ms_raw(self, spectra, scan_df, data):
167        """Return a dictionary of mass spectra data as a pandas DataFrame.
168
169        Parameters
170        ----------
171        spectra : str
172            Which mass spectra data to include in the output.
173            Options: None, "ms1", "ms2", "all".
174        scan_df : pandas.DataFrame
175            Scan dataframe. Output from get_scan_df().
176        data : pymzml.run.Reader
177            The mass spectra data.
178
179        Returns
180        -------
181        dict
182            A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level.
183
184        """
185        if spectra == "all":
186            scan_df_forspec = scan_df
187        elif spectra == "ms1":
188            scan_df_forspec = scan_df[scan_df.ms_level == 1]
189        elif spectra == "ms2":
190            scan_df_forspec = scan_df[scan_df.ms_level == 2]
191        else:
192            raise ValueError("spectra must be 'all', 'ms1', or 'ms2'")
193
194        # Result container
195        res = {}
196
197        # Row count container
198        counter = {}
199
200        # Column name container
201        cols = {}
202
203        # set at float32
204        dtype = np.float32
205
206        # First pass: get nrows
207        N = defaultdict(lambda: 0)
208        for i, spec in enumerate(data):
209            if i in scan_df_forspec.scan:
210                # Get ms level
211                level = "ms{}".format(spec.ms_level)
212
213                # Number of rows
214                N[level] += spec.mz.shape[0]
215
216        # Second pass: parse
217        for i, spec in enumerate(data):
218            if i in scan_df_forspec.scan:
219                # Number of rows
220                n = spec.mz.shape[0]
221
222                # No measurements
223                if n == 0:
224                    continue
225
226                # Dimension check
227                if len(spec.mz) != len(spec.i):
228                    # raise an error if the mz and intensity arrays are not the same length
229                    raise ValueError("m/z and intensity array dimension mismatch")
230
231                # Scan/frame info
232                id_dict = spec.id_dict
233
234                # Get ms level
235                level = "ms{}".format(spec.ms_level)
236
237                # Columns
238                cols[level] = list(id_dict.keys()) + ["mz", "intensity"]
239                m = len(cols[level])
240
241                # Subarray init
242                arr = np.empty((n, m), dtype=dtype)
243                inx = 0
244
245                # Populate scan/frame info
246                for k, v in id_dict.items():
247                    arr[:, inx] = v
248                    inx += 1
249
250                # Populate m/z
251                arr[:, inx] = spec.mz
252                inx += 1
253
254                # Populate intensity
255                arr[:, inx] = spec.i
256                inx += 1
257
258                # Initialize output container
259                if level not in res:
260                    res[level] = np.empty((N[level], m), dtype=dtype)
261                    counter[level] = 0
262
263                # Insert subarray
264                res[level][counter[level] : counter[level] + n, :] = arr
265                counter[level] += n
266
267        # Construct ms1 and ms2 mz dataframes
268        for level in res.keys():
269            res[level] = pd.DataFrame(res[level], columns=cols[level]).drop(
270                columns=["controllerType", "controllerNumber"],
271                axis=1,
272                inplace=False,
273            )
274
275        return res

Return a dictionary of mass spectra data as a pandas DataFrame.

Parameters
  • spectra (str): Which mass spectra data to include in the output. Options: None, "ms1", "ms2", "all".
  • scan_df (pandas.DataFrame): Scan dataframe. Output from get_scan_df().
  • data (pymzml.run.Reader): The mass spectra data.
Returns
  • dict: A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level.
def run(self, spectra='all', scan_df=None):
277    def run(self, spectra="all", scan_df=None):
278        """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe.
279
280        Parameters
281        ----------
282        spectra : str, optional
283            Which mass spectra data to include in the output. Default is "all".
284            Other options: None, "ms1", "ms2".
285        scan_df : pandas.DataFrame, optional
286            Scan dataframe.  If not provided, the scan dataframe is created from the mzML file.
287
288        Returns
289        -------
290        tuple
291            A tuple containing two elements:
292            - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level.
293            - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
294        """
295
296        # Open file
297        data = self.load()
298
299        if scan_df is None:
300            scan_df = self.get_scan_df(data)
301
302        if spectra != "none":
303            res = self.get_ms_raw(spectra, scan_df, data)
304
305        else:
306            res = None
307
308        return res, scan_df

Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe.

Parameters
  • spectra (str, optional): Which mass spectra data to include in the output. Default is "all". Other options: None, "ms1", "ms2".
  • scan_df (pandas.DataFrame, optional): Scan dataframe. If not provided, the scan dataframe is created from the mzML file.
Returns
  • tuple: A tuple containing two elements:
    • A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level.
    • A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
def get_mass_spectrum_from_scan(self, scan_number, spectrum_mode, auto_process=True):
310    def get_mass_spectrum_from_scan(
311        self, scan_number, spectrum_mode, auto_process=True
312    ):
313        """Instatiate a mass spectrum object from the mzML file.
314
315        Parameters
316        ----------
317        scan_number : int
318            The scan number to be parsed.
319        spectrum_mode : str
320            The type of spectrum to instantiate.  Must be'profile' or 'centroid'.
321        polarity : int
322            The polarity of the scan.  Must be -1 or 1.
323        auto_process : bool, optional
324            If True, process the mass spectrum. Default is True.
325
326        Returns
327        -------
328        MassSpecProfile | MassSpecCentroid
329            The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum.
330        """
331
332        def set_metadata(
333            scan_number: int,
334            polarity: int,
335            file_location: str,
336            label=Labels.thermo_profile,
337        ):
338            """
339            Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object.
340
341            Parameters
342            ----------
343            scan_number : int
344                The scan number.
345            polarity : int
346                The polarity of the data.
347            file_location : str
348                The file location.
349            label : str, optional
350                The label for the mass spectrum. Default is Labels.thermo_profile.
351
352            Returns
353            -------
354            dict
355                The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object.
356            """
357            d_params = default_parameters(file_location)
358            d_params["label"] = label
359            d_params["polarity"] = polarity
360            d_params["filename_path"] = file_location
361            d_params["scan_number"] = scan_number
362
363            return d_params
364
365        # Open file
366        data = self.load()
367
368        # Pluck out individual scan mz and intensity
369        spec = data[scan_number]
370
371        # Get polarity
372        if spec["negative scan"] is not None:
373            polarity = -1
374        elif spec["positive scan"] is not None:
375            polarity = 1
376
377        # Get mass spectrum
378        if spectrum_mode == "profile":
379            # Check if profile
380            if not spec.get("MS:1000128"):
381                raise ValueError("spectrum is not profile")
382            data_dict = {
383                Labels.mz: spec.mz,
384                Labels.abundance: spec.i,
385            }
386            d_params = set_metadata(
387                scan_number,
388                polarity,
389                self.file_location,
390                label=Labels.simulated_profile,
391            )
392            mass_spectrum_obj = mass_spectrum_obj = MassSpecProfile(
393                data_dict, d_params, auto_process=auto_process
394            )
395        elif spectrum_mode == "centroid":
396            # Check if centroided
397            if not spec.get("MS:1000127"):
398                raise ValueError("spectrum is not centroided")
399            data_dict = {
400                Labels.mz: spec.mz,
401                Labels.abundance: spec.i,
402                Labels.rp: [np.nan] * len(spec.mz),
403                Labels.s2n: [np.nan] * len(spec.i),
404            }
405            d_params = set_metadata(
406                scan_number, polarity, self.file_location, label=Labels.corems_centroid
407            )
408            mass_spectrum_obj = MassSpecCentroid(
409                data_dict, d_params, auto_process=auto_process
410            )
411
412        return mass_spectrum_obj

Instatiate a mass spectrum object from the mzML file.

Parameters
  • scan_number (int): The scan number to be parsed.
  • spectrum_mode (str): The type of spectrum to instantiate. Must be'profile' or 'centroid'.
  • polarity (int): The polarity of the scan. Must be -1 or 1.
  • auto_process (bool, optional): If True, process the mass spectrum. Default is True.
Returns
  • MassSpecProfile | MassSpecCentroid: The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum.
def get_mass_spectra_obj(self):
414    def get_mass_spectra_obj(self):
415        """Instatiate a MassSpectraBase object from the mzML file.
416
417
418        Returns
419        -------
420        MassSpectraBase
421            The MassSpectra object containing the parsed mass spectra.
422            The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe.
423        """
424        _, scan_df = self.run(spectra=False)
425        mass_spectra_obj = MassSpectraBase(
426            self.file_location,
427            self.analyzer,
428            self.instrument_label,
429            self.sample_name,
430            self,
431        )
432        scan_df = scan_df.set_index("scan", drop=False)
433        mass_spectra_obj.scan_df = scan_df
434
435        return mass_spectra_obj

Instatiate a MassSpectraBase object from the mzML file.

Returns
  • MassSpectraBase: The MassSpectra object containing the parsed mass spectra. The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe.
def get_lcms_obj(self, spectra='all'):
437    def get_lcms_obj(self, spectra="all"):
438        """Instatiates a LCMSBase object from the mzML file.
439
440        Parameters
441        ----------
442        spectra : str, optional
443            Which mass spectra data to include in the output. Default is all.  Other options: none, ms1, ms2.
444
445        Returns
446        -------
447        LCMSBase
448            LCMS object containing mass spectra data.
449            The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe,
450            and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs.
451        """
452        _, scan_df = self.run(spectra="none")  # first run it to just get scan info
453        res, scan_df = self.run(
454            scan_df=scan_df, spectra=spectra
455        )  # second run to parse data
456        lcms_obj = LCMSBase(
457            self.file_location,
458            self.analyzer,
459            self.instrument_label,
460            self.sample_name,
461            self,
462        )
463        for key in res:
464            key_int = int(key.replace("ms", ""))
465            res[key] = res[key][res[key].intensity > 0]
466            res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True)
467            lcms_obj._ms_unprocessed[key_int] = res[key]
468        lcms_obj.scan_df = scan_df.set_index("scan", drop=False)
469        # Check if polarity is mixed
470        if len(set(scan_df.polarity)) > 1:
471            raise ValueError("Mixed polarities detected in scan data")
472        lcms_obj.polarity = scan_df.polarity[0]
473        lcms_obj._scans_number_list = list(scan_df.scan)
474        lcms_obj._retention_time_list = list(scan_df.scan_time)
475        lcms_obj._tic_list = list(scan_df.tic)
476
477        return lcms_obj

Instatiates a LCMSBase object from the mzML file.

Parameters
  • spectra (str, optional): Which mass spectra data to include in the output. Default is all. Other options: none, ms1, ms2.
Returns
  • LCMSBase: LCMS object containing mass spectra data. The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe, and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs.
def get_instrument_info(self):
479    def get_instrument_info(self):
480        """
481        Return instrument information.
482
483        Returns
484        -------
485        dict
486            A dictionary with the keys 'model' and 'serial_number'.
487        """
488        # Load the pymzml data
489        data = self.load()
490        instrument_info = data.info.get('referenceable_param_group_list_element')[0]
491        cv_params = instrument_info.findall('{http://psi.hupo.org/ms/mzml}cvParam')
492
493        # Extract details from each cvParam
494        params = []
495        for param in cv_params:
496            accession = param.get('accession')  # Get 'accession' attribute
497            name = param.get('name')           # Get 'name' attribute
498            value = param.get('value')         # Get 'value' attribute
499            params.append({
500                'accession': accession,
501                'name': name,
502                'value': value
503            })
504
505        # Loop through params and try to find the relevant information
506        instrument_dict = {
507            'model': 'Unknown',
508            'serial_number': 'Unknown'
509        }
510
511        # Assuming there are only two paramters here - one is for the serial number (agnostic to the model) and the other is for the model
512        # If there are more than two, we raise an error
513        if len(params) < 2:
514            raise ValueError("Not enough parameters found in the instrument info, cannot parse.")
515        if len(params) > 2:
516            raise ValueError("Too many parameters found in the instrument info, cannot parse.")
517        for param in params:
518            if param['accession'] == 'MS:1000529':
519                instrument_dict['serial_number'] = param['value']
520            else:
521                instrument_dict['model'] = data.OT[param['accession']]     
522
523        return instrument_dict

Return instrument information.

Returns
  • dict: A dictionary with the keys 'model' and 'serial_number'.
def get_creation_time(self) -> datetime.datetime:
525    def get_creation_time(self) -> datetime.datetime:
526        """
527        Return the creation time of the mzML file.
528        """
529        data = self.load()
530        write_time = data.info.get('start_time')
531        if write_time:
532            # Convert the write time to a datetime object
533            return datetime.datetime.strptime(write_time, "%Y-%m-%dT%H:%M:%SZ")
534        else:
535            raise ValueError("Creation time is not available in the mzML file. "
536                           "Please ensure the file contains the 'start_time' information.")

Return the creation time of the mzML file.