corems.mass_spectra.input.mzml

  1from collections import defaultdict
  2from pathlib import Path
  3
  4import numpy as np
  5import pandas as pd
  6import pymzml
  7
  8from corems.encapsulation.constant import Labels
  9from corems.encapsulation.factory.parameters import default_parameters
 10from corems.mass_spectra.factory.lc_class import LCMSBase, MassSpectraBase
 11from corems.mass_spectra.input.parserbase import SpectraParserInterface
 12from corems.mass_spectrum.factory.MassSpectrumClasses import (
 13    MassSpecCentroid,
 14    MassSpecProfile,
 15)
 16
 17
 18class MZMLSpectraParser(SpectraParserInterface):
 19    """A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects
 20
 21    Parameters
 22    ----------
 23    file_location : str or Path
 24        The path to the RAW file to be parsed.
 25    analyzer : str, optional
 26        The type of mass analyzer used in the instrument. Default is "Unknown".
 27    instrument_label : str, optional
 28        The name of the instrument used to acquire the data. Default is "Unknown".
 29    sample_name : str, optional
 30        The name of the sample being analyzed. If not provided, the stem of the file_location path will be used.
 31
 32    Attributes
 33    ----------
 34    file_location : Path
 35        The path to the RAW file being parsed.
 36    analyzer : str
 37        The type of mass analyzer used in the instrument.
 38    instrument_label : str
 39        The name of the instrument used to acquire the data.
 40    sample_name : str
 41        The name of the sample being analyzed.
 42
 43    Methods
 44    -------
 45    * load().
 46        Load mzML file using pymzml.run.Reader and return the data as a numpy array.
 47    * run(spectra=True).
 48        Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe.
 49    * get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True)
 50        Parses the mzml file and returns a MassSpecBase object from a single scan.
 51    * get_mass_spectra_obj().
 52        Parses the mzml file and instantiates a MassSpectraBase object.
 53    * get_lcms_obj().
 54        Parses the mzml file and instantiates an LCMSBase object.
 55
 56    Inherits from ThermoBaseClass and SpectraParserInterface
 57    """
 58
 59    def __init__(
 60        self,
 61        file_location,
 62        analyzer="Unknown",
 63        instrument_label="Unknown",
 64        sample_name=None,
 65    ):
 66        # implementation details
 67        if isinstance(file_location, str):
 68            # if obj is a string it defaults to create a Path obj, pass the S3Path if needed
 69            file_location = Path(file_location)
 70        if not file_location.exists():
 71            raise FileExistsError("File does not exist: " + str(file_location))
 72        self.file_location = file_location
 73        self.analyzer = analyzer
 74        self.instrument_label = instrument_label
 75
 76        if sample_name:
 77            self.sample_name = sample_name
 78        else:
 79            self.sample_name = file_location.stem
 80
 81    def load(self):
 82        """
 83        Load mzML file using pymzml.run.Reader and return the data as a numpy array.
 84
 85        Returns
 86        -------
 87        numpy.ndarray
 88            The mass spectra data as a numpy array.
 89        """
 90        data = pymzml.run.Reader(self.file_location)
 91        return data
 92
 93    def get_scan_df(self, data):
 94        """
 95        Return scan data as a pandas DataFrame.
 96
 97        Parameters
 98        ----------
 99        data : pymzml.run.Reader
100            The mass spectra data.
101
102        Returns
103        -------
104        pandas.DataFrame
105            A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
106        """
107        # Scan dict
108        # instatinate scan dict, with empty lists of size of scans
109        n_scans = data.get_spectrum_count()
110        scan_dict = {
111            "scan": np.empty(n_scans, dtype=np.int32),
112            "scan_time": np.empty(n_scans, dtype=np.float32),
113            "ms_level": [None] * n_scans,
114            "polarity": [None] * n_scans,
115            "precursor_mz": [None] * n_scans,
116            "scan_text": [None] * n_scans,
117            "scan_window_lower": np.empty(n_scans, dtype=np.float32),
118            "scan_window_upper": np.empty(n_scans, dtype=np.float32),
119            "scan_precision": [None] * n_scans,
120            "tic": np.empty(n_scans, dtype=np.float32),
121            "ms_format": [None] * n_scans,
122        }
123
124        # First pass: loop through scans to get scan info
125        for i, spec in enumerate(data):
126            scan_dict["scan"][i] = spec.ID
127            scan_dict["ms_level"][i] = spec.ms_level
128            scan_dict["scan_precision"][i] = spec._measured_precision
129            scan_dict["tic"][i] = spec.TIC
130            if spec.selected_precursors:
131                scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get(
132                    "mz", None
133                )
134            if spec["negative scan"] is not None:
135                scan_dict["polarity"][i] = "negative"
136            if spec["positive scan"] is not None:
137                scan_dict["polarity"][i] = "positive"
138            if spec["negative scan"] is not None and spec["positive scan"] is not None:
139                raise ValueError(
140                    "Error: scan {0} has both negative and positive polarity".format(
141                        spec.ID
142                    )
143                )
144
145            scan_dict["scan_time"][i] = spec.get("MS:1000016")
146            scan_dict["scan_text"][i] = spec.get("MS:1000512")
147            scan_dict["scan_window_lower"][i] = spec.get("MS:1000501")
148            scan_dict["scan_window_upper"][i] = spec.get("MS:1000500")
149            if spec.get("MS:1000128"):
150                scan_dict["ms_format"][i] = "profile"
151            elif spec.get("MS:1000127"):
152                scan_dict["ms_format"][i] = "centroid"
153            else:
154                scan_dict["ms_format"][i] = None
155
156        scan_df = pd.DataFrame(scan_dict)
157
158        return scan_df
159
160    def get_ms_raw(self, spectra, scan_df, data):
161        """Return a dictionary of mass spectra data as a pandas DataFrame.
162
163        Parameters
164        ----------
165        spectra : str
166            Which mass spectra data to include in the output.
167            Options: None, "ms1", "ms2", "all".
168        scan_df : pandas.DataFrame
169            Scan dataframe. Output from get_scan_df().
170        data : pymzml.run.Reader
171            The mass spectra data.
172
173        Returns
174        -------
175        dict
176            A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level.
177
178        """
179        if spectra == "all":
180            scan_df_forspec = scan_df
181        elif spectra == "ms1":
182            scan_df_forspec = scan_df[scan_df.ms_level == 1]
183        elif spectra == "ms2":
184            scan_df_forspec = scan_df[scan_df.ms_level == 2]
185        else:
186            raise ValueError("spectra must be 'all', 'ms1', or 'ms2'")
187
188        # Result container
189        res = {}
190
191        # Row count container
192        counter = {}
193
194        # Column name container
195        cols = {}
196
197        # set at float32
198        dtype = np.float32
199
200        # First pass: get nrows
201        N = defaultdict(lambda: 0)
202        for i, spec in enumerate(data):
203            if i in scan_df_forspec.scan:
204                # Get ms level
205                level = "ms{}".format(spec.ms_level)
206
207                # Number of rows
208                N[level] += spec.mz.shape[0]
209
210        # Second pass: parse
211        for i, spec in enumerate(data):
212            if i in scan_df_forspec.scan:
213                # Number of rows
214                n = spec.mz.shape[0]
215
216                # No measurements
217                if n == 0:
218                    continue
219
220                # Dimension check
221                if len(spec.mz) != len(spec.i):
222                    # raise an error if the mz and intensity arrays are not the same length
223                    raise ValueError("m/z and intensity array dimension mismatch")
224
225                # Scan/frame info
226                id_dict = spec.id_dict
227
228                # Get ms level
229                level = "ms{}".format(spec.ms_level)
230
231                # Columns
232                cols[level] = list(id_dict.keys()) + ["mz", "intensity"]
233                m = len(cols[level])
234
235                # Subarray init
236                arr = np.empty((n, m), dtype=dtype)
237                inx = 0
238
239                # Populate scan/frame info
240                for k, v in id_dict.items():
241                    arr[:, inx] = v
242                    inx += 1
243
244                # Populate m/z
245                arr[:, inx] = spec.mz
246                inx += 1
247
248                # Populate intensity
249                arr[:, inx] = spec.i
250                inx += 1
251
252                # Initialize output container
253                if level not in res:
254                    res[level] = np.empty((N[level], m), dtype=dtype)
255                    counter[level] = 0
256
257                # Insert subarray
258                res[level][counter[level] : counter[level] + n, :] = arr
259                counter[level] += n
260
261        # Construct ms1 and ms2 mz dataframes
262        for level in res.keys():
263            res[level] = pd.DataFrame(res[level], columns=cols[level]).drop(
264                columns=["controllerType", "controllerNumber"],
265                axis=1,
266                inplace=False,
267            )
268
269        return res
270
271    def run(self, spectra="all", scan_df=None):
272        """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe.
273
274        Parameters
275        ----------
276        spectra : str, optional
277            Which mass spectra data to include in the output. Default is "all".
278            Other options: None, "ms1", "ms2".
279        scan_df : pandas.DataFrame, optional
280            Scan dataframe.  If not provided, the scan dataframe is created from the mzML file.
281
282        Returns
283        -------
284        tuple
285            A tuple containing two elements:
286            - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level.
287            - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
288        """
289
290        # Open file
291        data = self.load()
292
293        if scan_df is None:
294            scan_df = self.get_scan_df(data)
295
296        if spectra != "none":
297            res = self.get_ms_raw(spectra, scan_df, data)
298
299        else:
300            res = None
301
302        return res, scan_df
303
304    def get_mass_spectrum_from_scan(
305        self, scan_number, spectrum_mode, auto_process=True
306    ):
307        """Instatiate a mass spectrum object from the mzML file.
308
309        Parameters
310        ----------
311        scan_number : int
312            The scan number to be parsed.
313        spectrum_mode : str
314            The type of spectrum to instantiate.  Must be'profile' or 'centroid'.
315        polarity : int
316            The polarity of the scan.  Must be -1 or 1.
317        auto_process : bool, optional
318            If True, process the mass spectrum. Default is True.
319
320        Returns
321        -------
322        MassSpecProfile | MassSpecCentroid
323            The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum.
324        """
325
326        def set_metadata(
327            scan_number: int,
328            polarity: int,
329            file_location: str,
330            label=Labels.thermo_profile,
331        ):
332            """
333            Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object.
334
335            Parameters
336            ----------
337            scan_number : int
338                The scan number.
339            polarity : int
340                The polarity of the data.
341            file_location : str
342                The file location.
343            label : str, optional
344                The label for the mass spectrum. Default is Labels.thermo_profile.
345
346            Returns
347            -------
348            dict
349                The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object.
350            """
351            d_params = default_parameters(file_location)
352            d_params["label"] = label
353            d_params["polarity"] = polarity
354            d_params["filename_path"] = file_location
355            d_params["scan_number"] = scan_number
356
357            return d_params
358
359        # Open file
360        data = self.load()
361
362        # Pluck out individual scan mz and intensity
363        spec = data[scan_number]
364
365        # Get polarity
366        if spec["negative scan"] is not None:
367            polarity = -1
368        elif spec["positive scan"] is not None:
369            polarity = 1
370
371        # Get mass spectrum
372        if spectrum_mode == "profile":
373            # Check if profile
374            if not spec.get("MS:1000128"):
375                raise ValueError("spectrum is not profile")
376            data_dict = {
377                Labels.mz: spec.mz,
378                Labels.abundance: spec.i,
379            }
380            d_params = set_metadata(
381                scan_number,
382                polarity,
383                self.file_location,
384                label=Labels.simulated_profile,
385            )
386            mass_spectrum_obj = mass_spectrum_obj = MassSpecProfile(
387                data_dict, d_params, auto_process=auto_process
388            )
389        elif spectrum_mode == "centroid":
390            # Check if centroided
391            if not spec.get("MS:1000127"):
392                raise ValueError("spectrum is not centroided")
393            data_dict = {
394                Labels.mz: spec.mz,
395                Labels.abundance: spec.i,
396                Labels.rp: [np.nan] * len(spec.mz),
397                Labels.s2n: [np.nan] * len(spec.i),
398            }
399            d_params = set_metadata(
400                scan_number, polarity, self.file_location, label=Labels.corems_centroid
401            )
402            mass_spectrum_obj = MassSpecCentroid(
403                data_dict, d_params, auto_process=auto_process
404            )
405
406        return mass_spectrum_obj
407
408    def get_mass_spectra_obj(self):
409        """Instatiate a MassSpectraBase object from the mzML file.
410
411
412        Returns
413        -------
414        MassSpectraBase
415            The MassSpectra object containing the parsed mass spectra.
416            The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe.
417        """
418        _, scan_df = self.run(spectra=False)
419        mass_spectra_obj = MassSpectraBase(
420            self.file_location,
421            self.analyzer,
422            self.instrument_label,
423            self.sample_name,
424            self,
425        )
426        scan_df = scan_df.set_index("scan", drop=False)
427        mass_spectra_obj.scan_df = scan_df
428
429        return mass_spectra_obj
430
431    def get_lcms_obj(self, spectra="all"):
432        """Instatiates a LCMSBase object from the mzML file.
433
434        Parameters
435        ----------
436        spectra : str, optional
437            Which mass spectra data to include in the output. Default is all.  Other options: none, ms1, ms2.
438
439        Returns
440        -------
441        LCMSBase
442            LCMS object containing mass spectra data.
443            The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe,
444            and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs.
445        """
446        _, scan_df = self.run(spectra="none")  # first run it to just get scan info
447        res, scan_df = self.run(
448            scan_df=scan_df, spectra=spectra
449        )  # second run to parse data
450        lcms_obj = LCMSBase(
451            self.file_location,
452            self.analyzer,
453            self.instrument_label,
454            self.sample_name,
455            self,
456        )
457        for key in res:
458            key_int = int(key.replace("ms", ""))
459            res[key] = res[key][res[key].intensity > 0]
460            res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True)
461            lcms_obj._ms_unprocessed[key_int] = res[key]
462        lcms_obj.scan_df = scan_df.set_index("scan", drop=False)
463        # Check if polarity is mixed
464        if len(set(scan_df.polarity)) > 1:
465            raise ValueError("Mixed polarities detected in scan data")
466        lcms_obj.polarity = scan_df.polarity[0]
467        lcms_obj._scans_number_list = list(scan_df.scan)
468        lcms_obj._retention_time_list = list(scan_df.scan_time)
469        lcms_obj._tic_list = list(scan_df.tic)
470
471        return lcms_obj
class MZMLSpectraParser(corems.mass_spectra.input.parserbase.SpectraParserInterface):
 19class MZMLSpectraParser(SpectraParserInterface):
 20    """A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects
 21
 22    Parameters
 23    ----------
 24    file_location : str or Path
 25        The path to the RAW file to be parsed.
 26    analyzer : str, optional
 27        The type of mass analyzer used in the instrument. Default is "Unknown".
 28    instrument_label : str, optional
 29        The name of the instrument used to acquire the data. Default is "Unknown".
 30    sample_name : str, optional
 31        The name of the sample being analyzed. If not provided, the stem of the file_location path will be used.
 32
 33    Attributes
 34    ----------
 35    file_location : Path
 36        The path to the RAW file being parsed.
 37    analyzer : str
 38        The type of mass analyzer used in the instrument.
 39    instrument_label : str
 40        The name of the instrument used to acquire the data.
 41    sample_name : str
 42        The name of the sample being analyzed.
 43
 44    Methods
 45    -------
 46    * load().
 47        Load mzML file using pymzml.run.Reader and return the data as a numpy array.
 48    * run(spectra=True).
 49        Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe.
 50    * get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True)
 51        Parses the mzml file and returns a MassSpecBase object from a single scan.
 52    * get_mass_spectra_obj().
 53        Parses the mzml file and instantiates a MassSpectraBase object.
 54    * get_lcms_obj().
 55        Parses the mzml file and instantiates an LCMSBase object.
 56
 57    Inherits from ThermoBaseClass and SpectraParserInterface
 58    """
 59
 60    def __init__(
 61        self,
 62        file_location,
 63        analyzer="Unknown",
 64        instrument_label="Unknown",
 65        sample_name=None,
 66    ):
 67        # implementation details
 68        if isinstance(file_location, str):
 69            # if obj is a string it defaults to create a Path obj, pass the S3Path if needed
 70            file_location = Path(file_location)
 71        if not file_location.exists():
 72            raise FileExistsError("File does not exist: " + str(file_location))
 73        self.file_location = file_location
 74        self.analyzer = analyzer
 75        self.instrument_label = instrument_label
 76
 77        if sample_name:
 78            self.sample_name = sample_name
 79        else:
 80            self.sample_name = file_location.stem
 81
 82    def load(self):
 83        """
 84        Load mzML file using pymzml.run.Reader and return the data as a numpy array.
 85
 86        Returns
 87        -------
 88        numpy.ndarray
 89            The mass spectra data as a numpy array.
 90        """
 91        data = pymzml.run.Reader(self.file_location)
 92        return data
 93
 94    def get_scan_df(self, data):
 95        """
 96        Return scan data as a pandas DataFrame.
 97
 98        Parameters
 99        ----------
100        data : pymzml.run.Reader
101            The mass spectra data.
102
103        Returns
104        -------
105        pandas.DataFrame
106            A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
107        """
108        # Scan dict
109        # instatinate scan dict, with empty lists of size of scans
110        n_scans = data.get_spectrum_count()
111        scan_dict = {
112            "scan": np.empty(n_scans, dtype=np.int32),
113            "scan_time": np.empty(n_scans, dtype=np.float32),
114            "ms_level": [None] * n_scans,
115            "polarity": [None] * n_scans,
116            "precursor_mz": [None] * n_scans,
117            "scan_text": [None] * n_scans,
118            "scan_window_lower": np.empty(n_scans, dtype=np.float32),
119            "scan_window_upper": np.empty(n_scans, dtype=np.float32),
120            "scan_precision": [None] * n_scans,
121            "tic": np.empty(n_scans, dtype=np.float32),
122            "ms_format": [None] * n_scans,
123        }
124
125        # First pass: loop through scans to get scan info
126        for i, spec in enumerate(data):
127            scan_dict["scan"][i] = spec.ID
128            scan_dict["ms_level"][i] = spec.ms_level
129            scan_dict["scan_precision"][i] = spec._measured_precision
130            scan_dict["tic"][i] = spec.TIC
131            if spec.selected_precursors:
132                scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get(
133                    "mz", None
134                )
135            if spec["negative scan"] is not None:
136                scan_dict["polarity"][i] = "negative"
137            if spec["positive scan"] is not None:
138                scan_dict["polarity"][i] = "positive"
139            if spec["negative scan"] is not None and spec["positive scan"] is not None:
140                raise ValueError(
141                    "Error: scan {0} has both negative and positive polarity".format(
142                        spec.ID
143                    )
144                )
145
146            scan_dict["scan_time"][i] = spec.get("MS:1000016")
147            scan_dict["scan_text"][i] = spec.get("MS:1000512")
148            scan_dict["scan_window_lower"][i] = spec.get("MS:1000501")
149            scan_dict["scan_window_upper"][i] = spec.get("MS:1000500")
150            if spec.get("MS:1000128"):
151                scan_dict["ms_format"][i] = "profile"
152            elif spec.get("MS:1000127"):
153                scan_dict["ms_format"][i] = "centroid"
154            else:
155                scan_dict["ms_format"][i] = None
156
157        scan_df = pd.DataFrame(scan_dict)
158
159        return scan_df
160
161    def get_ms_raw(self, spectra, scan_df, data):
162        """Return a dictionary of mass spectra data as a pandas DataFrame.
163
164        Parameters
165        ----------
166        spectra : str
167            Which mass spectra data to include in the output.
168            Options: None, "ms1", "ms2", "all".
169        scan_df : pandas.DataFrame
170            Scan dataframe. Output from get_scan_df().
171        data : pymzml.run.Reader
172            The mass spectra data.
173
174        Returns
175        -------
176        dict
177            A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level.
178
179        """
180        if spectra == "all":
181            scan_df_forspec = scan_df
182        elif spectra == "ms1":
183            scan_df_forspec = scan_df[scan_df.ms_level == 1]
184        elif spectra == "ms2":
185            scan_df_forspec = scan_df[scan_df.ms_level == 2]
186        else:
187            raise ValueError("spectra must be 'all', 'ms1', or 'ms2'")
188
189        # Result container
190        res = {}
191
192        # Row count container
193        counter = {}
194
195        # Column name container
196        cols = {}
197
198        # set at float32
199        dtype = np.float32
200
201        # First pass: get nrows
202        N = defaultdict(lambda: 0)
203        for i, spec in enumerate(data):
204            if i in scan_df_forspec.scan:
205                # Get ms level
206                level = "ms{}".format(spec.ms_level)
207
208                # Number of rows
209                N[level] += spec.mz.shape[0]
210
211        # Second pass: parse
212        for i, spec in enumerate(data):
213            if i in scan_df_forspec.scan:
214                # Number of rows
215                n = spec.mz.shape[0]
216
217                # No measurements
218                if n == 0:
219                    continue
220
221                # Dimension check
222                if len(spec.mz) != len(spec.i):
223                    # raise an error if the mz and intensity arrays are not the same length
224                    raise ValueError("m/z and intensity array dimension mismatch")
225
226                # Scan/frame info
227                id_dict = spec.id_dict
228
229                # Get ms level
230                level = "ms{}".format(spec.ms_level)
231
232                # Columns
233                cols[level] = list(id_dict.keys()) + ["mz", "intensity"]
234                m = len(cols[level])
235
236                # Subarray init
237                arr = np.empty((n, m), dtype=dtype)
238                inx = 0
239
240                # Populate scan/frame info
241                for k, v in id_dict.items():
242                    arr[:, inx] = v
243                    inx += 1
244
245                # Populate m/z
246                arr[:, inx] = spec.mz
247                inx += 1
248
249                # Populate intensity
250                arr[:, inx] = spec.i
251                inx += 1
252
253                # Initialize output container
254                if level not in res:
255                    res[level] = np.empty((N[level], m), dtype=dtype)
256                    counter[level] = 0
257
258                # Insert subarray
259                res[level][counter[level] : counter[level] + n, :] = arr
260                counter[level] += n
261
262        # Construct ms1 and ms2 mz dataframes
263        for level in res.keys():
264            res[level] = pd.DataFrame(res[level], columns=cols[level]).drop(
265                columns=["controllerType", "controllerNumber"],
266                axis=1,
267                inplace=False,
268            )
269
270        return res
271
272    def run(self, spectra="all", scan_df=None):
273        """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe.
274
275        Parameters
276        ----------
277        spectra : str, optional
278            Which mass spectra data to include in the output. Default is "all".
279            Other options: None, "ms1", "ms2".
280        scan_df : pandas.DataFrame, optional
281            Scan dataframe.  If not provided, the scan dataframe is created from the mzML file.
282
283        Returns
284        -------
285        tuple
286            A tuple containing two elements:
287            - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level.
288            - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
289        """
290
291        # Open file
292        data = self.load()
293
294        if scan_df is None:
295            scan_df = self.get_scan_df(data)
296
297        if spectra != "none":
298            res = self.get_ms_raw(spectra, scan_df, data)
299
300        else:
301            res = None
302
303        return res, scan_df
304
305    def get_mass_spectrum_from_scan(
306        self, scan_number, spectrum_mode, auto_process=True
307    ):
308        """Instatiate a mass spectrum object from the mzML file.
309
310        Parameters
311        ----------
312        scan_number : int
313            The scan number to be parsed.
314        spectrum_mode : str
315            The type of spectrum to instantiate.  Must be'profile' or 'centroid'.
316        polarity : int
317            The polarity of the scan.  Must be -1 or 1.
318        auto_process : bool, optional
319            If True, process the mass spectrum. Default is True.
320
321        Returns
322        -------
323        MassSpecProfile | MassSpecCentroid
324            The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum.
325        """
326
327        def set_metadata(
328            scan_number: int,
329            polarity: int,
330            file_location: str,
331            label=Labels.thermo_profile,
332        ):
333            """
334            Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object.
335
336            Parameters
337            ----------
338            scan_number : int
339                The scan number.
340            polarity : int
341                The polarity of the data.
342            file_location : str
343                The file location.
344            label : str, optional
345                The label for the mass spectrum. Default is Labels.thermo_profile.
346
347            Returns
348            -------
349            dict
350                The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object.
351            """
352            d_params = default_parameters(file_location)
353            d_params["label"] = label
354            d_params["polarity"] = polarity
355            d_params["filename_path"] = file_location
356            d_params["scan_number"] = scan_number
357
358            return d_params
359
360        # Open file
361        data = self.load()
362
363        # Pluck out individual scan mz and intensity
364        spec = data[scan_number]
365
366        # Get polarity
367        if spec["negative scan"] is not None:
368            polarity = -1
369        elif spec["positive scan"] is not None:
370            polarity = 1
371
372        # Get mass spectrum
373        if spectrum_mode == "profile":
374            # Check if profile
375            if not spec.get("MS:1000128"):
376                raise ValueError("spectrum is not profile")
377            data_dict = {
378                Labels.mz: spec.mz,
379                Labels.abundance: spec.i,
380            }
381            d_params = set_metadata(
382                scan_number,
383                polarity,
384                self.file_location,
385                label=Labels.simulated_profile,
386            )
387            mass_spectrum_obj = mass_spectrum_obj = MassSpecProfile(
388                data_dict, d_params, auto_process=auto_process
389            )
390        elif spectrum_mode == "centroid":
391            # Check if centroided
392            if not spec.get("MS:1000127"):
393                raise ValueError("spectrum is not centroided")
394            data_dict = {
395                Labels.mz: spec.mz,
396                Labels.abundance: spec.i,
397                Labels.rp: [np.nan] * len(spec.mz),
398                Labels.s2n: [np.nan] * len(spec.i),
399            }
400            d_params = set_metadata(
401                scan_number, polarity, self.file_location, label=Labels.corems_centroid
402            )
403            mass_spectrum_obj = MassSpecCentroid(
404                data_dict, d_params, auto_process=auto_process
405            )
406
407        return mass_spectrum_obj
408
409    def get_mass_spectra_obj(self):
410        """Instatiate a MassSpectraBase object from the mzML file.
411
412
413        Returns
414        -------
415        MassSpectraBase
416            The MassSpectra object containing the parsed mass spectra.
417            The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe.
418        """
419        _, scan_df = self.run(spectra=False)
420        mass_spectra_obj = MassSpectraBase(
421            self.file_location,
422            self.analyzer,
423            self.instrument_label,
424            self.sample_name,
425            self,
426        )
427        scan_df = scan_df.set_index("scan", drop=False)
428        mass_spectra_obj.scan_df = scan_df
429
430        return mass_spectra_obj
431
432    def get_lcms_obj(self, spectra="all"):
433        """Instatiates a LCMSBase object from the mzML file.
434
435        Parameters
436        ----------
437        spectra : str, optional
438            Which mass spectra data to include in the output. Default is all.  Other options: none, ms1, ms2.
439
440        Returns
441        -------
442        LCMSBase
443            LCMS object containing mass spectra data.
444            The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe,
445            and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs.
446        """
447        _, scan_df = self.run(spectra="none")  # first run it to just get scan info
448        res, scan_df = self.run(
449            scan_df=scan_df, spectra=spectra
450        )  # second run to parse data
451        lcms_obj = LCMSBase(
452            self.file_location,
453            self.analyzer,
454            self.instrument_label,
455            self.sample_name,
456            self,
457        )
458        for key in res:
459            key_int = int(key.replace("ms", ""))
460            res[key] = res[key][res[key].intensity > 0]
461            res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True)
462            lcms_obj._ms_unprocessed[key_int] = res[key]
463        lcms_obj.scan_df = scan_df.set_index("scan", drop=False)
464        # Check if polarity is mixed
465        if len(set(scan_df.polarity)) > 1:
466            raise ValueError("Mixed polarities detected in scan data")
467        lcms_obj.polarity = scan_df.polarity[0]
468        lcms_obj._scans_number_list = list(scan_df.scan)
469        lcms_obj._retention_time_list = list(scan_df.scan_time)
470        lcms_obj._tic_list = list(scan_df.tic)
471
472        return lcms_obj

A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects

Parameters
  • file_location (str or Path): The path to the RAW file to be parsed.
  • analyzer (str, optional): The type of mass analyzer used in the instrument. Default is "Unknown".
  • instrument_label (str, optional): The name of the instrument used to acquire the data. Default is "Unknown".
  • sample_name (str, optional): The name of the sample being analyzed. If not provided, the stem of the file_location path will be used.
Attributes
  • file_location (Path): The path to the RAW file being parsed.
  • analyzer (str): The type of mass analyzer used in the instrument.
  • instrument_label (str): The name of the instrument used to acquire the data.
  • sample_name (str): The name of the sample being analyzed.
Methods
  • load(). Load mzML file using pymzml.run.Reader and return the data as a numpy array.
  • run(spectra=True). Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe.
  • get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True) Parses the mzml file and returns a MassSpecBase object from a single scan.
  • get_mass_spectra_obj(). Parses the mzml file and instantiates a MassSpectraBase object.
  • get_lcms_obj(). Parses the mzml file and instantiates an LCMSBase object.

Inherits from ThermoBaseClass and SpectraParserInterface

MZMLSpectraParser( file_location, analyzer='Unknown', instrument_label='Unknown', sample_name=None)
60    def __init__(
61        self,
62        file_location,
63        analyzer="Unknown",
64        instrument_label="Unknown",
65        sample_name=None,
66    ):
67        # implementation details
68        if isinstance(file_location, str):
69            # if obj is a string it defaults to create a Path obj, pass the S3Path if needed
70            file_location = Path(file_location)
71        if not file_location.exists():
72            raise FileExistsError("File does not exist: " + str(file_location))
73        self.file_location = file_location
74        self.analyzer = analyzer
75        self.instrument_label = instrument_label
76
77        if sample_name:
78            self.sample_name = sample_name
79        else:
80            self.sample_name = file_location.stem
file_location
analyzer
instrument_label
def load(self):
82    def load(self):
83        """
84        Load mzML file using pymzml.run.Reader and return the data as a numpy array.
85
86        Returns
87        -------
88        numpy.ndarray
89            The mass spectra data as a numpy array.
90        """
91        data = pymzml.run.Reader(self.file_location)
92        return data

Load mzML file using pymzml.run.Reader and return the data as a numpy array.

Returns
  • numpy.ndarray: The mass spectra data as a numpy array.
def get_scan_df(self, data):
 94    def get_scan_df(self, data):
 95        """
 96        Return scan data as a pandas DataFrame.
 97
 98        Parameters
 99        ----------
100        data : pymzml.run.Reader
101            The mass spectra data.
102
103        Returns
104        -------
105        pandas.DataFrame
106            A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
107        """
108        # Scan dict
109        # instatinate scan dict, with empty lists of size of scans
110        n_scans = data.get_spectrum_count()
111        scan_dict = {
112            "scan": np.empty(n_scans, dtype=np.int32),
113            "scan_time": np.empty(n_scans, dtype=np.float32),
114            "ms_level": [None] * n_scans,
115            "polarity": [None] * n_scans,
116            "precursor_mz": [None] * n_scans,
117            "scan_text": [None] * n_scans,
118            "scan_window_lower": np.empty(n_scans, dtype=np.float32),
119            "scan_window_upper": np.empty(n_scans, dtype=np.float32),
120            "scan_precision": [None] * n_scans,
121            "tic": np.empty(n_scans, dtype=np.float32),
122            "ms_format": [None] * n_scans,
123        }
124
125        # First pass: loop through scans to get scan info
126        for i, spec in enumerate(data):
127            scan_dict["scan"][i] = spec.ID
128            scan_dict["ms_level"][i] = spec.ms_level
129            scan_dict["scan_precision"][i] = spec._measured_precision
130            scan_dict["tic"][i] = spec.TIC
131            if spec.selected_precursors:
132                scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get(
133                    "mz", None
134                )
135            if spec["negative scan"] is not None:
136                scan_dict["polarity"][i] = "negative"
137            if spec["positive scan"] is not None:
138                scan_dict["polarity"][i] = "positive"
139            if spec["negative scan"] is not None and spec["positive scan"] is not None:
140                raise ValueError(
141                    "Error: scan {0} has both negative and positive polarity".format(
142                        spec.ID
143                    )
144                )
145
146            scan_dict["scan_time"][i] = spec.get("MS:1000016")
147            scan_dict["scan_text"][i] = spec.get("MS:1000512")
148            scan_dict["scan_window_lower"][i] = spec.get("MS:1000501")
149            scan_dict["scan_window_upper"][i] = spec.get("MS:1000500")
150            if spec.get("MS:1000128"):
151                scan_dict["ms_format"][i] = "profile"
152            elif spec.get("MS:1000127"):
153                scan_dict["ms_format"][i] = "centroid"
154            else:
155                scan_dict["ms_format"][i] = None
156
157        scan_df = pd.DataFrame(scan_dict)
158
159        return scan_df

Return scan data as a pandas DataFrame.

Parameters
  • data (pymzml.run.Reader): The mass spectra data.
Returns
  • pandas.DataFrame: A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
def get_ms_raw(self, spectra, scan_df, data):
161    def get_ms_raw(self, spectra, scan_df, data):
162        """Return a dictionary of mass spectra data as a pandas DataFrame.
163
164        Parameters
165        ----------
166        spectra : str
167            Which mass spectra data to include in the output.
168            Options: None, "ms1", "ms2", "all".
169        scan_df : pandas.DataFrame
170            Scan dataframe. Output from get_scan_df().
171        data : pymzml.run.Reader
172            The mass spectra data.
173
174        Returns
175        -------
176        dict
177            A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level.
178
179        """
180        if spectra == "all":
181            scan_df_forspec = scan_df
182        elif spectra == "ms1":
183            scan_df_forspec = scan_df[scan_df.ms_level == 1]
184        elif spectra == "ms2":
185            scan_df_forspec = scan_df[scan_df.ms_level == 2]
186        else:
187            raise ValueError("spectra must be 'all', 'ms1', or 'ms2'")
188
189        # Result container
190        res = {}
191
192        # Row count container
193        counter = {}
194
195        # Column name container
196        cols = {}
197
198        # set at float32
199        dtype = np.float32
200
201        # First pass: get nrows
202        N = defaultdict(lambda: 0)
203        for i, spec in enumerate(data):
204            if i in scan_df_forspec.scan:
205                # Get ms level
206                level = "ms{}".format(spec.ms_level)
207
208                # Number of rows
209                N[level] += spec.mz.shape[0]
210
211        # Second pass: parse
212        for i, spec in enumerate(data):
213            if i in scan_df_forspec.scan:
214                # Number of rows
215                n = spec.mz.shape[0]
216
217                # No measurements
218                if n == 0:
219                    continue
220
221                # Dimension check
222                if len(spec.mz) != len(spec.i):
223                    # raise an error if the mz and intensity arrays are not the same length
224                    raise ValueError("m/z and intensity array dimension mismatch")
225
226                # Scan/frame info
227                id_dict = spec.id_dict
228
229                # Get ms level
230                level = "ms{}".format(spec.ms_level)
231
232                # Columns
233                cols[level] = list(id_dict.keys()) + ["mz", "intensity"]
234                m = len(cols[level])
235
236                # Subarray init
237                arr = np.empty((n, m), dtype=dtype)
238                inx = 0
239
240                # Populate scan/frame info
241                for k, v in id_dict.items():
242                    arr[:, inx] = v
243                    inx += 1
244
245                # Populate m/z
246                arr[:, inx] = spec.mz
247                inx += 1
248
249                # Populate intensity
250                arr[:, inx] = spec.i
251                inx += 1
252
253                # Initialize output container
254                if level not in res:
255                    res[level] = np.empty((N[level], m), dtype=dtype)
256                    counter[level] = 0
257
258                # Insert subarray
259                res[level][counter[level] : counter[level] + n, :] = arr
260                counter[level] += n
261
262        # Construct ms1 and ms2 mz dataframes
263        for level in res.keys():
264            res[level] = pd.DataFrame(res[level], columns=cols[level]).drop(
265                columns=["controllerType", "controllerNumber"],
266                axis=1,
267                inplace=False,
268            )
269
270        return res

Return a dictionary of mass spectra data as a pandas DataFrame.

Parameters
  • spectra (str): Which mass spectra data to include in the output. Options: None, "ms1", "ms2", "all".
  • scan_df (pandas.DataFrame): Scan dataframe. Output from get_scan_df().
  • data (pymzml.run.Reader): The mass spectra data.
Returns
  • dict: A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level.
def run(self, spectra='all', scan_df=None):
272    def run(self, spectra="all", scan_df=None):
273        """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe.
274
275        Parameters
276        ----------
277        spectra : str, optional
278            Which mass spectra data to include in the output. Default is "all".
279            Other options: None, "ms1", "ms2".
280        scan_df : pandas.DataFrame, optional
281            Scan dataframe.  If not provided, the scan dataframe is created from the mzML file.
282
283        Returns
284        -------
285        tuple
286            A tuple containing two elements:
287            - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level.
288            - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
289        """
290
291        # Open file
292        data = self.load()
293
294        if scan_df is None:
295            scan_df = self.get_scan_df(data)
296
297        if spectra != "none":
298            res = self.get_ms_raw(spectra, scan_df, data)
299
300        else:
301            res = None
302
303        return res, scan_df

Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe.

Parameters
  • spectra (str, optional): Which mass spectra data to include in the output. Default is "all". Other options: None, "ms1", "ms2".
  • scan_df (pandas.DataFrame, optional): Scan dataframe. If not provided, the scan dataframe is created from the mzML file.
Returns
  • tuple: A tuple containing two elements:
    • A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level.
    • A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
def get_mass_spectrum_from_scan(self, scan_number, spectrum_mode, auto_process=True):
305    def get_mass_spectrum_from_scan(
306        self, scan_number, spectrum_mode, auto_process=True
307    ):
308        """Instatiate a mass spectrum object from the mzML file.
309
310        Parameters
311        ----------
312        scan_number : int
313            The scan number to be parsed.
314        spectrum_mode : str
315            The type of spectrum to instantiate.  Must be'profile' or 'centroid'.
316        polarity : int
317            The polarity of the scan.  Must be -1 or 1.
318        auto_process : bool, optional
319            If True, process the mass spectrum. Default is True.
320
321        Returns
322        -------
323        MassSpecProfile | MassSpecCentroid
324            The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum.
325        """
326
327        def set_metadata(
328            scan_number: int,
329            polarity: int,
330            file_location: str,
331            label=Labels.thermo_profile,
332        ):
333            """
334            Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object.
335
336            Parameters
337            ----------
338            scan_number : int
339                The scan number.
340            polarity : int
341                The polarity of the data.
342            file_location : str
343                The file location.
344            label : str, optional
345                The label for the mass spectrum. Default is Labels.thermo_profile.
346
347            Returns
348            -------
349            dict
350                The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object.
351            """
352            d_params = default_parameters(file_location)
353            d_params["label"] = label
354            d_params["polarity"] = polarity
355            d_params["filename_path"] = file_location
356            d_params["scan_number"] = scan_number
357
358            return d_params
359
360        # Open file
361        data = self.load()
362
363        # Pluck out individual scan mz and intensity
364        spec = data[scan_number]
365
366        # Get polarity
367        if spec["negative scan"] is not None:
368            polarity = -1
369        elif spec["positive scan"] is not None:
370            polarity = 1
371
372        # Get mass spectrum
373        if spectrum_mode == "profile":
374            # Check if profile
375            if not spec.get("MS:1000128"):
376                raise ValueError("spectrum is not profile")
377            data_dict = {
378                Labels.mz: spec.mz,
379                Labels.abundance: spec.i,
380            }
381            d_params = set_metadata(
382                scan_number,
383                polarity,
384                self.file_location,
385                label=Labels.simulated_profile,
386            )
387            mass_spectrum_obj = mass_spectrum_obj = MassSpecProfile(
388                data_dict, d_params, auto_process=auto_process
389            )
390        elif spectrum_mode == "centroid":
391            # Check if centroided
392            if not spec.get("MS:1000127"):
393                raise ValueError("spectrum is not centroided")
394            data_dict = {
395                Labels.mz: spec.mz,
396                Labels.abundance: spec.i,
397                Labels.rp: [np.nan] * len(spec.mz),
398                Labels.s2n: [np.nan] * len(spec.i),
399            }
400            d_params = set_metadata(
401                scan_number, polarity, self.file_location, label=Labels.corems_centroid
402            )
403            mass_spectrum_obj = MassSpecCentroid(
404                data_dict, d_params, auto_process=auto_process
405            )
406
407        return mass_spectrum_obj

Instatiate a mass spectrum object from the mzML file.

Parameters
  • scan_number (int): The scan number to be parsed.
  • spectrum_mode (str): The type of spectrum to instantiate. Must be'profile' or 'centroid'.
  • polarity (int): The polarity of the scan. Must be -1 or 1.
  • auto_process (bool, optional): If True, process the mass spectrum. Default is True.
Returns
  • MassSpecProfile | MassSpecCentroid: The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum.
def get_mass_spectra_obj(self):
409    def get_mass_spectra_obj(self):
410        """Instatiate a MassSpectraBase object from the mzML file.
411
412
413        Returns
414        -------
415        MassSpectraBase
416            The MassSpectra object containing the parsed mass spectra.
417            The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe.
418        """
419        _, scan_df = self.run(spectra=False)
420        mass_spectra_obj = MassSpectraBase(
421            self.file_location,
422            self.analyzer,
423            self.instrument_label,
424            self.sample_name,
425            self,
426        )
427        scan_df = scan_df.set_index("scan", drop=False)
428        mass_spectra_obj.scan_df = scan_df
429
430        return mass_spectra_obj

Instatiate a MassSpectraBase object from the mzML file.

Returns
  • MassSpectraBase: The MassSpectra object containing the parsed mass spectra. The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe.
def get_lcms_obj(self, spectra='all'):
432    def get_lcms_obj(self, spectra="all"):
433        """Instatiates a LCMSBase object from the mzML file.
434
435        Parameters
436        ----------
437        spectra : str, optional
438            Which mass spectra data to include in the output. Default is all.  Other options: none, ms1, ms2.
439
440        Returns
441        -------
442        LCMSBase
443            LCMS object containing mass spectra data.
444            The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe,
445            and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs.
446        """
447        _, scan_df = self.run(spectra="none")  # first run it to just get scan info
448        res, scan_df = self.run(
449            scan_df=scan_df, spectra=spectra
450        )  # second run to parse data
451        lcms_obj = LCMSBase(
452            self.file_location,
453            self.analyzer,
454            self.instrument_label,
455            self.sample_name,
456            self,
457        )
458        for key in res:
459            key_int = int(key.replace("ms", ""))
460            res[key] = res[key][res[key].intensity > 0]
461            res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True)
462            lcms_obj._ms_unprocessed[key_int] = res[key]
463        lcms_obj.scan_df = scan_df.set_index("scan", drop=False)
464        # Check if polarity is mixed
465        if len(set(scan_df.polarity)) > 1:
466            raise ValueError("Mixed polarities detected in scan data")
467        lcms_obj.polarity = scan_df.polarity[0]
468        lcms_obj._scans_number_list = list(scan_df.scan)
469        lcms_obj._retention_time_list = list(scan_df.scan_time)
470        lcms_obj._tic_list = list(scan_df.tic)
471
472        return lcms_obj

Instatiates a LCMSBase object from the mzML file.

Parameters
  • spectra (str, optional): Which mass spectra data to include in the output. Default is all. Other options: none, ms1, ms2.
Returns
  • LCMSBase: LCMS object containing mass spectra data. The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe, and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs.