corems.mass_spectra.input.mzml

  1from collections import defaultdict
  2from pathlib import Path
  3
  4import numpy as np
  5import pandas as pd
  6import pymzml
  7import datetime
  8
  9from corems.encapsulation.constant import Labels
 10from corems.encapsulation.factory.parameters import default_parameters
 11from corems.mass_spectra.factory.lc_class import LCMSBase, MassSpectraBase
 12from corems.mass_spectra.input.parserbase import SpectraParserInterface
 13from corems.mass_spectrum.factory.MassSpectrumClasses import (
 14    MassSpecCentroid,
 15    MassSpecProfile,
 16)
 17
 18
 19class MZMLSpectraParser(SpectraParserInterface):
 20    """A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects
 21
 22    Parameters
 23    ----------
 24    file_location : str or Path
 25        The path to the RAW file to be parsed.
 26    analyzer : str, optional
 27        The type of mass analyzer used in the instrument. Default is "Unknown".
 28    instrument_label : str, optional
 29        The name of the instrument used to acquire the data. Default is "Unknown".
 30    sample_name : str, optional
 31        The name of the sample being analyzed. If not provided, the stem of the file_location path will be used.
 32
 33    Attributes
 34    ----------
 35    file_location : Path
 36        The path to the RAW file being parsed.
 37    analyzer : str
 38        The type of mass analyzer used in the instrument.
 39    instrument_label : str
 40        The name of the instrument used to acquire the data.
 41    sample_name : str
 42        The name of the sample being analyzed.
 43
 44    Methods
 45    -------
 46    * load().
 47        Load mzML file using pymzml.run.Reader and return the data as a numpy array.
 48    * run(spectra=True).
 49        Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe.
 50    * get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True)
 51        Parses the mzml file and returns a MassSpecBase object from a single scan.
 52    * get_mass_spectra_obj().
 53        Parses the mzml file and instantiates a MassSpectraBase object.
 54    * get_lcms_obj().
 55        Parses the mzml file and instantiates an LCMSBase object.
 56    * get_instrument_info().
 57        Return instrument information from the mzML file.
 58    * get_creation_time().
 59        Return the creation time of the mzML file as a datetime object.
 60
 61    Inherits from ThermoBaseClass and SpectraParserInterface
 62    """
 63
 64    def __init__(
 65        self,
 66        file_location,
 67        analyzer="Unknown",
 68        instrument_label="Unknown",
 69        sample_name=None,
 70    ):
 71        # implementation details
 72        if isinstance(file_location, str):
 73            # if obj is a string it defaults to create a Path obj, pass the S3Path if needed
 74            file_location = Path(file_location)
 75        if not file_location.exists():
 76            raise FileExistsError("File does not exist: " + str(file_location))
 77        self.file_location = file_location
 78        self.analyzer = analyzer
 79        self.instrument_label = instrument_label
 80
 81        if sample_name:
 82            self.sample_name = sample_name
 83        else:
 84            self.sample_name = file_location.stem
 85
 86    def load(self):
 87        """
 88        Load mzML file using pymzml.run.Reader and return the data as a numpy array.
 89
 90        Returns
 91        -------
 92        numpy.ndarray
 93            The mass spectra data as a numpy array.
 94        """
 95        data = pymzml.run.Reader(self.file_location)
 96        return data
 97
 98    def get_scan_df(self, data):
 99        """
100        Return scan data as a pandas DataFrame.
101
102        Parameters
103        ----------
104        data : pymzml.run.Reader
105            The mass spectra data.
106
107        Returns
108        -------
109        pandas.DataFrame
110            A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
111        """
112        # Scan dict
113        # instatinate scan dict, with empty lists of size of scans
114        n_scans = data.get_spectrum_count()
115        scan_dict = {
116            "scan": np.empty(n_scans, dtype=np.int32),
117            "scan_time": np.empty(n_scans, dtype=np.float32),
118            "ms_level": [None] * n_scans,
119            "polarity": [None] * n_scans,
120            "precursor_mz": [None] * n_scans,
121            "scan_text": [None] * n_scans,
122            "scan_window_lower": np.empty(n_scans, dtype=np.float32),
123            "scan_window_upper": np.empty(n_scans, dtype=np.float32),
124            "scan_precision": [None] * n_scans,
125            "tic": np.empty(n_scans, dtype=np.float32),
126            "ms_format": [None] * n_scans,
127        }
128
129        # First pass: loop through scans to get scan info
130        for i, spec in enumerate(data):
131            scan_dict["scan"][i] = spec.ID
132            scan_dict["ms_level"][i] = spec.ms_level
133            scan_dict["scan_precision"][i] = spec._measured_precision
134            scan_dict["tic"][i] = spec.TIC
135            if spec.selected_precursors:
136                scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get(
137                    "mz", None
138                )
139            if spec["negative scan"] is not None:
140                scan_dict["polarity"][i] = "negative"
141            if spec["positive scan"] is not None:
142                scan_dict["polarity"][i] = "positive"
143            if spec["negative scan"] is not None and spec["positive scan"] is not None:
144                raise ValueError(
145                    "Error: scan {0} has both negative and positive polarity".format(
146                        spec.ID
147                    )
148                )
149
150            scan_dict["scan_time"][i] = spec.get("MS:1000016")
151            scan_dict["scan_text"][i] = spec.get("MS:1000512")
152            scan_dict["scan_window_lower"][i] = spec.get("MS:1000501")
153            scan_dict["scan_window_upper"][i] = spec.get("MS:1000500")
154            if spec.get("MS:1000128"):
155                scan_dict["ms_format"][i] = "profile"
156            elif spec.get("MS:1000127"):
157                scan_dict["ms_format"][i] = "centroid"
158            else:
159                scan_dict["ms_format"][i] = None
160
161        scan_df = pd.DataFrame(scan_dict)
162
163        return scan_df
164
165    def get_ms_raw(self, spectra, scan_df, data):
166        """Return a dictionary of mass spectra data as a pandas DataFrame.
167
168        Parameters
169        ----------
170        spectra : str
171            Which mass spectra data to include in the output.
172            Options: None, "ms1", "ms2", "all".
173        scan_df : pandas.DataFrame
174            Scan dataframe. Output from get_scan_df().
175        data : pymzml.run.Reader
176            The mass spectra data.
177
178        Returns
179        -------
180        dict
181            A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level.
182
183        """
184        if spectra == "all":
185            scan_df_forspec = scan_df
186        elif spectra == "ms1":
187            scan_df_forspec = scan_df[scan_df.ms_level == 1]
188        elif spectra == "ms2":
189            scan_df_forspec = scan_df[scan_df.ms_level == 2]
190        else:
191            raise ValueError("spectra must be 'all', 'ms1', or 'ms2'")
192
193        # Result container
194        res = {}
195
196        # Row count container
197        counter = {}
198
199        # Column name container
200        cols = {}
201
202        # set at float32
203        dtype = np.float32
204
205        # First pass: get nrows
206        N = defaultdict(lambda: 0)
207        for i, spec in enumerate(data):
208            if i in scan_df_forspec.scan:
209                # Get ms level
210                level = "ms{}".format(spec.ms_level)
211
212                # Number of rows
213                N[level] += spec.mz.shape[0]
214
215        # Second pass: parse
216        for i, spec in enumerate(data):
217            if i in scan_df_forspec.scan:
218                # Number of rows
219                n = spec.mz.shape[0]
220
221                # No measurements
222                if n == 0:
223                    continue
224
225                # Dimension check
226                if len(spec.mz) != len(spec.i):
227                    # raise an error if the mz and intensity arrays are not the same length
228                    raise ValueError("m/z and intensity array dimension mismatch")
229
230                # Scan/frame info
231                id_dict = spec.id_dict
232
233                # Get ms level
234                level = "ms{}".format(spec.ms_level)
235
236                # Columns
237                cols[level] = list(id_dict.keys()) + ["mz", "intensity"]
238                m = len(cols[level])
239
240                # Subarray init
241                arr = np.empty((n, m), dtype=dtype)
242                inx = 0
243
244                # Populate scan/frame info
245                for k, v in id_dict.items():
246                    arr[:, inx] = v
247                    inx += 1
248
249                # Populate m/z
250                arr[:, inx] = spec.mz
251                inx += 1
252
253                # Populate intensity
254                arr[:, inx] = spec.i
255                inx += 1
256
257                # Initialize output container
258                if level not in res:
259                    res[level] = np.empty((N[level], m), dtype=dtype)
260                    counter[level] = 0
261
262                # Insert subarray
263                res[level][counter[level] : counter[level] + n, :] = arr
264                counter[level] += n
265
266        # Construct ms1 and ms2 mz dataframes
267        for level in res.keys():
268            res[level] = pd.DataFrame(res[level], columns=cols[level]).drop(
269                columns=["controllerType", "controllerNumber"],
270                axis=1,
271                inplace=False,
272            )
273
274        return res
275
276    def run(self, spectra="all", scan_df=None):
277        """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe.
278
279        Parameters
280        ----------
281        spectra : str, optional
282            Which mass spectra data to include in the output. Default is "all".
283            Other options: None, "ms1", "ms2".
284        scan_df : pandas.DataFrame, optional
285            Scan dataframe.  If not provided, the scan dataframe is created from the mzML file.
286
287        Returns
288        -------
289        tuple
290            A tuple containing two elements:
291            - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level.
292            - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
293        """
294
295        # Open file
296        data = self.load()
297
298        if scan_df is None:
299            scan_df = self.get_scan_df(data)
300
301        if spectra != "none":
302            res = self.get_ms_raw(spectra, scan_df, data)
303
304        else:
305            res = None
306
307        return res, scan_df
308
309    def get_mass_spectrum_from_scan(
310        self, scan_number, spectrum_mode, auto_process=True
311    ):
312        """Instatiate a mass spectrum object from the mzML file.
313
314        Parameters
315        ----------
316        scan_number : int
317            The scan number to be parsed.
318        spectrum_mode : str
319            The type of spectrum to instantiate.  Must be'profile' or 'centroid'.
320        polarity : int
321            The polarity of the scan.  Must be -1 or 1.
322        auto_process : bool, optional
323            If True, process the mass spectrum. Default is True.
324
325        Returns
326        -------
327        MassSpecProfile | MassSpecCentroid
328            The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum.
329        """
330        # Use the batch function and return the first result
331        result_list = self.get_mass_spectra_from_scan_list(
332            [scan_number], spectrum_mode, auto_process
333        )
334        return result_list[0] if result_list else None
335
336    def get_mass_spectra_from_scan_list(
337        self, scan_list, spectrum_mode, auto_process=True
338    ):
339        """Instatiate mass spectrum objects from the mzML file.
340
341        Parameters
342        ----------
343        scan_list : list of int
344            The scan numbers to be parsed.
345        spectrum_mode : str
346            The type of spectrum to instantiate.  Must be'profile' or 'centroid'.
347        auto_process : bool, optional
348            If True, process the mass spectrum. Default is True.
349
350        Returns
351        -------
352        list of MassSpecProfile | MassSpecCentroid
353            List of MassSpecProfile or MassSpecCentroid objects containing the parsed mass spectra.
354        """
355
356        def set_metadata(
357            scan_number: int,
358            polarity: int,
359            file_location: str,
360            label=Labels.thermo_profile,
361        ):
362            """
363            Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object.
364
365            Parameters
366            ----------
367            scan_number : int
368                The scan number.
369            polarity : int
370                The polarity of the data.
371            file_location : str
372                The file location.
373            label : str, optional
374                The label for the mass spectrum. Default is Labels.thermo_profile.
375
376            Returns
377            -------
378            dict
379                The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object.
380            """
381            d_params = default_parameters(file_location)
382            d_params["label"] = label
383            d_params["polarity"] = polarity
384            d_params["filename_path"] = file_location
385            d_params["scan_number"] = scan_number
386
387            return d_params
388
389        # Open file
390        data = self.load()
391
392        mass_spectrum_objects = []
393        
394        for scan_number in scan_list:
395            # Pluck out individual scan mz and intensity
396            spec = data[scan_number]
397
398            # Get polarity
399            if spec["negative scan"] is not None:
400                polarity = -1
401            elif spec["positive scan"] is not None:
402                polarity = 1
403
404            # Get mass spectrum
405            if spectrum_mode == "profile":
406                # Check if profile
407                if not spec.get("MS:1000128"):
408                    raise ValueError("spectrum is not profile")
409                data_dict = {
410                    Labels.mz: spec.mz,
411                    Labels.abundance: spec.i,
412                }
413                d_params = set_metadata(
414                    scan_number,
415                    polarity,
416                    self.file_location,
417                    label=Labels.simulated_profile,
418                )
419                mass_spectrum_obj = MassSpecProfile(
420                    data_dict, d_params, auto_process=auto_process
421                )
422            elif spectrum_mode == "centroid":
423                # Check if centroided
424                if not spec.get("MS:1000127"):
425                    raise ValueError("spectrum is not centroided")
426                data_dict = {
427                    Labels.mz: spec.mz,
428                    Labels.abundance: spec.i,
429                    Labels.rp: [np.nan] * len(spec.mz),
430                    Labels.s2n: [np.nan] * len(spec.i),
431                }
432                d_params = set_metadata(
433                    scan_number, polarity, self.file_location, label=Labels.corems_centroid
434                )
435                mass_spectrum_obj = MassSpecCentroid(
436                    data_dict, d_params, auto_process=auto_process
437                )
438            
439            mass_spectrum_objects.append(mass_spectrum_obj)
440
441        return mass_spectrum_objects
442
443    def get_mass_spectra_obj(self):
444        """Instatiate a MassSpectraBase object from the mzML file.
445
446
447        Returns
448        -------
449        MassSpectraBase
450            The MassSpectra object containing the parsed mass spectra.
451            The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe.
452        """
453        _, scan_df = self.run(spectra=False)
454        mass_spectra_obj = MassSpectraBase(
455            self.file_location,
456            self.analyzer,
457            self.instrument_label,
458            self.sample_name,
459            self,
460        )
461        scan_df = scan_df.set_index("scan", drop=False)
462        mass_spectra_obj.scan_df = scan_df
463
464        return mass_spectra_obj
465
466    def get_lcms_obj(self, spectra="all"):
467        """Instatiates a LCMSBase object from the mzML file.
468
469        Parameters
470        ----------
471        spectra : str, optional
472            Which mass spectra data to include in the output. Default is all.  Other options: none, ms1, ms2.
473
474        Returns
475        -------
476        LCMSBase
477            LCMS object containing mass spectra data.
478            The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe,
479            and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs.
480        """
481        _, scan_df = self.run(spectra="none")  # first run it to just get scan info
482        if spectra != "none":
483            res, scan_df = self.run(
484                scan_df=scan_df, spectra=spectra
485            )  # second run to parse data
486        lcms_obj = LCMSBase(
487            self.file_location,
488            self.analyzer,
489            self.instrument_label,
490            self.sample_name,
491            self,
492        )
493        if spectra != "none":
494            for key in res:
495                key_int = int(key.replace("ms", ""))
496                res[key] = res[key][res[key].intensity > 0]
497                res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True)
498                lcms_obj._ms_unprocessed[key_int] = res[key]
499        lcms_obj.scan_df = scan_df.set_index("scan", drop=False)
500        # Check if polarity is mixed
501        if len(set(scan_df.polarity)) > 1:
502            raise ValueError("Mixed polarities detected in scan data")
503        lcms_obj.polarity = scan_df.polarity[0]
504        lcms_obj._scans_number_list = list(scan_df.scan)
505        lcms_obj._retention_time_list = list(scan_df.scan_time)
506        lcms_obj._tic_list = list(scan_df.tic)
507
508        return lcms_obj
509
510    def get_instrument_info(self):
511        """
512        Return instrument information.
513
514        Returns
515        -------
516        dict
517            A dictionary with the keys 'model' and 'serial_number'.
518        """
519        # Load the pymzml data
520        data = self.load()
521        instrument_info = data.info.get('referenceable_param_group_list_element')[0]
522        cv_params = instrument_info.findall('{http://psi.hupo.org/ms/mzml}cvParam')
523
524        # Extract details from each cvParam
525        params = []
526        for param in cv_params:
527            accession = param.get('accession')  # Get 'accession' attribute
528            name = param.get('name')           # Get 'name' attribute
529            value = param.get('value')         # Get 'value' attribute
530            params.append({
531                'accession': accession,
532                'name': name,
533                'value': value
534            })
535
536        # Loop through params and try to find the relevant information
537        instrument_dict = {
538            'model': 'Unknown',
539            'serial_number': 'Unknown'
540        }
541
542        # Assuming there are only two paramters here - one is for the serial number (agnostic to the model) and the other is for the model
543        # If there are more than two, we raise an error
544        if len(params) < 2:
545            raise ValueError("Not enough parameters found in the instrument info, cannot parse.")
546        if len(params) > 2:
547            raise ValueError("Too many parameters found in the instrument info, cannot parse.")
548        for param in params:
549            if param['accession'] == 'MS:1000529':
550                instrument_dict['serial_number'] = param['value']
551            else:
552                instrument_dict['model'] = data.OT[param['accession']]     
553
554        return instrument_dict
555
556    def get_creation_time(self) -> datetime.datetime:
557        """
558        Return the creation time of the mzML file.
559        """
560        data = self.load()
561        write_time = data.info.get('start_time')
562        if write_time:
563            # Convert the write time to a datetime object
564            return datetime.datetime.strptime(write_time, "%Y-%m-%dT%H:%M:%SZ")
565        else:
566            raise ValueError("Creation time is not available in the mzML file. "
567                           "Please ensure the file contains the 'start_time' information.")
class MZMLSpectraParser(corems.mass_spectra.input.parserbase.SpectraParserInterface):
 20class MZMLSpectraParser(SpectraParserInterface):
 21    """A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects
 22
 23    Parameters
 24    ----------
 25    file_location : str or Path
 26        The path to the RAW file to be parsed.
 27    analyzer : str, optional
 28        The type of mass analyzer used in the instrument. Default is "Unknown".
 29    instrument_label : str, optional
 30        The name of the instrument used to acquire the data. Default is "Unknown".
 31    sample_name : str, optional
 32        The name of the sample being analyzed. If not provided, the stem of the file_location path will be used.
 33
 34    Attributes
 35    ----------
 36    file_location : Path
 37        The path to the RAW file being parsed.
 38    analyzer : str
 39        The type of mass analyzer used in the instrument.
 40    instrument_label : str
 41        The name of the instrument used to acquire the data.
 42    sample_name : str
 43        The name of the sample being analyzed.
 44
 45    Methods
 46    -------
 47    * load().
 48        Load mzML file using pymzml.run.Reader and return the data as a numpy array.
 49    * run(spectra=True).
 50        Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe.
 51    * get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True)
 52        Parses the mzml file and returns a MassSpecBase object from a single scan.
 53    * get_mass_spectra_obj().
 54        Parses the mzml file and instantiates a MassSpectraBase object.
 55    * get_lcms_obj().
 56        Parses the mzml file and instantiates an LCMSBase object.
 57    * get_instrument_info().
 58        Return instrument information from the mzML file.
 59    * get_creation_time().
 60        Return the creation time of the mzML file as a datetime object.
 61
 62    Inherits from ThermoBaseClass and SpectraParserInterface
 63    """
 64
 65    def __init__(
 66        self,
 67        file_location,
 68        analyzer="Unknown",
 69        instrument_label="Unknown",
 70        sample_name=None,
 71    ):
 72        # implementation details
 73        if isinstance(file_location, str):
 74            # if obj is a string it defaults to create a Path obj, pass the S3Path if needed
 75            file_location = Path(file_location)
 76        if not file_location.exists():
 77            raise FileExistsError("File does not exist: " + str(file_location))
 78        self.file_location = file_location
 79        self.analyzer = analyzer
 80        self.instrument_label = instrument_label
 81
 82        if sample_name:
 83            self.sample_name = sample_name
 84        else:
 85            self.sample_name = file_location.stem
 86
 87    def load(self):
 88        """
 89        Load mzML file using pymzml.run.Reader and return the data as a numpy array.
 90
 91        Returns
 92        -------
 93        numpy.ndarray
 94            The mass spectra data as a numpy array.
 95        """
 96        data = pymzml.run.Reader(self.file_location)
 97        return data
 98
 99    def get_scan_df(self, data):
100        """
101        Return scan data as a pandas DataFrame.
102
103        Parameters
104        ----------
105        data : pymzml.run.Reader
106            The mass spectra data.
107
108        Returns
109        -------
110        pandas.DataFrame
111            A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
112        """
113        # Scan dict
114        # instatinate scan dict, with empty lists of size of scans
115        n_scans = data.get_spectrum_count()
116        scan_dict = {
117            "scan": np.empty(n_scans, dtype=np.int32),
118            "scan_time": np.empty(n_scans, dtype=np.float32),
119            "ms_level": [None] * n_scans,
120            "polarity": [None] * n_scans,
121            "precursor_mz": [None] * n_scans,
122            "scan_text": [None] * n_scans,
123            "scan_window_lower": np.empty(n_scans, dtype=np.float32),
124            "scan_window_upper": np.empty(n_scans, dtype=np.float32),
125            "scan_precision": [None] * n_scans,
126            "tic": np.empty(n_scans, dtype=np.float32),
127            "ms_format": [None] * n_scans,
128        }
129
130        # First pass: loop through scans to get scan info
131        for i, spec in enumerate(data):
132            scan_dict["scan"][i] = spec.ID
133            scan_dict["ms_level"][i] = spec.ms_level
134            scan_dict["scan_precision"][i] = spec._measured_precision
135            scan_dict["tic"][i] = spec.TIC
136            if spec.selected_precursors:
137                scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get(
138                    "mz", None
139                )
140            if spec["negative scan"] is not None:
141                scan_dict["polarity"][i] = "negative"
142            if spec["positive scan"] is not None:
143                scan_dict["polarity"][i] = "positive"
144            if spec["negative scan"] is not None and spec["positive scan"] is not None:
145                raise ValueError(
146                    "Error: scan {0} has both negative and positive polarity".format(
147                        spec.ID
148                    )
149                )
150
151            scan_dict["scan_time"][i] = spec.get("MS:1000016")
152            scan_dict["scan_text"][i] = spec.get("MS:1000512")
153            scan_dict["scan_window_lower"][i] = spec.get("MS:1000501")
154            scan_dict["scan_window_upper"][i] = spec.get("MS:1000500")
155            if spec.get("MS:1000128"):
156                scan_dict["ms_format"][i] = "profile"
157            elif spec.get("MS:1000127"):
158                scan_dict["ms_format"][i] = "centroid"
159            else:
160                scan_dict["ms_format"][i] = None
161
162        scan_df = pd.DataFrame(scan_dict)
163
164        return scan_df
165
166    def get_ms_raw(self, spectra, scan_df, data):
167        """Return a dictionary of mass spectra data as a pandas DataFrame.
168
169        Parameters
170        ----------
171        spectra : str
172            Which mass spectra data to include in the output.
173            Options: None, "ms1", "ms2", "all".
174        scan_df : pandas.DataFrame
175            Scan dataframe. Output from get_scan_df().
176        data : pymzml.run.Reader
177            The mass spectra data.
178
179        Returns
180        -------
181        dict
182            A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level.
183
184        """
185        if spectra == "all":
186            scan_df_forspec = scan_df
187        elif spectra == "ms1":
188            scan_df_forspec = scan_df[scan_df.ms_level == 1]
189        elif spectra == "ms2":
190            scan_df_forspec = scan_df[scan_df.ms_level == 2]
191        else:
192            raise ValueError("spectra must be 'all', 'ms1', or 'ms2'")
193
194        # Result container
195        res = {}
196
197        # Row count container
198        counter = {}
199
200        # Column name container
201        cols = {}
202
203        # set at float32
204        dtype = np.float32
205
206        # First pass: get nrows
207        N = defaultdict(lambda: 0)
208        for i, spec in enumerate(data):
209            if i in scan_df_forspec.scan:
210                # Get ms level
211                level = "ms{}".format(spec.ms_level)
212
213                # Number of rows
214                N[level] += spec.mz.shape[0]
215
216        # Second pass: parse
217        for i, spec in enumerate(data):
218            if i in scan_df_forspec.scan:
219                # Number of rows
220                n = spec.mz.shape[0]
221
222                # No measurements
223                if n == 0:
224                    continue
225
226                # Dimension check
227                if len(spec.mz) != len(spec.i):
228                    # raise an error if the mz and intensity arrays are not the same length
229                    raise ValueError("m/z and intensity array dimension mismatch")
230
231                # Scan/frame info
232                id_dict = spec.id_dict
233
234                # Get ms level
235                level = "ms{}".format(spec.ms_level)
236
237                # Columns
238                cols[level] = list(id_dict.keys()) + ["mz", "intensity"]
239                m = len(cols[level])
240
241                # Subarray init
242                arr = np.empty((n, m), dtype=dtype)
243                inx = 0
244
245                # Populate scan/frame info
246                for k, v in id_dict.items():
247                    arr[:, inx] = v
248                    inx += 1
249
250                # Populate m/z
251                arr[:, inx] = spec.mz
252                inx += 1
253
254                # Populate intensity
255                arr[:, inx] = spec.i
256                inx += 1
257
258                # Initialize output container
259                if level not in res:
260                    res[level] = np.empty((N[level], m), dtype=dtype)
261                    counter[level] = 0
262
263                # Insert subarray
264                res[level][counter[level] : counter[level] + n, :] = arr
265                counter[level] += n
266
267        # Construct ms1 and ms2 mz dataframes
268        for level in res.keys():
269            res[level] = pd.DataFrame(res[level], columns=cols[level]).drop(
270                columns=["controllerType", "controllerNumber"],
271                axis=1,
272                inplace=False,
273            )
274
275        return res
276
277    def run(self, spectra="all", scan_df=None):
278        """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe.
279
280        Parameters
281        ----------
282        spectra : str, optional
283            Which mass spectra data to include in the output. Default is "all".
284            Other options: None, "ms1", "ms2".
285        scan_df : pandas.DataFrame, optional
286            Scan dataframe.  If not provided, the scan dataframe is created from the mzML file.
287
288        Returns
289        -------
290        tuple
291            A tuple containing two elements:
292            - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level.
293            - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
294        """
295
296        # Open file
297        data = self.load()
298
299        if scan_df is None:
300            scan_df = self.get_scan_df(data)
301
302        if spectra != "none":
303            res = self.get_ms_raw(spectra, scan_df, data)
304
305        else:
306            res = None
307
308        return res, scan_df
309
310    def get_mass_spectrum_from_scan(
311        self, scan_number, spectrum_mode, auto_process=True
312    ):
313        """Instatiate a mass spectrum object from the mzML file.
314
315        Parameters
316        ----------
317        scan_number : int
318            The scan number to be parsed.
319        spectrum_mode : str
320            The type of spectrum to instantiate.  Must be'profile' or 'centroid'.
321        polarity : int
322            The polarity of the scan.  Must be -1 or 1.
323        auto_process : bool, optional
324            If True, process the mass spectrum. Default is True.
325
326        Returns
327        -------
328        MassSpecProfile | MassSpecCentroid
329            The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum.
330        """
331        # Use the batch function and return the first result
332        result_list = self.get_mass_spectra_from_scan_list(
333            [scan_number], spectrum_mode, auto_process
334        )
335        return result_list[0] if result_list else None
336
337    def get_mass_spectra_from_scan_list(
338        self, scan_list, spectrum_mode, auto_process=True
339    ):
340        """Instatiate mass spectrum objects from the mzML file.
341
342        Parameters
343        ----------
344        scan_list : list of int
345            The scan numbers to be parsed.
346        spectrum_mode : str
347            The type of spectrum to instantiate.  Must be'profile' or 'centroid'.
348        auto_process : bool, optional
349            If True, process the mass spectrum. Default is True.
350
351        Returns
352        -------
353        list of MassSpecProfile | MassSpecCentroid
354            List of MassSpecProfile or MassSpecCentroid objects containing the parsed mass spectra.
355        """
356
357        def set_metadata(
358            scan_number: int,
359            polarity: int,
360            file_location: str,
361            label=Labels.thermo_profile,
362        ):
363            """
364            Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object.
365
366            Parameters
367            ----------
368            scan_number : int
369                The scan number.
370            polarity : int
371                The polarity of the data.
372            file_location : str
373                The file location.
374            label : str, optional
375                The label for the mass spectrum. Default is Labels.thermo_profile.
376
377            Returns
378            -------
379            dict
380                The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object.
381            """
382            d_params = default_parameters(file_location)
383            d_params["label"] = label
384            d_params["polarity"] = polarity
385            d_params["filename_path"] = file_location
386            d_params["scan_number"] = scan_number
387
388            return d_params
389
390        # Open file
391        data = self.load()
392
393        mass_spectrum_objects = []
394        
395        for scan_number in scan_list:
396            # Pluck out individual scan mz and intensity
397            spec = data[scan_number]
398
399            # Get polarity
400            if spec["negative scan"] is not None:
401                polarity = -1
402            elif spec["positive scan"] is not None:
403                polarity = 1
404
405            # Get mass spectrum
406            if spectrum_mode == "profile":
407                # Check if profile
408                if not spec.get("MS:1000128"):
409                    raise ValueError("spectrum is not profile")
410                data_dict = {
411                    Labels.mz: spec.mz,
412                    Labels.abundance: spec.i,
413                }
414                d_params = set_metadata(
415                    scan_number,
416                    polarity,
417                    self.file_location,
418                    label=Labels.simulated_profile,
419                )
420                mass_spectrum_obj = MassSpecProfile(
421                    data_dict, d_params, auto_process=auto_process
422                )
423            elif spectrum_mode == "centroid":
424                # Check if centroided
425                if not spec.get("MS:1000127"):
426                    raise ValueError("spectrum is not centroided")
427                data_dict = {
428                    Labels.mz: spec.mz,
429                    Labels.abundance: spec.i,
430                    Labels.rp: [np.nan] * len(spec.mz),
431                    Labels.s2n: [np.nan] * len(spec.i),
432                }
433                d_params = set_metadata(
434                    scan_number, polarity, self.file_location, label=Labels.corems_centroid
435                )
436                mass_spectrum_obj = MassSpecCentroid(
437                    data_dict, d_params, auto_process=auto_process
438                )
439            
440            mass_spectrum_objects.append(mass_spectrum_obj)
441
442        return mass_spectrum_objects
443
444    def get_mass_spectra_obj(self):
445        """Instatiate a MassSpectraBase object from the mzML file.
446
447
448        Returns
449        -------
450        MassSpectraBase
451            The MassSpectra object containing the parsed mass spectra.
452            The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe.
453        """
454        _, scan_df = self.run(spectra=False)
455        mass_spectra_obj = MassSpectraBase(
456            self.file_location,
457            self.analyzer,
458            self.instrument_label,
459            self.sample_name,
460            self,
461        )
462        scan_df = scan_df.set_index("scan", drop=False)
463        mass_spectra_obj.scan_df = scan_df
464
465        return mass_spectra_obj
466
467    def get_lcms_obj(self, spectra="all"):
468        """Instatiates a LCMSBase object from the mzML file.
469
470        Parameters
471        ----------
472        spectra : str, optional
473            Which mass spectra data to include in the output. Default is all.  Other options: none, ms1, ms2.
474
475        Returns
476        -------
477        LCMSBase
478            LCMS object containing mass spectra data.
479            The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe,
480            and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs.
481        """
482        _, scan_df = self.run(spectra="none")  # first run it to just get scan info
483        if spectra != "none":
484            res, scan_df = self.run(
485                scan_df=scan_df, spectra=spectra
486            )  # second run to parse data
487        lcms_obj = LCMSBase(
488            self.file_location,
489            self.analyzer,
490            self.instrument_label,
491            self.sample_name,
492            self,
493        )
494        if spectra != "none":
495            for key in res:
496                key_int = int(key.replace("ms", ""))
497                res[key] = res[key][res[key].intensity > 0]
498                res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True)
499                lcms_obj._ms_unprocessed[key_int] = res[key]
500        lcms_obj.scan_df = scan_df.set_index("scan", drop=False)
501        # Check if polarity is mixed
502        if len(set(scan_df.polarity)) > 1:
503            raise ValueError("Mixed polarities detected in scan data")
504        lcms_obj.polarity = scan_df.polarity[0]
505        lcms_obj._scans_number_list = list(scan_df.scan)
506        lcms_obj._retention_time_list = list(scan_df.scan_time)
507        lcms_obj._tic_list = list(scan_df.tic)
508
509        return lcms_obj
510
511    def get_instrument_info(self):
512        """
513        Return instrument information.
514
515        Returns
516        -------
517        dict
518            A dictionary with the keys 'model' and 'serial_number'.
519        """
520        # Load the pymzml data
521        data = self.load()
522        instrument_info = data.info.get('referenceable_param_group_list_element')[0]
523        cv_params = instrument_info.findall('{http://psi.hupo.org/ms/mzml}cvParam')
524
525        # Extract details from each cvParam
526        params = []
527        for param in cv_params:
528            accession = param.get('accession')  # Get 'accession' attribute
529            name = param.get('name')           # Get 'name' attribute
530            value = param.get('value')         # Get 'value' attribute
531            params.append({
532                'accession': accession,
533                'name': name,
534                'value': value
535            })
536
537        # Loop through params and try to find the relevant information
538        instrument_dict = {
539            'model': 'Unknown',
540            'serial_number': 'Unknown'
541        }
542
543        # Assuming there are only two paramters here - one is for the serial number (agnostic to the model) and the other is for the model
544        # If there are more than two, we raise an error
545        if len(params) < 2:
546            raise ValueError("Not enough parameters found in the instrument info, cannot parse.")
547        if len(params) > 2:
548            raise ValueError("Too many parameters found in the instrument info, cannot parse.")
549        for param in params:
550            if param['accession'] == 'MS:1000529':
551                instrument_dict['serial_number'] = param['value']
552            else:
553                instrument_dict['model'] = data.OT[param['accession']]     
554
555        return instrument_dict
556
557    def get_creation_time(self) -> datetime.datetime:
558        """
559        Return the creation time of the mzML file.
560        """
561        data = self.load()
562        write_time = data.info.get('start_time')
563        if write_time:
564            # Convert the write time to a datetime object
565            return datetime.datetime.strptime(write_time, "%Y-%m-%dT%H:%M:%SZ")
566        else:
567            raise ValueError("Creation time is not available in the mzML file. "
568                           "Please ensure the file contains the 'start_time' information.")

A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects

Parameters
  • file_location (str or Path): The path to the RAW file to be parsed.
  • analyzer (str, optional): The type of mass analyzer used in the instrument. Default is "Unknown".
  • instrument_label (str, optional): The name of the instrument used to acquire the data. Default is "Unknown".
  • sample_name (str, optional): The name of the sample being analyzed. If not provided, the stem of the file_location path will be used.
Attributes
  • file_location (Path): The path to the RAW file being parsed.
  • analyzer (str): The type of mass analyzer used in the instrument.
  • instrument_label (str): The name of the instrument used to acquire the data.
  • sample_name (str): The name of the sample being analyzed.
Methods
  • load(). Load mzML file using pymzml.run.Reader and return the data as a numpy array.
  • run(spectra=True). Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe.
  • get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True) Parses the mzml file and returns a MassSpecBase object from a single scan.
  • get_mass_spectra_obj(). Parses the mzml file and instantiates a MassSpectraBase object.
  • get_lcms_obj(). Parses the mzml file and instantiates an LCMSBase object.
  • get_instrument_info(). Return instrument information from the mzML file.
  • get_creation_time(). Return the creation time of the mzML file as a datetime object.

Inherits from ThermoBaseClass and SpectraParserInterface

MZMLSpectraParser( file_location, analyzer='Unknown', instrument_label='Unknown', sample_name=None)
65    def __init__(
66        self,
67        file_location,
68        analyzer="Unknown",
69        instrument_label="Unknown",
70        sample_name=None,
71    ):
72        # implementation details
73        if isinstance(file_location, str):
74            # if obj is a string it defaults to create a Path obj, pass the S3Path if needed
75            file_location = Path(file_location)
76        if not file_location.exists():
77            raise FileExistsError("File does not exist: " + str(file_location))
78        self.file_location = file_location
79        self.analyzer = analyzer
80        self.instrument_label = instrument_label
81
82        if sample_name:
83            self.sample_name = sample_name
84        else:
85            self.sample_name = file_location.stem
file_location
analyzer
instrument_label
def load(self):
87    def load(self):
88        """
89        Load mzML file using pymzml.run.Reader and return the data as a numpy array.
90
91        Returns
92        -------
93        numpy.ndarray
94            The mass spectra data as a numpy array.
95        """
96        data = pymzml.run.Reader(self.file_location)
97        return data

Load mzML file using pymzml.run.Reader and return the data as a numpy array.

Returns
  • numpy.ndarray: The mass spectra data as a numpy array.
def get_scan_df(self, data):
 99    def get_scan_df(self, data):
100        """
101        Return scan data as a pandas DataFrame.
102
103        Parameters
104        ----------
105        data : pymzml.run.Reader
106            The mass spectra data.
107
108        Returns
109        -------
110        pandas.DataFrame
111            A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
112        """
113        # Scan dict
114        # instatinate scan dict, with empty lists of size of scans
115        n_scans = data.get_spectrum_count()
116        scan_dict = {
117            "scan": np.empty(n_scans, dtype=np.int32),
118            "scan_time": np.empty(n_scans, dtype=np.float32),
119            "ms_level": [None] * n_scans,
120            "polarity": [None] * n_scans,
121            "precursor_mz": [None] * n_scans,
122            "scan_text": [None] * n_scans,
123            "scan_window_lower": np.empty(n_scans, dtype=np.float32),
124            "scan_window_upper": np.empty(n_scans, dtype=np.float32),
125            "scan_precision": [None] * n_scans,
126            "tic": np.empty(n_scans, dtype=np.float32),
127            "ms_format": [None] * n_scans,
128        }
129
130        # First pass: loop through scans to get scan info
131        for i, spec in enumerate(data):
132            scan_dict["scan"][i] = spec.ID
133            scan_dict["ms_level"][i] = spec.ms_level
134            scan_dict["scan_precision"][i] = spec._measured_precision
135            scan_dict["tic"][i] = spec.TIC
136            if spec.selected_precursors:
137                scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get(
138                    "mz", None
139                )
140            if spec["negative scan"] is not None:
141                scan_dict["polarity"][i] = "negative"
142            if spec["positive scan"] is not None:
143                scan_dict["polarity"][i] = "positive"
144            if spec["negative scan"] is not None and spec["positive scan"] is not None:
145                raise ValueError(
146                    "Error: scan {0} has both negative and positive polarity".format(
147                        spec.ID
148                    )
149                )
150
151            scan_dict["scan_time"][i] = spec.get("MS:1000016")
152            scan_dict["scan_text"][i] = spec.get("MS:1000512")
153            scan_dict["scan_window_lower"][i] = spec.get("MS:1000501")
154            scan_dict["scan_window_upper"][i] = spec.get("MS:1000500")
155            if spec.get("MS:1000128"):
156                scan_dict["ms_format"][i] = "profile"
157            elif spec.get("MS:1000127"):
158                scan_dict["ms_format"][i] = "centroid"
159            else:
160                scan_dict["ms_format"][i] = None
161
162        scan_df = pd.DataFrame(scan_dict)
163
164        return scan_df

Return scan data as a pandas DataFrame.

Parameters
  • data (pymzml.run.Reader): The mass spectra data.
Returns
  • pandas.DataFrame: A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
def get_ms_raw(self, spectra, scan_df, data):
166    def get_ms_raw(self, spectra, scan_df, data):
167        """Return a dictionary of mass spectra data as a pandas DataFrame.
168
169        Parameters
170        ----------
171        spectra : str
172            Which mass spectra data to include in the output.
173            Options: None, "ms1", "ms2", "all".
174        scan_df : pandas.DataFrame
175            Scan dataframe. Output from get_scan_df().
176        data : pymzml.run.Reader
177            The mass spectra data.
178
179        Returns
180        -------
181        dict
182            A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level.
183
184        """
185        if spectra == "all":
186            scan_df_forspec = scan_df
187        elif spectra == "ms1":
188            scan_df_forspec = scan_df[scan_df.ms_level == 1]
189        elif spectra == "ms2":
190            scan_df_forspec = scan_df[scan_df.ms_level == 2]
191        else:
192            raise ValueError("spectra must be 'all', 'ms1', or 'ms2'")
193
194        # Result container
195        res = {}
196
197        # Row count container
198        counter = {}
199
200        # Column name container
201        cols = {}
202
203        # set at float32
204        dtype = np.float32
205
206        # First pass: get nrows
207        N = defaultdict(lambda: 0)
208        for i, spec in enumerate(data):
209            if i in scan_df_forspec.scan:
210                # Get ms level
211                level = "ms{}".format(spec.ms_level)
212
213                # Number of rows
214                N[level] += spec.mz.shape[0]
215
216        # Second pass: parse
217        for i, spec in enumerate(data):
218            if i in scan_df_forspec.scan:
219                # Number of rows
220                n = spec.mz.shape[0]
221
222                # No measurements
223                if n == 0:
224                    continue
225
226                # Dimension check
227                if len(spec.mz) != len(spec.i):
228                    # raise an error if the mz and intensity arrays are not the same length
229                    raise ValueError("m/z and intensity array dimension mismatch")
230
231                # Scan/frame info
232                id_dict = spec.id_dict
233
234                # Get ms level
235                level = "ms{}".format(spec.ms_level)
236
237                # Columns
238                cols[level] = list(id_dict.keys()) + ["mz", "intensity"]
239                m = len(cols[level])
240
241                # Subarray init
242                arr = np.empty((n, m), dtype=dtype)
243                inx = 0
244
245                # Populate scan/frame info
246                for k, v in id_dict.items():
247                    arr[:, inx] = v
248                    inx += 1
249
250                # Populate m/z
251                arr[:, inx] = spec.mz
252                inx += 1
253
254                # Populate intensity
255                arr[:, inx] = spec.i
256                inx += 1
257
258                # Initialize output container
259                if level not in res:
260                    res[level] = np.empty((N[level], m), dtype=dtype)
261                    counter[level] = 0
262
263                # Insert subarray
264                res[level][counter[level] : counter[level] + n, :] = arr
265                counter[level] += n
266
267        # Construct ms1 and ms2 mz dataframes
268        for level in res.keys():
269            res[level] = pd.DataFrame(res[level], columns=cols[level]).drop(
270                columns=["controllerType", "controllerNumber"],
271                axis=1,
272                inplace=False,
273            )
274
275        return res

Return a dictionary of mass spectra data as a pandas DataFrame.

Parameters
  • spectra (str): Which mass spectra data to include in the output. Options: None, "ms1", "ms2", "all".
  • scan_df (pandas.DataFrame): Scan dataframe. Output from get_scan_df().
  • data (pymzml.run.Reader): The mass spectra data.
Returns
  • dict: A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level.
def run(self, spectra='all', scan_df=None):
277    def run(self, spectra="all", scan_df=None):
278        """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe.
279
280        Parameters
281        ----------
282        spectra : str, optional
283            Which mass spectra data to include in the output. Default is "all".
284            Other options: None, "ms1", "ms2".
285        scan_df : pandas.DataFrame, optional
286            Scan dataframe.  If not provided, the scan dataframe is created from the mzML file.
287
288        Returns
289        -------
290        tuple
291            A tuple containing two elements:
292            - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level.
293            - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
294        """
295
296        # Open file
297        data = self.load()
298
299        if scan_df is None:
300            scan_df = self.get_scan_df(data)
301
302        if spectra != "none":
303            res = self.get_ms_raw(spectra, scan_df, data)
304
305        else:
306            res = None
307
308        return res, scan_df

Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe.

Parameters
  • spectra (str, optional): Which mass spectra data to include in the output. Default is "all". Other options: None, "ms1", "ms2".
  • scan_df (pandas.DataFrame, optional): Scan dataframe. If not provided, the scan dataframe is created from the mzML file.
Returns
  • tuple: A tuple containing two elements:
    • A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level.
    • A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
def get_mass_spectrum_from_scan(self, scan_number, spectrum_mode, auto_process=True):
310    def get_mass_spectrum_from_scan(
311        self, scan_number, spectrum_mode, auto_process=True
312    ):
313        """Instatiate a mass spectrum object from the mzML file.
314
315        Parameters
316        ----------
317        scan_number : int
318            The scan number to be parsed.
319        spectrum_mode : str
320            The type of spectrum to instantiate.  Must be'profile' or 'centroid'.
321        polarity : int
322            The polarity of the scan.  Must be -1 or 1.
323        auto_process : bool, optional
324            If True, process the mass spectrum. Default is True.
325
326        Returns
327        -------
328        MassSpecProfile | MassSpecCentroid
329            The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum.
330        """
331        # Use the batch function and return the first result
332        result_list = self.get_mass_spectra_from_scan_list(
333            [scan_number], spectrum_mode, auto_process
334        )
335        return result_list[0] if result_list else None

Instatiate a mass spectrum object from the mzML file.

Parameters
  • scan_number (int): The scan number to be parsed.
  • spectrum_mode (str): The type of spectrum to instantiate. Must be'profile' or 'centroid'.
  • polarity (int): The polarity of the scan. Must be -1 or 1.
  • auto_process (bool, optional): If True, process the mass spectrum. Default is True.
Returns
  • MassSpecProfile | MassSpecCentroid: The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum.
def get_mass_spectra_from_scan_list(self, scan_list, spectrum_mode, auto_process=True):
337    def get_mass_spectra_from_scan_list(
338        self, scan_list, spectrum_mode, auto_process=True
339    ):
340        """Instatiate mass spectrum objects from the mzML file.
341
342        Parameters
343        ----------
344        scan_list : list of int
345            The scan numbers to be parsed.
346        spectrum_mode : str
347            The type of spectrum to instantiate.  Must be'profile' or 'centroid'.
348        auto_process : bool, optional
349            If True, process the mass spectrum. Default is True.
350
351        Returns
352        -------
353        list of MassSpecProfile | MassSpecCentroid
354            List of MassSpecProfile or MassSpecCentroid objects containing the parsed mass spectra.
355        """
356
357        def set_metadata(
358            scan_number: int,
359            polarity: int,
360            file_location: str,
361            label=Labels.thermo_profile,
362        ):
363            """
364            Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object.
365
366            Parameters
367            ----------
368            scan_number : int
369                The scan number.
370            polarity : int
371                The polarity of the data.
372            file_location : str
373                The file location.
374            label : str, optional
375                The label for the mass spectrum. Default is Labels.thermo_profile.
376
377            Returns
378            -------
379            dict
380                The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object.
381            """
382            d_params = default_parameters(file_location)
383            d_params["label"] = label
384            d_params["polarity"] = polarity
385            d_params["filename_path"] = file_location
386            d_params["scan_number"] = scan_number
387
388            return d_params
389
390        # Open file
391        data = self.load()
392
393        mass_spectrum_objects = []
394        
395        for scan_number in scan_list:
396            # Pluck out individual scan mz and intensity
397            spec = data[scan_number]
398
399            # Get polarity
400            if spec["negative scan"] is not None:
401                polarity = -1
402            elif spec["positive scan"] is not None:
403                polarity = 1
404
405            # Get mass spectrum
406            if spectrum_mode == "profile":
407                # Check if profile
408                if not spec.get("MS:1000128"):
409                    raise ValueError("spectrum is not profile")
410                data_dict = {
411                    Labels.mz: spec.mz,
412                    Labels.abundance: spec.i,
413                }
414                d_params = set_metadata(
415                    scan_number,
416                    polarity,
417                    self.file_location,
418                    label=Labels.simulated_profile,
419                )
420                mass_spectrum_obj = MassSpecProfile(
421                    data_dict, d_params, auto_process=auto_process
422                )
423            elif spectrum_mode == "centroid":
424                # Check if centroided
425                if not spec.get("MS:1000127"):
426                    raise ValueError("spectrum is not centroided")
427                data_dict = {
428                    Labels.mz: spec.mz,
429                    Labels.abundance: spec.i,
430                    Labels.rp: [np.nan] * len(spec.mz),
431                    Labels.s2n: [np.nan] * len(spec.i),
432                }
433                d_params = set_metadata(
434                    scan_number, polarity, self.file_location, label=Labels.corems_centroid
435                )
436                mass_spectrum_obj = MassSpecCentroid(
437                    data_dict, d_params, auto_process=auto_process
438                )
439            
440            mass_spectrum_objects.append(mass_spectrum_obj)
441
442        return mass_spectrum_objects

Instatiate mass spectrum objects from the mzML file.

Parameters
  • scan_list (list of int): The scan numbers to be parsed.
  • spectrum_mode (str): The type of spectrum to instantiate. Must be'profile' or 'centroid'.
  • auto_process (bool, optional): If True, process the mass spectrum. Default is True.
Returns
  • list of MassSpecProfile | MassSpecCentroid: List of MassSpecProfile or MassSpecCentroid objects containing the parsed mass spectra.
def get_mass_spectra_obj(self):
444    def get_mass_spectra_obj(self):
445        """Instatiate a MassSpectraBase object from the mzML file.
446
447
448        Returns
449        -------
450        MassSpectraBase
451            The MassSpectra object containing the parsed mass spectra.
452            The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe.
453        """
454        _, scan_df = self.run(spectra=False)
455        mass_spectra_obj = MassSpectraBase(
456            self.file_location,
457            self.analyzer,
458            self.instrument_label,
459            self.sample_name,
460            self,
461        )
462        scan_df = scan_df.set_index("scan", drop=False)
463        mass_spectra_obj.scan_df = scan_df
464
465        return mass_spectra_obj

Instatiate a MassSpectraBase object from the mzML file.

Returns
  • MassSpectraBase: The MassSpectra object containing the parsed mass spectra. The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe.
def get_lcms_obj(self, spectra='all'):
467    def get_lcms_obj(self, spectra="all"):
468        """Instatiates a LCMSBase object from the mzML file.
469
470        Parameters
471        ----------
472        spectra : str, optional
473            Which mass spectra data to include in the output. Default is all.  Other options: none, ms1, ms2.
474
475        Returns
476        -------
477        LCMSBase
478            LCMS object containing mass spectra data.
479            The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe,
480            and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs.
481        """
482        _, scan_df = self.run(spectra="none")  # first run it to just get scan info
483        if spectra != "none":
484            res, scan_df = self.run(
485                scan_df=scan_df, spectra=spectra
486            )  # second run to parse data
487        lcms_obj = LCMSBase(
488            self.file_location,
489            self.analyzer,
490            self.instrument_label,
491            self.sample_name,
492            self,
493        )
494        if spectra != "none":
495            for key in res:
496                key_int = int(key.replace("ms", ""))
497                res[key] = res[key][res[key].intensity > 0]
498                res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True)
499                lcms_obj._ms_unprocessed[key_int] = res[key]
500        lcms_obj.scan_df = scan_df.set_index("scan", drop=False)
501        # Check if polarity is mixed
502        if len(set(scan_df.polarity)) > 1:
503            raise ValueError("Mixed polarities detected in scan data")
504        lcms_obj.polarity = scan_df.polarity[0]
505        lcms_obj._scans_number_list = list(scan_df.scan)
506        lcms_obj._retention_time_list = list(scan_df.scan_time)
507        lcms_obj._tic_list = list(scan_df.tic)
508
509        return lcms_obj

Instatiates a LCMSBase object from the mzML file.

Parameters
  • spectra (str, optional): Which mass spectra data to include in the output. Default is all. Other options: none, ms1, ms2.
Returns
  • LCMSBase: LCMS object containing mass spectra data. The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe, and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs.
def get_instrument_info(self):
511    def get_instrument_info(self):
512        """
513        Return instrument information.
514
515        Returns
516        -------
517        dict
518            A dictionary with the keys 'model' and 'serial_number'.
519        """
520        # Load the pymzml data
521        data = self.load()
522        instrument_info = data.info.get('referenceable_param_group_list_element')[0]
523        cv_params = instrument_info.findall('{http://psi.hupo.org/ms/mzml}cvParam')
524
525        # Extract details from each cvParam
526        params = []
527        for param in cv_params:
528            accession = param.get('accession')  # Get 'accession' attribute
529            name = param.get('name')           # Get 'name' attribute
530            value = param.get('value')         # Get 'value' attribute
531            params.append({
532                'accession': accession,
533                'name': name,
534                'value': value
535            })
536
537        # Loop through params and try to find the relevant information
538        instrument_dict = {
539            'model': 'Unknown',
540            'serial_number': 'Unknown'
541        }
542
543        # Assuming there are only two paramters here - one is for the serial number (agnostic to the model) and the other is for the model
544        # If there are more than two, we raise an error
545        if len(params) < 2:
546            raise ValueError("Not enough parameters found in the instrument info, cannot parse.")
547        if len(params) > 2:
548            raise ValueError("Too many parameters found in the instrument info, cannot parse.")
549        for param in params:
550            if param['accession'] == 'MS:1000529':
551                instrument_dict['serial_number'] = param['value']
552            else:
553                instrument_dict['model'] = data.OT[param['accession']]     
554
555        return instrument_dict

Return instrument information.

Returns
  • dict: A dictionary with the keys 'model' and 'serial_number'.
def get_creation_time(self) -> datetime.datetime:
557    def get_creation_time(self) -> datetime.datetime:
558        """
559        Return the creation time of the mzML file.
560        """
561        data = self.load()
562        write_time = data.info.get('start_time')
563        if write_time:
564            # Convert the write time to a datetime object
565            return datetime.datetime.strptime(write_time, "%Y-%m-%dT%H:%M:%SZ")
566        else:
567            raise ValueError("Creation time is not available in the mzML file. "
568                           "Please ensure the file contains the 'start_time' information.")

Return the creation time of the mzML file.