corems.mass_spectra.input.corems_hdf5

  1__author__ = "Yuri E. Corilo"
  2__date__ = "Oct 29, 2019"
  3
  4
  5from threading import Thread
  6from pathlib import Path
  7
  8import pandas as pd
  9import warnings
 10
 11from corems.chroma_peak.factory.chroma_peak_classes import LCMSMassFeature
 12from corems.encapsulation.input.parameter_from_json import (
 13    load_and_set_json_parameters_lcms,
 14    load_and_set_toml_parameters_lcms,
 15)
 16from corems.mass_spectra.factory.lc_class import LCMSBase, MassSpectraBase
 17from corems.mass_spectra.factory.chromat_data import EIC_Data
 18from corems.mass_spectra.input.parserbase import SpectraParserInterface
 19from corems.mass_spectrum.input.coremsHDF5 import ReadCoreMSHDF_MassSpectrum
 20from corems.molecular_id.factory.spectrum_search_results import SpectrumSearchResults
 21from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader
 22from corems.mass_spectra.input.mzml import MZMLSpectraParser
 23
 24
 25class ReadCoreMSHDFMassSpectra(
 26    SpectraParserInterface, ReadCoreMSHDF_MassSpectrum, Thread
 27):
 28    """Class to read CoreMS HDF5 files and populate a LCMS or MassSpectraBase object.
 29
 30    Parameters
 31    ----------
 32    file_location : str
 33        The location of the HDF5 file to read, including the suffix.
 34
 35    Attributes
 36    ----------
 37    file_location : str
 38        The location of the HDF5 file to read.
 39    h5pydata : h5py.File
 40        The HDF5 file object.
 41    scans : list
 42        A list of the location of individual mass spectra within the HDF5 file.
 43    scan_number_list : list
 44        A list of the scan numbers of the mass spectra within the HDF5 file.
 45    parameters_location : str
 46        The location of the parameters file (json or toml).
 47
 48    Methods
 49    -------
 50    * import_mass_spectra(mass_spectra).
 51        Imports all mass spectra from the HDF5 file onto the LCMS or MassSpectraBase object.
 52    * get_mass_spectrum_from_scan(scan_number).
 53        Return mass spectrum data object from scan number.
 54    * load().
 55        Placeholder method to meet the requirements of the SpectraParserInterface.
 56    * run(mass_spectra).
 57        Runs the importer functions to populate a LCMS or MassSpectraBase object.
 58    * import_scan_info(mass_spectra).
 59        Imports the scan info from the HDF5 file to populate the _scan_info attribute
 60        on the LCMS or MassSpectraBase object
 61    * import_ms_unprocessed(mass_spectra).
 62        Imports the unprocessed mass spectra from the HDF5 file to populate the
 63        _ms_unprocessed attribute on the LCMS or MassSpectraBase object
 64    * import_parameters(mass_spectra).
 65        Imports the parameters from the HDF5 file to populate the parameters
 66        attribute on the LCMS or MassSpectraBase object
 67    * import_mass_features(mass_spectra).
 68        Imports the mass features from the HDF5 file to populate the mass_features
 69        attribute on the LCMS or MassSpectraBase object
 70    * import_eics(mass_spectra).
 71        Imports the extracted ion chromatograms from the HDF5 file to populate the
 72        eics attribute on the LCMS or MassSpectraBase object
 73    * import_spectral_search_results(mass_spectra).
 74        Imports the spectral search results from the HDF5 file to populate the
 75        spectral_search_results attribute on the LCMS or MassSpectraBase object
 76    * get_mass_spectra_obj().
 77        Return mass spectra data object, populating the _ms list on the LCMS or
 78        MassSpectraBase object from the HDF5 file
 79    * get_lcms_obj().
 80        Return LCMSBase object, populating the majority of the attributes on the
 81        LCMS object from the HDF5 file
 82
 83    """
 84
 85    def __init__(self, file_location: str):
 86        Thread.__init__(self)
 87        ReadCoreMSHDF_MassSpectrum.__init__(self, file_location)
 88
 89        # override the scans attribute on ReadCoreMSHDF_MassSpectrum class to expect a nested location within the HDF5 file
 90        self.scans = [
 91            "mass_spectra/" + x for x in list(self.h5pydata["mass_spectra"].keys())
 92        ]
 93        self.scan_number_list = sorted(
 94            [int(float(i)) for i in list(self.h5pydata["mass_spectra"].keys())]
 95        )
 96
 97        # set the location of the parameters file (json or toml)
 98        add_files = [
 99            x
100            for x in self.file_location.parent.glob(
101                self.file_location.name.replace(".hdf5", ".*")
102            )
103            if x.suffix != ".hdf5"
104        ]
105        if len([x for x in add_files if x.suffix == ".json"]) > 0:
106            self.parameters_location = [x for x in add_files if x.suffix == ".json"][0]
107        elif len([x for x in add_files if x.suffix == ".toml"]) > 0:
108            self.parameters_location = [x for x in add_files if x.suffix == ".toml"][0]
109        else:
110            self.parameters_location = None
111
112    def get_mass_spectrum_from_scan(self, scan_number):
113        """Return mass spectrum data object from scan number."""
114        if scan_number in self.scan_number_list:
115            mass_spec = self.get_mass_spectrum(scan_number)
116            return mass_spec
117        else:
118            raise Exception("Scan number not found in HDF5 file.")
119
120    def get_mass_spectra_from_scan_list(
121        self, scan_list, spectrum_mode, auto_process=True
122    ):
123        """Return a list of mass spectrum data objects from a list of scan numbers.
124
125        Parameters
126        ----------
127        scan_list : list
128            A list of scan numbers to retrieve mass spectra for.
129        spectrum_mode : str
130            The spectrum mode to use when retrieving the mass spectra.
131            Note that this parameter is not used for CoreMS HDF5 files, as the spectra are already processed and only
132            centroided spectra are saved.
133        auto_process : bool
134            If True, automatically process the mass spectra when retrieving them.
135            Note that this parameter is not used for CoreMS HDF5 files, as the spectra are already processed and only
136            centroided spectra are saved.
137
138        Returns
139        -------
140        list
141            A list of mass spectrum data objects corresponding to the provided scan numbers.
142        """
143        mass_spectra_list = []
144        for scan_number in scan_list:
145            if scan_number in self.scan_number_list:
146                mass_spec = self.get_mass_spectrum_from_scan(scan_number)
147                mass_spectra_list.append(mass_spec)
148            else:
149                warnings.warn(f"Scan number {scan_number} not found in HDF5 file.")
150        return mass_spectra_list
151
152    def load(self) -> None:
153        """ """
154        pass
155
156    def get_ms_raw(self, spectra=None, scan_df=None) -> dict:
157        """ """
158        # Warn if spectra or scan_df are not None that they are not used for CoreMS HDF5 files and should be rerun after instantiation
159        if spectra is not None or scan_df is not None:
160            SyntaxWarning(
161                "get_ms_raw method for CoreMS HDF5 files can only access saved data, consider rerunning after instantiation."
162            )
163        ms_unprocessed = {}
164        dict_group_load = self.h5pydata["ms_unprocessed"]
165        dict_group_keys = dict_group_load.keys()
166        for k in dict_group_keys:
167            ms_up_int = dict_group_load[k][:]
168            ms_unprocessed[int(k)] = pd.DataFrame(
169                ms_up_int, columns=["scan", "mz", "intensity"]
170            )
171        return ms_unprocessed
172
173    def get_scan_df(self) -> pd.DataFrame:
174        scan_info = {}
175        dict_group_load = self.h5pydata["scan_info"]
176        dict_group_keys = dict_group_load.keys()
177        for k in dict_group_keys:
178            scan_info[k] = dict_group_load[k][:]
179        scan_df = pd.DataFrame(scan_info)
180        scan_df.set_index("scan", inplace=True, drop=False)
181        str_df = scan_df.select_dtypes([object])
182        str_df = str_df.stack().str.decode("utf-8").unstack()
183        for col in str_df:
184            scan_df[col] = str_df[col]
185        return scan_df
186
187    def run(self, mass_spectra, load_raw=True) -> None:
188        """Runs the importer functions to populate a LCMS or MassSpectraBase object.
189
190        Notes
191        -----
192        The following functions are run in order, if the HDF5 file contains the necessary data:
193        1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object.
194        2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object.
195        3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object.
196        4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object.
197        5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object.
198        6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object.
199        7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object.
200
201        Parameters
202        ----------
203        mass_spectra : LCMSBase or MassSpectraBase
204            The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes.
205        load_raw : bool
206            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
207        Returns
208        -------
209        None, but populates several attributes on the LCMS or MassSpectraBase object.
210
211        """
212        if self.parameters_location is not None:
213            # Populate the parameters attribute on the LCMS object
214            self.import_parameters(mass_spectra)
215
216        if "mass_spectra" in self.h5pydata:
217            # Populate the _ms list on the LCMS object
218            self.import_mass_spectra(mass_spectra, load_raw=load_raw)
219
220        if "scan_info" in self.h5pydata:
221            # Populate the _scan_info attribute on the LCMS object
222            self.import_scan_info(mass_spectra)
223
224        if "ms_unprocessed" in self.h5pydata and load_raw:
225            # Populate the _ms_unprocessed attribute on the LCMS object
226            self.import_ms_unprocessed(mass_spectra)
227
228        if "mass_features" in self.h5pydata:
229            # Populate the mass_features attribute on the LCMS object
230            self.import_mass_features(mass_spectra)
231
232        if "eics" in self.h5pydata:
233            # Populate the eics attribute on the LCMS object
234            self.import_eics(mass_spectra)
235
236        if "spectral_search_results" in self.h5pydata:
237            # Populate the spectral_search_results attribute on the LCMS object
238            self.import_spectral_search_results(mass_spectra)
239
240    def import_mass_spectra(self, mass_spectra, load_raw=True) -> None:
241        """Imports all mass spectra from the HDF5 file.
242
243        Parameters
244        ----------
245        mass_spectra : LCMSBase | MassSpectraBase
246            The MassSpectraBase or LCMSBase object to populate with mass spectra.
247        load_raw : bool
248            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default
249
250        Returns
251        -------
252        None, but populates the '_ms' list on the LCMSBase or MassSpectraBase
253        object with mass spectra from the HDF5 file.
254        """
255        for scan_number in self.scan_number_list:
256            mass_spec = self.get_mass_spectrum(scan_number, load_raw=load_raw)
257            mass_spec.scan_number = scan_number
258            mass_spectra.add_mass_spectrum(mass_spec)
259
260    def import_scan_info(self, mass_spectra) -> None:
261        """Imports the scan info from the HDF5 file.
262
263        Parameters
264        ----------
265        lcms : LCMSBase | MassSpectraBase
266            The MassSpectraBase or LCMSBase objects
267
268        Returns
269        -------
270        None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase
271        object with a pandas DataFrame of the 'scan_info' from the HDF5 file.
272
273        """
274        scan_df = self.get_scan_df()
275        mass_spectra.scan_df = scan_df
276
277    def import_ms_unprocessed(self, mass_spectra) -> None:
278        """Imports the unprocessed mass spectra from the HDF5 file.
279
280        Parameters
281        ----------
282        lcms : LCMSBase | MassSpectraBase
283            The MassSpectraBase or LCMSBase objects
284
285        Returns
286        -------
287        None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase
288        object with a dictionary of the 'ms_unprocessed' from the HDF5 file.
289
290        """
291        ms_unprocessed = self.get_ms_raw()
292        mass_spectra._ms_unprocessed = ms_unprocessed
293
294    def import_parameters(self, mass_spectra) -> None:
295        """Imports the parameters from the HDF5 file.
296
297        Parameters
298        ----------
299        mass_spectra : LCMSBase | MassSpectraBase
300            The MassSpectraBase or LCMSBase object to populate with parameters.
301
302        Returns
303        -------
304        None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase
305        object with a dictionary of the 'parameters' from the HDF5 file.
306
307        """
308        if ".json" == self.parameters_location.suffix:
309            load_and_set_json_parameters_lcms(mass_spectra, self.parameters_location)
310        if ".toml" == self.parameters_location.suffix:
311            load_and_set_toml_parameters_lcms(mass_spectra, self.parameters_location)
312        else:
313            raise Exception(
314                "Parameters file must be in JSON format, TOML format is not yet supported."
315            )
316
317    def import_mass_features(self, mass_spectra) -> None:
318        """Imports the mass features from the HDF5 file.
319
320        Parameters
321        ----------
322        mass_spectra : LCMSBase | MassSpectraBase
323            The MassSpectraBase or LCMSBase object to populate with mass features.
324
325        Returns
326        -------
327        None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase
328        object with a dictionary of the 'mass_features' from the HDF5 file.
329
330        """
331        dict_group_load = self.h5pydata["mass_features"]
332        dict_group_keys = dict_group_load.keys()
333        for k in dict_group_keys:
334            # Instantiate the MassFeature object
335            mass_feature = LCMSMassFeature(
336                mass_spectra,
337                mz=dict_group_load[k].attrs["_mz_exp"],
338                retention_time=dict_group_load[k].attrs["_retention_time"],
339                intensity=dict_group_load[k].attrs["_intensity"],
340                apex_scan=dict_group_load[k].attrs["_apex_scan"],
341                persistence=dict_group_load[k].attrs["_persistence"],
342                id=int(k),
343            )
344
345            # Populate additional attributes on the MassFeature object
346            for key in dict_group_load[k].attrs.keys() - {
347                "_mz_exp",
348                "_mz_cal",
349                "_retention_time",
350                "_intensity",
351                "_apex_scan",
352                "_persistence",
353            }:
354                setattr(mass_feature, key, dict_group_load[k].attrs[key])
355
356            # Populate attributes on MassFeature object that are lists
357            for key in dict_group_load[k].keys():
358                setattr(mass_feature, key, dict_group_load[k][key][:])
359                # Convert _noise_score from array to tuple
360                if key == "_noise_score":
361                    mass_feature._noise_score = tuple(mass_feature._noise_score)
362            mass_spectra.mass_features[int(k)] = mass_feature
363
364        # Associate mass features with ms1 and ms2 spectra, if available
365        for mf_id in mass_spectra.mass_features.keys():
366            if mass_spectra.mass_features[mf_id].apex_scan in mass_spectra._ms.keys():
367                mass_spectra.mass_features[mf_id].mass_spectrum = mass_spectra._ms[
368                    mass_spectra.mass_features[mf_id].apex_scan
369                ]
370            if mass_spectra.mass_features[mf_id].ms2_scan_numbers is not None:
371                for ms2_scan in mass_spectra.mass_features[mf_id].ms2_scan_numbers:
372                    if ms2_scan in mass_spectra._ms.keys():
373                        mass_spectra.mass_features[mf_id].ms2_mass_spectra[ms2_scan] = (
374                            mass_spectra._ms[ms2_scan]
375                        )
376
377    def import_eics(self, mass_spectra):
378        """Imports the extracted ion chromatograms from the HDF5 file.
379
380        Parameters
381        ----------
382        mass_spectra : LCMSBase | MassSpectraBase
383            The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms.
384
385        Returns
386        -------
387        None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase
388        object with a dictionary of the 'eics' from the HDF5 file.
389
390        """
391        dict_group_load = self.h5pydata["eics"]
392        dict_group_keys = dict_group_load.keys()
393        for k in dict_group_keys:
394            my_eic = EIC_Data(
395                scans=dict_group_load[k]["scans"][:],
396                time=dict_group_load[k]["time"][:],
397                eic=dict_group_load[k]["eic"][:],
398            )
399            for key in dict_group_load[k].keys():
400                if key not in ["scans", "time", "eic"]:
401                    setattr(my_eic, key, dict_group_load[k][key][:])
402                    # if key is apexes, convert to a tuple of a list
403                    if key == "apexes" and len(my_eic.apexes) > 0:
404                        my_eic.apexes = [tuple(x) for x in my_eic.apexes]
405            # Add to mass_spectra object
406            mass_spectra.eics[dict_group_load[k].attrs["mz"]] = my_eic
407
408        # Add to mass features
409        for idx in mass_spectra.mass_features.keys():
410            mz = mass_spectra.mass_features[idx].mz
411            if mz in mass_spectra.eics.keys():
412                mass_spectra.mass_features[idx]._eic_data = mass_spectra.eics[mz]
413
414    def import_spectral_search_results(self, mass_spectra):
415        """Imports the spectral search results from the HDF5 file.
416
417        Parameters
418        ----------
419        mass_spectra : LCMSBase | MassSpectraBase
420            The MassSpectraBase or LCMSBase object to populate with spectral search results.
421
422        Returns
423        -------
424        None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase
425        object with a dictionary of the 'spectral_search_results' from the HDF5 file.
426
427        """
428        overall_results_dict = {}
429        ms2_results_load = self.h5pydata["spectral_search_results"]
430        for k in ms2_results_load.keys():
431            overall_results_dict[int(k)] = {}
432            for k2 in ms2_results_load[k].keys():
433                ms2_search_res = SpectrumSearchResults(
434                    query_spectrum=mass_spectra._ms[int(k)],
435                    precursor_mz=ms2_results_load[k][k2].attrs["precursor_mz"],
436                    spectral_similarity_search_results={},
437                )
438
439                for key in ms2_results_load[k][k2].keys() - {"precursor_mz"}:
440                    setattr(ms2_search_res, key, list(ms2_results_load[k][k2][key][:]))
441                overall_results_dict[int(k)][
442                    ms2_results_load[k][k2].attrs["precursor_mz"]
443                ] = ms2_search_res
444
445        # add to mass_spectra
446        mass_spectra.spectral_search_results.update(overall_results_dict)
447
448        # If there are mass features, associate the results with each mass feature
449        if len(mass_spectra.mass_features) > 0:
450            for mass_feature_id, mass_feature in mass_spectra.mass_features.items():
451                scan_ids = mass_feature.ms2_scan_numbers
452                for ms2_scan_id in scan_ids:
453                    precursor_mz = mass_feature.mz
454                    try:
455                        mass_spectra.spectral_search_results[ms2_scan_id][precursor_mz]
456                    except KeyError:
457                        pass
458                    else:
459                        mass_spectra.mass_features[
460                            mass_feature_id
461                        ].ms2_similarity_results.append(
462                            mass_spectra.spectral_search_results[ms2_scan_id][
463                                precursor_mz
464                            ]
465                        )
466
467    def get_mass_spectra_obj(self, load_raw=True) -> MassSpectraBase:
468        """
469        Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file.
470
471        Parameters
472        ----------
473        load_raw : bool
474            If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True.
475
476        """
477        # Instantiate the LCMS object
478        spectra_obj = MassSpectraBase(
479            file_location=self.file_location,
480            analyzer=self.analyzer,
481            instrument_label=self.instrument_label,
482            sample_name=self.sample_name,
483        )
484
485        # This will populate the _ms list on the LCMS or MassSpectraBase object
486        self.run(spectra_obj, load_raw=load_raw)
487
488        return spectra_obj
489
490    def get_lcms_obj(
491        self, load_raw=True, use_original_parser=True, raw_file_path=None
492    ) -> LCMSBase:
493        """
494        Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file.
495
496        Parameters
497        ----------
498        load_raw : bool
499            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
500        use_original_parser : bool
501            If True, use the original parser to populate the LCMS object. Default is True.
502        raw_file_path : str
503            The location of the raw file to parse if attempting to use original parser.
504            Default is None, which attempts to get the raw file path from the HDF5 file.
505            If the original file path has moved, this parameter can be used to specify the new location.
506        """
507        # Instantiate the LCMS object
508        lcms_obj = LCMSBase(
509            file_location=self.file_location,
510            analyzer=self.analyzer,
511            instrument_label=self.instrument_label,
512            sample_name=self.sample_name,
513        )
514
515        # This will populate the majority of the attributes on the LCMS object
516        self.run(lcms_obj, load_raw=load_raw)
517
518        # Set final attributes of the LCMS object
519        lcms_obj.polarity = self.h5pydata.attrs["polarity"]
520        lcms_obj._scans_number_list = list(lcms_obj.scan_df.scan)
521        lcms_obj._retention_time_list = list(lcms_obj.scan_df.scan_time)
522        lcms_obj._tic_list = list(lcms_obj.scan_df.tic)
523
524        # If use_original_parser is True, instantiate the original parser and populate the LCMS object
525        if use_original_parser:
526            lcms_obj = self.add_original_parser(lcms_obj, raw_file_path=raw_file_path)
527        else:
528            lcms_obj.spectra_parser_class = self.__class__
529
530        return lcms_obj
531
532    def get_raw_file_location(self):
533        """
534        Get the raw file location from the HDF5 file attributes.
535
536        Returns
537        -------
538        str
539            The raw file location.
540        """
541        if "original_file_location" in self.h5pydata.attrs:
542            return self.h5pydata.attrs["original_file_location"]
543        else:
544            return None
545
546    def add_original_parser(self, mass_spectra, raw_file_path=None):
547        """
548        Add the original parser to the mass spectra object.
549
550        Parameters
551        ----------
552        mass_spectra : MassSpectraBase | LCMSBase
553            The MassSpectraBase or LCMSBase object to add the original parser to.
554        raw_file_path : str
555            The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file.
556        """
557        # Get the original parser type
558        og_parser_type = self.h5pydata.attrs["parser_type"]
559
560        # If raw_file_path is None, get it from the HDF5 file attributes
561        if raw_file_path is None:
562            raw_file_path = self.get_raw_file_location()
563            if raw_file_path is None:
564                raise ValueError(
565                    "Raw file path not found in HDF5 file attributes, cannot instantiate original parser."
566                )
567
568        # Set the raw file path on the mass_spectra object so the parser knows where to find the raw file
569        mass_spectra.raw_file_location = raw_file_path
570
571        if og_parser_type == "ImportMassSpectraThermoMSFileReader":
572            # Check that the parser can be instantiated with the raw file path
573            parser = ImportMassSpectraThermoMSFileReader(raw_file_path)
574        elif og_parser_type == "MZMLSpectraParser":
575            # Check that the parser can be instantiated with the raw file path
576            parser = MZMLSpectraParser(raw_file_path)
577
578        # Set the spectra parser class on the mass_spectra object so the spectra_parser property can be used with the original parser
579        mass_spectra.spectra_parser_class = parser.__class__
580
581        return mass_spectra
582
583    def get_creation_time(self):
584        """
585        Raise a NotImplemented Warning, as creation time is not available in CoreMS HDF5 files and returning None.
586        """
587        warnings.warn(
588            "Creation time is not available in CoreMS HDF5 files, returning None."
589            "This should be accessed through the original parser.",
590        )
591        return None
592
593    def get_instrument_info(self):
594        """
595        Raise a NotImplemented Warning, as instrument info is not available in CoreMS HDF5 files and returning None.
596        """
597        warnings.warn(
598            "Instrument info is not available in CoreMS HDF5 files, returning None."
599            "This should be accessed through the original parser.",
600        )
601        return None
 26class ReadCoreMSHDFMassSpectra(
 27    SpectraParserInterface, ReadCoreMSHDF_MassSpectrum, Thread
 28):
 29    """Class to read CoreMS HDF5 files and populate a LCMS or MassSpectraBase object.
 30
 31    Parameters
 32    ----------
 33    file_location : str
 34        The location of the HDF5 file to read, including the suffix.
 35
 36    Attributes
 37    ----------
 38    file_location : str
 39        The location of the HDF5 file to read.
 40    h5pydata : h5py.File
 41        The HDF5 file object.
 42    scans : list
 43        A list of the location of individual mass spectra within the HDF5 file.
 44    scan_number_list : list
 45        A list of the scan numbers of the mass spectra within the HDF5 file.
 46    parameters_location : str
 47        The location of the parameters file (json or toml).
 48
 49    Methods
 50    -------
 51    * import_mass_spectra(mass_spectra).
 52        Imports all mass spectra from the HDF5 file onto the LCMS or MassSpectraBase object.
 53    * get_mass_spectrum_from_scan(scan_number).
 54        Return mass spectrum data object from scan number.
 55    * load().
 56        Placeholder method to meet the requirements of the SpectraParserInterface.
 57    * run(mass_spectra).
 58        Runs the importer functions to populate a LCMS or MassSpectraBase object.
 59    * import_scan_info(mass_spectra).
 60        Imports the scan info from the HDF5 file to populate the _scan_info attribute
 61        on the LCMS or MassSpectraBase object
 62    * import_ms_unprocessed(mass_spectra).
 63        Imports the unprocessed mass spectra from the HDF5 file to populate the
 64        _ms_unprocessed attribute on the LCMS or MassSpectraBase object
 65    * import_parameters(mass_spectra).
 66        Imports the parameters from the HDF5 file to populate the parameters
 67        attribute on the LCMS or MassSpectraBase object
 68    * import_mass_features(mass_spectra).
 69        Imports the mass features from the HDF5 file to populate the mass_features
 70        attribute on the LCMS or MassSpectraBase object
 71    * import_eics(mass_spectra).
 72        Imports the extracted ion chromatograms from the HDF5 file to populate the
 73        eics attribute on the LCMS or MassSpectraBase object
 74    * import_spectral_search_results(mass_spectra).
 75        Imports the spectral search results from the HDF5 file to populate the
 76        spectral_search_results attribute on the LCMS or MassSpectraBase object
 77    * get_mass_spectra_obj().
 78        Return mass spectra data object, populating the _ms list on the LCMS or
 79        MassSpectraBase object from the HDF5 file
 80    * get_lcms_obj().
 81        Return LCMSBase object, populating the majority of the attributes on the
 82        LCMS object from the HDF5 file
 83
 84    """
 85
 86    def __init__(self, file_location: str):
 87        Thread.__init__(self)
 88        ReadCoreMSHDF_MassSpectrum.__init__(self, file_location)
 89
 90        # override the scans attribute on ReadCoreMSHDF_MassSpectrum class to expect a nested location within the HDF5 file
 91        self.scans = [
 92            "mass_spectra/" + x for x in list(self.h5pydata["mass_spectra"].keys())
 93        ]
 94        self.scan_number_list = sorted(
 95            [int(float(i)) for i in list(self.h5pydata["mass_spectra"].keys())]
 96        )
 97
 98        # set the location of the parameters file (json or toml)
 99        add_files = [
100            x
101            for x in self.file_location.parent.glob(
102                self.file_location.name.replace(".hdf5", ".*")
103            )
104            if x.suffix != ".hdf5"
105        ]
106        if len([x for x in add_files if x.suffix == ".json"]) > 0:
107            self.parameters_location = [x for x in add_files if x.suffix == ".json"][0]
108        elif len([x for x in add_files if x.suffix == ".toml"]) > 0:
109            self.parameters_location = [x for x in add_files if x.suffix == ".toml"][0]
110        else:
111            self.parameters_location = None
112
113    def get_mass_spectrum_from_scan(self, scan_number):
114        """Return mass spectrum data object from scan number."""
115        if scan_number in self.scan_number_list:
116            mass_spec = self.get_mass_spectrum(scan_number)
117            return mass_spec
118        else:
119            raise Exception("Scan number not found in HDF5 file.")
120
121    def get_mass_spectra_from_scan_list(
122        self, scan_list, spectrum_mode, auto_process=True
123    ):
124        """Return a list of mass spectrum data objects from a list of scan numbers.
125
126        Parameters
127        ----------
128        scan_list : list
129            A list of scan numbers to retrieve mass spectra for.
130        spectrum_mode : str
131            The spectrum mode to use when retrieving the mass spectra.
132            Note that this parameter is not used for CoreMS HDF5 files, as the spectra are already processed and only
133            centroided spectra are saved.
134        auto_process : bool
135            If True, automatically process the mass spectra when retrieving them.
136            Note that this parameter is not used for CoreMS HDF5 files, as the spectra are already processed and only
137            centroided spectra are saved.
138
139        Returns
140        -------
141        list
142            A list of mass spectrum data objects corresponding to the provided scan numbers.
143        """
144        mass_spectra_list = []
145        for scan_number in scan_list:
146            if scan_number in self.scan_number_list:
147                mass_spec = self.get_mass_spectrum_from_scan(scan_number)
148                mass_spectra_list.append(mass_spec)
149            else:
150                warnings.warn(f"Scan number {scan_number} not found in HDF5 file.")
151        return mass_spectra_list
152
153    def load(self) -> None:
154        """ """
155        pass
156
157    def get_ms_raw(self, spectra=None, scan_df=None) -> dict:
158        """ """
159        # Warn if spectra or scan_df are not None that they are not used for CoreMS HDF5 files and should be rerun after instantiation
160        if spectra is not None or scan_df is not None:
161            SyntaxWarning(
162                "get_ms_raw method for CoreMS HDF5 files can only access saved data, consider rerunning after instantiation."
163            )
164        ms_unprocessed = {}
165        dict_group_load = self.h5pydata["ms_unprocessed"]
166        dict_group_keys = dict_group_load.keys()
167        for k in dict_group_keys:
168            ms_up_int = dict_group_load[k][:]
169            ms_unprocessed[int(k)] = pd.DataFrame(
170                ms_up_int, columns=["scan", "mz", "intensity"]
171            )
172        return ms_unprocessed
173
174    def get_scan_df(self) -> pd.DataFrame:
175        scan_info = {}
176        dict_group_load = self.h5pydata["scan_info"]
177        dict_group_keys = dict_group_load.keys()
178        for k in dict_group_keys:
179            scan_info[k] = dict_group_load[k][:]
180        scan_df = pd.DataFrame(scan_info)
181        scan_df.set_index("scan", inplace=True, drop=False)
182        str_df = scan_df.select_dtypes([object])
183        str_df = str_df.stack().str.decode("utf-8").unstack()
184        for col in str_df:
185            scan_df[col] = str_df[col]
186        return scan_df
187
188    def run(self, mass_spectra, load_raw=True) -> None:
189        """Runs the importer functions to populate a LCMS or MassSpectraBase object.
190
191        Notes
192        -----
193        The following functions are run in order, if the HDF5 file contains the necessary data:
194        1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object.
195        2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object.
196        3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object.
197        4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object.
198        5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object.
199        6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object.
200        7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object.
201
202        Parameters
203        ----------
204        mass_spectra : LCMSBase or MassSpectraBase
205            The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes.
206        load_raw : bool
207            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
208        Returns
209        -------
210        None, but populates several attributes on the LCMS or MassSpectraBase object.
211
212        """
213        if self.parameters_location is not None:
214            # Populate the parameters attribute on the LCMS object
215            self.import_parameters(mass_spectra)
216
217        if "mass_spectra" in self.h5pydata:
218            # Populate the _ms list on the LCMS object
219            self.import_mass_spectra(mass_spectra, load_raw=load_raw)
220
221        if "scan_info" in self.h5pydata:
222            # Populate the _scan_info attribute on the LCMS object
223            self.import_scan_info(mass_spectra)
224
225        if "ms_unprocessed" in self.h5pydata and load_raw:
226            # Populate the _ms_unprocessed attribute on the LCMS object
227            self.import_ms_unprocessed(mass_spectra)
228
229        if "mass_features" in self.h5pydata:
230            # Populate the mass_features attribute on the LCMS object
231            self.import_mass_features(mass_spectra)
232
233        if "eics" in self.h5pydata:
234            # Populate the eics attribute on the LCMS object
235            self.import_eics(mass_spectra)
236
237        if "spectral_search_results" in self.h5pydata:
238            # Populate the spectral_search_results attribute on the LCMS object
239            self.import_spectral_search_results(mass_spectra)
240
241    def import_mass_spectra(self, mass_spectra, load_raw=True) -> None:
242        """Imports all mass spectra from the HDF5 file.
243
244        Parameters
245        ----------
246        mass_spectra : LCMSBase | MassSpectraBase
247            The MassSpectraBase or LCMSBase object to populate with mass spectra.
248        load_raw : bool
249            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default
250
251        Returns
252        -------
253        None, but populates the '_ms' list on the LCMSBase or MassSpectraBase
254        object with mass spectra from the HDF5 file.
255        """
256        for scan_number in self.scan_number_list:
257            mass_spec = self.get_mass_spectrum(scan_number, load_raw=load_raw)
258            mass_spec.scan_number = scan_number
259            mass_spectra.add_mass_spectrum(mass_spec)
260
261    def import_scan_info(self, mass_spectra) -> None:
262        """Imports the scan info from the HDF5 file.
263
264        Parameters
265        ----------
266        lcms : LCMSBase | MassSpectraBase
267            The MassSpectraBase or LCMSBase objects
268
269        Returns
270        -------
271        None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase
272        object with a pandas DataFrame of the 'scan_info' from the HDF5 file.
273
274        """
275        scan_df = self.get_scan_df()
276        mass_spectra.scan_df = scan_df
277
278    def import_ms_unprocessed(self, mass_spectra) -> None:
279        """Imports the unprocessed mass spectra from the HDF5 file.
280
281        Parameters
282        ----------
283        lcms : LCMSBase | MassSpectraBase
284            The MassSpectraBase or LCMSBase objects
285
286        Returns
287        -------
288        None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase
289        object with a dictionary of the 'ms_unprocessed' from the HDF5 file.
290
291        """
292        ms_unprocessed = self.get_ms_raw()
293        mass_spectra._ms_unprocessed = ms_unprocessed
294
295    def import_parameters(self, mass_spectra) -> None:
296        """Imports the parameters from the HDF5 file.
297
298        Parameters
299        ----------
300        mass_spectra : LCMSBase | MassSpectraBase
301            The MassSpectraBase or LCMSBase object to populate with parameters.
302
303        Returns
304        -------
305        None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase
306        object with a dictionary of the 'parameters' from the HDF5 file.
307
308        """
309        if ".json" == self.parameters_location.suffix:
310            load_and_set_json_parameters_lcms(mass_spectra, self.parameters_location)
311        if ".toml" == self.parameters_location.suffix:
312            load_and_set_toml_parameters_lcms(mass_spectra, self.parameters_location)
313        else:
314            raise Exception(
315                "Parameters file must be in JSON format, TOML format is not yet supported."
316            )
317
318    def import_mass_features(self, mass_spectra) -> None:
319        """Imports the mass features from the HDF5 file.
320
321        Parameters
322        ----------
323        mass_spectra : LCMSBase | MassSpectraBase
324            The MassSpectraBase or LCMSBase object to populate with mass features.
325
326        Returns
327        -------
328        None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase
329        object with a dictionary of the 'mass_features' from the HDF5 file.
330
331        """
332        dict_group_load = self.h5pydata["mass_features"]
333        dict_group_keys = dict_group_load.keys()
334        for k in dict_group_keys:
335            # Instantiate the MassFeature object
336            mass_feature = LCMSMassFeature(
337                mass_spectra,
338                mz=dict_group_load[k].attrs["_mz_exp"],
339                retention_time=dict_group_load[k].attrs["_retention_time"],
340                intensity=dict_group_load[k].attrs["_intensity"],
341                apex_scan=dict_group_load[k].attrs["_apex_scan"],
342                persistence=dict_group_load[k].attrs["_persistence"],
343                id=int(k),
344            )
345
346            # Populate additional attributes on the MassFeature object
347            for key in dict_group_load[k].attrs.keys() - {
348                "_mz_exp",
349                "_mz_cal",
350                "_retention_time",
351                "_intensity",
352                "_apex_scan",
353                "_persistence",
354            }:
355                setattr(mass_feature, key, dict_group_load[k].attrs[key])
356
357            # Populate attributes on MassFeature object that are lists
358            for key in dict_group_load[k].keys():
359                setattr(mass_feature, key, dict_group_load[k][key][:])
360                # Convert _noise_score from array to tuple
361                if key == "_noise_score":
362                    mass_feature._noise_score = tuple(mass_feature._noise_score)
363            mass_spectra.mass_features[int(k)] = mass_feature
364
365        # Associate mass features with ms1 and ms2 spectra, if available
366        for mf_id in mass_spectra.mass_features.keys():
367            if mass_spectra.mass_features[mf_id].apex_scan in mass_spectra._ms.keys():
368                mass_spectra.mass_features[mf_id].mass_spectrum = mass_spectra._ms[
369                    mass_spectra.mass_features[mf_id].apex_scan
370                ]
371            if mass_spectra.mass_features[mf_id].ms2_scan_numbers is not None:
372                for ms2_scan in mass_spectra.mass_features[mf_id].ms2_scan_numbers:
373                    if ms2_scan in mass_spectra._ms.keys():
374                        mass_spectra.mass_features[mf_id].ms2_mass_spectra[ms2_scan] = (
375                            mass_spectra._ms[ms2_scan]
376                        )
377
378    def import_eics(self, mass_spectra):
379        """Imports the extracted ion chromatograms from the HDF5 file.
380
381        Parameters
382        ----------
383        mass_spectra : LCMSBase | MassSpectraBase
384            The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms.
385
386        Returns
387        -------
388        None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase
389        object with a dictionary of the 'eics' from the HDF5 file.
390
391        """
392        dict_group_load = self.h5pydata["eics"]
393        dict_group_keys = dict_group_load.keys()
394        for k in dict_group_keys:
395            my_eic = EIC_Data(
396                scans=dict_group_load[k]["scans"][:],
397                time=dict_group_load[k]["time"][:],
398                eic=dict_group_load[k]["eic"][:],
399            )
400            for key in dict_group_load[k].keys():
401                if key not in ["scans", "time", "eic"]:
402                    setattr(my_eic, key, dict_group_load[k][key][:])
403                    # if key is apexes, convert to a tuple of a list
404                    if key == "apexes" and len(my_eic.apexes) > 0:
405                        my_eic.apexes = [tuple(x) for x in my_eic.apexes]
406            # Add to mass_spectra object
407            mass_spectra.eics[dict_group_load[k].attrs["mz"]] = my_eic
408
409        # Add to mass features
410        for idx in mass_spectra.mass_features.keys():
411            mz = mass_spectra.mass_features[idx].mz
412            if mz in mass_spectra.eics.keys():
413                mass_spectra.mass_features[idx]._eic_data = mass_spectra.eics[mz]
414
415    def import_spectral_search_results(self, mass_spectra):
416        """Imports the spectral search results from the HDF5 file.
417
418        Parameters
419        ----------
420        mass_spectra : LCMSBase | MassSpectraBase
421            The MassSpectraBase or LCMSBase object to populate with spectral search results.
422
423        Returns
424        -------
425        None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase
426        object with a dictionary of the 'spectral_search_results' from the HDF5 file.
427
428        """
429        overall_results_dict = {}
430        ms2_results_load = self.h5pydata["spectral_search_results"]
431        for k in ms2_results_load.keys():
432            overall_results_dict[int(k)] = {}
433            for k2 in ms2_results_load[k].keys():
434                ms2_search_res = SpectrumSearchResults(
435                    query_spectrum=mass_spectra._ms[int(k)],
436                    precursor_mz=ms2_results_load[k][k2].attrs["precursor_mz"],
437                    spectral_similarity_search_results={},
438                )
439
440                for key in ms2_results_load[k][k2].keys() - {"precursor_mz"}:
441                    setattr(ms2_search_res, key, list(ms2_results_load[k][k2][key][:]))
442                overall_results_dict[int(k)][
443                    ms2_results_load[k][k2].attrs["precursor_mz"]
444                ] = ms2_search_res
445
446        # add to mass_spectra
447        mass_spectra.spectral_search_results.update(overall_results_dict)
448
449        # If there are mass features, associate the results with each mass feature
450        if len(mass_spectra.mass_features) > 0:
451            for mass_feature_id, mass_feature in mass_spectra.mass_features.items():
452                scan_ids = mass_feature.ms2_scan_numbers
453                for ms2_scan_id in scan_ids:
454                    precursor_mz = mass_feature.mz
455                    try:
456                        mass_spectra.spectral_search_results[ms2_scan_id][precursor_mz]
457                    except KeyError:
458                        pass
459                    else:
460                        mass_spectra.mass_features[
461                            mass_feature_id
462                        ].ms2_similarity_results.append(
463                            mass_spectra.spectral_search_results[ms2_scan_id][
464                                precursor_mz
465                            ]
466                        )
467
468    def get_mass_spectra_obj(self, load_raw=True) -> MassSpectraBase:
469        """
470        Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file.
471
472        Parameters
473        ----------
474        load_raw : bool
475            If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True.
476
477        """
478        # Instantiate the LCMS object
479        spectra_obj = MassSpectraBase(
480            file_location=self.file_location,
481            analyzer=self.analyzer,
482            instrument_label=self.instrument_label,
483            sample_name=self.sample_name,
484        )
485
486        # This will populate the _ms list on the LCMS or MassSpectraBase object
487        self.run(spectra_obj, load_raw=load_raw)
488
489        return spectra_obj
490
491    def get_lcms_obj(
492        self, load_raw=True, use_original_parser=True, raw_file_path=None
493    ) -> LCMSBase:
494        """
495        Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file.
496
497        Parameters
498        ----------
499        load_raw : bool
500            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
501        use_original_parser : bool
502            If True, use the original parser to populate the LCMS object. Default is True.
503        raw_file_path : str
504            The location of the raw file to parse if attempting to use original parser.
505            Default is None, which attempts to get the raw file path from the HDF5 file.
506            If the original file path has moved, this parameter can be used to specify the new location.
507        """
508        # Instantiate the LCMS object
509        lcms_obj = LCMSBase(
510            file_location=self.file_location,
511            analyzer=self.analyzer,
512            instrument_label=self.instrument_label,
513            sample_name=self.sample_name,
514        )
515
516        # This will populate the majority of the attributes on the LCMS object
517        self.run(lcms_obj, load_raw=load_raw)
518
519        # Set final attributes of the LCMS object
520        lcms_obj.polarity = self.h5pydata.attrs["polarity"]
521        lcms_obj._scans_number_list = list(lcms_obj.scan_df.scan)
522        lcms_obj._retention_time_list = list(lcms_obj.scan_df.scan_time)
523        lcms_obj._tic_list = list(lcms_obj.scan_df.tic)
524
525        # If use_original_parser is True, instantiate the original parser and populate the LCMS object
526        if use_original_parser:
527            lcms_obj = self.add_original_parser(lcms_obj, raw_file_path=raw_file_path)
528        else:
529            lcms_obj.spectra_parser_class = self.__class__
530
531        return lcms_obj
532
533    def get_raw_file_location(self):
534        """
535        Get the raw file location from the HDF5 file attributes.
536
537        Returns
538        -------
539        str
540            The raw file location.
541        """
542        if "original_file_location" in self.h5pydata.attrs:
543            return self.h5pydata.attrs["original_file_location"]
544        else:
545            return None
546
547    def add_original_parser(self, mass_spectra, raw_file_path=None):
548        """
549        Add the original parser to the mass spectra object.
550
551        Parameters
552        ----------
553        mass_spectra : MassSpectraBase | LCMSBase
554            The MassSpectraBase or LCMSBase object to add the original parser to.
555        raw_file_path : str
556            The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file.
557        """
558        # Get the original parser type
559        og_parser_type = self.h5pydata.attrs["parser_type"]
560
561        # If raw_file_path is None, get it from the HDF5 file attributes
562        if raw_file_path is None:
563            raw_file_path = self.get_raw_file_location()
564            if raw_file_path is None:
565                raise ValueError(
566                    "Raw file path not found in HDF5 file attributes, cannot instantiate original parser."
567                )
568
569        # Set the raw file path on the mass_spectra object so the parser knows where to find the raw file
570        mass_spectra.raw_file_location = raw_file_path
571
572        if og_parser_type == "ImportMassSpectraThermoMSFileReader":
573            # Check that the parser can be instantiated with the raw file path
574            parser = ImportMassSpectraThermoMSFileReader(raw_file_path)
575        elif og_parser_type == "MZMLSpectraParser":
576            # Check that the parser can be instantiated with the raw file path
577            parser = MZMLSpectraParser(raw_file_path)
578
579        # Set the spectra parser class on the mass_spectra object so the spectra_parser property can be used with the original parser
580        mass_spectra.spectra_parser_class = parser.__class__
581
582        return mass_spectra
583
584    def get_creation_time(self):
585        """
586        Raise a NotImplemented Warning, as creation time is not available in CoreMS HDF5 files and returning None.
587        """
588        warnings.warn(
589            "Creation time is not available in CoreMS HDF5 files, returning None."
590            "This should be accessed through the original parser.",
591        )
592        return None
593
594    def get_instrument_info(self):
595        """
596        Raise a NotImplemented Warning, as instrument info is not available in CoreMS HDF5 files and returning None.
597        """
598        warnings.warn(
599            "Instrument info is not available in CoreMS HDF5 files, returning None."
600            "This should be accessed through the original parser.",
601        )
602        return None

Class to read CoreMS HDF5 files and populate a LCMS or MassSpectraBase object.

Parameters
  • file_location (str): The location of the HDF5 file to read, including the suffix.
Attributes
  • file_location (str): The location of the HDF5 file to read.
  • h5pydata (h5py.File): The HDF5 file object.
  • scans (list): A list of the location of individual mass spectra within the HDF5 file.
  • scan_number_list (list): A list of the scan numbers of the mass spectra within the HDF5 file.
  • parameters_location (str): The location of the parameters file (json or toml).
Methods
  • import_mass_spectra(mass_spectra). Imports all mass spectra from the HDF5 file onto the LCMS or MassSpectraBase object.
  • get_mass_spectrum_from_scan(scan_number). Return mass spectrum data object from scan number.
  • load(). Placeholder method to meet the requirements of the SpectraParserInterface.
  • run(mass_spectra). Runs the importer functions to populate a LCMS or MassSpectraBase object.
  • import_scan_info(mass_spectra). Imports the scan info from the HDF5 file to populate the _scan_info attribute on the LCMS or MassSpectraBase object
  • import_ms_unprocessed(mass_spectra). Imports the unprocessed mass spectra from the HDF5 file to populate the _ms_unprocessed attribute on the LCMS or MassSpectraBase object
  • import_parameters(mass_spectra). Imports the parameters from the HDF5 file to populate the parameters attribute on the LCMS or MassSpectraBase object
  • import_mass_features(mass_spectra). Imports the mass features from the HDF5 file to populate the mass_features attribute on the LCMS or MassSpectraBase object
  • import_eics(mass_spectra). Imports the extracted ion chromatograms from the HDF5 file to populate the eics attribute on the LCMS or MassSpectraBase object
  • import_spectral_search_results(mass_spectra). Imports the spectral search results from the HDF5 file to populate the spectral_search_results attribute on the LCMS or MassSpectraBase object
  • get_mass_spectra_obj(). Return mass spectra data object, populating the _ms list on the LCMS or MassSpectraBase object from the HDF5 file
  • get_lcms_obj(). Return LCMSBase object, populating the majority of the attributes on the LCMS object from the HDF5 file
ReadCoreMSHDFMassSpectra(file_location: str)
 86    def __init__(self, file_location: str):
 87        Thread.__init__(self)
 88        ReadCoreMSHDF_MassSpectrum.__init__(self, file_location)
 89
 90        # override the scans attribute on ReadCoreMSHDF_MassSpectrum class to expect a nested location within the HDF5 file
 91        self.scans = [
 92            "mass_spectra/" + x for x in list(self.h5pydata["mass_spectra"].keys())
 93        ]
 94        self.scan_number_list = sorted(
 95            [int(float(i)) for i in list(self.h5pydata["mass_spectra"].keys())]
 96        )
 97
 98        # set the location of the parameters file (json or toml)
 99        add_files = [
100            x
101            for x in self.file_location.parent.glob(
102                self.file_location.name.replace(".hdf5", ".*")
103            )
104            if x.suffix != ".hdf5"
105        ]
106        if len([x for x in add_files if x.suffix == ".json"]) > 0:
107            self.parameters_location = [x for x in add_files if x.suffix == ".json"][0]
108        elif len([x for x in add_files if x.suffix == ".toml"]) > 0:
109            self.parameters_location = [x for x in add_files if x.suffix == ".toml"][0]
110        else:
111            self.parameters_location = None

This constructor should always be called with keyword arguments. Arguments are:

group should be None; reserved for future extension when a ThreadGroup class is implemented.

target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.

name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.

args is the argument tuple for the target invocation. Defaults to ().

kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.

If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.

scans
scan_number_list
def get_mass_spectrum_from_scan(self, scan_number):
113    def get_mass_spectrum_from_scan(self, scan_number):
114        """Return mass spectrum data object from scan number."""
115        if scan_number in self.scan_number_list:
116            mass_spec = self.get_mass_spectrum(scan_number)
117            return mass_spec
118        else:
119            raise Exception("Scan number not found in HDF5 file.")

Return mass spectrum data object from scan number.

def get_mass_spectra_from_scan_list(self, scan_list, spectrum_mode, auto_process=True):
121    def get_mass_spectra_from_scan_list(
122        self, scan_list, spectrum_mode, auto_process=True
123    ):
124        """Return a list of mass spectrum data objects from a list of scan numbers.
125
126        Parameters
127        ----------
128        scan_list : list
129            A list of scan numbers to retrieve mass spectra for.
130        spectrum_mode : str
131            The spectrum mode to use when retrieving the mass spectra.
132            Note that this parameter is not used for CoreMS HDF5 files, as the spectra are already processed and only
133            centroided spectra are saved.
134        auto_process : bool
135            If True, automatically process the mass spectra when retrieving them.
136            Note that this parameter is not used for CoreMS HDF5 files, as the spectra are already processed and only
137            centroided spectra are saved.
138
139        Returns
140        -------
141        list
142            A list of mass spectrum data objects corresponding to the provided scan numbers.
143        """
144        mass_spectra_list = []
145        for scan_number in scan_list:
146            if scan_number in self.scan_number_list:
147                mass_spec = self.get_mass_spectrum_from_scan(scan_number)
148                mass_spectra_list.append(mass_spec)
149            else:
150                warnings.warn(f"Scan number {scan_number} not found in HDF5 file.")
151        return mass_spectra_list

Return a list of mass spectrum data objects from a list of scan numbers.

Parameters
  • scan_list (list): A list of scan numbers to retrieve mass spectra for.
  • spectrum_mode (str): The spectrum mode to use when retrieving the mass spectra. Note that this parameter is not used for CoreMS HDF5 files, as the spectra are already processed and only centroided spectra are saved.
  • auto_process (bool): If True, automatically process the mass spectra when retrieving them. Note that this parameter is not used for CoreMS HDF5 files, as the spectra are already processed and only centroided spectra are saved.
Returns
  • list: A list of mass spectrum data objects corresponding to the provided scan numbers.
def load(self) -> None:
153    def load(self) -> None:
154        """ """
155        pass
def get_ms_raw(self, spectra=None, scan_df=None) -> dict:
157    def get_ms_raw(self, spectra=None, scan_df=None) -> dict:
158        """ """
159        # Warn if spectra or scan_df are not None that they are not used for CoreMS HDF5 files and should be rerun after instantiation
160        if spectra is not None or scan_df is not None:
161            SyntaxWarning(
162                "get_ms_raw method for CoreMS HDF5 files can only access saved data, consider rerunning after instantiation."
163            )
164        ms_unprocessed = {}
165        dict_group_load = self.h5pydata["ms_unprocessed"]
166        dict_group_keys = dict_group_load.keys()
167        for k in dict_group_keys:
168            ms_up_int = dict_group_load[k][:]
169            ms_unprocessed[int(k)] = pd.DataFrame(
170                ms_up_int, columns=["scan", "mz", "intensity"]
171            )
172        return ms_unprocessed
def get_scan_df(self) -> pandas.core.frame.DataFrame:
174    def get_scan_df(self) -> pd.DataFrame:
175        scan_info = {}
176        dict_group_load = self.h5pydata["scan_info"]
177        dict_group_keys = dict_group_load.keys()
178        for k in dict_group_keys:
179            scan_info[k] = dict_group_load[k][:]
180        scan_df = pd.DataFrame(scan_info)
181        scan_df.set_index("scan", inplace=True, drop=False)
182        str_df = scan_df.select_dtypes([object])
183        str_df = str_df.stack().str.decode("utf-8").unstack()
184        for col in str_df:
185            scan_df[col] = str_df[col]
186        return scan_df

Return scan data as a pandas DataFrame.

def run(self, mass_spectra, load_raw=True) -> None:
188    def run(self, mass_spectra, load_raw=True) -> None:
189        """Runs the importer functions to populate a LCMS or MassSpectraBase object.
190
191        Notes
192        -----
193        The following functions are run in order, if the HDF5 file contains the necessary data:
194        1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object.
195        2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object.
196        3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object.
197        4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object.
198        5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object.
199        6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object.
200        7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object.
201
202        Parameters
203        ----------
204        mass_spectra : LCMSBase or MassSpectraBase
205            The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes.
206        load_raw : bool
207            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
208        Returns
209        -------
210        None, but populates several attributes on the LCMS or MassSpectraBase object.
211
212        """
213        if self.parameters_location is not None:
214            # Populate the parameters attribute on the LCMS object
215            self.import_parameters(mass_spectra)
216
217        if "mass_spectra" in self.h5pydata:
218            # Populate the _ms list on the LCMS object
219            self.import_mass_spectra(mass_spectra, load_raw=load_raw)
220
221        if "scan_info" in self.h5pydata:
222            # Populate the _scan_info attribute on the LCMS object
223            self.import_scan_info(mass_spectra)
224
225        if "ms_unprocessed" in self.h5pydata and load_raw:
226            # Populate the _ms_unprocessed attribute on the LCMS object
227            self.import_ms_unprocessed(mass_spectra)
228
229        if "mass_features" in self.h5pydata:
230            # Populate the mass_features attribute on the LCMS object
231            self.import_mass_features(mass_spectra)
232
233        if "eics" in self.h5pydata:
234            # Populate the eics attribute on the LCMS object
235            self.import_eics(mass_spectra)
236
237        if "spectral_search_results" in self.h5pydata:
238            # Populate the spectral_search_results attribute on the LCMS object
239            self.import_spectral_search_results(mass_spectra)

Runs the importer functions to populate a LCMS or MassSpectraBase object.

Notes

The following functions are run in order, if the HDF5 file contains the necessary data:

  1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object.
  2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object.
  3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object.
  4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object.
  5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object.
  6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object.
  7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object.
Parameters
  • mass_spectra (LCMSBase or MassSpectraBase): The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes.
  • load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
Returns
  • None, but populates several attributes on the LCMS or MassSpectraBase object.
def import_mass_spectra(self, mass_spectra, load_raw=True) -> None:
241    def import_mass_spectra(self, mass_spectra, load_raw=True) -> None:
242        """Imports all mass spectra from the HDF5 file.
243
244        Parameters
245        ----------
246        mass_spectra : LCMSBase | MassSpectraBase
247            The MassSpectraBase or LCMSBase object to populate with mass spectra.
248        load_raw : bool
249            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default
250
251        Returns
252        -------
253        None, but populates the '_ms' list on the LCMSBase or MassSpectraBase
254        object with mass spectra from the HDF5 file.
255        """
256        for scan_number in self.scan_number_list:
257            mass_spec = self.get_mass_spectrum(scan_number, load_raw=load_raw)
258            mass_spec.scan_number = scan_number
259            mass_spectra.add_mass_spectrum(mass_spec)

Imports all mass spectra from the HDF5 file.

Parameters
  • mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with mass spectra.
  • load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default
Returns
  • None, but populates the '_ms' list on the LCMSBase or MassSpectraBase
  • object with mass spectra from the HDF5 file.
def import_scan_info(self, mass_spectra) -> None:
261    def import_scan_info(self, mass_spectra) -> None:
262        """Imports the scan info from the HDF5 file.
263
264        Parameters
265        ----------
266        lcms : LCMSBase | MassSpectraBase
267            The MassSpectraBase or LCMSBase objects
268
269        Returns
270        -------
271        None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase
272        object with a pandas DataFrame of the 'scan_info' from the HDF5 file.
273
274        """
275        scan_df = self.get_scan_df()
276        mass_spectra.scan_df = scan_df

Imports the scan info from the HDF5 file.

Parameters
  • lcms (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase objects
Returns
  • None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase
  • object with a pandas DataFrame of the 'scan_info' from the HDF5 file.
def import_ms_unprocessed(self, mass_spectra) -> None:
278    def import_ms_unprocessed(self, mass_spectra) -> None:
279        """Imports the unprocessed mass spectra from the HDF5 file.
280
281        Parameters
282        ----------
283        lcms : LCMSBase | MassSpectraBase
284            The MassSpectraBase or LCMSBase objects
285
286        Returns
287        -------
288        None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase
289        object with a dictionary of the 'ms_unprocessed' from the HDF5 file.
290
291        """
292        ms_unprocessed = self.get_ms_raw()
293        mass_spectra._ms_unprocessed = ms_unprocessed

Imports the unprocessed mass spectra from the HDF5 file.

Parameters
  • lcms (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase objects
Returns
  • None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase
  • object with a dictionary of the 'ms_unprocessed' from the HDF5 file.
def import_parameters(self, mass_spectra) -> None:
295    def import_parameters(self, mass_spectra) -> None:
296        """Imports the parameters from the HDF5 file.
297
298        Parameters
299        ----------
300        mass_spectra : LCMSBase | MassSpectraBase
301            The MassSpectraBase or LCMSBase object to populate with parameters.
302
303        Returns
304        -------
305        None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase
306        object with a dictionary of the 'parameters' from the HDF5 file.
307
308        """
309        if ".json" == self.parameters_location.suffix:
310            load_and_set_json_parameters_lcms(mass_spectra, self.parameters_location)
311        if ".toml" == self.parameters_location.suffix:
312            load_and_set_toml_parameters_lcms(mass_spectra, self.parameters_location)
313        else:
314            raise Exception(
315                "Parameters file must be in JSON format, TOML format is not yet supported."
316            )

Imports the parameters from the HDF5 file.

Parameters
  • mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with parameters.
Returns
  • None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase
  • object with a dictionary of the 'parameters' from the HDF5 file.
def import_mass_features(self, mass_spectra) -> None:
318    def import_mass_features(self, mass_spectra) -> None:
319        """Imports the mass features from the HDF5 file.
320
321        Parameters
322        ----------
323        mass_spectra : LCMSBase | MassSpectraBase
324            The MassSpectraBase or LCMSBase object to populate with mass features.
325
326        Returns
327        -------
328        None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase
329        object with a dictionary of the 'mass_features' from the HDF5 file.
330
331        """
332        dict_group_load = self.h5pydata["mass_features"]
333        dict_group_keys = dict_group_load.keys()
334        for k in dict_group_keys:
335            # Instantiate the MassFeature object
336            mass_feature = LCMSMassFeature(
337                mass_spectra,
338                mz=dict_group_load[k].attrs["_mz_exp"],
339                retention_time=dict_group_load[k].attrs["_retention_time"],
340                intensity=dict_group_load[k].attrs["_intensity"],
341                apex_scan=dict_group_load[k].attrs["_apex_scan"],
342                persistence=dict_group_load[k].attrs["_persistence"],
343                id=int(k),
344            )
345
346            # Populate additional attributes on the MassFeature object
347            for key in dict_group_load[k].attrs.keys() - {
348                "_mz_exp",
349                "_mz_cal",
350                "_retention_time",
351                "_intensity",
352                "_apex_scan",
353                "_persistence",
354            }:
355                setattr(mass_feature, key, dict_group_load[k].attrs[key])
356
357            # Populate attributes on MassFeature object that are lists
358            for key in dict_group_load[k].keys():
359                setattr(mass_feature, key, dict_group_load[k][key][:])
360                # Convert _noise_score from array to tuple
361                if key == "_noise_score":
362                    mass_feature._noise_score = tuple(mass_feature._noise_score)
363            mass_spectra.mass_features[int(k)] = mass_feature
364
365        # Associate mass features with ms1 and ms2 spectra, if available
366        for mf_id in mass_spectra.mass_features.keys():
367            if mass_spectra.mass_features[mf_id].apex_scan in mass_spectra._ms.keys():
368                mass_spectra.mass_features[mf_id].mass_spectrum = mass_spectra._ms[
369                    mass_spectra.mass_features[mf_id].apex_scan
370                ]
371            if mass_spectra.mass_features[mf_id].ms2_scan_numbers is not None:
372                for ms2_scan in mass_spectra.mass_features[mf_id].ms2_scan_numbers:
373                    if ms2_scan in mass_spectra._ms.keys():
374                        mass_spectra.mass_features[mf_id].ms2_mass_spectra[ms2_scan] = (
375                            mass_spectra._ms[ms2_scan]
376                        )

Imports the mass features from the HDF5 file.

Parameters
  • mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with mass features.
Returns
  • None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase
  • object with a dictionary of the 'mass_features' from the HDF5 file.
def import_eics(self, mass_spectra):
378    def import_eics(self, mass_spectra):
379        """Imports the extracted ion chromatograms from the HDF5 file.
380
381        Parameters
382        ----------
383        mass_spectra : LCMSBase | MassSpectraBase
384            The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms.
385
386        Returns
387        -------
388        None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase
389        object with a dictionary of the 'eics' from the HDF5 file.
390
391        """
392        dict_group_load = self.h5pydata["eics"]
393        dict_group_keys = dict_group_load.keys()
394        for k in dict_group_keys:
395            my_eic = EIC_Data(
396                scans=dict_group_load[k]["scans"][:],
397                time=dict_group_load[k]["time"][:],
398                eic=dict_group_load[k]["eic"][:],
399            )
400            for key in dict_group_load[k].keys():
401                if key not in ["scans", "time", "eic"]:
402                    setattr(my_eic, key, dict_group_load[k][key][:])
403                    # if key is apexes, convert to a tuple of a list
404                    if key == "apexes" and len(my_eic.apexes) > 0:
405                        my_eic.apexes = [tuple(x) for x in my_eic.apexes]
406            # Add to mass_spectra object
407            mass_spectra.eics[dict_group_load[k].attrs["mz"]] = my_eic
408
409        # Add to mass features
410        for idx in mass_spectra.mass_features.keys():
411            mz = mass_spectra.mass_features[idx].mz
412            if mz in mass_spectra.eics.keys():
413                mass_spectra.mass_features[idx]._eic_data = mass_spectra.eics[mz]

Imports the extracted ion chromatograms from the HDF5 file.

Parameters
  • mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms.
Returns
  • None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase
  • object with a dictionary of the 'eics' from the HDF5 file.
def import_spectral_search_results(self, mass_spectra):
415    def import_spectral_search_results(self, mass_spectra):
416        """Imports the spectral search results from the HDF5 file.
417
418        Parameters
419        ----------
420        mass_spectra : LCMSBase | MassSpectraBase
421            The MassSpectraBase or LCMSBase object to populate with spectral search results.
422
423        Returns
424        -------
425        None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase
426        object with a dictionary of the 'spectral_search_results' from the HDF5 file.
427
428        """
429        overall_results_dict = {}
430        ms2_results_load = self.h5pydata["spectral_search_results"]
431        for k in ms2_results_load.keys():
432            overall_results_dict[int(k)] = {}
433            for k2 in ms2_results_load[k].keys():
434                ms2_search_res = SpectrumSearchResults(
435                    query_spectrum=mass_spectra._ms[int(k)],
436                    precursor_mz=ms2_results_load[k][k2].attrs["precursor_mz"],
437                    spectral_similarity_search_results={},
438                )
439
440                for key in ms2_results_load[k][k2].keys() - {"precursor_mz"}:
441                    setattr(ms2_search_res, key, list(ms2_results_load[k][k2][key][:]))
442                overall_results_dict[int(k)][
443                    ms2_results_load[k][k2].attrs["precursor_mz"]
444                ] = ms2_search_res
445
446        # add to mass_spectra
447        mass_spectra.spectral_search_results.update(overall_results_dict)
448
449        # If there are mass features, associate the results with each mass feature
450        if len(mass_spectra.mass_features) > 0:
451            for mass_feature_id, mass_feature in mass_spectra.mass_features.items():
452                scan_ids = mass_feature.ms2_scan_numbers
453                for ms2_scan_id in scan_ids:
454                    precursor_mz = mass_feature.mz
455                    try:
456                        mass_spectra.spectral_search_results[ms2_scan_id][precursor_mz]
457                    except KeyError:
458                        pass
459                    else:
460                        mass_spectra.mass_features[
461                            mass_feature_id
462                        ].ms2_similarity_results.append(
463                            mass_spectra.spectral_search_results[ms2_scan_id][
464                                precursor_mz
465                            ]
466                        )

Imports the spectral search results from the HDF5 file.

Parameters
  • mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with spectral search results.
Returns
  • None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase
  • object with a dictionary of the 'spectral_search_results' from the HDF5 file.
def get_mass_spectra_obj( self, load_raw=True) -> corems.mass_spectra.factory.lc_class.MassSpectraBase:
468    def get_mass_spectra_obj(self, load_raw=True) -> MassSpectraBase:
469        """
470        Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file.
471
472        Parameters
473        ----------
474        load_raw : bool
475            If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True.
476
477        """
478        # Instantiate the LCMS object
479        spectra_obj = MassSpectraBase(
480            file_location=self.file_location,
481            analyzer=self.analyzer,
482            instrument_label=self.instrument_label,
483            sample_name=self.sample_name,
484        )
485
486        # This will populate the _ms list on the LCMS or MassSpectraBase object
487        self.run(spectra_obj, load_raw=load_raw)
488
489        return spectra_obj

Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file.

Parameters
  • load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True.
def get_lcms_obj( self, load_raw=True, use_original_parser=True, raw_file_path=None) -> corems.mass_spectra.factory.lc_class.LCMSBase:
491    def get_lcms_obj(
492        self, load_raw=True, use_original_parser=True, raw_file_path=None
493    ) -> LCMSBase:
494        """
495        Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file.
496
497        Parameters
498        ----------
499        load_raw : bool
500            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
501        use_original_parser : bool
502            If True, use the original parser to populate the LCMS object. Default is True.
503        raw_file_path : str
504            The location of the raw file to parse if attempting to use original parser.
505            Default is None, which attempts to get the raw file path from the HDF5 file.
506            If the original file path has moved, this parameter can be used to specify the new location.
507        """
508        # Instantiate the LCMS object
509        lcms_obj = LCMSBase(
510            file_location=self.file_location,
511            analyzer=self.analyzer,
512            instrument_label=self.instrument_label,
513            sample_name=self.sample_name,
514        )
515
516        # This will populate the majority of the attributes on the LCMS object
517        self.run(lcms_obj, load_raw=load_raw)
518
519        # Set final attributes of the LCMS object
520        lcms_obj.polarity = self.h5pydata.attrs["polarity"]
521        lcms_obj._scans_number_list = list(lcms_obj.scan_df.scan)
522        lcms_obj._retention_time_list = list(lcms_obj.scan_df.scan_time)
523        lcms_obj._tic_list = list(lcms_obj.scan_df.tic)
524
525        # If use_original_parser is True, instantiate the original parser and populate the LCMS object
526        if use_original_parser:
527            lcms_obj = self.add_original_parser(lcms_obj, raw_file_path=raw_file_path)
528        else:
529            lcms_obj.spectra_parser_class = self.__class__
530
531        return lcms_obj

Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file.

Parameters
  • load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
  • use_original_parser (bool): If True, use the original parser to populate the LCMS object. Default is True.
  • raw_file_path (str): The location of the raw file to parse if attempting to use original parser. Default is None, which attempts to get the raw file path from the HDF5 file. If the original file path has moved, this parameter can be used to specify the new location.
def get_raw_file_location(self):
533    def get_raw_file_location(self):
534        """
535        Get the raw file location from the HDF5 file attributes.
536
537        Returns
538        -------
539        str
540            The raw file location.
541        """
542        if "original_file_location" in self.h5pydata.attrs:
543            return self.h5pydata.attrs["original_file_location"]
544        else:
545            return None

Get the raw file location from the HDF5 file attributes.

Returns
  • str: The raw file location.
def add_original_parser(self, mass_spectra, raw_file_path=None):
547    def add_original_parser(self, mass_spectra, raw_file_path=None):
548        """
549        Add the original parser to the mass spectra object.
550
551        Parameters
552        ----------
553        mass_spectra : MassSpectraBase | LCMSBase
554            The MassSpectraBase or LCMSBase object to add the original parser to.
555        raw_file_path : str
556            The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file.
557        """
558        # Get the original parser type
559        og_parser_type = self.h5pydata.attrs["parser_type"]
560
561        # If raw_file_path is None, get it from the HDF5 file attributes
562        if raw_file_path is None:
563            raw_file_path = self.get_raw_file_location()
564            if raw_file_path is None:
565                raise ValueError(
566                    "Raw file path not found in HDF5 file attributes, cannot instantiate original parser."
567                )
568
569        # Set the raw file path on the mass_spectra object so the parser knows where to find the raw file
570        mass_spectra.raw_file_location = raw_file_path
571
572        if og_parser_type == "ImportMassSpectraThermoMSFileReader":
573            # Check that the parser can be instantiated with the raw file path
574            parser = ImportMassSpectraThermoMSFileReader(raw_file_path)
575        elif og_parser_type == "MZMLSpectraParser":
576            # Check that the parser can be instantiated with the raw file path
577            parser = MZMLSpectraParser(raw_file_path)
578
579        # Set the spectra parser class on the mass_spectra object so the spectra_parser property can be used with the original parser
580        mass_spectra.spectra_parser_class = parser.__class__
581
582        return mass_spectra

Add the original parser to the mass spectra object.

Parameters
  • mass_spectra (MassSpectraBase | LCMSBase): The MassSpectraBase or LCMSBase object to add the original parser to.
  • raw_file_path (str): The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file.
def get_creation_time(self):
584    def get_creation_time(self):
585        """
586        Raise a NotImplemented Warning, as creation time is not available in CoreMS HDF5 files and returning None.
587        """
588        warnings.warn(
589            "Creation time is not available in CoreMS HDF5 files, returning None."
590            "This should be accessed through the original parser.",
591        )
592        return None

Raise a NotImplemented Warning, as creation time is not available in CoreMS HDF5 files and returning None.

def get_instrument_info(self):
594    def get_instrument_info(self):
595        """
596        Raise a NotImplemented Warning, as instrument info is not available in CoreMS HDF5 files and returning None.
597        """
598        warnings.warn(
599            "Instrument info is not available in CoreMS HDF5 files, returning None."
600            "This should be accessed through the original parser.",
601        )
602        return None

Raise a NotImplemented Warning, as instrument info is not available in CoreMS HDF5 files and returning None.