corems.mass_spectra.input.corems_hdf5

  1__author__ = "Yuri E. Corilo"
  2__date__ = "Oct 29, 2019"
  3
  4
  5from threading import Thread
  6from pathlib import Path
  7
  8import pandas as pd
  9
 10from corems.chroma_peak.factory.chroma_peak_classes import LCMSMassFeature
 11from corems.encapsulation.input.parameter_from_json import (
 12    load_and_set_json_parameters_lcms,
 13    load_and_set_toml_parameters_lcms,
 14)
 15from corems.mass_spectra.factory.lc_class import LCMSBase, MassSpectraBase
 16from corems.mass_spectra.factory.chromat_data import EIC_Data
 17from corems.mass_spectra.input.parserbase import SpectraParserInterface
 18from corems.mass_spectrum.input.coremsHDF5 import ReadCoreMSHDF_MassSpectrum
 19from corems.molecular_id.factory.spectrum_search_results import SpectrumSearchResults
 20from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader
 21from corems.mass_spectra.input.mzml import MZMLSpectraParser
 22
 23
 24class ReadCoreMSHDFMassSpectra(
 25    SpectraParserInterface, ReadCoreMSHDF_MassSpectrum, Thread
 26):
 27    """Class to read CoreMS HDF5 files and populate a LCMS or MassSpectraBase object.
 28
 29    Parameters
 30    ----------
 31    file_location : str
 32        The location of the HDF5 file to read, including the suffix.
 33
 34    Attributes
 35    ----------
 36    file_location : str
 37        The location of the HDF5 file to read.
 38    h5pydata : h5py.File
 39        The HDF5 file object.
 40    scans : list
 41        A list of the location of individual mass spectra within the HDF5 file.
 42    scan_number_list : list
 43        A list of the scan numbers of the mass spectra within the HDF5 file.
 44    parameters_location : str
 45        The location of the parameters file (json or toml).
 46
 47    Methods
 48    -------
 49    * import_mass_spectra(mass_spectra).
 50        Imports all mass spectra from the HDF5 file onto the LCMS or MassSpectraBase object.
 51    * get_mass_spectrum_from_scan(scan_number).
 52        Return mass spectrum data object from scan number.
 53    * load().
 54        Placeholder method to meet the requirements of the SpectraParserInterface.
 55    * run(mass_spectra).
 56        Runs the importer functions to populate a LCMS or MassSpectraBase object.
 57    * import_scan_info(mass_spectra).
 58        Imports the scan info from the HDF5 file to populate the _scan_info attribute
 59        on the LCMS or MassSpectraBase object
 60    * import_ms_unprocessed(mass_spectra).
 61        Imports the unprocessed mass spectra from the HDF5 file to populate the
 62        _ms_unprocessed attribute on the LCMS or MassSpectraBase object
 63    * import_parameters(mass_spectra).
 64        Imports the parameters from the HDF5 file to populate the parameters
 65        attribute on the LCMS or MassSpectraBase object
 66    * import_mass_features(mass_spectra).
 67        Imports the mass features from the HDF5 file to populate the mass_features
 68        attribute on the LCMS or MassSpectraBase object
 69    * import_eics(mass_spectra).
 70        Imports the extracted ion chromatograms from the HDF5 file to populate the
 71        eics attribute on the LCMS or MassSpectraBase object
 72    * import_spectral_search_results(mass_spectra).
 73        Imports the spectral search results from the HDF5 file to populate the
 74        spectral_search_results attribute on the LCMS or MassSpectraBase object
 75    * get_mass_spectra_obj().
 76        Return mass spectra data object, populating the _ms list on the LCMS or
 77        MassSpectraBase object from the HDF5 file
 78    * get_lcms_obj().
 79        Return LCMSBase object, populating the majority of the attributes on the
 80        LCMS object from the HDF5 file
 81
 82    """
 83
 84    def __init__(self, file_location: str):
 85        Thread.__init__(self)
 86        ReadCoreMSHDF_MassSpectrum.__init__(self, file_location)
 87
 88        # override the scans attribute on ReadCoreMSHDF_MassSpectrum class to expect a nested location within the HDF5 file
 89        self.scans = [
 90            "mass_spectra/" + x for x in list(self.h5pydata["mass_spectra"].keys())
 91        ]
 92        self.scan_number_list = sorted(
 93            [int(float(i)) for i in list(self.h5pydata["mass_spectra"].keys())]
 94        )
 95
 96        # set the location of the parameters file (json or toml)
 97        add_files = [
 98            x
 99            for x in self.file_location.parent.glob(
100                self.file_location.name.replace(".hdf5", ".*")
101            )
102            if x.suffix != ".hdf5"
103        ]
104        if len([x for x in add_files if x.suffix == ".json"]) > 0:
105            self.parameters_location = [x for x in add_files if x.suffix == ".json"][0]
106        elif len([x for x in add_files if x.suffix == ".toml"]) > 0:
107            self.parameters_location = [x for x in add_files if x.suffix == ".toml"][0]
108        else:
109            self.parameters_location = None
110
111    def get_mass_spectrum_from_scan(self, scan_number):
112        """Return mass spectrum data object from scan number."""
113        if scan_number in self.scan_number_list:
114            mass_spec = self.get_mass_spectrum(scan_number)
115            return mass_spec
116        else:
117            raise Exception("Scan number not found in HDF5 file.")
118
119    def load(self) -> None:
120        """ """
121        pass
122
123    def get_ms_raw(self, spectra=None, scan_df=None) -> dict:
124        """ """
125        # Warn if spectra or scan_df are not None that they are not used for CoreMS HDF5 files and should be rerun after instantiation
126        if spectra is not None or scan_df is not None:
127            SyntaxWarning(
128                "get_ms_raw method for CoreMS HDF5 files can only access saved data, consider rerunning after instantiation."
129            )
130        ms_unprocessed = {}
131        dict_group_load = self.h5pydata["ms_unprocessed"]
132        dict_group_keys = dict_group_load.keys()
133        for k in dict_group_keys:
134            ms_up_int = dict_group_load[k][:]
135            ms_unprocessed[int(k)] = pd.DataFrame(
136                ms_up_int, columns=["scan", "mz", "intensity"]
137            )
138        return ms_unprocessed
139
140    def get_scan_df(self) -> pd.DataFrame:
141        scan_info = {}
142        dict_group_load = self.h5pydata["scan_info"]
143        dict_group_keys = dict_group_load.keys()
144        for k in dict_group_keys:
145            scan_info[k] = dict_group_load[k][:]
146        scan_df = pd.DataFrame(scan_info)
147        scan_df.set_index("scan", inplace=True, drop=False)
148        str_df = scan_df.select_dtypes([object])
149        str_df = str_df.stack().str.decode("utf-8").unstack()
150        for col in str_df:
151            scan_df[col] = str_df[col]
152        return scan_df
153
154    def run(self, mass_spectra, load_raw=True) -> None:
155        """Runs the importer functions to populate a LCMS or MassSpectraBase object.
156
157        Notes
158        -----
159        The following functions are run in order, if the HDF5 file contains the necessary data:
160        1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object.
161        2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object.
162        3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object.
163        4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object.
164        5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object.
165        6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object.
166        7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object.
167
168        Parameters
169        ----------
170        mass_spectra : LCMSBase or MassSpectraBase
171            The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes.
172        load_raw : bool
173            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
174        Returns
175        -------
176        None, but populates several attributes on the LCMS or MassSpectraBase object.
177
178        """
179        if self.parameters_location is not None:
180            # Populate the parameters attribute on the LCMS object
181            self.import_parameters(mass_spectra)
182
183        if "mass_spectra" in self.h5pydata:
184            # Populate the _ms list on the LCMS object
185            self.import_mass_spectra(mass_spectra, load_raw=load_raw)
186
187        if "scan_info" in self.h5pydata:
188            # Populate the _scan_info attribute on the LCMS object
189            self.import_scan_info(mass_spectra)
190
191        if "ms_unprocessed" in self.h5pydata and load_raw:
192            # Populate the _ms_unprocessed attribute on the LCMS object
193            self.import_ms_unprocessed(mass_spectra)
194
195        if "mass_features" in self.h5pydata:
196            # Populate the mass_features attribute on the LCMS object
197            self.import_mass_features(mass_spectra)
198
199        if "eics" in self.h5pydata:
200            # Populate the eics attribute on the LCMS object
201            self.import_eics(mass_spectra)
202
203        if "spectral_search_results" in self.h5pydata:
204            # Populate the spectral_search_results attribute on the LCMS object
205            self.import_spectral_search_results(mass_spectra)
206
207    def import_mass_spectra(self, mass_spectra, load_raw=True) -> None:
208        """Imports all mass spectra from the HDF5 file.
209
210        Parameters
211        ----------
212        mass_spectra : LCMSBase | MassSpectraBase
213            The MassSpectraBase or LCMSBase object to populate with mass spectra.
214        load_raw : bool
215            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default
216
217        Returns
218        -------
219        None, but populates the '_ms' list on the LCMSBase or MassSpectraBase
220        object with mass spectra from the HDF5 file.
221        """
222        for scan_number in self.scan_number_list:
223            mass_spec = self.get_mass_spectrum(scan_number, load_raw=load_raw)
224            mass_spec.scan_number = scan_number
225            mass_spectra.add_mass_spectrum(mass_spec)
226
227    def import_scan_info(self, mass_spectra) -> None:
228        """Imports the scan info from the HDF5 file.
229
230        Parameters
231        ----------
232        lcms : LCMSBase | MassSpectraBase
233            The MassSpectraBase or LCMSBase objects
234
235        Returns
236        -------
237        None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase
238        object with a pandas DataFrame of the 'scan_info' from the HDF5 file.
239
240        """
241        scan_df = self.get_scan_df()
242        mass_spectra.scan_df = scan_df
243
244    def import_ms_unprocessed(self, mass_spectra) -> None:
245        """Imports the unprocessed mass spectra from the HDF5 file.
246
247        Parameters
248        ----------
249        lcms : LCMSBase | MassSpectraBase
250            The MassSpectraBase or LCMSBase objects
251
252        Returns
253        -------
254        None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase
255        object with a dictionary of the 'ms_unprocessed' from the HDF5 file.
256
257        """
258        ms_unprocessed = self.get_ms_raw()
259        mass_spectra._ms_unprocessed = ms_unprocessed
260
261    def import_parameters(self, mass_spectra) -> None:
262        """Imports the parameters from the HDF5 file.
263
264        Parameters
265        ----------
266        mass_spectra : LCMSBase | MassSpectraBase
267            The MassSpectraBase or LCMSBase object to populate with parameters.
268
269        Returns
270        -------
271        None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase
272        object with a dictionary of the 'parameters' from the HDF5 file.
273
274        """
275        if ".json" == self.parameters_location.suffix:
276            load_and_set_json_parameters_lcms(mass_spectra, self.parameters_location)
277        if ".toml" == self.parameters_location.suffix:
278            load_and_set_toml_parameters_lcms(mass_spectra, self.parameters_location)
279        else:
280            raise Exception(
281                "Parameters file must be in JSON format, TOML format is not yet supported."
282            )
283
284    def import_mass_features(self, mass_spectra) -> None:
285        """Imports the mass features from the HDF5 file.
286
287        Parameters
288        ----------
289        mass_spectra : LCMSBase | MassSpectraBase
290            The MassSpectraBase or LCMSBase object to populate with mass features.
291
292        Returns
293        -------
294        None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase
295        object with a dictionary of the 'mass_features' from the HDF5 file.
296
297        """
298        dict_group_load = self.h5pydata["mass_features"]
299        dict_group_keys = dict_group_load.keys()
300        for k in dict_group_keys:
301            # Instantiate the MassFeature object
302            mass_feature = LCMSMassFeature(
303                mass_spectra,
304                mz=dict_group_load[k].attrs["_mz_exp"],
305                retention_time=dict_group_load[k].attrs["_retention_time"],
306                intensity=dict_group_load[k].attrs["_intensity"],
307                apex_scan=dict_group_load[k].attrs["_apex_scan"],
308                persistence=dict_group_load[k].attrs["_persistence"],
309                id=int(k),
310            )
311
312            # Populate additional attributes on the MassFeature object
313            for key in dict_group_load[k].attrs.keys() - {
314                "_mz_exp",
315                "_mz_cal",
316                "_retention_time",
317                "_intensity",
318                "_apex_scan",
319                "_persistence",
320            }:
321                setattr(mass_feature, key, dict_group_load[k].attrs[key])
322
323            # Populate attributes on MassFeature object that are lists
324            for key in dict_group_load[k].keys():
325                setattr(mass_feature, key, dict_group_load[k][key][:])
326
327            mass_spectra.mass_features[int(k)] = mass_feature
328
329        # Associate mass features with ms1 and ms2 spectra, if available
330        for mf_id in mass_spectra.mass_features.keys():
331            if mass_spectra.mass_features[mf_id].apex_scan in mass_spectra._ms.keys():
332                mass_spectra.mass_features[mf_id].mass_spectrum = mass_spectra._ms[
333                    mass_spectra.mass_features[mf_id].apex_scan
334                ]
335            if mass_spectra.mass_features[mf_id].ms2_scan_numbers is not None:
336                for ms2_scan in mass_spectra.mass_features[mf_id].ms2_scan_numbers:
337                    if ms2_scan in mass_spectra._ms.keys():
338                        mass_spectra.mass_features[mf_id].ms2_mass_spectra[ms2_scan] = (
339                            mass_spectra._ms[ms2_scan]
340                        )
341
342    def import_eics(self, mass_spectra):
343        """Imports the extracted ion chromatograms from the HDF5 file.
344
345        Parameters
346        ----------
347        mass_spectra : LCMSBase | MassSpectraBase
348            The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms.
349
350        Returns
351        -------
352        None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase
353        object with a dictionary of the 'eics' from the HDF5 file.
354
355        """
356        dict_group_load = self.h5pydata["eics"]
357        dict_group_keys = dict_group_load.keys()
358        for k in dict_group_keys:
359            my_eic = EIC_Data(
360                scans=dict_group_load[k]["scans"][:],
361                time=dict_group_load[k]["time"][:],
362                eic=dict_group_load[k]["eic"][:],
363            )
364            for key in dict_group_load[k].keys():
365                if key not in ["scans", "time", "eic"]:
366                    setattr(my_eic, key, dict_group_load[k][key][:])
367                    # if key is apexes, convert to a tuple of a list
368                    if key == "apexes" and len(my_eic.apexes) > 0:
369                        my_eic.apexes = [tuple(x) for x in my_eic.apexes]
370            # Add to mass_spectra object
371            mass_spectra.eics[dict_group_load[k].attrs["mz"]] = my_eic
372
373        # Add to mass features
374        for idx in mass_spectra.mass_features.keys():
375            mz = mass_spectra.mass_features[idx].mz
376            if mz in mass_spectra.eics.keys():
377                mass_spectra.mass_features[idx]._eic_data = mass_spectra.eics[mz]
378
379    def import_spectral_search_results(self, mass_spectra):
380        """Imports the spectral search results from the HDF5 file.
381
382        Parameters
383        ----------
384        mass_spectra : LCMSBase | MassSpectraBase
385            The MassSpectraBase or LCMSBase object to populate with spectral search results.
386
387        Returns
388        -------
389        None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase
390        object with a dictionary of the 'spectral_search_results' from the HDF5 file.
391
392        """
393        overall_results_dict = {}
394        ms2_results_load = self.h5pydata["spectral_search_results"]
395        for k in ms2_results_load.keys():
396            overall_results_dict[int(k)] = {}
397            for k2 in ms2_results_load[k].keys():
398                ms2_search_res = SpectrumSearchResults(
399                    query_spectrum=mass_spectra._ms[int(k)],
400                    precursor_mz=ms2_results_load[k][k2].attrs["precursor_mz"],
401                    spectral_similarity_search_results={},
402                )
403
404                for key in ms2_results_load[k][k2].keys() - {"precursor_mz"}:
405                    setattr(ms2_search_res, key, list(ms2_results_load[k][k2][key][:]))
406                overall_results_dict[int(k)][
407                    ms2_results_load[k][k2].attrs["precursor_mz"]
408                ] = ms2_search_res
409
410        # add to mass_spectra
411        mass_spectra.spectral_search_results.update(overall_results_dict)
412
413        # If there are mass features, associate the results with each mass feature
414        if len(mass_spectra.mass_features) > 0:
415            for mass_feature_id, mass_feature in mass_spectra.mass_features.items():
416                scan_ids = mass_feature.ms2_scan_numbers
417                for ms2_scan_id in scan_ids:
418                    precursor_mz = mass_feature.mz
419                    try:
420                        mass_spectra.spectral_search_results[ms2_scan_id][precursor_mz]
421                    except KeyError:
422                        pass
423                    else:
424                        mass_spectra.mass_features[
425                            mass_feature_id
426                        ].ms2_similarity_results.append(
427                            mass_spectra.spectral_search_results[ms2_scan_id][
428                                precursor_mz
429                            ]
430                        )
431
432    def get_mass_spectra_obj(self, load_raw=True) -> MassSpectraBase:
433        """
434        Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file.
435
436        Parameters
437        ----------
438        load_raw : bool
439            If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True.
440
441        """
442        # Instantiate the LCMS object
443        spectra_obj = MassSpectraBase(
444            file_location=self.file_location,
445            analyzer=self.analyzer,
446            instrument_label=self.instrument_label,
447            sample_name=self.sample_name,
448        )
449
450        # This will populate the _ms list on the LCMS or MassSpectraBase object
451        self.run(spectra_obj, load_raw=load_raw)
452
453        return spectra_obj
454
455    def get_lcms_obj(
456        self, load_raw=True, use_original_parser=True, raw_file_path=None
457    ) -> LCMSBase:
458        """
459        Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file.
460
461        Parameters
462        ----------
463        load_raw : bool
464            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
465        use_original_parser : bool
466            If True, use the original parser to populate the LCMS object. Default is True.
467        raw_file_path : str
468            The location of the raw file to parse if attempting to use original parser.
469            Default is None, which attempts to get the raw file path from the HDF5 file.
470            If the original file path has moved, this parameter can be used to specify the new location.
471        """
472        # Instantiate the LCMS object
473        lcms_obj = LCMSBase(
474            file_location=self.file_location,
475            analyzer=self.analyzer,
476            instrument_label=self.instrument_label,
477            sample_name=self.sample_name,
478        )
479
480        # This will populate the majority of the attributes on the LCMS object
481        self.run(lcms_obj, load_raw=load_raw)
482
483        # Set final attributes of the LCMS object
484        lcms_obj.polarity = self.h5pydata.attrs["polarity"]
485        lcms_obj._scans_number_list = list(lcms_obj.scan_df.scan)
486        lcms_obj._retention_time_list = list(lcms_obj.scan_df.scan_time)
487        lcms_obj._tic_list = list(lcms_obj.scan_df.tic)
488
489        # If use_original_parser is True, instantiate the original parser and populate the LCMS object
490        if use_original_parser:
491            lcms_obj = self.add_original_parser(lcms_obj, raw_file_path=raw_file_path)
492
493        return lcms_obj
494
495    def add_original_parser(self, mass_spectra, raw_file_path=None):
496        """
497        Add the original parser to the mass spectra object.
498
499        Parameters
500        ----------
501        mass_spectra : MassSpectraBase | LCMSBase
502            The MassSpectraBase or LCMSBase object to add the original parser to.
503        raw_file_path : str
504            The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file.
505        """
506        # Try to get the raw file path from the HDF5 file
507        if raw_file_path is None:
508            raw_file_path = self.h5pydata.attrs["original_file_location"]
509            # Check if og_file_location exists, if not raise an error
510            raw_file_path = self.h5pydata.attrs["original_file_location"]
511
512        raw_file_path = Path(raw_file_path)
513        if not raw_file_path.exists():
514            raise FileExistsError(
515                "File does not exist: " + str(raw_file_path),
516                ". Cannot use original parser for instatiating the lcms_obj.",
517            )
518
519        # Get the original parser type
520        og_parser_type = self.h5pydata.attrs["parser_type"]
521
522        if og_parser_type == "ImportMassSpectraThermoMSFileReader":
523            parser = ImportMassSpectraThermoMSFileReader(raw_file_path)
524        elif og_parser_type == "MZMLSpectraParser":
525            parser = MZMLSpectraParser(raw_file_path)
526
527        mass_spectra.spectra_parser_class = parser.__class__
528        mass_spectra.spectra_parser = parser
529
530        return mass_spectra
 25class ReadCoreMSHDFMassSpectra(
 26    SpectraParserInterface, ReadCoreMSHDF_MassSpectrum, Thread
 27):
 28    """Class to read CoreMS HDF5 files and populate a LCMS or MassSpectraBase object.
 29
 30    Parameters
 31    ----------
 32    file_location : str
 33        The location of the HDF5 file to read, including the suffix.
 34
 35    Attributes
 36    ----------
 37    file_location : str
 38        The location of the HDF5 file to read.
 39    h5pydata : h5py.File
 40        The HDF5 file object.
 41    scans : list
 42        A list of the location of individual mass spectra within the HDF5 file.
 43    scan_number_list : list
 44        A list of the scan numbers of the mass spectra within the HDF5 file.
 45    parameters_location : str
 46        The location of the parameters file (json or toml).
 47
 48    Methods
 49    -------
 50    * import_mass_spectra(mass_spectra).
 51        Imports all mass spectra from the HDF5 file onto the LCMS or MassSpectraBase object.
 52    * get_mass_spectrum_from_scan(scan_number).
 53        Return mass spectrum data object from scan number.
 54    * load().
 55        Placeholder method to meet the requirements of the SpectraParserInterface.
 56    * run(mass_spectra).
 57        Runs the importer functions to populate a LCMS or MassSpectraBase object.
 58    * import_scan_info(mass_spectra).
 59        Imports the scan info from the HDF5 file to populate the _scan_info attribute
 60        on the LCMS or MassSpectraBase object
 61    * import_ms_unprocessed(mass_spectra).
 62        Imports the unprocessed mass spectra from the HDF5 file to populate the
 63        _ms_unprocessed attribute on the LCMS or MassSpectraBase object
 64    * import_parameters(mass_spectra).
 65        Imports the parameters from the HDF5 file to populate the parameters
 66        attribute on the LCMS or MassSpectraBase object
 67    * import_mass_features(mass_spectra).
 68        Imports the mass features from the HDF5 file to populate the mass_features
 69        attribute on the LCMS or MassSpectraBase object
 70    * import_eics(mass_spectra).
 71        Imports the extracted ion chromatograms from the HDF5 file to populate the
 72        eics attribute on the LCMS or MassSpectraBase object
 73    * import_spectral_search_results(mass_spectra).
 74        Imports the spectral search results from the HDF5 file to populate the
 75        spectral_search_results attribute on the LCMS or MassSpectraBase object
 76    * get_mass_spectra_obj().
 77        Return mass spectra data object, populating the _ms list on the LCMS or
 78        MassSpectraBase object from the HDF5 file
 79    * get_lcms_obj().
 80        Return LCMSBase object, populating the majority of the attributes on the
 81        LCMS object from the HDF5 file
 82
 83    """
 84
 85    def __init__(self, file_location: str):
 86        Thread.__init__(self)
 87        ReadCoreMSHDF_MassSpectrum.__init__(self, file_location)
 88
 89        # override the scans attribute on ReadCoreMSHDF_MassSpectrum class to expect a nested location within the HDF5 file
 90        self.scans = [
 91            "mass_spectra/" + x for x in list(self.h5pydata["mass_spectra"].keys())
 92        ]
 93        self.scan_number_list = sorted(
 94            [int(float(i)) for i in list(self.h5pydata["mass_spectra"].keys())]
 95        )
 96
 97        # set the location of the parameters file (json or toml)
 98        add_files = [
 99            x
100            for x in self.file_location.parent.glob(
101                self.file_location.name.replace(".hdf5", ".*")
102            )
103            if x.suffix != ".hdf5"
104        ]
105        if len([x for x in add_files if x.suffix == ".json"]) > 0:
106            self.parameters_location = [x for x in add_files if x.suffix == ".json"][0]
107        elif len([x for x in add_files if x.suffix == ".toml"]) > 0:
108            self.parameters_location = [x for x in add_files if x.suffix == ".toml"][0]
109        else:
110            self.parameters_location = None
111
112    def get_mass_spectrum_from_scan(self, scan_number):
113        """Return mass spectrum data object from scan number."""
114        if scan_number in self.scan_number_list:
115            mass_spec = self.get_mass_spectrum(scan_number)
116            return mass_spec
117        else:
118            raise Exception("Scan number not found in HDF5 file.")
119
120    def load(self) -> None:
121        """ """
122        pass
123
124    def get_ms_raw(self, spectra=None, scan_df=None) -> dict:
125        """ """
126        # Warn if spectra or scan_df are not None that they are not used for CoreMS HDF5 files and should be rerun after instantiation
127        if spectra is not None or scan_df is not None:
128            SyntaxWarning(
129                "get_ms_raw method for CoreMS HDF5 files can only access saved data, consider rerunning after instantiation."
130            )
131        ms_unprocessed = {}
132        dict_group_load = self.h5pydata["ms_unprocessed"]
133        dict_group_keys = dict_group_load.keys()
134        for k in dict_group_keys:
135            ms_up_int = dict_group_load[k][:]
136            ms_unprocessed[int(k)] = pd.DataFrame(
137                ms_up_int, columns=["scan", "mz", "intensity"]
138            )
139        return ms_unprocessed
140
141    def get_scan_df(self) -> pd.DataFrame:
142        scan_info = {}
143        dict_group_load = self.h5pydata["scan_info"]
144        dict_group_keys = dict_group_load.keys()
145        for k in dict_group_keys:
146            scan_info[k] = dict_group_load[k][:]
147        scan_df = pd.DataFrame(scan_info)
148        scan_df.set_index("scan", inplace=True, drop=False)
149        str_df = scan_df.select_dtypes([object])
150        str_df = str_df.stack().str.decode("utf-8").unstack()
151        for col in str_df:
152            scan_df[col] = str_df[col]
153        return scan_df
154
155    def run(self, mass_spectra, load_raw=True) -> None:
156        """Runs the importer functions to populate a LCMS or MassSpectraBase object.
157
158        Notes
159        -----
160        The following functions are run in order, if the HDF5 file contains the necessary data:
161        1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object.
162        2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object.
163        3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object.
164        4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object.
165        5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object.
166        6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object.
167        7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object.
168
169        Parameters
170        ----------
171        mass_spectra : LCMSBase or MassSpectraBase
172            The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes.
173        load_raw : bool
174            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
175        Returns
176        -------
177        None, but populates several attributes on the LCMS or MassSpectraBase object.
178
179        """
180        if self.parameters_location is not None:
181            # Populate the parameters attribute on the LCMS object
182            self.import_parameters(mass_spectra)
183
184        if "mass_spectra" in self.h5pydata:
185            # Populate the _ms list on the LCMS object
186            self.import_mass_spectra(mass_spectra, load_raw=load_raw)
187
188        if "scan_info" in self.h5pydata:
189            # Populate the _scan_info attribute on the LCMS object
190            self.import_scan_info(mass_spectra)
191
192        if "ms_unprocessed" in self.h5pydata and load_raw:
193            # Populate the _ms_unprocessed attribute on the LCMS object
194            self.import_ms_unprocessed(mass_spectra)
195
196        if "mass_features" in self.h5pydata:
197            # Populate the mass_features attribute on the LCMS object
198            self.import_mass_features(mass_spectra)
199
200        if "eics" in self.h5pydata:
201            # Populate the eics attribute on the LCMS object
202            self.import_eics(mass_spectra)
203
204        if "spectral_search_results" in self.h5pydata:
205            # Populate the spectral_search_results attribute on the LCMS object
206            self.import_spectral_search_results(mass_spectra)
207
208    def import_mass_spectra(self, mass_spectra, load_raw=True) -> None:
209        """Imports all mass spectra from the HDF5 file.
210
211        Parameters
212        ----------
213        mass_spectra : LCMSBase | MassSpectraBase
214            The MassSpectraBase or LCMSBase object to populate with mass spectra.
215        load_raw : bool
216            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default
217
218        Returns
219        -------
220        None, but populates the '_ms' list on the LCMSBase or MassSpectraBase
221        object with mass spectra from the HDF5 file.
222        """
223        for scan_number in self.scan_number_list:
224            mass_spec = self.get_mass_spectrum(scan_number, load_raw=load_raw)
225            mass_spec.scan_number = scan_number
226            mass_spectra.add_mass_spectrum(mass_spec)
227
228    def import_scan_info(self, mass_spectra) -> None:
229        """Imports the scan info from the HDF5 file.
230
231        Parameters
232        ----------
233        lcms : LCMSBase | MassSpectraBase
234            The MassSpectraBase or LCMSBase objects
235
236        Returns
237        -------
238        None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase
239        object with a pandas DataFrame of the 'scan_info' from the HDF5 file.
240
241        """
242        scan_df = self.get_scan_df()
243        mass_spectra.scan_df = scan_df
244
245    def import_ms_unprocessed(self, mass_spectra) -> None:
246        """Imports the unprocessed mass spectra from the HDF5 file.
247
248        Parameters
249        ----------
250        lcms : LCMSBase | MassSpectraBase
251            The MassSpectraBase or LCMSBase objects
252
253        Returns
254        -------
255        None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase
256        object with a dictionary of the 'ms_unprocessed' from the HDF5 file.
257
258        """
259        ms_unprocessed = self.get_ms_raw()
260        mass_spectra._ms_unprocessed = ms_unprocessed
261
262    def import_parameters(self, mass_spectra) -> None:
263        """Imports the parameters from the HDF5 file.
264
265        Parameters
266        ----------
267        mass_spectra : LCMSBase | MassSpectraBase
268            The MassSpectraBase or LCMSBase object to populate with parameters.
269
270        Returns
271        -------
272        None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase
273        object with a dictionary of the 'parameters' from the HDF5 file.
274
275        """
276        if ".json" == self.parameters_location.suffix:
277            load_and_set_json_parameters_lcms(mass_spectra, self.parameters_location)
278        if ".toml" == self.parameters_location.suffix:
279            load_and_set_toml_parameters_lcms(mass_spectra, self.parameters_location)
280        else:
281            raise Exception(
282                "Parameters file must be in JSON format, TOML format is not yet supported."
283            )
284
285    def import_mass_features(self, mass_spectra) -> None:
286        """Imports the mass features from the HDF5 file.
287
288        Parameters
289        ----------
290        mass_spectra : LCMSBase | MassSpectraBase
291            The MassSpectraBase or LCMSBase object to populate with mass features.
292
293        Returns
294        -------
295        None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase
296        object with a dictionary of the 'mass_features' from the HDF5 file.
297
298        """
299        dict_group_load = self.h5pydata["mass_features"]
300        dict_group_keys = dict_group_load.keys()
301        for k in dict_group_keys:
302            # Instantiate the MassFeature object
303            mass_feature = LCMSMassFeature(
304                mass_spectra,
305                mz=dict_group_load[k].attrs["_mz_exp"],
306                retention_time=dict_group_load[k].attrs["_retention_time"],
307                intensity=dict_group_load[k].attrs["_intensity"],
308                apex_scan=dict_group_load[k].attrs["_apex_scan"],
309                persistence=dict_group_load[k].attrs["_persistence"],
310                id=int(k),
311            )
312
313            # Populate additional attributes on the MassFeature object
314            for key in dict_group_load[k].attrs.keys() - {
315                "_mz_exp",
316                "_mz_cal",
317                "_retention_time",
318                "_intensity",
319                "_apex_scan",
320                "_persistence",
321            }:
322                setattr(mass_feature, key, dict_group_load[k].attrs[key])
323
324            # Populate attributes on MassFeature object that are lists
325            for key in dict_group_load[k].keys():
326                setattr(mass_feature, key, dict_group_load[k][key][:])
327
328            mass_spectra.mass_features[int(k)] = mass_feature
329
330        # Associate mass features with ms1 and ms2 spectra, if available
331        for mf_id in mass_spectra.mass_features.keys():
332            if mass_spectra.mass_features[mf_id].apex_scan in mass_spectra._ms.keys():
333                mass_spectra.mass_features[mf_id].mass_spectrum = mass_spectra._ms[
334                    mass_spectra.mass_features[mf_id].apex_scan
335                ]
336            if mass_spectra.mass_features[mf_id].ms2_scan_numbers is not None:
337                for ms2_scan in mass_spectra.mass_features[mf_id].ms2_scan_numbers:
338                    if ms2_scan in mass_spectra._ms.keys():
339                        mass_spectra.mass_features[mf_id].ms2_mass_spectra[ms2_scan] = (
340                            mass_spectra._ms[ms2_scan]
341                        )
342
343    def import_eics(self, mass_spectra):
344        """Imports the extracted ion chromatograms from the HDF5 file.
345
346        Parameters
347        ----------
348        mass_spectra : LCMSBase | MassSpectraBase
349            The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms.
350
351        Returns
352        -------
353        None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase
354        object with a dictionary of the 'eics' from the HDF5 file.
355
356        """
357        dict_group_load = self.h5pydata["eics"]
358        dict_group_keys = dict_group_load.keys()
359        for k in dict_group_keys:
360            my_eic = EIC_Data(
361                scans=dict_group_load[k]["scans"][:],
362                time=dict_group_load[k]["time"][:],
363                eic=dict_group_load[k]["eic"][:],
364            )
365            for key in dict_group_load[k].keys():
366                if key not in ["scans", "time", "eic"]:
367                    setattr(my_eic, key, dict_group_load[k][key][:])
368                    # if key is apexes, convert to a tuple of a list
369                    if key == "apexes" and len(my_eic.apexes) > 0:
370                        my_eic.apexes = [tuple(x) for x in my_eic.apexes]
371            # Add to mass_spectra object
372            mass_spectra.eics[dict_group_load[k].attrs["mz"]] = my_eic
373
374        # Add to mass features
375        for idx in mass_spectra.mass_features.keys():
376            mz = mass_spectra.mass_features[idx].mz
377            if mz in mass_spectra.eics.keys():
378                mass_spectra.mass_features[idx]._eic_data = mass_spectra.eics[mz]
379
380    def import_spectral_search_results(self, mass_spectra):
381        """Imports the spectral search results from the HDF5 file.
382
383        Parameters
384        ----------
385        mass_spectra : LCMSBase | MassSpectraBase
386            The MassSpectraBase or LCMSBase object to populate with spectral search results.
387
388        Returns
389        -------
390        None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase
391        object with a dictionary of the 'spectral_search_results' from the HDF5 file.
392
393        """
394        overall_results_dict = {}
395        ms2_results_load = self.h5pydata["spectral_search_results"]
396        for k in ms2_results_load.keys():
397            overall_results_dict[int(k)] = {}
398            for k2 in ms2_results_load[k].keys():
399                ms2_search_res = SpectrumSearchResults(
400                    query_spectrum=mass_spectra._ms[int(k)],
401                    precursor_mz=ms2_results_load[k][k2].attrs["precursor_mz"],
402                    spectral_similarity_search_results={},
403                )
404
405                for key in ms2_results_load[k][k2].keys() - {"precursor_mz"}:
406                    setattr(ms2_search_res, key, list(ms2_results_load[k][k2][key][:]))
407                overall_results_dict[int(k)][
408                    ms2_results_load[k][k2].attrs["precursor_mz"]
409                ] = ms2_search_res
410
411        # add to mass_spectra
412        mass_spectra.spectral_search_results.update(overall_results_dict)
413
414        # If there are mass features, associate the results with each mass feature
415        if len(mass_spectra.mass_features) > 0:
416            for mass_feature_id, mass_feature in mass_spectra.mass_features.items():
417                scan_ids = mass_feature.ms2_scan_numbers
418                for ms2_scan_id in scan_ids:
419                    precursor_mz = mass_feature.mz
420                    try:
421                        mass_spectra.spectral_search_results[ms2_scan_id][precursor_mz]
422                    except KeyError:
423                        pass
424                    else:
425                        mass_spectra.mass_features[
426                            mass_feature_id
427                        ].ms2_similarity_results.append(
428                            mass_spectra.spectral_search_results[ms2_scan_id][
429                                precursor_mz
430                            ]
431                        )
432
433    def get_mass_spectra_obj(self, load_raw=True) -> MassSpectraBase:
434        """
435        Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file.
436
437        Parameters
438        ----------
439        load_raw : bool
440            If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True.
441
442        """
443        # Instantiate the LCMS object
444        spectra_obj = MassSpectraBase(
445            file_location=self.file_location,
446            analyzer=self.analyzer,
447            instrument_label=self.instrument_label,
448            sample_name=self.sample_name,
449        )
450
451        # This will populate the _ms list on the LCMS or MassSpectraBase object
452        self.run(spectra_obj, load_raw=load_raw)
453
454        return spectra_obj
455
456    def get_lcms_obj(
457        self, load_raw=True, use_original_parser=True, raw_file_path=None
458    ) -> LCMSBase:
459        """
460        Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file.
461
462        Parameters
463        ----------
464        load_raw : bool
465            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
466        use_original_parser : bool
467            If True, use the original parser to populate the LCMS object. Default is True.
468        raw_file_path : str
469            The location of the raw file to parse if attempting to use original parser.
470            Default is None, which attempts to get the raw file path from the HDF5 file.
471            If the original file path has moved, this parameter can be used to specify the new location.
472        """
473        # Instantiate the LCMS object
474        lcms_obj = LCMSBase(
475            file_location=self.file_location,
476            analyzer=self.analyzer,
477            instrument_label=self.instrument_label,
478            sample_name=self.sample_name,
479        )
480
481        # This will populate the majority of the attributes on the LCMS object
482        self.run(lcms_obj, load_raw=load_raw)
483
484        # Set final attributes of the LCMS object
485        lcms_obj.polarity = self.h5pydata.attrs["polarity"]
486        lcms_obj._scans_number_list = list(lcms_obj.scan_df.scan)
487        lcms_obj._retention_time_list = list(lcms_obj.scan_df.scan_time)
488        lcms_obj._tic_list = list(lcms_obj.scan_df.tic)
489
490        # If use_original_parser is True, instantiate the original parser and populate the LCMS object
491        if use_original_parser:
492            lcms_obj = self.add_original_parser(lcms_obj, raw_file_path=raw_file_path)
493
494        return lcms_obj
495
496    def add_original_parser(self, mass_spectra, raw_file_path=None):
497        """
498        Add the original parser to the mass spectra object.
499
500        Parameters
501        ----------
502        mass_spectra : MassSpectraBase | LCMSBase
503            The MassSpectraBase or LCMSBase object to add the original parser to.
504        raw_file_path : str
505            The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file.
506        """
507        # Try to get the raw file path from the HDF5 file
508        if raw_file_path is None:
509            raw_file_path = self.h5pydata.attrs["original_file_location"]
510            # Check if og_file_location exists, if not raise an error
511            raw_file_path = self.h5pydata.attrs["original_file_location"]
512
513        raw_file_path = Path(raw_file_path)
514        if not raw_file_path.exists():
515            raise FileExistsError(
516                "File does not exist: " + str(raw_file_path),
517                ". Cannot use original parser for instatiating the lcms_obj.",
518            )
519
520        # Get the original parser type
521        og_parser_type = self.h5pydata.attrs["parser_type"]
522
523        if og_parser_type == "ImportMassSpectraThermoMSFileReader":
524            parser = ImportMassSpectraThermoMSFileReader(raw_file_path)
525        elif og_parser_type == "MZMLSpectraParser":
526            parser = MZMLSpectraParser(raw_file_path)
527
528        mass_spectra.spectra_parser_class = parser.__class__
529        mass_spectra.spectra_parser = parser
530
531        return mass_spectra

Class to read CoreMS HDF5 files and populate a LCMS or MassSpectraBase object.

Parameters
  • file_location (str): The location of the HDF5 file to read, including the suffix.
Attributes
  • file_location (str): The location of the HDF5 file to read.
  • h5pydata (h5py.File): The HDF5 file object.
  • scans (list): A list of the location of individual mass spectra within the HDF5 file.
  • scan_number_list (list): A list of the scan numbers of the mass spectra within the HDF5 file.
  • parameters_location (str): The location of the parameters file (json or toml).
Methods
  • import_mass_spectra(mass_spectra). Imports all mass spectra from the HDF5 file onto the LCMS or MassSpectraBase object.
  • get_mass_spectrum_from_scan(scan_number). Return mass spectrum data object from scan number.
  • load(). Placeholder method to meet the requirements of the SpectraParserInterface.
  • run(mass_spectra). Runs the importer functions to populate a LCMS or MassSpectraBase object.
  • import_scan_info(mass_spectra). Imports the scan info from the HDF5 file to populate the _scan_info attribute on the LCMS or MassSpectraBase object
  • import_ms_unprocessed(mass_spectra). Imports the unprocessed mass spectra from the HDF5 file to populate the _ms_unprocessed attribute on the LCMS or MassSpectraBase object
  • import_parameters(mass_spectra). Imports the parameters from the HDF5 file to populate the parameters attribute on the LCMS or MassSpectraBase object
  • import_mass_features(mass_spectra). Imports the mass features from the HDF5 file to populate the mass_features attribute on the LCMS or MassSpectraBase object
  • import_eics(mass_spectra). Imports the extracted ion chromatograms from the HDF5 file to populate the eics attribute on the LCMS or MassSpectraBase object
  • import_spectral_search_results(mass_spectra). Imports the spectral search results from the HDF5 file to populate the spectral_search_results attribute on the LCMS or MassSpectraBase object
  • get_mass_spectra_obj(). Return mass spectra data object, populating the _ms list on the LCMS or MassSpectraBase object from the HDF5 file
  • get_lcms_obj(). Return LCMSBase object, populating the majority of the attributes on the LCMS object from the HDF5 file
ReadCoreMSHDFMassSpectra(file_location: str)
 85    def __init__(self, file_location: str):
 86        Thread.__init__(self)
 87        ReadCoreMSHDF_MassSpectrum.__init__(self, file_location)
 88
 89        # override the scans attribute on ReadCoreMSHDF_MassSpectrum class to expect a nested location within the HDF5 file
 90        self.scans = [
 91            "mass_spectra/" + x for x in list(self.h5pydata["mass_spectra"].keys())
 92        ]
 93        self.scan_number_list = sorted(
 94            [int(float(i)) for i in list(self.h5pydata["mass_spectra"].keys())]
 95        )
 96
 97        # set the location of the parameters file (json or toml)
 98        add_files = [
 99            x
100            for x in self.file_location.parent.glob(
101                self.file_location.name.replace(".hdf5", ".*")
102            )
103            if x.suffix != ".hdf5"
104        ]
105        if len([x for x in add_files if x.suffix == ".json"]) > 0:
106            self.parameters_location = [x for x in add_files if x.suffix == ".json"][0]
107        elif len([x for x in add_files if x.suffix == ".toml"]) > 0:
108            self.parameters_location = [x for x in add_files if x.suffix == ".toml"][0]
109        else:
110            self.parameters_location = None

This constructor should always be called with keyword arguments. Arguments are:

group should be None; reserved for future extension when a ThreadGroup class is implemented.

target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.

name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.

args is the argument tuple for the target invocation. Defaults to ().

kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.

If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.

scans
scan_number_list
def get_mass_spectrum_from_scan(self, scan_number):
112    def get_mass_spectrum_from_scan(self, scan_number):
113        """Return mass spectrum data object from scan number."""
114        if scan_number in self.scan_number_list:
115            mass_spec = self.get_mass_spectrum(scan_number)
116            return mass_spec
117        else:
118            raise Exception("Scan number not found in HDF5 file.")

Return mass spectrum data object from scan number.

def load(self) -> None:
120    def load(self) -> None:
121        """ """
122        pass
def get_ms_raw(self, spectra=None, scan_df=None) -> dict:
124    def get_ms_raw(self, spectra=None, scan_df=None) -> dict:
125        """ """
126        # Warn if spectra or scan_df are not None that they are not used for CoreMS HDF5 files and should be rerun after instantiation
127        if spectra is not None or scan_df is not None:
128            SyntaxWarning(
129                "get_ms_raw method for CoreMS HDF5 files can only access saved data, consider rerunning after instantiation."
130            )
131        ms_unprocessed = {}
132        dict_group_load = self.h5pydata["ms_unprocessed"]
133        dict_group_keys = dict_group_load.keys()
134        for k in dict_group_keys:
135            ms_up_int = dict_group_load[k][:]
136            ms_unprocessed[int(k)] = pd.DataFrame(
137                ms_up_int, columns=["scan", "mz", "intensity"]
138            )
139        return ms_unprocessed
def get_scan_df(self) -> pandas.core.frame.DataFrame:
141    def get_scan_df(self) -> pd.DataFrame:
142        scan_info = {}
143        dict_group_load = self.h5pydata["scan_info"]
144        dict_group_keys = dict_group_load.keys()
145        for k in dict_group_keys:
146            scan_info[k] = dict_group_load[k][:]
147        scan_df = pd.DataFrame(scan_info)
148        scan_df.set_index("scan", inplace=True, drop=False)
149        str_df = scan_df.select_dtypes([object])
150        str_df = str_df.stack().str.decode("utf-8").unstack()
151        for col in str_df:
152            scan_df[col] = str_df[col]
153        return scan_df

Return scan data as a pandas DataFrame.

def run(self, mass_spectra, load_raw=True) -> None:
155    def run(self, mass_spectra, load_raw=True) -> None:
156        """Runs the importer functions to populate a LCMS or MassSpectraBase object.
157
158        Notes
159        -----
160        The following functions are run in order, if the HDF5 file contains the necessary data:
161        1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object.
162        2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object.
163        3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object.
164        4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object.
165        5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object.
166        6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object.
167        7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object.
168
169        Parameters
170        ----------
171        mass_spectra : LCMSBase or MassSpectraBase
172            The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes.
173        load_raw : bool
174            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
175        Returns
176        -------
177        None, but populates several attributes on the LCMS or MassSpectraBase object.
178
179        """
180        if self.parameters_location is not None:
181            # Populate the parameters attribute on the LCMS object
182            self.import_parameters(mass_spectra)
183
184        if "mass_spectra" in self.h5pydata:
185            # Populate the _ms list on the LCMS object
186            self.import_mass_spectra(mass_spectra, load_raw=load_raw)
187
188        if "scan_info" in self.h5pydata:
189            # Populate the _scan_info attribute on the LCMS object
190            self.import_scan_info(mass_spectra)
191
192        if "ms_unprocessed" in self.h5pydata and load_raw:
193            # Populate the _ms_unprocessed attribute on the LCMS object
194            self.import_ms_unprocessed(mass_spectra)
195
196        if "mass_features" in self.h5pydata:
197            # Populate the mass_features attribute on the LCMS object
198            self.import_mass_features(mass_spectra)
199
200        if "eics" in self.h5pydata:
201            # Populate the eics attribute on the LCMS object
202            self.import_eics(mass_spectra)
203
204        if "spectral_search_results" in self.h5pydata:
205            # Populate the spectral_search_results attribute on the LCMS object
206            self.import_spectral_search_results(mass_spectra)

Runs the importer functions to populate a LCMS or MassSpectraBase object.

Notes

The following functions are run in order, if the HDF5 file contains the necessary data:

  1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object.
  2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object.
  3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object.
  4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object.
  5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object.
  6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object.
  7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object.
Parameters
  • mass_spectra (LCMSBase or MassSpectraBase): The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes.
  • load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
Returns
  • None, but populates several attributes on the LCMS or MassSpectraBase object.
def import_mass_spectra(self, mass_spectra, load_raw=True) -> None:
208    def import_mass_spectra(self, mass_spectra, load_raw=True) -> None:
209        """Imports all mass spectra from the HDF5 file.
210
211        Parameters
212        ----------
213        mass_spectra : LCMSBase | MassSpectraBase
214            The MassSpectraBase or LCMSBase object to populate with mass spectra.
215        load_raw : bool
216            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default
217
218        Returns
219        -------
220        None, but populates the '_ms' list on the LCMSBase or MassSpectraBase
221        object with mass spectra from the HDF5 file.
222        """
223        for scan_number in self.scan_number_list:
224            mass_spec = self.get_mass_spectrum(scan_number, load_raw=load_raw)
225            mass_spec.scan_number = scan_number
226            mass_spectra.add_mass_spectrum(mass_spec)

Imports all mass spectra from the HDF5 file.

Parameters
  • mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with mass spectra.
  • load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default
Returns
  • None, but populates the '_ms' list on the LCMSBase or MassSpectraBase
  • object with mass spectra from the HDF5 file.
def import_scan_info(self, mass_spectra) -> None:
228    def import_scan_info(self, mass_spectra) -> None:
229        """Imports the scan info from the HDF5 file.
230
231        Parameters
232        ----------
233        lcms : LCMSBase | MassSpectraBase
234            The MassSpectraBase or LCMSBase objects
235
236        Returns
237        -------
238        None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase
239        object with a pandas DataFrame of the 'scan_info' from the HDF5 file.
240
241        """
242        scan_df = self.get_scan_df()
243        mass_spectra.scan_df = scan_df

Imports the scan info from the HDF5 file.

Parameters
  • lcms (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase objects
Returns
  • None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase
  • object with a pandas DataFrame of the 'scan_info' from the HDF5 file.
def import_ms_unprocessed(self, mass_spectra) -> None:
245    def import_ms_unprocessed(self, mass_spectra) -> None:
246        """Imports the unprocessed mass spectra from the HDF5 file.
247
248        Parameters
249        ----------
250        lcms : LCMSBase | MassSpectraBase
251            The MassSpectraBase or LCMSBase objects
252
253        Returns
254        -------
255        None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase
256        object with a dictionary of the 'ms_unprocessed' from the HDF5 file.
257
258        """
259        ms_unprocessed = self.get_ms_raw()
260        mass_spectra._ms_unprocessed = ms_unprocessed

Imports the unprocessed mass spectra from the HDF5 file.

Parameters
  • lcms (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase objects
Returns
  • None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase
  • object with a dictionary of the 'ms_unprocessed' from the HDF5 file.
def import_parameters(self, mass_spectra) -> None:
262    def import_parameters(self, mass_spectra) -> None:
263        """Imports the parameters from the HDF5 file.
264
265        Parameters
266        ----------
267        mass_spectra : LCMSBase | MassSpectraBase
268            The MassSpectraBase or LCMSBase object to populate with parameters.
269
270        Returns
271        -------
272        None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase
273        object with a dictionary of the 'parameters' from the HDF5 file.
274
275        """
276        if ".json" == self.parameters_location.suffix:
277            load_and_set_json_parameters_lcms(mass_spectra, self.parameters_location)
278        if ".toml" == self.parameters_location.suffix:
279            load_and_set_toml_parameters_lcms(mass_spectra, self.parameters_location)
280        else:
281            raise Exception(
282                "Parameters file must be in JSON format, TOML format is not yet supported."
283            )

Imports the parameters from the HDF5 file.

Parameters
  • mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with parameters.
Returns
  • None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase
  • object with a dictionary of the 'parameters' from the HDF5 file.
def import_mass_features(self, mass_spectra) -> None:
285    def import_mass_features(self, mass_spectra) -> None:
286        """Imports the mass features from the HDF5 file.
287
288        Parameters
289        ----------
290        mass_spectra : LCMSBase | MassSpectraBase
291            The MassSpectraBase or LCMSBase object to populate with mass features.
292
293        Returns
294        -------
295        None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase
296        object with a dictionary of the 'mass_features' from the HDF5 file.
297
298        """
299        dict_group_load = self.h5pydata["mass_features"]
300        dict_group_keys = dict_group_load.keys()
301        for k in dict_group_keys:
302            # Instantiate the MassFeature object
303            mass_feature = LCMSMassFeature(
304                mass_spectra,
305                mz=dict_group_load[k].attrs["_mz_exp"],
306                retention_time=dict_group_load[k].attrs["_retention_time"],
307                intensity=dict_group_load[k].attrs["_intensity"],
308                apex_scan=dict_group_load[k].attrs["_apex_scan"],
309                persistence=dict_group_load[k].attrs["_persistence"],
310                id=int(k),
311            )
312
313            # Populate additional attributes on the MassFeature object
314            for key in dict_group_load[k].attrs.keys() - {
315                "_mz_exp",
316                "_mz_cal",
317                "_retention_time",
318                "_intensity",
319                "_apex_scan",
320                "_persistence",
321            }:
322                setattr(mass_feature, key, dict_group_load[k].attrs[key])
323
324            # Populate attributes on MassFeature object that are lists
325            for key in dict_group_load[k].keys():
326                setattr(mass_feature, key, dict_group_load[k][key][:])
327
328            mass_spectra.mass_features[int(k)] = mass_feature
329
330        # Associate mass features with ms1 and ms2 spectra, if available
331        for mf_id in mass_spectra.mass_features.keys():
332            if mass_spectra.mass_features[mf_id].apex_scan in mass_spectra._ms.keys():
333                mass_spectra.mass_features[mf_id].mass_spectrum = mass_spectra._ms[
334                    mass_spectra.mass_features[mf_id].apex_scan
335                ]
336            if mass_spectra.mass_features[mf_id].ms2_scan_numbers is not None:
337                for ms2_scan in mass_spectra.mass_features[mf_id].ms2_scan_numbers:
338                    if ms2_scan in mass_spectra._ms.keys():
339                        mass_spectra.mass_features[mf_id].ms2_mass_spectra[ms2_scan] = (
340                            mass_spectra._ms[ms2_scan]
341                        )

Imports the mass features from the HDF5 file.

Parameters
  • mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with mass features.
Returns
  • None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase
  • object with a dictionary of the 'mass_features' from the HDF5 file.
def import_eics(self, mass_spectra):
343    def import_eics(self, mass_spectra):
344        """Imports the extracted ion chromatograms from the HDF5 file.
345
346        Parameters
347        ----------
348        mass_spectra : LCMSBase | MassSpectraBase
349            The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms.
350
351        Returns
352        -------
353        None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase
354        object with a dictionary of the 'eics' from the HDF5 file.
355
356        """
357        dict_group_load = self.h5pydata["eics"]
358        dict_group_keys = dict_group_load.keys()
359        for k in dict_group_keys:
360            my_eic = EIC_Data(
361                scans=dict_group_load[k]["scans"][:],
362                time=dict_group_load[k]["time"][:],
363                eic=dict_group_load[k]["eic"][:],
364            )
365            for key in dict_group_load[k].keys():
366                if key not in ["scans", "time", "eic"]:
367                    setattr(my_eic, key, dict_group_load[k][key][:])
368                    # if key is apexes, convert to a tuple of a list
369                    if key == "apexes" and len(my_eic.apexes) > 0:
370                        my_eic.apexes = [tuple(x) for x in my_eic.apexes]
371            # Add to mass_spectra object
372            mass_spectra.eics[dict_group_load[k].attrs["mz"]] = my_eic
373
374        # Add to mass features
375        for idx in mass_spectra.mass_features.keys():
376            mz = mass_spectra.mass_features[idx].mz
377            if mz in mass_spectra.eics.keys():
378                mass_spectra.mass_features[idx]._eic_data = mass_spectra.eics[mz]

Imports the extracted ion chromatograms from the HDF5 file.

Parameters
  • mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms.
Returns
  • None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase
  • object with a dictionary of the 'eics' from the HDF5 file.
def import_spectral_search_results(self, mass_spectra):
380    def import_spectral_search_results(self, mass_spectra):
381        """Imports the spectral search results from the HDF5 file.
382
383        Parameters
384        ----------
385        mass_spectra : LCMSBase | MassSpectraBase
386            The MassSpectraBase or LCMSBase object to populate with spectral search results.
387
388        Returns
389        -------
390        None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase
391        object with a dictionary of the 'spectral_search_results' from the HDF5 file.
392
393        """
394        overall_results_dict = {}
395        ms2_results_load = self.h5pydata["spectral_search_results"]
396        for k in ms2_results_load.keys():
397            overall_results_dict[int(k)] = {}
398            for k2 in ms2_results_load[k].keys():
399                ms2_search_res = SpectrumSearchResults(
400                    query_spectrum=mass_spectra._ms[int(k)],
401                    precursor_mz=ms2_results_load[k][k2].attrs["precursor_mz"],
402                    spectral_similarity_search_results={},
403                )
404
405                for key in ms2_results_load[k][k2].keys() - {"precursor_mz"}:
406                    setattr(ms2_search_res, key, list(ms2_results_load[k][k2][key][:]))
407                overall_results_dict[int(k)][
408                    ms2_results_load[k][k2].attrs["precursor_mz"]
409                ] = ms2_search_res
410
411        # add to mass_spectra
412        mass_spectra.spectral_search_results.update(overall_results_dict)
413
414        # If there are mass features, associate the results with each mass feature
415        if len(mass_spectra.mass_features) > 0:
416            for mass_feature_id, mass_feature in mass_spectra.mass_features.items():
417                scan_ids = mass_feature.ms2_scan_numbers
418                for ms2_scan_id in scan_ids:
419                    precursor_mz = mass_feature.mz
420                    try:
421                        mass_spectra.spectral_search_results[ms2_scan_id][precursor_mz]
422                    except KeyError:
423                        pass
424                    else:
425                        mass_spectra.mass_features[
426                            mass_feature_id
427                        ].ms2_similarity_results.append(
428                            mass_spectra.spectral_search_results[ms2_scan_id][
429                                precursor_mz
430                            ]
431                        )

Imports the spectral search results from the HDF5 file.

Parameters
  • mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with spectral search results.
Returns
  • None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase
  • object with a dictionary of the 'spectral_search_results' from the HDF5 file.
def get_mass_spectra_obj( self, load_raw=True) -> corems.mass_spectra.factory.lc_class.MassSpectraBase:
433    def get_mass_spectra_obj(self, load_raw=True) -> MassSpectraBase:
434        """
435        Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file.
436
437        Parameters
438        ----------
439        load_raw : bool
440            If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True.
441
442        """
443        # Instantiate the LCMS object
444        spectra_obj = MassSpectraBase(
445            file_location=self.file_location,
446            analyzer=self.analyzer,
447            instrument_label=self.instrument_label,
448            sample_name=self.sample_name,
449        )
450
451        # This will populate the _ms list on the LCMS or MassSpectraBase object
452        self.run(spectra_obj, load_raw=load_raw)
453
454        return spectra_obj

Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file.

Parameters
  • load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True.
def get_lcms_obj( self, load_raw=True, use_original_parser=True, raw_file_path=None) -> corems.mass_spectra.factory.lc_class.LCMSBase:
456    def get_lcms_obj(
457        self, load_raw=True, use_original_parser=True, raw_file_path=None
458    ) -> LCMSBase:
459        """
460        Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file.
461
462        Parameters
463        ----------
464        load_raw : bool
465            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
466        use_original_parser : bool
467            If True, use the original parser to populate the LCMS object. Default is True.
468        raw_file_path : str
469            The location of the raw file to parse if attempting to use original parser.
470            Default is None, which attempts to get the raw file path from the HDF5 file.
471            If the original file path has moved, this parameter can be used to specify the new location.
472        """
473        # Instantiate the LCMS object
474        lcms_obj = LCMSBase(
475            file_location=self.file_location,
476            analyzer=self.analyzer,
477            instrument_label=self.instrument_label,
478            sample_name=self.sample_name,
479        )
480
481        # This will populate the majority of the attributes on the LCMS object
482        self.run(lcms_obj, load_raw=load_raw)
483
484        # Set final attributes of the LCMS object
485        lcms_obj.polarity = self.h5pydata.attrs["polarity"]
486        lcms_obj._scans_number_list = list(lcms_obj.scan_df.scan)
487        lcms_obj._retention_time_list = list(lcms_obj.scan_df.scan_time)
488        lcms_obj._tic_list = list(lcms_obj.scan_df.tic)
489
490        # If use_original_parser is True, instantiate the original parser and populate the LCMS object
491        if use_original_parser:
492            lcms_obj = self.add_original_parser(lcms_obj, raw_file_path=raw_file_path)
493
494        return lcms_obj

Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file.

Parameters
  • load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
  • use_original_parser (bool): If True, use the original parser to populate the LCMS object. Default is True.
  • raw_file_path (str): The location of the raw file to parse if attempting to use original parser. Default is None, which attempts to get the raw file path from the HDF5 file. If the original file path has moved, this parameter can be used to specify the new location.
def add_original_parser(self, mass_spectra, raw_file_path=None):
496    def add_original_parser(self, mass_spectra, raw_file_path=None):
497        """
498        Add the original parser to the mass spectra object.
499
500        Parameters
501        ----------
502        mass_spectra : MassSpectraBase | LCMSBase
503            The MassSpectraBase or LCMSBase object to add the original parser to.
504        raw_file_path : str
505            The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file.
506        """
507        # Try to get the raw file path from the HDF5 file
508        if raw_file_path is None:
509            raw_file_path = self.h5pydata.attrs["original_file_location"]
510            # Check if og_file_location exists, if not raise an error
511            raw_file_path = self.h5pydata.attrs["original_file_location"]
512
513        raw_file_path = Path(raw_file_path)
514        if not raw_file_path.exists():
515            raise FileExistsError(
516                "File does not exist: " + str(raw_file_path),
517                ". Cannot use original parser for instatiating the lcms_obj.",
518            )
519
520        # Get the original parser type
521        og_parser_type = self.h5pydata.attrs["parser_type"]
522
523        if og_parser_type == "ImportMassSpectraThermoMSFileReader":
524            parser = ImportMassSpectraThermoMSFileReader(raw_file_path)
525        elif og_parser_type == "MZMLSpectraParser":
526            parser = MZMLSpectraParser(raw_file_path)
527
528        mass_spectra.spectra_parser_class = parser.__class__
529        mass_spectra.spectra_parser = parser
530
531        return mass_spectra

Add the original parser to the mass spectra object.

Parameters
  • mass_spectra (MassSpectraBase | LCMSBase): The MassSpectraBase or LCMSBase object to add the original parser to.
  • raw_file_path (str): The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file.