corems.mass_spectra.input.corems_hdf5

  1__author__ = "Yuri E. Corilo"
  2__date__ = "Oct 29, 2019"
  3
  4
  5from threading import Thread
  6from pathlib import Path
  7
  8import pandas as pd
  9import warnings
 10
 11from corems.chroma_peak.factory.chroma_peak_classes import LCMSMassFeature
 12from corems.encapsulation.input.parameter_from_json import (
 13    load_and_set_json_parameters_lcms,
 14    load_and_set_toml_parameters_lcms,
 15)
 16from corems.mass_spectra.factory.lc_class import LCMSBase, MassSpectraBase
 17from corems.mass_spectra.factory.chromat_data import EIC_Data
 18from corems.mass_spectra.input.parserbase import SpectraParserInterface
 19from corems.mass_spectrum.input.coremsHDF5 import ReadCoreMSHDF_MassSpectrum
 20from corems.molecular_id.factory.spectrum_search_results import SpectrumSearchResults
 21from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader
 22from corems.mass_spectra.input.mzml import MZMLSpectraParser
 23
 24
 25class ReadCoreMSHDFMassSpectra(
 26    SpectraParserInterface, ReadCoreMSHDF_MassSpectrum, Thread
 27):
 28    """Class to read CoreMS HDF5 files and populate a LCMS or MassSpectraBase object.
 29
 30    Parameters
 31    ----------
 32    file_location : str
 33        The location of the HDF5 file to read, including the suffix.
 34
 35    Attributes
 36    ----------
 37    file_location : str
 38        The location of the HDF5 file to read.
 39    h5pydata : h5py.File
 40        The HDF5 file object.
 41    scans : list
 42        A list of the location of individual mass spectra within the HDF5 file.
 43    scan_number_list : list
 44        A list of the scan numbers of the mass spectra within the HDF5 file.
 45    parameters_location : str
 46        The location of the parameters file (json or toml).
 47
 48    Methods
 49    -------
 50    * import_mass_spectra(mass_spectra).
 51        Imports all mass spectra from the HDF5 file onto the LCMS or MassSpectraBase object.
 52    * get_mass_spectrum_from_scan(scan_number).
 53        Return mass spectrum data object from scan number.
 54    * load().
 55        Placeholder method to meet the requirements of the SpectraParserInterface.
 56    * run(mass_spectra).
 57        Runs the importer functions to populate a LCMS or MassSpectraBase object.
 58    * import_scan_info(mass_spectra).
 59        Imports the scan info from the HDF5 file to populate the _scan_info attribute
 60        on the LCMS or MassSpectraBase object
 61    * import_ms_unprocessed(mass_spectra).
 62        Imports the unprocessed mass spectra from the HDF5 file to populate the
 63        _ms_unprocessed attribute on the LCMS or MassSpectraBase object
 64    * import_parameters(mass_spectra).
 65        Imports the parameters from the HDF5 file to populate the parameters
 66        attribute on the LCMS or MassSpectraBase object
 67    * import_mass_features(mass_spectra).
 68        Imports the mass features from the HDF5 file to populate the mass_features
 69        attribute on the LCMS or MassSpectraBase object
 70    * import_eics(mass_spectra).
 71        Imports the extracted ion chromatograms from the HDF5 file to populate the
 72        eics attribute on the LCMS or MassSpectraBase object
 73    * import_spectral_search_results(mass_spectra).
 74        Imports the spectral search results from the HDF5 file to populate the
 75        spectral_search_results attribute on the LCMS or MassSpectraBase object
 76    * get_mass_spectra_obj().
 77        Return mass spectra data object, populating the _ms list on the LCMS or
 78        MassSpectraBase object from the HDF5 file
 79    * get_lcms_obj().
 80        Return LCMSBase object, populating the majority of the attributes on the
 81        LCMS object from the HDF5 file
 82
 83    """
 84
 85    def __init__(self, file_location: str):
 86        Thread.__init__(self)
 87        ReadCoreMSHDF_MassSpectrum.__init__(self, file_location)
 88
 89        # override the scans attribute on ReadCoreMSHDF_MassSpectrum class to expect a nested location within the HDF5 file
 90        self.scans = [
 91            "mass_spectra/" + x for x in list(self.h5pydata["mass_spectra"].keys())
 92        ]
 93        self.scan_number_list = sorted(
 94            [int(float(i)) for i in list(self.h5pydata["mass_spectra"].keys())]
 95        )
 96
 97        # set the location of the parameters file (json or toml)
 98        add_files = [
 99            x
100            for x in self.file_location.parent.glob(
101                self.file_location.name.replace(".hdf5", ".*")
102            )
103            if x.suffix != ".hdf5"
104        ]
105        if len([x for x in add_files if x.suffix == ".json"]) > 0:
106            self.parameters_location = [x for x in add_files if x.suffix == ".json"][0]
107        elif len([x for x in add_files if x.suffix == ".toml"]) > 0:
108            self.parameters_location = [x for x in add_files if x.suffix == ".toml"][0]
109        else:
110            self.parameters_location = None
111
112    def get_mass_spectrum_from_scan(self, scan_number):
113        """Return mass spectrum data object from scan number."""
114        if scan_number in self.scan_number_list:
115            mass_spec = self.get_mass_spectrum(scan_number)
116            return mass_spec
117        else:
118            raise Exception("Scan number not found in HDF5 file.")
119
120    def load(self) -> None:
121        """ """
122        pass
123
124    def get_ms_raw(self, spectra=None, scan_df=None) -> dict:
125        """ """
126        # Warn if spectra or scan_df are not None that they are not used for CoreMS HDF5 files and should be rerun after instantiation
127        if spectra is not None or scan_df is not None:
128            SyntaxWarning(
129                "get_ms_raw method for CoreMS HDF5 files can only access saved data, consider rerunning after instantiation."
130            )
131        ms_unprocessed = {}
132        dict_group_load = self.h5pydata["ms_unprocessed"]
133        dict_group_keys = dict_group_load.keys()
134        for k in dict_group_keys:
135            ms_up_int = dict_group_load[k][:]
136            ms_unprocessed[int(k)] = pd.DataFrame(
137                ms_up_int, columns=["scan", "mz", "intensity"]
138            )
139        return ms_unprocessed
140
141    def get_scan_df(self) -> pd.DataFrame:
142        scan_info = {}
143        dict_group_load = self.h5pydata["scan_info"]
144        dict_group_keys = dict_group_load.keys()
145        for k in dict_group_keys:
146            scan_info[k] = dict_group_load[k][:]
147        scan_df = pd.DataFrame(scan_info)
148        scan_df.set_index("scan", inplace=True, drop=False)
149        str_df = scan_df.select_dtypes([object])
150        str_df = str_df.stack().str.decode("utf-8").unstack()
151        for col in str_df:
152            scan_df[col] = str_df[col]
153        return scan_df
154
155    def run(self, mass_spectra, load_raw=True) -> None:
156        """Runs the importer functions to populate a LCMS or MassSpectraBase object.
157
158        Notes
159        -----
160        The following functions are run in order, if the HDF5 file contains the necessary data:
161        1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object.
162        2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object.
163        3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object.
164        4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object.
165        5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object.
166        6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object.
167        7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object.
168
169        Parameters
170        ----------
171        mass_spectra : LCMSBase or MassSpectraBase
172            The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes.
173        load_raw : bool
174            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
175        Returns
176        -------
177        None, but populates several attributes on the LCMS or MassSpectraBase object.
178
179        """
180        if self.parameters_location is not None:
181            # Populate the parameters attribute on the LCMS object
182            self.import_parameters(mass_spectra)
183
184        if "mass_spectra" in self.h5pydata:
185            # Populate the _ms list on the LCMS object
186            self.import_mass_spectra(mass_spectra, load_raw=load_raw)
187
188        if "scan_info" in self.h5pydata:
189            # Populate the _scan_info attribute on the LCMS object
190            self.import_scan_info(mass_spectra)
191
192        if "ms_unprocessed" in self.h5pydata and load_raw:
193            # Populate the _ms_unprocessed attribute on the LCMS object
194            self.import_ms_unprocessed(mass_spectra)
195
196        if "mass_features" in self.h5pydata:
197            # Populate the mass_features attribute on the LCMS object
198            self.import_mass_features(mass_spectra)
199
200        if "eics" in self.h5pydata:
201            # Populate the eics attribute on the LCMS object
202            self.import_eics(mass_spectra)
203
204        if "spectral_search_results" in self.h5pydata:
205            # Populate the spectral_search_results attribute on the LCMS object
206            self.import_spectral_search_results(mass_spectra)
207
208    def import_mass_spectra(self, mass_spectra, load_raw=True) -> None:
209        """Imports all mass spectra from the HDF5 file.
210
211        Parameters
212        ----------
213        mass_spectra : LCMSBase | MassSpectraBase
214            The MassSpectraBase or LCMSBase object to populate with mass spectra.
215        load_raw : bool
216            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default
217
218        Returns
219        -------
220        None, but populates the '_ms' list on the LCMSBase or MassSpectraBase
221        object with mass spectra from the HDF5 file.
222        """
223        for scan_number in self.scan_number_list:
224            mass_spec = self.get_mass_spectrum(scan_number, load_raw=load_raw)
225            mass_spec.scan_number = scan_number
226            mass_spectra.add_mass_spectrum(mass_spec)
227
228    def import_scan_info(self, mass_spectra) -> None:
229        """Imports the scan info from the HDF5 file.
230
231        Parameters
232        ----------
233        lcms : LCMSBase | MassSpectraBase
234            The MassSpectraBase or LCMSBase objects
235
236        Returns
237        -------
238        None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase
239        object with a pandas DataFrame of the 'scan_info' from the HDF5 file.
240
241        """
242        scan_df = self.get_scan_df()
243        mass_spectra.scan_df = scan_df
244
245    def import_ms_unprocessed(self, mass_spectra) -> None:
246        """Imports the unprocessed mass spectra from the HDF5 file.
247
248        Parameters
249        ----------
250        lcms : LCMSBase | MassSpectraBase
251            The MassSpectraBase or LCMSBase objects
252
253        Returns
254        -------
255        None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase
256        object with a dictionary of the 'ms_unprocessed' from the HDF5 file.
257
258        """
259        ms_unprocessed = self.get_ms_raw()
260        mass_spectra._ms_unprocessed = ms_unprocessed
261
262    def import_parameters(self, mass_spectra) -> None:
263        """Imports the parameters from the HDF5 file.
264
265        Parameters
266        ----------
267        mass_spectra : LCMSBase | MassSpectraBase
268            The MassSpectraBase or LCMSBase object to populate with parameters.
269
270        Returns
271        -------
272        None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase
273        object with a dictionary of the 'parameters' from the HDF5 file.
274
275        """
276        if ".json" == self.parameters_location.suffix:
277            load_and_set_json_parameters_lcms(mass_spectra, self.parameters_location)
278        if ".toml" == self.parameters_location.suffix:
279            load_and_set_toml_parameters_lcms(mass_spectra, self.parameters_location)
280        else:
281            raise Exception(
282                "Parameters file must be in JSON format, TOML format is not yet supported."
283            )
284
285    def import_mass_features(self, mass_spectra) -> None:
286        """Imports the mass features from the HDF5 file.
287
288        Parameters
289        ----------
290        mass_spectra : LCMSBase | MassSpectraBase
291            The MassSpectraBase or LCMSBase object to populate with mass features.
292
293        Returns
294        -------
295        None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase
296        object with a dictionary of the 'mass_features' from the HDF5 file.
297
298        """
299        dict_group_load = self.h5pydata["mass_features"]
300        dict_group_keys = dict_group_load.keys()
301        for k in dict_group_keys:
302            # Instantiate the MassFeature object
303            mass_feature = LCMSMassFeature(
304                mass_spectra,
305                mz=dict_group_load[k].attrs["_mz_exp"],
306                retention_time=dict_group_load[k].attrs["_retention_time"],
307                intensity=dict_group_load[k].attrs["_intensity"],
308                apex_scan=dict_group_load[k].attrs["_apex_scan"],
309                persistence=dict_group_load[k].attrs["_persistence"],
310                id=int(k),
311            )
312
313            # Populate additional attributes on the MassFeature object
314            for key in dict_group_load[k].attrs.keys() - {
315                "_mz_exp",
316                "_mz_cal",
317                "_retention_time",
318                "_intensity",
319                "_apex_scan",
320                "_persistence",
321            }:
322                setattr(mass_feature, key, dict_group_load[k].attrs[key])
323
324            # Populate attributes on MassFeature object that are lists
325            for key in dict_group_load[k].keys():
326                setattr(mass_feature, key, dict_group_load[k][key][:])
327
328            mass_spectra.mass_features[int(k)] = mass_feature
329
330        # Associate mass features with ms1 and ms2 spectra, if available
331        for mf_id in mass_spectra.mass_features.keys():
332            if mass_spectra.mass_features[mf_id].apex_scan in mass_spectra._ms.keys():
333                mass_spectra.mass_features[mf_id].mass_spectrum = mass_spectra._ms[
334                    mass_spectra.mass_features[mf_id].apex_scan
335                ]
336            if mass_spectra.mass_features[mf_id].ms2_scan_numbers is not None:
337                for ms2_scan in mass_spectra.mass_features[mf_id].ms2_scan_numbers:
338                    if ms2_scan in mass_spectra._ms.keys():
339                        mass_spectra.mass_features[mf_id].ms2_mass_spectra[ms2_scan] = (
340                            mass_spectra._ms[ms2_scan]
341                        )
342
343    def import_eics(self, mass_spectra):
344        """Imports the extracted ion chromatograms from the HDF5 file.
345
346        Parameters
347        ----------
348        mass_spectra : LCMSBase | MassSpectraBase
349            The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms.
350
351        Returns
352        -------
353        None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase
354        object with a dictionary of the 'eics' from the HDF5 file.
355
356        """
357        dict_group_load = self.h5pydata["eics"]
358        dict_group_keys = dict_group_load.keys()
359        for k in dict_group_keys:
360            my_eic = EIC_Data(
361                scans=dict_group_load[k]["scans"][:],
362                time=dict_group_load[k]["time"][:],
363                eic=dict_group_load[k]["eic"][:],
364            )
365            for key in dict_group_load[k].keys():
366                if key not in ["scans", "time", "eic"]:
367                    setattr(my_eic, key, dict_group_load[k][key][:])
368                    # if key is apexes, convert to a tuple of a list
369                    if key == "apexes" and len(my_eic.apexes) > 0:
370                        my_eic.apexes = [tuple(x) for x in my_eic.apexes]
371            # Add to mass_spectra object
372            mass_spectra.eics[dict_group_load[k].attrs["mz"]] = my_eic
373
374        # Add to mass features
375        for idx in mass_spectra.mass_features.keys():
376            mz = mass_spectra.mass_features[idx].mz
377            if mz in mass_spectra.eics.keys():
378                mass_spectra.mass_features[idx]._eic_data = mass_spectra.eics[mz]
379
380    def import_spectral_search_results(self, mass_spectra):
381        """Imports the spectral search results from the HDF5 file.
382
383        Parameters
384        ----------
385        mass_spectra : LCMSBase | MassSpectraBase
386            The MassSpectraBase or LCMSBase object to populate with spectral search results.
387
388        Returns
389        -------
390        None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase
391        object with a dictionary of the 'spectral_search_results' from the HDF5 file.
392
393        """
394        overall_results_dict = {}
395        ms2_results_load = self.h5pydata["spectral_search_results"]
396        for k in ms2_results_load.keys():
397            overall_results_dict[int(k)] = {}
398            for k2 in ms2_results_load[k].keys():
399                ms2_search_res = SpectrumSearchResults(
400                    query_spectrum=mass_spectra._ms[int(k)],
401                    precursor_mz=ms2_results_load[k][k2].attrs["precursor_mz"],
402                    spectral_similarity_search_results={},
403                )
404
405                for key in ms2_results_load[k][k2].keys() - {"precursor_mz"}:
406                    setattr(ms2_search_res, key, list(ms2_results_load[k][k2][key][:]))
407                overall_results_dict[int(k)][
408                    ms2_results_load[k][k2].attrs["precursor_mz"]
409                ] = ms2_search_res
410
411        # add to mass_spectra
412        mass_spectra.spectral_search_results.update(overall_results_dict)
413
414        # If there are mass features, associate the results with each mass feature
415        if len(mass_spectra.mass_features) > 0:
416            for mass_feature_id, mass_feature in mass_spectra.mass_features.items():
417                scan_ids = mass_feature.ms2_scan_numbers
418                for ms2_scan_id in scan_ids:
419                    precursor_mz = mass_feature.mz
420                    try:
421                        mass_spectra.spectral_search_results[ms2_scan_id][precursor_mz]
422                    except KeyError:
423                        pass
424                    else:
425                        mass_spectra.mass_features[
426                            mass_feature_id
427                        ].ms2_similarity_results.append(
428                            mass_spectra.spectral_search_results[ms2_scan_id][
429                                precursor_mz
430                            ]
431                        )
432
433    def get_mass_spectra_obj(self, load_raw=True) -> MassSpectraBase:
434        """
435        Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file.
436
437        Parameters
438        ----------
439        load_raw : bool
440            If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True.
441
442        """
443        # Instantiate the LCMS object
444        spectra_obj = MassSpectraBase(
445            file_location=self.file_location,
446            analyzer=self.analyzer,
447            instrument_label=self.instrument_label,
448            sample_name=self.sample_name,
449        )
450
451        # This will populate the _ms list on the LCMS or MassSpectraBase object
452        self.run(spectra_obj, load_raw=load_raw)
453
454        return spectra_obj
455
456    def get_lcms_obj(
457        self, load_raw=True, use_original_parser=True, raw_file_path=None
458    ) -> LCMSBase:
459        """
460        Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file.
461
462        Parameters
463        ----------
464        load_raw : bool
465            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
466        use_original_parser : bool
467            If True, use the original parser to populate the LCMS object. Default is True.
468        raw_file_path : str
469            The location of the raw file to parse if attempting to use original parser.
470            Default is None, which attempts to get the raw file path from the HDF5 file.
471            If the original file path has moved, this parameter can be used to specify the new location.
472        """
473        # Instantiate the LCMS object
474        lcms_obj = LCMSBase(
475            file_location=self.file_location,
476            analyzer=self.analyzer,
477            instrument_label=self.instrument_label,
478            sample_name=self.sample_name,
479        )
480
481        # This will populate the majority of the attributes on the LCMS object
482        self.run(lcms_obj, load_raw=load_raw)
483
484        # Set final attributes of the LCMS object
485        lcms_obj.polarity = self.h5pydata.attrs["polarity"]
486        lcms_obj._scans_number_list = list(lcms_obj.scan_df.scan)
487        lcms_obj._retention_time_list = list(lcms_obj.scan_df.scan_time)
488        lcms_obj._tic_list = list(lcms_obj.scan_df.tic)
489
490        # If use_original_parser is True, instantiate the original parser and populate the LCMS object
491        if use_original_parser:
492            lcms_obj = self.add_original_parser(lcms_obj, raw_file_path=raw_file_path)
493
494        return lcms_obj
495
496    def add_original_parser(self, mass_spectra, raw_file_path=None):
497        """
498        Add the original parser to the mass spectra object.
499
500        Parameters
501        ----------
502        mass_spectra : MassSpectraBase | LCMSBase
503            The MassSpectraBase or LCMSBase object to add the original parser to.
504        raw_file_path : str
505            The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file.
506        """
507        # Try to get the raw file path from the HDF5 file
508        if raw_file_path is None:
509            raw_file_path = self.h5pydata.attrs["original_file_location"]
510            # Check if og_file_location exists, if not raise an error
511            raw_file_path = self.h5pydata.attrs["original_file_location"]
512
513        raw_file_path = Path(raw_file_path)
514        if not raw_file_path.exists():
515            raise FileExistsError(
516                "File does not exist: " + str(raw_file_path),
517                ". Cannot use original parser for instatiating the lcms_obj.",
518            )
519
520        # Get the original parser type
521        og_parser_type = self.h5pydata.attrs["parser_type"]
522
523        if og_parser_type == "ImportMassSpectraThermoMSFileReader":
524            parser = ImportMassSpectraThermoMSFileReader(raw_file_path)
525        elif og_parser_type == "MZMLSpectraParser":
526            parser = MZMLSpectraParser(raw_file_path)
527
528        mass_spectra.spectra_parser_class = parser.__class__
529        mass_spectra.spectra_parser = parser
530
531        return mass_spectra
532    
533    def get_creation_time(self):
534        """
535        Raise a NotImplemented Warning, as creation time is not available in CoreMS HDF5 files and returning None.
536        """
537        warnings.warn(
538            "Creation time is not available in CoreMS HDF5 files, returning None." \
539            "This should be accessed through the original parser.",
540        )
541        return None
542    
543    def get_instrument_info(self):
544        """
545        Raise a NotImplemented Warning, as instrument info is not available in CoreMS HDF5 files and returning None.
546        """
547        warnings.warn(
548            "Instrument info is not available in CoreMS HDF5 files, returning None." \
549            "This should be accessed through the original parser.",
550        )
551        return None
 26class ReadCoreMSHDFMassSpectra(
 27    SpectraParserInterface, ReadCoreMSHDF_MassSpectrum, Thread
 28):
 29    """Class to read CoreMS HDF5 files and populate a LCMS or MassSpectraBase object.
 30
 31    Parameters
 32    ----------
 33    file_location : str
 34        The location of the HDF5 file to read, including the suffix.
 35
 36    Attributes
 37    ----------
 38    file_location : str
 39        The location of the HDF5 file to read.
 40    h5pydata : h5py.File
 41        The HDF5 file object.
 42    scans : list
 43        A list of the location of individual mass spectra within the HDF5 file.
 44    scan_number_list : list
 45        A list of the scan numbers of the mass spectra within the HDF5 file.
 46    parameters_location : str
 47        The location of the parameters file (json or toml).
 48
 49    Methods
 50    -------
 51    * import_mass_spectra(mass_spectra).
 52        Imports all mass spectra from the HDF5 file onto the LCMS or MassSpectraBase object.
 53    * get_mass_spectrum_from_scan(scan_number).
 54        Return mass spectrum data object from scan number.
 55    * load().
 56        Placeholder method to meet the requirements of the SpectraParserInterface.
 57    * run(mass_spectra).
 58        Runs the importer functions to populate a LCMS or MassSpectraBase object.
 59    * import_scan_info(mass_spectra).
 60        Imports the scan info from the HDF5 file to populate the _scan_info attribute
 61        on the LCMS or MassSpectraBase object
 62    * import_ms_unprocessed(mass_spectra).
 63        Imports the unprocessed mass spectra from the HDF5 file to populate the
 64        _ms_unprocessed attribute on the LCMS or MassSpectraBase object
 65    * import_parameters(mass_spectra).
 66        Imports the parameters from the HDF5 file to populate the parameters
 67        attribute on the LCMS or MassSpectraBase object
 68    * import_mass_features(mass_spectra).
 69        Imports the mass features from the HDF5 file to populate the mass_features
 70        attribute on the LCMS or MassSpectraBase object
 71    * import_eics(mass_spectra).
 72        Imports the extracted ion chromatograms from the HDF5 file to populate the
 73        eics attribute on the LCMS or MassSpectraBase object
 74    * import_spectral_search_results(mass_spectra).
 75        Imports the spectral search results from the HDF5 file to populate the
 76        spectral_search_results attribute on the LCMS or MassSpectraBase object
 77    * get_mass_spectra_obj().
 78        Return mass spectra data object, populating the _ms list on the LCMS or
 79        MassSpectraBase object from the HDF5 file
 80    * get_lcms_obj().
 81        Return LCMSBase object, populating the majority of the attributes on the
 82        LCMS object from the HDF5 file
 83
 84    """
 85
 86    def __init__(self, file_location: str):
 87        Thread.__init__(self)
 88        ReadCoreMSHDF_MassSpectrum.__init__(self, file_location)
 89
 90        # override the scans attribute on ReadCoreMSHDF_MassSpectrum class to expect a nested location within the HDF5 file
 91        self.scans = [
 92            "mass_spectra/" + x for x in list(self.h5pydata["mass_spectra"].keys())
 93        ]
 94        self.scan_number_list = sorted(
 95            [int(float(i)) for i in list(self.h5pydata["mass_spectra"].keys())]
 96        )
 97
 98        # set the location of the parameters file (json or toml)
 99        add_files = [
100            x
101            for x in self.file_location.parent.glob(
102                self.file_location.name.replace(".hdf5", ".*")
103            )
104            if x.suffix != ".hdf5"
105        ]
106        if len([x for x in add_files if x.suffix == ".json"]) > 0:
107            self.parameters_location = [x for x in add_files if x.suffix == ".json"][0]
108        elif len([x for x in add_files if x.suffix == ".toml"]) > 0:
109            self.parameters_location = [x for x in add_files if x.suffix == ".toml"][0]
110        else:
111            self.parameters_location = None
112
113    def get_mass_spectrum_from_scan(self, scan_number):
114        """Return mass spectrum data object from scan number."""
115        if scan_number in self.scan_number_list:
116            mass_spec = self.get_mass_spectrum(scan_number)
117            return mass_spec
118        else:
119            raise Exception("Scan number not found in HDF5 file.")
120
121    def load(self) -> None:
122        """ """
123        pass
124
125    def get_ms_raw(self, spectra=None, scan_df=None) -> dict:
126        """ """
127        # Warn if spectra or scan_df are not None that they are not used for CoreMS HDF5 files and should be rerun after instantiation
128        if spectra is not None or scan_df is not None:
129            SyntaxWarning(
130                "get_ms_raw method for CoreMS HDF5 files can only access saved data, consider rerunning after instantiation."
131            )
132        ms_unprocessed = {}
133        dict_group_load = self.h5pydata["ms_unprocessed"]
134        dict_group_keys = dict_group_load.keys()
135        for k in dict_group_keys:
136            ms_up_int = dict_group_load[k][:]
137            ms_unprocessed[int(k)] = pd.DataFrame(
138                ms_up_int, columns=["scan", "mz", "intensity"]
139            )
140        return ms_unprocessed
141
142    def get_scan_df(self) -> pd.DataFrame:
143        scan_info = {}
144        dict_group_load = self.h5pydata["scan_info"]
145        dict_group_keys = dict_group_load.keys()
146        for k in dict_group_keys:
147            scan_info[k] = dict_group_load[k][:]
148        scan_df = pd.DataFrame(scan_info)
149        scan_df.set_index("scan", inplace=True, drop=False)
150        str_df = scan_df.select_dtypes([object])
151        str_df = str_df.stack().str.decode("utf-8").unstack()
152        for col in str_df:
153            scan_df[col] = str_df[col]
154        return scan_df
155
156    def run(self, mass_spectra, load_raw=True) -> None:
157        """Runs the importer functions to populate a LCMS or MassSpectraBase object.
158
159        Notes
160        -----
161        The following functions are run in order, if the HDF5 file contains the necessary data:
162        1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object.
163        2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object.
164        3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object.
165        4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object.
166        5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object.
167        6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object.
168        7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object.
169
170        Parameters
171        ----------
172        mass_spectra : LCMSBase or MassSpectraBase
173            The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes.
174        load_raw : bool
175            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
176        Returns
177        -------
178        None, but populates several attributes on the LCMS or MassSpectraBase object.
179
180        """
181        if self.parameters_location is not None:
182            # Populate the parameters attribute on the LCMS object
183            self.import_parameters(mass_spectra)
184
185        if "mass_spectra" in self.h5pydata:
186            # Populate the _ms list on the LCMS object
187            self.import_mass_spectra(mass_spectra, load_raw=load_raw)
188
189        if "scan_info" in self.h5pydata:
190            # Populate the _scan_info attribute on the LCMS object
191            self.import_scan_info(mass_spectra)
192
193        if "ms_unprocessed" in self.h5pydata and load_raw:
194            # Populate the _ms_unprocessed attribute on the LCMS object
195            self.import_ms_unprocessed(mass_spectra)
196
197        if "mass_features" in self.h5pydata:
198            # Populate the mass_features attribute on the LCMS object
199            self.import_mass_features(mass_spectra)
200
201        if "eics" in self.h5pydata:
202            # Populate the eics attribute on the LCMS object
203            self.import_eics(mass_spectra)
204
205        if "spectral_search_results" in self.h5pydata:
206            # Populate the spectral_search_results attribute on the LCMS object
207            self.import_spectral_search_results(mass_spectra)
208
209    def import_mass_spectra(self, mass_spectra, load_raw=True) -> None:
210        """Imports all mass spectra from the HDF5 file.
211
212        Parameters
213        ----------
214        mass_spectra : LCMSBase | MassSpectraBase
215            The MassSpectraBase or LCMSBase object to populate with mass spectra.
216        load_raw : bool
217            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default
218
219        Returns
220        -------
221        None, but populates the '_ms' list on the LCMSBase or MassSpectraBase
222        object with mass spectra from the HDF5 file.
223        """
224        for scan_number in self.scan_number_list:
225            mass_spec = self.get_mass_spectrum(scan_number, load_raw=load_raw)
226            mass_spec.scan_number = scan_number
227            mass_spectra.add_mass_spectrum(mass_spec)
228
229    def import_scan_info(self, mass_spectra) -> None:
230        """Imports the scan info from the HDF5 file.
231
232        Parameters
233        ----------
234        lcms : LCMSBase | MassSpectraBase
235            The MassSpectraBase or LCMSBase objects
236
237        Returns
238        -------
239        None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase
240        object with a pandas DataFrame of the 'scan_info' from the HDF5 file.
241
242        """
243        scan_df = self.get_scan_df()
244        mass_spectra.scan_df = scan_df
245
246    def import_ms_unprocessed(self, mass_spectra) -> None:
247        """Imports the unprocessed mass spectra from the HDF5 file.
248
249        Parameters
250        ----------
251        lcms : LCMSBase | MassSpectraBase
252            The MassSpectraBase or LCMSBase objects
253
254        Returns
255        -------
256        None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase
257        object with a dictionary of the 'ms_unprocessed' from the HDF5 file.
258
259        """
260        ms_unprocessed = self.get_ms_raw()
261        mass_spectra._ms_unprocessed = ms_unprocessed
262
263    def import_parameters(self, mass_spectra) -> None:
264        """Imports the parameters from the HDF5 file.
265
266        Parameters
267        ----------
268        mass_spectra : LCMSBase | MassSpectraBase
269            The MassSpectraBase or LCMSBase object to populate with parameters.
270
271        Returns
272        -------
273        None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase
274        object with a dictionary of the 'parameters' from the HDF5 file.
275
276        """
277        if ".json" == self.parameters_location.suffix:
278            load_and_set_json_parameters_lcms(mass_spectra, self.parameters_location)
279        if ".toml" == self.parameters_location.suffix:
280            load_and_set_toml_parameters_lcms(mass_spectra, self.parameters_location)
281        else:
282            raise Exception(
283                "Parameters file must be in JSON format, TOML format is not yet supported."
284            )
285
286    def import_mass_features(self, mass_spectra) -> None:
287        """Imports the mass features from the HDF5 file.
288
289        Parameters
290        ----------
291        mass_spectra : LCMSBase | MassSpectraBase
292            The MassSpectraBase or LCMSBase object to populate with mass features.
293
294        Returns
295        -------
296        None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase
297        object with a dictionary of the 'mass_features' from the HDF5 file.
298
299        """
300        dict_group_load = self.h5pydata["mass_features"]
301        dict_group_keys = dict_group_load.keys()
302        for k in dict_group_keys:
303            # Instantiate the MassFeature object
304            mass_feature = LCMSMassFeature(
305                mass_spectra,
306                mz=dict_group_load[k].attrs["_mz_exp"],
307                retention_time=dict_group_load[k].attrs["_retention_time"],
308                intensity=dict_group_load[k].attrs["_intensity"],
309                apex_scan=dict_group_load[k].attrs["_apex_scan"],
310                persistence=dict_group_load[k].attrs["_persistence"],
311                id=int(k),
312            )
313
314            # Populate additional attributes on the MassFeature object
315            for key in dict_group_load[k].attrs.keys() - {
316                "_mz_exp",
317                "_mz_cal",
318                "_retention_time",
319                "_intensity",
320                "_apex_scan",
321                "_persistence",
322            }:
323                setattr(mass_feature, key, dict_group_load[k].attrs[key])
324
325            # Populate attributes on MassFeature object that are lists
326            for key in dict_group_load[k].keys():
327                setattr(mass_feature, key, dict_group_load[k][key][:])
328
329            mass_spectra.mass_features[int(k)] = mass_feature
330
331        # Associate mass features with ms1 and ms2 spectra, if available
332        for mf_id in mass_spectra.mass_features.keys():
333            if mass_spectra.mass_features[mf_id].apex_scan in mass_spectra._ms.keys():
334                mass_spectra.mass_features[mf_id].mass_spectrum = mass_spectra._ms[
335                    mass_spectra.mass_features[mf_id].apex_scan
336                ]
337            if mass_spectra.mass_features[mf_id].ms2_scan_numbers is not None:
338                for ms2_scan in mass_spectra.mass_features[mf_id].ms2_scan_numbers:
339                    if ms2_scan in mass_spectra._ms.keys():
340                        mass_spectra.mass_features[mf_id].ms2_mass_spectra[ms2_scan] = (
341                            mass_spectra._ms[ms2_scan]
342                        )
343
344    def import_eics(self, mass_spectra):
345        """Imports the extracted ion chromatograms from the HDF5 file.
346
347        Parameters
348        ----------
349        mass_spectra : LCMSBase | MassSpectraBase
350            The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms.
351
352        Returns
353        -------
354        None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase
355        object with a dictionary of the 'eics' from the HDF5 file.
356
357        """
358        dict_group_load = self.h5pydata["eics"]
359        dict_group_keys = dict_group_load.keys()
360        for k in dict_group_keys:
361            my_eic = EIC_Data(
362                scans=dict_group_load[k]["scans"][:],
363                time=dict_group_load[k]["time"][:],
364                eic=dict_group_load[k]["eic"][:],
365            )
366            for key in dict_group_load[k].keys():
367                if key not in ["scans", "time", "eic"]:
368                    setattr(my_eic, key, dict_group_load[k][key][:])
369                    # if key is apexes, convert to a tuple of a list
370                    if key == "apexes" and len(my_eic.apexes) > 0:
371                        my_eic.apexes = [tuple(x) for x in my_eic.apexes]
372            # Add to mass_spectra object
373            mass_spectra.eics[dict_group_load[k].attrs["mz"]] = my_eic
374
375        # Add to mass features
376        for idx in mass_spectra.mass_features.keys():
377            mz = mass_spectra.mass_features[idx].mz
378            if mz in mass_spectra.eics.keys():
379                mass_spectra.mass_features[idx]._eic_data = mass_spectra.eics[mz]
380
381    def import_spectral_search_results(self, mass_spectra):
382        """Imports the spectral search results from the HDF5 file.
383
384        Parameters
385        ----------
386        mass_spectra : LCMSBase | MassSpectraBase
387            The MassSpectraBase or LCMSBase object to populate with spectral search results.
388
389        Returns
390        -------
391        None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase
392        object with a dictionary of the 'spectral_search_results' from the HDF5 file.
393
394        """
395        overall_results_dict = {}
396        ms2_results_load = self.h5pydata["spectral_search_results"]
397        for k in ms2_results_load.keys():
398            overall_results_dict[int(k)] = {}
399            for k2 in ms2_results_load[k].keys():
400                ms2_search_res = SpectrumSearchResults(
401                    query_spectrum=mass_spectra._ms[int(k)],
402                    precursor_mz=ms2_results_load[k][k2].attrs["precursor_mz"],
403                    spectral_similarity_search_results={},
404                )
405
406                for key in ms2_results_load[k][k2].keys() - {"precursor_mz"}:
407                    setattr(ms2_search_res, key, list(ms2_results_load[k][k2][key][:]))
408                overall_results_dict[int(k)][
409                    ms2_results_load[k][k2].attrs["precursor_mz"]
410                ] = ms2_search_res
411
412        # add to mass_spectra
413        mass_spectra.spectral_search_results.update(overall_results_dict)
414
415        # If there are mass features, associate the results with each mass feature
416        if len(mass_spectra.mass_features) > 0:
417            for mass_feature_id, mass_feature in mass_spectra.mass_features.items():
418                scan_ids = mass_feature.ms2_scan_numbers
419                for ms2_scan_id in scan_ids:
420                    precursor_mz = mass_feature.mz
421                    try:
422                        mass_spectra.spectral_search_results[ms2_scan_id][precursor_mz]
423                    except KeyError:
424                        pass
425                    else:
426                        mass_spectra.mass_features[
427                            mass_feature_id
428                        ].ms2_similarity_results.append(
429                            mass_spectra.spectral_search_results[ms2_scan_id][
430                                precursor_mz
431                            ]
432                        )
433
434    def get_mass_spectra_obj(self, load_raw=True) -> MassSpectraBase:
435        """
436        Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file.
437
438        Parameters
439        ----------
440        load_raw : bool
441            If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True.
442
443        """
444        # Instantiate the LCMS object
445        spectra_obj = MassSpectraBase(
446            file_location=self.file_location,
447            analyzer=self.analyzer,
448            instrument_label=self.instrument_label,
449            sample_name=self.sample_name,
450        )
451
452        # This will populate the _ms list on the LCMS or MassSpectraBase object
453        self.run(spectra_obj, load_raw=load_raw)
454
455        return spectra_obj
456
457    def get_lcms_obj(
458        self, load_raw=True, use_original_parser=True, raw_file_path=None
459    ) -> LCMSBase:
460        """
461        Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file.
462
463        Parameters
464        ----------
465        load_raw : bool
466            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
467        use_original_parser : bool
468            If True, use the original parser to populate the LCMS object. Default is True.
469        raw_file_path : str
470            The location of the raw file to parse if attempting to use original parser.
471            Default is None, which attempts to get the raw file path from the HDF5 file.
472            If the original file path has moved, this parameter can be used to specify the new location.
473        """
474        # Instantiate the LCMS object
475        lcms_obj = LCMSBase(
476            file_location=self.file_location,
477            analyzer=self.analyzer,
478            instrument_label=self.instrument_label,
479            sample_name=self.sample_name,
480        )
481
482        # This will populate the majority of the attributes on the LCMS object
483        self.run(lcms_obj, load_raw=load_raw)
484
485        # Set final attributes of the LCMS object
486        lcms_obj.polarity = self.h5pydata.attrs["polarity"]
487        lcms_obj._scans_number_list = list(lcms_obj.scan_df.scan)
488        lcms_obj._retention_time_list = list(lcms_obj.scan_df.scan_time)
489        lcms_obj._tic_list = list(lcms_obj.scan_df.tic)
490
491        # If use_original_parser is True, instantiate the original parser and populate the LCMS object
492        if use_original_parser:
493            lcms_obj = self.add_original_parser(lcms_obj, raw_file_path=raw_file_path)
494
495        return lcms_obj
496
497    def add_original_parser(self, mass_spectra, raw_file_path=None):
498        """
499        Add the original parser to the mass spectra object.
500
501        Parameters
502        ----------
503        mass_spectra : MassSpectraBase | LCMSBase
504            The MassSpectraBase or LCMSBase object to add the original parser to.
505        raw_file_path : str
506            The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file.
507        """
508        # Try to get the raw file path from the HDF5 file
509        if raw_file_path is None:
510            raw_file_path = self.h5pydata.attrs["original_file_location"]
511            # Check if og_file_location exists, if not raise an error
512            raw_file_path = self.h5pydata.attrs["original_file_location"]
513
514        raw_file_path = Path(raw_file_path)
515        if not raw_file_path.exists():
516            raise FileExistsError(
517                "File does not exist: " + str(raw_file_path),
518                ". Cannot use original parser for instatiating the lcms_obj.",
519            )
520
521        # Get the original parser type
522        og_parser_type = self.h5pydata.attrs["parser_type"]
523
524        if og_parser_type == "ImportMassSpectraThermoMSFileReader":
525            parser = ImportMassSpectraThermoMSFileReader(raw_file_path)
526        elif og_parser_type == "MZMLSpectraParser":
527            parser = MZMLSpectraParser(raw_file_path)
528
529        mass_spectra.spectra_parser_class = parser.__class__
530        mass_spectra.spectra_parser = parser
531
532        return mass_spectra
533    
534    def get_creation_time(self):
535        """
536        Raise a NotImplemented Warning, as creation time is not available in CoreMS HDF5 files and returning None.
537        """
538        warnings.warn(
539            "Creation time is not available in CoreMS HDF5 files, returning None." \
540            "This should be accessed through the original parser.",
541        )
542        return None
543    
544    def get_instrument_info(self):
545        """
546        Raise a NotImplemented Warning, as instrument info is not available in CoreMS HDF5 files and returning None.
547        """
548        warnings.warn(
549            "Instrument info is not available in CoreMS HDF5 files, returning None." \
550            "This should be accessed through the original parser.",
551        )
552        return None

Class to read CoreMS HDF5 files and populate a LCMS or MassSpectraBase object.

Parameters
  • file_location (str): The location of the HDF5 file to read, including the suffix.
Attributes
  • file_location (str): The location of the HDF5 file to read.
  • h5pydata (h5py.File): The HDF5 file object.
  • scans (list): A list of the location of individual mass spectra within the HDF5 file.
  • scan_number_list (list): A list of the scan numbers of the mass spectra within the HDF5 file.
  • parameters_location (str): The location of the parameters file (json or toml).
Methods
  • import_mass_spectra(mass_spectra). Imports all mass spectra from the HDF5 file onto the LCMS or MassSpectraBase object.
  • get_mass_spectrum_from_scan(scan_number). Return mass spectrum data object from scan number.
  • load(). Placeholder method to meet the requirements of the SpectraParserInterface.
  • run(mass_spectra). Runs the importer functions to populate a LCMS or MassSpectraBase object.
  • import_scan_info(mass_spectra). Imports the scan info from the HDF5 file to populate the _scan_info attribute on the LCMS or MassSpectraBase object
  • import_ms_unprocessed(mass_spectra). Imports the unprocessed mass spectra from the HDF5 file to populate the _ms_unprocessed attribute on the LCMS or MassSpectraBase object
  • import_parameters(mass_spectra). Imports the parameters from the HDF5 file to populate the parameters attribute on the LCMS or MassSpectraBase object
  • import_mass_features(mass_spectra). Imports the mass features from the HDF5 file to populate the mass_features attribute on the LCMS or MassSpectraBase object
  • import_eics(mass_spectra). Imports the extracted ion chromatograms from the HDF5 file to populate the eics attribute on the LCMS or MassSpectraBase object
  • import_spectral_search_results(mass_spectra). Imports the spectral search results from the HDF5 file to populate the spectral_search_results attribute on the LCMS or MassSpectraBase object
  • get_mass_spectra_obj(). Return mass spectra data object, populating the _ms list on the LCMS or MassSpectraBase object from the HDF5 file
  • get_lcms_obj(). Return LCMSBase object, populating the majority of the attributes on the LCMS object from the HDF5 file
ReadCoreMSHDFMassSpectra(file_location: str)
 86    def __init__(self, file_location: str):
 87        Thread.__init__(self)
 88        ReadCoreMSHDF_MassSpectrum.__init__(self, file_location)
 89
 90        # override the scans attribute on ReadCoreMSHDF_MassSpectrum class to expect a nested location within the HDF5 file
 91        self.scans = [
 92            "mass_spectra/" + x for x in list(self.h5pydata["mass_spectra"].keys())
 93        ]
 94        self.scan_number_list = sorted(
 95            [int(float(i)) for i in list(self.h5pydata["mass_spectra"].keys())]
 96        )
 97
 98        # set the location of the parameters file (json or toml)
 99        add_files = [
100            x
101            for x in self.file_location.parent.glob(
102                self.file_location.name.replace(".hdf5", ".*")
103            )
104            if x.suffix != ".hdf5"
105        ]
106        if len([x for x in add_files if x.suffix == ".json"]) > 0:
107            self.parameters_location = [x for x in add_files if x.suffix == ".json"][0]
108        elif len([x for x in add_files if x.suffix == ".toml"]) > 0:
109            self.parameters_location = [x for x in add_files if x.suffix == ".toml"][0]
110        else:
111            self.parameters_location = None

This constructor should always be called with keyword arguments. Arguments are:

group should be None; reserved for future extension when a ThreadGroup class is implemented.

target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.

name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.

args is the argument tuple for the target invocation. Defaults to ().

kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.

If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.

scans
scan_number_list
def get_mass_spectrum_from_scan(self, scan_number):
113    def get_mass_spectrum_from_scan(self, scan_number):
114        """Return mass spectrum data object from scan number."""
115        if scan_number in self.scan_number_list:
116            mass_spec = self.get_mass_spectrum(scan_number)
117            return mass_spec
118        else:
119            raise Exception("Scan number not found in HDF5 file.")

Return mass spectrum data object from scan number.

def load(self) -> None:
121    def load(self) -> None:
122        """ """
123        pass
def get_ms_raw(self, spectra=None, scan_df=None) -> dict:
125    def get_ms_raw(self, spectra=None, scan_df=None) -> dict:
126        """ """
127        # Warn if spectra or scan_df are not None that they are not used for CoreMS HDF5 files and should be rerun after instantiation
128        if spectra is not None or scan_df is not None:
129            SyntaxWarning(
130                "get_ms_raw method for CoreMS HDF5 files can only access saved data, consider rerunning after instantiation."
131            )
132        ms_unprocessed = {}
133        dict_group_load = self.h5pydata["ms_unprocessed"]
134        dict_group_keys = dict_group_load.keys()
135        for k in dict_group_keys:
136            ms_up_int = dict_group_load[k][:]
137            ms_unprocessed[int(k)] = pd.DataFrame(
138                ms_up_int, columns=["scan", "mz", "intensity"]
139            )
140        return ms_unprocessed
def get_scan_df(self) -> pandas.core.frame.DataFrame:
142    def get_scan_df(self) -> pd.DataFrame:
143        scan_info = {}
144        dict_group_load = self.h5pydata["scan_info"]
145        dict_group_keys = dict_group_load.keys()
146        for k in dict_group_keys:
147            scan_info[k] = dict_group_load[k][:]
148        scan_df = pd.DataFrame(scan_info)
149        scan_df.set_index("scan", inplace=True, drop=False)
150        str_df = scan_df.select_dtypes([object])
151        str_df = str_df.stack().str.decode("utf-8").unstack()
152        for col in str_df:
153            scan_df[col] = str_df[col]
154        return scan_df

Return scan data as a pandas DataFrame.

def run(self, mass_spectra, load_raw=True) -> None:
156    def run(self, mass_spectra, load_raw=True) -> None:
157        """Runs the importer functions to populate a LCMS or MassSpectraBase object.
158
159        Notes
160        -----
161        The following functions are run in order, if the HDF5 file contains the necessary data:
162        1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object.
163        2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object.
164        3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object.
165        4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object.
166        5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object.
167        6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object.
168        7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object.
169
170        Parameters
171        ----------
172        mass_spectra : LCMSBase or MassSpectraBase
173            The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes.
174        load_raw : bool
175            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
176        Returns
177        -------
178        None, but populates several attributes on the LCMS or MassSpectraBase object.
179
180        """
181        if self.parameters_location is not None:
182            # Populate the parameters attribute on the LCMS object
183            self.import_parameters(mass_spectra)
184
185        if "mass_spectra" in self.h5pydata:
186            # Populate the _ms list on the LCMS object
187            self.import_mass_spectra(mass_spectra, load_raw=load_raw)
188
189        if "scan_info" in self.h5pydata:
190            # Populate the _scan_info attribute on the LCMS object
191            self.import_scan_info(mass_spectra)
192
193        if "ms_unprocessed" in self.h5pydata and load_raw:
194            # Populate the _ms_unprocessed attribute on the LCMS object
195            self.import_ms_unprocessed(mass_spectra)
196
197        if "mass_features" in self.h5pydata:
198            # Populate the mass_features attribute on the LCMS object
199            self.import_mass_features(mass_spectra)
200
201        if "eics" in self.h5pydata:
202            # Populate the eics attribute on the LCMS object
203            self.import_eics(mass_spectra)
204
205        if "spectral_search_results" in self.h5pydata:
206            # Populate the spectral_search_results attribute on the LCMS object
207            self.import_spectral_search_results(mass_spectra)

Runs the importer functions to populate a LCMS or MassSpectraBase object.

Notes

The following functions are run in order, if the HDF5 file contains the necessary data:

  1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object.
  2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object.
  3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object.
  4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object.
  5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object.
  6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object.
  7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object.
Parameters
  • mass_spectra (LCMSBase or MassSpectraBase): The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes.
  • load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
Returns
  • None, but populates several attributes on the LCMS or MassSpectraBase object.
def import_mass_spectra(self, mass_spectra, load_raw=True) -> None:
209    def import_mass_spectra(self, mass_spectra, load_raw=True) -> None:
210        """Imports all mass spectra from the HDF5 file.
211
212        Parameters
213        ----------
214        mass_spectra : LCMSBase | MassSpectraBase
215            The MassSpectraBase or LCMSBase object to populate with mass spectra.
216        load_raw : bool
217            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default
218
219        Returns
220        -------
221        None, but populates the '_ms' list on the LCMSBase or MassSpectraBase
222        object with mass spectra from the HDF5 file.
223        """
224        for scan_number in self.scan_number_list:
225            mass_spec = self.get_mass_spectrum(scan_number, load_raw=load_raw)
226            mass_spec.scan_number = scan_number
227            mass_spectra.add_mass_spectrum(mass_spec)

Imports all mass spectra from the HDF5 file.

Parameters
  • mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with mass spectra.
  • load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default
Returns
  • None, but populates the '_ms' list on the LCMSBase or MassSpectraBase
  • object with mass spectra from the HDF5 file.
def import_scan_info(self, mass_spectra) -> None:
229    def import_scan_info(self, mass_spectra) -> None:
230        """Imports the scan info from the HDF5 file.
231
232        Parameters
233        ----------
234        lcms : LCMSBase | MassSpectraBase
235            The MassSpectraBase or LCMSBase objects
236
237        Returns
238        -------
239        None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase
240        object with a pandas DataFrame of the 'scan_info' from the HDF5 file.
241
242        """
243        scan_df = self.get_scan_df()
244        mass_spectra.scan_df = scan_df

Imports the scan info from the HDF5 file.

Parameters
  • lcms (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase objects
Returns
  • None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase
  • object with a pandas DataFrame of the 'scan_info' from the HDF5 file.
def import_ms_unprocessed(self, mass_spectra) -> None:
246    def import_ms_unprocessed(self, mass_spectra) -> None:
247        """Imports the unprocessed mass spectra from the HDF5 file.
248
249        Parameters
250        ----------
251        lcms : LCMSBase | MassSpectraBase
252            The MassSpectraBase or LCMSBase objects
253
254        Returns
255        -------
256        None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase
257        object with a dictionary of the 'ms_unprocessed' from the HDF5 file.
258
259        """
260        ms_unprocessed = self.get_ms_raw()
261        mass_spectra._ms_unprocessed = ms_unprocessed

Imports the unprocessed mass spectra from the HDF5 file.

Parameters
  • lcms (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase objects
Returns
  • None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase
  • object with a dictionary of the 'ms_unprocessed' from the HDF5 file.
def import_parameters(self, mass_spectra) -> None:
263    def import_parameters(self, mass_spectra) -> None:
264        """Imports the parameters from the HDF5 file.
265
266        Parameters
267        ----------
268        mass_spectra : LCMSBase | MassSpectraBase
269            The MassSpectraBase or LCMSBase object to populate with parameters.
270
271        Returns
272        -------
273        None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase
274        object with a dictionary of the 'parameters' from the HDF5 file.
275
276        """
277        if ".json" == self.parameters_location.suffix:
278            load_and_set_json_parameters_lcms(mass_spectra, self.parameters_location)
279        if ".toml" == self.parameters_location.suffix:
280            load_and_set_toml_parameters_lcms(mass_spectra, self.parameters_location)
281        else:
282            raise Exception(
283                "Parameters file must be in JSON format, TOML format is not yet supported."
284            )

Imports the parameters from the HDF5 file.

Parameters
  • mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with parameters.
Returns
  • None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase
  • object with a dictionary of the 'parameters' from the HDF5 file.
def import_mass_features(self, mass_spectra) -> None:
286    def import_mass_features(self, mass_spectra) -> None:
287        """Imports the mass features from the HDF5 file.
288
289        Parameters
290        ----------
291        mass_spectra : LCMSBase | MassSpectraBase
292            The MassSpectraBase or LCMSBase object to populate with mass features.
293
294        Returns
295        -------
296        None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase
297        object with a dictionary of the 'mass_features' from the HDF5 file.
298
299        """
300        dict_group_load = self.h5pydata["mass_features"]
301        dict_group_keys = dict_group_load.keys()
302        for k in dict_group_keys:
303            # Instantiate the MassFeature object
304            mass_feature = LCMSMassFeature(
305                mass_spectra,
306                mz=dict_group_load[k].attrs["_mz_exp"],
307                retention_time=dict_group_load[k].attrs["_retention_time"],
308                intensity=dict_group_load[k].attrs["_intensity"],
309                apex_scan=dict_group_load[k].attrs["_apex_scan"],
310                persistence=dict_group_load[k].attrs["_persistence"],
311                id=int(k),
312            )
313
314            # Populate additional attributes on the MassFeature object
315            for key in dict_group_load[k].attrs.keys() - {
316                "_mz_exp",
317                "_mz_cal",
318                "_retention_time",
319                "_intensity",
320                "_apex_scan",
321                "_persistence",
322            }:
323                setattr(mass_feature, key, dict_group_load[k].attrs[key])
324
325            # Populate attributes on MassFeature object that are lists
326            for key in dict_group_load[k].keys():
327                setattr(mass_feature, key, dict_group_load[k][key][:])
328
329            mass_spectra.mass_features[int(k)] = mass_feature
330
331        # Associate mass features with ms1 and ms2 spectra, if available
332        for mf_id in mass_spectra.mass_features.keys():
333            if mass_spectra.mass_features[mf_id].apex_scan in mass_spectra._ms.keys():
334                mass_spectra.mass_features[mf_id].mass_spectrum = mass_spectra._ms[
335                    mass_spectra.mass_features[mf_id].apex_scan
336                ]
337            if mass_spectra.mass_features[mf_id].ms2_scan_numbers is not None:
338                for ms2_scan in mass_spectra.mass_features[mf_id].ms2_scan_numbers:
339                    if ms2_scan in mass_spectra._ms.keys():
340                        mass_spectra.mass_features[mf_id].ms2_mass_spectra[ms2_scan] = (
341                            mass_spectra._ms[ms2_scan]
342                        )

Imports the mass features from the HDF5 file.

Parameters
  • mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with mass features.
Returns
  • None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase
  • object with a dictionary of the 'mass_features' from the HDF5 file.
def import_eics(self, mass_spectra):
344    def import_eics(self, mass_spectra):
345        """Imports the extracted ion chromatograms from the HDF5 file.
346
347        Parameters
348        ----------
349        mass_spectra : LCMSBase | MassSpectraBase
350            The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms.
351
352        Returns
353        -------
354        None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase
355        object with a dictionary of the 'eics' from the HDF5 file.
356
357        """
358        dict_group_load = self.h5pydata["eics"]
359        dict_group_keys = dict_group_load.keys()
360        for k in dict_group_keys:
361            my_eic = EIC_Data(
362                scans=dict_group_load[k]["scans"][:],
363                time=dict_group_load[k]["time"][:],
364                eic=dict_group_load[k]["eic"][:],
365            )
366            for key in dict_group_load[k].keys():
367                if key not in ["scans", "time", "eic"]:
368                    setattr(my_eic, key, dict_group_load[k][key][:])
369                    # if key is apexes, convert to a tuple of a list
370                    if key == "apexes" and len(my_eic.apexes) > 0:
371                        my_eic.apexes = [tuple(x) for x in my_eic.apexes]
372            # Add to mass_spectra object
373            mass_spectra.eics[dict_group_load[k].attrs["mz"]] = my_eic
374
375        # Add to mass features
376        for idx in mass_spectra.mass_features.keys():
377            mz = mass_spectra.mass_features[idx].mz
378            if mz in mass_spectra.eics.keys():
379                mass_spectra.mass_features[idx]._eic_data = mass_spectra.eics[mz]

Imports the extracted ion chromatograms from the HDF5 file.

Parameters
  • mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms.
Returns
  • None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase
  • object with a dictionary of the 'eics' from the HDF5 file.
def import_spectral_search_results(self, mass_spectra):
381    def import_spectral_search_results(self, mass_spectra):
382        """Imports the spectral search results from the HDF5 file.
383
384        Parameters
385        ----------
386        mass_spectra : LCMSBase | MassSpectraBase
387            The MassSpectraBase or LCMSBase object to populate with spectral search results.
388
389        Returns
390        -------
391        None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase
392        object with a dictionary of the 'spectral_search_results' from the HDF5 file.
393
394        """
395        overall_results_dict = {}
396        ms2_results_load = self.h5pydata["spectral_search_results"]
397        for k in ms2_results_load.keys():
398            overall_results_dict[int(k)] = {}
399            for k2 in ms2_results_load[k].keys():
400                ms2_search_res = SpectrumSearchResults(
401                    query_spectrum=mass_spectra._ms[int(k)],
402                    precursor_mz=ms2_results_load[k][k2].attrs["precursor_mz"],
403                    spectral_similarity_search_results={},
404                )
405
406                for key in ms2_results_load[k][k2].keys() - {"precursor_mz"}:
407                    setattr(ms2_search_res, key, list(ms2_results_load[k][k2][key][:]))
408                overall_results_dict[int(k)][
409                    ms2_results_load[k][k2].attrs["precursor_mz"]
410                ] = ms2_search_res
411
412        # add to mass_spectra
413        mass_spectra.spectral_search_results.update(overall_results_dict)
414
415        # If there are mass features, associate the results with each mass feature
416        if len(mass_spectra.mass_features) > 0:
417            for mass_feature_id, mass_feature in mass_spectra.mass_features.items():
418                scan_ids = mass_feature.ms2_scan_numbers
419                for ms2_scan_id in scan_ids:
420                    precursor_mz = mass_feature.mz
421                    try:
422                        mass_spectra.spectral_search_results[ms2_scan_id][precursor_mz]
423                    except KeyError:
424                        pass
425                    else:
426                        mass_spectra.mass_features[
427                            mass_feature_id
428                        ].ms2_similarity_results.append(
429                            mass_spectra.spectral_search_results[ms2_scan_id][
430                                precursor_mz
431                            ]
432                        )

Imports the spectral search results from the HDF5 file.

Parameters
  • mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with spectral search results.
Returns
  • None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase
  • object with a dictionary of the 'spectral_search_results' from the HDF5 file.
def get_mass_spectra_obj( self, load_raw=True) -> corems.mass_spectra.factory.lc_class.MassSpectraBase:
434    def get_mass_spectra_obj(self, load_raw=True) -> MassSpectraBase:
435        """
436        Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file.
437
438        Parameters
439        ----------
440        load_raw : bool
441            If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True.
442
443        """
444        # Instantiate the LCMS object
445        spectra_obj = MassSpectraBase(
446            file_location=self.file_location,
447            analyzer=self.analyzer,
448            instrument_label=self.instrument_label,
449            sample_name=self.sample_name,
450        )
451
452        # This will populate the _ms list on the LCMS or MassSpectraBase object
453        self.run(spectra_obj, load_raw=load_raw)
454
455        return spectra_obj

Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file.

Parameters
  • load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True.
def get_lcms_obj( self, load_raw=True, use_original_parser=True, raw_file_path=None) -> corems.mass_spectra.factory.lc_class.LCMSBase:
457    def get_lcms_obj(
458        self, load_raw=True, use_original_parser=True, raw_file_path=None
459    ) -> LCMSBase:
460        """
461        Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file.
462
463        Parameters
464        ----------
465        load_raw : bool
466            If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
467        use_original_parser : bool
468            If True, use the original parser to populate the LCMS object. Default is True.
469        raw_file_path : str
470            The location of the raw file to parse if attempting to use original parser.
471            Default is None, which attempts to get the raw file path from the HDF5 file.
472            If the original file path has moved, this parameter can be used to specify the new location.
473        """
474        # Instantiate the LCMS object
475        lcms_obj = LCMSBase(
476            file_location=self.file_location,
477            analyzer=self.analyzer,
478            instrument_label=self.instrument_label,
479            sample_name=self.sample_name,
480        )
481
482        # This will populate the majority of the attributes on the LCMS object
483        self.run(lcms_obj, load_raw=load_raw)
484
485        # Set final attributes of the LCMS object
486        lcms_obj.polarity = self.h5pydata.attrs["polarity"]
487        lcms_obj._scans_number_list = list(lcms_obj.scan_df.scan)
488        lcms_obj._retention_time_list = list(lcms_obj.scan_df.scan_time)
489        lcms_obj._tic_list = list(lcms_obj.scan_df.tic)
490
491        # If use_original_parser is True, instantiate the original parser and populate the LCMS object
492        if use_original_parser:
493            lcms_obj = self.add_original_parser(lcms_obj, raw_file_path=raw_file_path)
494
495        return lcms_obj

Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file.

Parameters
  • load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
  • use_original_parser (bool): If True, use the original parser to populate the LCMS object. Default is True.
  • raw_file_path (str): The location of the raw file to parse if attempting to use original parser. Default is None, which attempts to get the raw file path from the HDF5 file. If the original file path has moved, this parameter can be used to specify the new location.
def add_original_parser(self, mass_spectra, raw_file_path=None):
497    def add_original_parser(self, mass_spectra, raw_file_path=None):
498        """
499        Add the original parser to the mass spectra object.
500
501        Parameters
502        ----------
503        mass_spectra : MassSpectraBase | LCMSBase
504            The MassSpectraBase or LCMSBase object to add the original parser to.
505        raw_file_path : str
506            The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file.
507        """
508        # Try to get the raw file path from the HDF5 file
509        if raw_file_path is None:
510            raw_file_path = self.h5pydata.attrs["original_file_location"]
511            # Check if og_file_location exists, if not raise an error
512            raw_file_path = self.h5pydata.attrs["original_file_location"]
513
514        raw_file_path = Path(raw_file_path)
515        if not raw_file_path.exists():
516            raise FileExistsError(
517                "File does not exist: " + str(raw_file_path),
518                ". Cannot use original parser for instatiating the lcms_obj.",
519            )
520
521        # Get the original parser type
522        og_parser_type = self.h5pydata.attrs["parser_type"]
523
524        if og_parser_type == "ImportMassSpectraThermoMSFileReader":
525            parser = ImportMassSpectraThermoMSFileReader(raw_file_path)
526        elif og_parser_type == "MZMLSpectraParser":
527            parser = MZMLSpectraParser(raw_file_path)
528
529        mass_spectra.spectra_parser_class = parser.__class__
530        mass_spectra.spectra_parser = parser
531
532        return mass_spectra

Add the original parser to the mass spectra object.

Parameters
  • mass_spectra (MassSpectraBase | LCMSBase): The MassSpectraBase or LCMSBase object to add the original parser to.
  • raw_file_path (str): The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file.
def get_creation_time(self):
534    def get_creation_time(self):
535        """
536        Raise a NotImplemented Warning, as creation time is not available in CoreMS HDF5 files and returning None.
537        """
538        warnings.warn(
539            "Creation time is not available in CoreMS HDF5 files, returning None." \
540            "This should be accessed through the original parser.",
541        )
542        return None

Raise a NotImplemented Warning, as creation time is not available in CoreMS HDF5 files and returning None.

def get_instrument_info(self):
544    def get_instrument_info(self):
545        """
546        Raise a NotImplemented Warning, as instrument info is not available in CoreMS HDF5 files and returning None.
547        """
548        warnings.warn(
549            "Instrument info is not available in CoreMS HDF5 files, returning None." \
550            "This should be accessed through the original parser.",
551        )
552        return None

Raise a NotImplemented Warning, as instrument info is not available in CoreMS HDF5 files and returning None.