MassListBaseClass( file_location: pathlib.Path | s3path.S3Path, isCentroid: bool = True, analyzer: str = 'Unknown', instrument_label: str = 'Unknown', sample_name: str = None, header_lines: int = 0, isThermoProfile: bool = False, headerless: bool = False) View Source

 67    def __init__(
 68        self,
 69        file_location: Path | S3Path,
 70        isCentroid: bool = True,
 71        analyzer: str = "Unknown",
 72        instrument_label: str = "Unknown",
 73        sample_name: str = None,
 74        header_lines: int = 0,
 75        isThermoProfile: bool = False,
 76        headerless: bool = False,
 77    ):
 78        self.file_location = (
 79            Path(file_location) if isinstance(file_location, str) else file_location
 80        )
 81
 82        if not self.file_location.exists():
 83            raise FileExistsError("File does not exist: %s" % file_location)
 84
 85        # (newline="\n")
 86
 87        self.header_lines = header_lines
 88
 89        if isThermoProfile:
 90            self._expected_columns = {Labels.mz, Labels.abundance}
 91
 92        else:
 93            self._expected_columns = {
 94                Labels.mz,
 95                Labels.abundance,
 96                Labels.s2n,
 97                Labels.rp,
 98            }
 99
100        self._delimiter = None
101
102        self.isCentroid = isCentroid
103
104        self.isThermoProfile = isThermoProfile
105
106        self.headerless = headerless
107
108        self._data_type = None
109
110        self.analyzer = analyzer
111
112        self.instrument_label = instrument_label
113
114        self.sample_name = sample_name
115
116        self._parameters = deepcopy(DataInputSetting())

file_location

header_lines

isCentroid

isThermoProfile

headerless

analyzer

instrument_label

sample_name

parameters

def set_parameter_from_toml(self, parameters_path): View Source

126    def set_parameter_from_toml(self, parameters_path):
127        self._parameters = load_and_set_toml_parameters_class(
128            "DataInput", self.parameters, parameters_path=parameters_path
129        )

def set_parameter_from_json(self, parameters_path): View Source

131    def set_parameter_from_json(self, parameters_path):
132        self._parameters = load_and_set_parameters_class(
133            "DataInput", self.parameters, parameters_path=parameters_path
134        )

data_type

delimiter

def encoding_detector(self, file_location) -> str: View Source

152    def encoding_detector(self, file_location) -> str:
153        """
154        Detects the encoding of a file.
155
156        Parameters
157        --------
158        file_location : str
159            The location of the file to be analyzed.
160
161        Returns
162        --------
163        str
164            The detected encoding of the file.
165        """
166
167        with file_location.open("rb") as rawdata:
168            result = chardet.detect(rawdata.read(10000))
169        return result["encoding"]

Detects the encoding of a file.

Parameters

file_location (str): The location of the file to be analyzed.

Returns

str: The detected encoding of the file.

def set_data_type(self): View Source

171    def set_data_type(self):
172        """
173        Set the data type and delimiter based on the file extension.
174
175        Raises
176        ------
177        TypeError
178            If the data type could not be automatically recognized.
179        """
180        if self.file_location.suffix == ".csv":
181            self.data_type = "txt"
182            self.delimiter = ","
183        elif self.file_location.suffix == ".txt":
184            self.data_type = "txt"
185            self.delimiter = "\t"
186        elif self.file_location.suffix == ".tsv":
187            self.data_type = "txt"
188            self.delimiter = "\t"
189        elif self.file_location.suffix == ".xlsx":
190            self.data_type = "excel"
191        elif self.file_location.suffix == ".ascii":
192            self.data_type = "txt"
193            self.delimiter = "  "
194        elif self.file_location.suffix == ".pkl":
195            self.data_type = "dataframe"
196        elif self.file_location.suffix == ".pks":
197            self.data_type = "pks"
198            self.delimiter = "          "
199            self.header_lines = 9
200        elif self.file_location.suffix == ".xml":
201            self.data_type = "xml"
202            # self.delimiter = None
203            # self.header_lines = None
204        elif self.file_location.suffix == ".xy":
205            self.data_type = "txt"
206            self.delimiter = " "
207            self.header_lines = None
208        else:
209            raise TypeError(
210                "Data type could not be automatically recognized for %s; please set data type and delimiter manually."
211                % self.file_location.name
212            )

Set the data type and delimiter based on the file extension.

Raises

TypeError: If the data type could not be automatically recognized.

def get_dataframe(self) -> pandas.core.frame.DataFrame: View Source

214    def get_dataframe(self) -> DataFrame:
215        """
216        Get the data as a pandas DataFrame.
217
218        Returns
219        -------
220        pandas.DataFrame
221            The data as a pandas DataFrame.
222
223        Raises
224        ------
225        TypeError
226            If the data type is not supported.
227        """
228
229        if not self.data_type or not self.delimiter:
230            self.set_data_type()
231
232        if isinstance(self.file_location, S3Path):
233            data = BytesIO(self.file_location.open("rb").read())
234        else:
235            data = self.file_location
236
237        if self.data_type == "txt":
238            if self.headerless:
239                dataframe = read_csv(
240                    data,
241                    skiprows=self.header_lines,
242                    delimiter=self.delimiter,
243                    header=None,
244                    names=["m/z", "I"],
245                    encoding=self.encoding_detector(self.file_location),
246                    engine="python",
247                )
248            else:
249                dataframe = read_csv(
250                    data,
251                    skiprows=self.header_lines,
252                    delimiter=self.delimiter,
253                    encoding=self.encoding_detector(self.file_location),
254                    engine="python",
255                )
256
257        elif self.data_type == "pks":
258            names = [
259                "m/z",
260                "I",
261                "Scaled Peak Height",
262                "Resolving Power",
263                "Frequency",
264                "S/N",
265            ]
266            clean_data = []
267            with self.file_location.open() as maglabfile:
268                for i in maglabfile.readlines()[8:-1]:
269                    clean_data.append(i.split())
270            dataframe = DataFrame(clean_data, columns=names)
271
272        elif self.data_type == "dataframe":
273            dataframe = read_pickle(data)
274
275        elif self.data_type == "excel":
276            dataframe = read_excel(data)
277
278        elif self.data_type == "xml":
279            dataframe = self.read_xml_peaks(data)
280
281        else:
282            raise TypeError("Data type %s is not supported" % self.data_type)
283
284        return dataframe

Get the data as a pandas DataFrame.

Returns

pandas.DataFrame: The data as a pandas DataFrame.

Raises

TypeError: If the data type is not supported.

def load_settings(self, mass_spec_obj, output_parameters): View Source

286    def load_settings(self, mass_spec_obj, output_parameters):
287        """
288        #TODO loading output parameters from json file is not functional
289        Load settings from a JSON file and apply them to the given mass_spec_obj.
290
291        Parameters
292        ----------
293        mass_spec_obj : MassSpec
294            The mass spectrum object to apply the settings to.
295
296        """
297        import json
298        import warnings
299
300        settings_file_path = self.file_location.with_suffix(".json")
301
302        if settings_file_path.exists():
303            self._parameters = load_and_set_parameters_class(
304                "DataInput", self._parameters, parameters_path=settings_file_path
305            )
306
307            load_and_set_parameters_ms(
308                mass_spec_obj, parameters_path=settings_file_path
309            )
310
311        else:
312            warnings.warn(
313                "auto settings loading is enabled but could not locate the file:  %s. Please load the settings manually"
314                % settings_file_path
315            )
316
317        # TODO this will load the setting from SettingCoreMS.json
318        # coreMSHFD5 overrides this function to import the attrs stored in the h5 file
319        # loaded_settings = {}
320        # loaded_settings['MoleculaSearch'] = self.get_scan_group_attr_data(scan_index,  time_index, 'MoleculaSearchSetting')
321        # loaded_settings['MassSpecPeak'] = self.get_scan_group_attr_data(scan_index,  time_index, 'MassSpecPeakSetting')
322
323        # loaded_settings['MassSpectrum'] = self.get_scan_group_attr_data(scan_index, time_index, 'MassSpectrumSetting')
324        # loaded_settings['Transient'] = self.get_scan_group_attr_data(scan_index, time_index, 'TransientSetting')

TODO loading output parameters from json file is not functional

Load settings from a JSON file and apply them to the given mass_spec_obj.

Parameters

mass_spec_obj (MassSpec): The mass spectrum object to apply the settings to.

def get_output_parameters(self, polarity: int, scan_index: int = 0) -> dict: View Source

326    def get_output_parameters(self, polarity: int, scan_index: int = 0) -> dict:
327        """
328        Get the output parameters for the mass spectrum.
329
330        Parameters
331        ----------
332        polarity : int
333            The polarity of the mass spectrum +1 or -1.
334        scan_index : int, optional
335            The index of the scan. Default is 0.
336
337        Returns
338        -------
339        dict
340            A dictionary containing the output parameters.
341
342        """
343        from copy import deepcopy
344
345        output_parameters = default_parameters(self.file_location)
346
347        if self.isCentroid:
348            output_parameters["label"] = Labels.corems_centroid
349        else:
350            output_parameters["label"] = Labels.bruker_profile
351
352        output_parameters["analyzer"] = self.analyzer
353
354        output_parameters["instrument_label"] = self.instrument_label
355
356        output_parameters["sample_name"] = self.sample_name
357
358        output_parameters["Aterm"] = None
359
360        output_parameters["Bterm"] = None
361
362        output_parameters["Cterm"] = None
363
364        output_parameters["polarity"] = polarity
365
366        # scan_number and rt will be need to lc ms====
367
368        output_parameters["mobility_scan"] = 0
369
370        output_parameters["mobility_rt"] = 0
371
372        output_parameters["scan_number"] = scan_index
373
374        output_parameters["rt"] = 0
375
376        return output_parameters

Get the output parameters for the mass spectrum.

Parameters

polarity (int): The polarity of the mass spectrum +1 or -1.
scan_index (int, optional): The index of the scan. Default is 0.

Returns

dict: A dictionary containing the output parameters.

def clean_data_frame(self, dataframe): View Source

378    def clean_data_frame(self, dataframe):
379        """
380        Clean the input dataframe by removing columns that are not expected.
381
382        Parameters
383        ----------
384        pandas.DataFrame
385            The input dataframe to be cleaned.
386
387        """
388
389        for column_name in dataframe.columns:
390            expected_column_name = self.parameters.header_translate.get(column_name)
391            if expected_column_name not in self._expected_columns:
392                del dataframe[column_name]

Clean the input dataframe by removing columns that are not expected.

Parameters

pandas.DataFrame: The input dataframe to be cleaned.

def check_columns(self, header_labels: list[str]): View Source

394    def check_columns(self, header_labels: list[str]):
395        """
396        Check if the given header labels match the expected columns.
397
398        Parameters
399        ----------
400        header_labels : list
401            The header labels to be checked.
402
403        Raises
404        ------
405        Exception
406            If any expected column is not found in the header labels.
407        """
408        found_label = set()
409
410        for label in header_labels:
411            if not label in self._expected_columns:
412                user_column_name = self.parameters.header_translate.get(label)
413                if user_column_name in self._expected_columns:
414                    found_label.add(user_column_name)
415            else:
416                found_label.add(label)
417
418        not_found = self._expected_columns - found_label
419
420        if len(not_found) > 0:
421            raise Exception(
422                "Please make sure to include the columns %s" % ", ".join(not_found)
423            )

Check if the given header labels match the expected columns.

Parameters

header_labels (list): The header labels to be checked.

Raises

Exception: If any expected column is not found in the header labels.

def read_xml_peaks(self, data: str) -> pandas.core.frame.DataFrame: View Source

425    def read_xml_peaks(self, data: str) -> DataFrame:
426        """
427        Read peaks from a Bruker .xml file and return a pandas DataFrame.
428
429        Parameters
430        ----------
431        data : str
432            The path to the .xml file.
433
434        Returns
435        -------
436        pandas.DataFrame
437            A DataFrame containing the peak data with columns: 'm/z', 'I', 'Resolving Power', 'Area', 'S/N', 'fwhm'.
438        """
439        from numpy import nan
440
441        with open(data, "r") as file:
442            content = file.readlines()
443            content = "".join(content)
444            bs_content = BeautifulSoup(content, features="xml")
445        peaks_xml = bs_content.find_all("pk")
446
447        # initialise lists of the peak variables
448        areas = []
449        fwhms = []
450        intensities = []
451        mzs = []
452        res = []
453        sn = []
454        # iterate through the peaks appending to each list
455        for peak in peaks_xml:
456            areas.append(
457                float(peak.get("a", nan))
458            )  # Use a default value if key 'a' is missing
459            fwhms.append(
460                float(peak.get("fwhm", nan))
461            )  # Use a default value if key 'fwhm' is missing
462            intensities.append(
463                float(peak.get("i", nan))
464            )  # Use a default value if key 'i' is missing
465            mzs.append(
466                float(peak.get("mz", nan))
467            )  # Use a default value if key 'mz' is missing
468            res.append(
469                float(peak.get("res", nan))
470            )  # Use a default value if key 'res' is missing
471            sn.append(
472                float(peak.get("sn", nan))
473            )  # Use a default value if key 'sn' is missing
474
475        # Compile pandas dataframe of these values
476        names = ["m/z", "I", "Resolving Power", "Area", "S/N", "fwhm"]
477        df = DataFrame(columns=names, dtype=float)
478        df["m/z"] = mzs
479        df["I"] = intensities
480        df["Resolving Power"] = res
481        df["Area"] = areas
482        df["S/N"] = sn
483        df["fwhm"] = fwhms
484        return df

Read peaks from a Bruker .xml file and return a pandas DataFrame.

Parameters

data (str): The path to the .xml file.

Returns

pandas.DataFrame: A DataFrame containing the peak data with columns: 'm/z', 'I', 'Resolving Power', 'Area', 'S/N', 'fwhm'.

def get_xml_polarity(self): View Source

486    def get_xml_polarity(self):
487        """
488        Get the polarity from an XML peaklist.
489
490        Returns
491        -------
492        int
493            The polarity of the XML peaklist. Returns -1 for negative polarity, +1 for positive polarity.
494
495        Raises
496        ------
497        Exception
498            If the data type is not XML peaklist in Bruker format or if the polarity is unhandled.
499        """
500
501        # Check its an actual xml
502        if not self.data_type or not self.delimiter:
503            self.set_data_type()
504
505        if isinstance(self.file_location, S3Path):
506            # data = self.file_location.open('rb').read()
507            data = BytesIO(self.file_location.open("rb").read())
508
509        else:
510            data = self.file_location
511
512        if self.data_type != "xml":
513            raise Exception("This function is only for XML peaklists (Bruker format)")
514
515        with open(data, "r") as file:
516            content = file.readlines()
517            content = "".join(content)
518            bs_content = BeautifulSoup(content, features="xml")
519        polarity = bs_content.find_all("ms_spectrum")[0]["polarity"]
520        if polarity == "-":
521            return -1
522        elif polarity == "+":
523            return +1
524        else:
525            raise Exception("Polarity %s unhandled" % polarity)

Get the polarity from an XML peaklist.

Returns

int: The polarity of the XML peaklist. Returns -1 for negative polarity, +1 for positive polarity.

Raises

Exception: If the data type is not XML peaklist in Bruker format or if the polarity is unhandled.

corems.mass_spectrum.input.baseClass

Parameters

Attributes

Methods

Parameters

Returns

Raises

Returns

Raises

TODO loading output parameters from json file is not functional

Parameters

Parameters

Returns

Parameters

Parameters

Raises

Parameters

Returns

Returns

Raises