corems.mass_spectra.input.mzml
1from collections import defaultdict 2from pathlib import Path 3 4import numpy as np 5import pandas as pd 6import pymzml 7import datetime 8 9from corems.encapsulation.constant import Labels 10from corems.encapsulation.factory.parameters import default_parameters 11from corems.mass_spectra.factory.lc_class import LCMSBase, MassSpectraBase 12from corems.mass_spectra.input.parserbase import SpectraParserInterface 13from corems.mass_spectrum.factory.MassSpectrumClasses import ( 14 MassSpecCentroid, 15 MassSpecProfile, 16) 17 18 19class MZMLSpectraParser(SpectraParserInterface): 20 """A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects 21 22 Parameters 23 ---------- 24 file_location : str or Path 25 The path to the RAW file to be parsed. 26 analyzer : str, optional 27 The type of mass analyzer used in the instrument. Default is "Unknown". 28 instrument_label : str, optional 29 The name of the instrument used to acquire the data. Default is "Unknown". 30 sample_name : str, optional 31 The name of the sample being analyzed. If not provided, the stem of the file_location path will be used. 32 33 Attributes 34 ---------- 35 file_location : Path 36 The path to the RAW file being parsed. 37 analyzer : str 38 The type of mass analyzer used in the instrument. 39 instrument_label : str 40 The name of the instrument used to acquire the data. 41 sample_name : str 42 The name of the sample being analyzed. 43 44 Methods 45 ------- 46 * load(). 47 Load mzML file using pymzml.run.Reader and return the data as a numpy array. 48 * run(spectra=True). 49 Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe. 50 * get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True) 51 Parses the mzml file and returns a MassSpecBase object from a single scan. 52 * get_mass_spectra_obj(). 53 Parses the mzml file and instantiates a MassSpectraBase object. 54 * get_lcms_obj(). 55 Parses the mzml file and instantiates an LCMSBase object. 56 * get_instrument_info(). 57 Return instrument information from the mzML file. 58 * get_creation_time(). 59 Return the creation time of the mzML file as a datetime object. 60 61 Inherits from ThermoBaseClass and SpectraParserInterface 62 """ 63 64 def __init__( 65 self, 66 file_location, 67 analyzer="Unknown", 68 instrument_label="Unknown", 69 sample_name=None, 70 ): 71 # implementation details 72 if isinstance(file_location, str): 73 # if obj is a string it defaults to create a Path obj, pass the S3Path if needed 74 file_location = Path(file_location) 75 if not file_location.exists(): 76 raise FileExistsError("File does not exist: " + str(file_location)) 77 self.file_location = file_location 78 self.analyzer = analyzer 79 self.instrument_label = instrument_label 80 81 if sample_name: 82 self.sample_name = sample_name 83 else: 84 self.sample_name = file_location.stem 85 86 def load(self): 87 """ 88 Load mzML file using pymzml.run.Reader and return the data as a numpy array. 89 90 Returns 91 ------- 92 numpy.ndarray 93 The mass spectra data as a numpy array. 94 """ 95 data = pymzml.run.Reader(self.file_location) 96 return data 97 98 def get_scan_df(self, data): 99 """ 100 Return scan data as a pandas DataFrame. 101 102 Parameters 103 ---------- 104 data : pymzml.run.Reader 105 The mass spectra data. 106 107 Returns 108 ------- 109 pandas.DataFrame 110 A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 111 """ 112 # Scan dict 113 # instatinate scan dict, with empty lists of size of scans 114 n_scans = data.get_spectrum_count() 115 scan_dict = { 116 "scan": np.empty(n_scans, dtype=np.int32), 117 "scan_time": np.empty(n_scans, dtype=np.float32), 118 "ms_level": [None] * n_scans, 119 "polarity": [None] * n_scans, 120 "precursor_mz": [None] * n_scans, 121 "scan_text": [None] * n_scans, 122 "scan_window_lower": np.empty(n_scans, dtype=np.float32), 123 "scan_window_upper": np.empty(n_scans, dtype=np.float32), 124 "scan_precision": [None] * n_scans, 125 "tic": np.empty(n_scans, dtype=np.float32), 126 "ms_format": [None] * n_scans, 127 } 128 129 # First pass: loop through scans to get scan info 130 for i, spec in enumerate(data): 131 scan_dict["scan"][i] = spec.ID 132 scan_dict["ms_level"][i] = spec.ms_level 133 scan_dict["scan_precision"][i] = spec._measured_precision 134 scan_dict["tic"][i] = spec.TIC 135 if spec.selected_precursors: 136 scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get( 137 "mz", None 138 ) 139 if spec["negative scan"] is not None: 140 scan_dict["polarity"][i] = "negative" 141 if spec["positive scan"] is not None: 142 scan_dict["polarity"][i] = "positive" 143 if spec["negative scan"] is not None and spec["positive scan"] is not None: 144 raise ValueError( 145 "Error: scan {0} has both negative and positive polarity".format( 146 spec.ID 147 ) 148 ) 149 150 scan_dict["scan_time"][i] = spec.get("MS:1000016") 151 scan_dict["scan_text"][i] = spec.get("MS:1000512") 152 scan_dict["scan_window_lower"][i] = spec.get("MS:1000501") 153 scan_dict["scan_window_upper"][i] = spec.get("MS:1000500") 154 if spec.get("MS:1000128"): 155 scan_dict["ms_format"][i] = "profile" 156 elif spec.get("MS:1000127"): 157 scan_dict["ms_format"][i] = "centroid" 158 else: 159 scan_dict["ms_format"][i] = None 160 161 scan_df = pd.DataFrame(scan_dict) 162 163 return scan_df 164 165 def get_ms_raw(self, spectra, scan_df, data): 166 """Return a dictionary of mass spectra data as a pandas DataFrame. 167 168 Parameters 169 ---------- 170 spectra : str 171 Which mass spectra data to include in the output. 172 Options: None, "ms1", "ms2", "all". 173 scan_df : pandas.DataFrame 174 Scan dataframe. Output from get_scan_df(). 175 data : pymzml.run.Reader 176 The mass spectra data. 177 178 Returns 179 ------- 180 dict 181 A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level. 182 183 """ 184 if spectra == "all": 185 scan_df_forspec = scan_df 186 elif spectra == "ms1": 187 scan_df_forspec = scan_df[scan_df.ms_level == 1] 188 elif spectra == "ms2": 189 scan_df_forspec = scan_df[scan_df.ms_level == 2] 190 else: 191 raise ValueError("spectra must be 'all', 'ms1', or 'ms2'") 192 193 # Result container 194 res = {} 195 196 # Row count container 197 counter = {} 198 199 # Column name container 200 cols = {} 201 202 # set at float32 203 dtype = np.float32 204 205 # First pass: get nrows 206 N = defaultdict(lambda: 0) 207 for i, spec in enumerate(data): 208 if i in scan_df_forspec.scan: 209 # Get ms level 210 level = "ms{}".format(spec.ms_level) 211 212 # Number of rows 213 N[level] += spec.mz.shape[0] 214 215 # Second pass: parse 216 for i, spec in enumerate(data): 217 if i in scan_df_forspec.scan: 218 # Number of rows 219 n = spec.mz.shape[0] 220 221 # No measurements 222 if n == 0: 223 continue 224 225 # Dimension check 226 if len(spec.mz) != len(spec.i): 227 # raise an error if the mz and intensity arrays are not the same length 228 raise ValueError("m/z and intensity array dimension mismatch") 229 230 # Scan/frame info 231 id_dict = spec.id_dict 232 233 # Get ms level 234 level = "ms{}".format(spec.ms_level) 235 236 # Columns 237 cols[level] = list(id_dict.keys()) + ["mz", "intensity"] 238 m = len(cols[level]) 239 240 # Subarray init 241 arr = np.empty((n, m), dtype=dtype) 242 inx = 0 243 244 # Populate scan/frame info 245 for k, v in id_dict.items(): 246 arr[:, inx] = v 247 inx += 1 248 249 # Populate m/z 250 arr[:, inx] = spec.mz 251 inx += 1 252 253 # Populate intensity 254 arr[:, inx] = spec.i 255 inx += 1 256 257 # Initialize output container 258 if level not in res: 259 res[level] = np.empty((N[level], m), dtype=dtype) 260 counter[level] = 0 261 262 # Insert subarray 263 res[level][counter[level] : counter[level] + n, :] = arr 264 counter[level] += n 265 266 # Construct ms1 and ms2 mz dataframes 267 for level in res.keys(): 268 res[level] = pd.DataFrame(res[level], columns=cols[level]).drop( 269 columns=["controllerType", "controllerNumber"], 270 axis=1, 271 inplace=False, 272 ) 273 274 return res 275 276 def run(self, spectra="all", scan_df=None): 277 """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe. 278 279 Parameters 280 ---------- 281 spectra : str, optional 282 Which mass spectra data to include in the output. Default is "all". 283 Other options: None, "ms1", "ms2". 284 scan_df : pandas.DataFrame, optional 285 Scan dataframe. If not provided, the scan dataframe is created from the mzML file. 286 287 Returns 288 ------- 289 tuple 290 A tuple containing two elements: 291 - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level. 292 - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 293 """ 294 295 # Open file 296 data = self.load() 297 298 if scan_df is None: 299 scan_df = self.get_scan_df(data) 300 301 if spectra != "none": 302 res = self.get_ms_raw(spectra, scan_df, data) 303 304 else: 305 res = None 306 307 return res, scan_df 308 309 def get_mass_spectrum_from_scan( 310 self, scan_number, spectrum_mode, auto_process=True 311 ): 312 """Instatiate a mass spectrum object from the mzML file. 313 314 Parameters 315 ---------- 316 scan_number : int 317 The scan number to be parsed. 318 spectrum_mode : str 319 The type of spectrum to instantiate. Must be'profile' or 'centroid'. 320 polarity : int 321 The polarity of the scan. Must be -1 or 1. 322 auto_process : bool, optional 323 If True, process the mass spectrum. Default is True. 324 325 Returns 326 ------- 327 MassSpecProfile | MassSpecCentroid 328 The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum. 329 """ 330 331 def set_metadata( 332 scan_number: int, 333 polarity: int, 334 file_location: str, 335 label=Labels.thermo_profile, 336 ): 337 """ 338 Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object. 339 340 Parameters 341 ---------- 342 scan_number : int 343 The scan number. 344 polarity : int 345 The polarity of the data. 346 file_location : str 347 The file location. 348 label : str, optional 349 The label for the mass spectrum. Default is Labels.thermo_profile. 350 351 Returns 352 ------- 353 dict 354 The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object. 355 """ 356 d_params = default_parameters(file_location) 357 d_params["label"] = label 358 d_params["polarity"] = polarity 359 d_params["filename_path"] = file_location 360 d_params["scan_number"] = scan_number 361 362 return d_params 363 364 # Open file 365 data = self.load() 366 367 # Pluck out individual scan mz and intensity 368 spec = data[scan_number] 369 370 # Get polarity 371 if spec["negative scan"] is not None: 372 polarity = -1 373 elif spec["positive scan"] is not None: 374 polarity = 1 375 376 # Get mass spectrum 377 if spectrum_mode == "profile": 378 # Check if profile 379 if not spec.get("MS:1000128"): 380 raise ValueError("spectrum is not profile") 381 data_dict = { 382 Labels.mz: spec.mz, 383 Labels.abundance: spec.i, 384 } 385 d_params = set_metadata( 386 scan_number, 387 polarity, 388 self.file_location, 389 label=Labels.simulated_profile, 390 ) 391 mass_spectrum_obj = mass_spectrum_obj = MassSpecProfile( 392 data_dict, d_params, auto_process=auto_process 393 ) 394 elif spectrum_mode == "centroid": 395 # Check if centroided 396 if not spec.get("MS:1000127"): 397 raise ValueError("spectrum is not centroided") 398 data_dict = { 399 Labels.mz: spec.mz, 400 Labels.abundance: spec.i, 401 Labels.rp: [np.nan] * len(spec.mz), 402 Labels.s2n: [np.nan] * len(spec.i), 403 } 404 d_params = set_metadata( 405 scan_number, polarity, self.file_location, label=Labels.corems_centroid 406 ) 407 mass_spectrum_obj = MassSpecCentroid( 408 data_dict, d_params, auto_process=auto_process 409 ) 410 411 return mass_spectrum_obj 412 413 def get_mass_spectra_obj(self): 414 """Instatiate a MassSpectraBase object from the mzML file. 415 416 417 Returns 418 ------- 419 MassSpectraBase 420 The MassSpectra object containing the parsed mass spectra. 421 The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe. 422 """ 423 _, scan_df = self.run(spectra=False) 424 mass_spectra_obj = MassSpectraBase( 425 self.file_location, 426 self.analyzer, 427 self.instrument_label, 428 self.sample_name, 429 self, 430 ) 431 scan_df = scan_df.set_index("scan", drop=False) 432 mass_spectra_obj.scan_df = scan_df 433 434 return mass_spectra_obj 435 436 def get_lcms_obj(self, spectra="all"): 437 """Instatiates a LCMSBase object from the mzML file. 438 439 Parameters 440 ---------- 441 spectra : str, optional 442 Which mass spectra data to include in the output. Default is all. Other options: none, ms1, ms2. 443 444 Returns 445 ------- 446 LCMSBase 447 LCMS object containing mass spectra data. 448 The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe, 449 and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs. 450 """ 451 _, scan_df = self.run(spectra="none") # first run it to just get scan info 452 res, scan_df = self.run( 453 scan_df=scan_df, spectra=spectra 454 ) # second run to parse data 455 lcms_obj = LCMSBase( 456 self.file_location, 457 self.analyzer, 458 self.instrument_label, 459 self.sample_name, 460 self, 461 ) 462 for key in res: 463 key_int = int(key.replace("ms", "")) 464 res[key] = res[key][res[key].intensity > 0] 465 res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True) 466 lcms_obj._ms_unprocessed[key_int] = res[key] 467 lcms_obj.scan_df = scan_df.set_index("scan", drop=False) 468 # Check if polarity is mixed 469 if len(set(scan_df.polarity)) > 1: 470 raise ValueError("Mixed polarities detected in scan data") 471 lcms_obj.polarity = scan_df.polarity[0] 472 lcms_obj._scans_number_list = list(scan_df.scan) 473 lcms_obj._retention_time_list = list(scan_df.scan_time) 474 lcms_obj._tic_list = list(scan_df.tic) 475 476 return lcms_obj 477 478 def get_instrument_info(self): 479 """ 480 Return instrument information. 481 482 Returns 483 ------- 484 dict 485 A dictionary with the keys 'model' and 'serial_number'. 486 """ 487 # Load the pymzml data 488 data = self.load() 489 instrument_info = data.info.get('referenceable_param_group_list_element')[0] 490 cv_params = instrument_info.findall('{http://psi.hupo.org/ms/mzml}cvParam') 491 492 # Extract details from each cvParam 493 params = [] 494 for param in cv_params: 495 accession = param.get('accession') # Get 'accession' attribute 496 name = param.get('name') # Get 'name' attribute 497 value = param.get('value') # Get 'value' attribute 498 params.append({ 499 'accession': accession, 500 'name': name, 501 'value': value 502 }) 503 504 # Loop through params and try to find the relevant information 505 instrument_dict = { 506 'model': 'Unknown', 507 'serial_number': 'Unknown' 508 } 509 510 # Assuming there are only two paramters here - one is for the serial number (agnostic to the model) and the other is for the model 511 # If there are more than two, we raise an error 512 if len(params) < 2: 513 raise ValueError("Not enough parameters found in the instrument info, cannot parse.") 514 if len(params) > 2: 515 raise ValueError("Too many parameters found in the instrument info, cannot parse.") 516 for param in params: 517 if param['accession'] == 'MS:1000529': 518 instrument_dict['serial_number'] = param['value'] 519 else: 520 instrument_dict['model'] = data.OT[param['accession']] 521 522 return instrument_dict 523 524 def get_creation_time(self) -> datetime.datetime: 525 """ 526 Return the creation time of the mzML file. 527 """ 528 data = self.load() 529 write_time = data.info.get('start_time') 530 if write_time: 531 # Convert the write time to a datetime object 532 return datetime.datetime.strptime(write_time, "%Y-%m-%dT%H:%M:%SZ") 533 else: 534 raise ValueError("Creation time is not available in the mzML file. " 535 "Please ensure the file contains the 'start_time' information.")
20class MZMLSpectraParser(SpectraParserInterface): 21 """A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects 22 23 Parameters 24 ---------- 25 file_location : str or Path 26 The path to the RAW file to be parsed. 27 analyzer : str, optional 28 The type of mass analyzer used in the instrument. Default is "Unknown". 29 instrument_label : str, optional 30 The name of the instrument used to acquire the data. Default is "Unknown". 31 sample_name : str, optional 32 The name of the sample being analyzed. If not provided, the stem of the file_location path will be used. 33 34 Attributes 35 ---------- 36 file_location : Path 37 The path to the RAW file being parsed. 38 analyzer : str 39 The type of mass analyzer used in the instrument. 40 instrument_label : str 41 The name of the instrument used to acquire the data. 42 sample_name : str 43 The name of the sample being analyzed. 44 45 Methods 46 ------- 47 * load(). 48 Load mzML file using pymzml.run.Reader and return the data as a numpy array. 49 * run(spectra=True). 50 Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe. 51 * get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True) 52 Parses the mzml file and returns a MassSpecBase object from a single scan. 53 * get_mass_spectra_obj(). 54 Parses the mzml file and instantiates a MassSpectraBase object. 55 * get_lcms_obj(). 56 Parses the mzml file and instantiates an LCMSBase object. 57 * get_instrument_info(). 58 Return instrument information from the mzML file. 59 * get_creation_time(). 60 Return the creation time of the mzML file as a datetime object. 61 62 Inherits from ThermoBaseClass and SpectraParserInterface 63 """ 64 65 def __init__( 66 self, 67 file_location, 68 analyzer="Unknown", 69 instrument_label="Unknown", 70 sample_name=None, 71 ): 72 # implementation details 73 if isinstance(file_location, str): 74 # if obj is a string it defaults to create a Path obj, pass the S3Path if needed 75 file_location = Path(file_location) 76 if not file_location.exists(): 77 raise FileExistsError("File does not exist: " + str(file_location)) 78 self.file_location = file_location 79 self.analyzer = analyzer 80 self.instrument_label = instrument_label 81 82 if sample_name: 83 self.sample_name = sample_name 84 else: 85 self.sample_name = file_location.stem 86 87 def load(self): 88 """ 89 Load mzML file using pymzml.run.Reader and return the data as a numpy array. 90 91 Returns 92 ------- 93 numpy.ndarray 94 The mass spectra data as a numpy array. 95 """ 96 data = pymzml.run.Reader(self.file_location) 97 return data 98 99 def get_scan_df(self, data): 100 """ 101 Return scan data as a pandas DataFrame. 102 103 Parameters 104 ---------- 105 data : pymzml.run.Reader 106 The mass spectra data. 107 108 Returns 109 ------- 110 pandas.DataFrame 111 A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 112 """ 113 # Scan dict 114 # instatinate scan dict, with empty lists of size of scans 115 n_scans = data.get_spectrum_count() 116 scan_dict = { 117 "scan": np.empty(n_scans, dtype=np.int32), 118 "scan_time": np.empty(n_scans, dtype=np.float32), 119 "ms_level": [None] * n_scans, 120 "polarity": [None] * n_scans, 121 "precursor_mz": [None] * n_scans, 122 "scan_text": [None] * n_scans, 123 "scan_window_lower": np.empty(n_scans, dtype=np.float32), 124 "scan_window_upper": np.empty(n_scans, dtype=np.float32), 125 "scan_precision": [None] * n_scans, 126 "tic": np.empty(n_scans, dtype=np.float32), 127 "ms_format": [None] * n_scans, 128 } 129 130 # First pass: loop through scans to get scan info 131 for i, spec in enumerate(data): 132 scan_dict["scan"][i] = spec.ID 133 scan_dict["ms_level"][i] = spec.ms_level 134 scan_dict["scan_precision"][i] = spec._measured_precision 135 scan_dict["tic"][i] = spec.TIC 136 if spec.selected_precursors: 137 scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get( 138 "mz", None 139 ) 140 if spec["negative scan"] is not None: 141 scan_dict["polarity"][i] = "negative" 142 if spec["positive scan"] is not None: 143 scan_dict["polarity"][i] = "positive" 144 if spec["negative scan"] is not None and spec["positive scan"] is not None: 145 raise ValueError( 146 "Error: scan {0} has both negative and positive polarity".format( 147 spec.ID 148 ) 149 ) 150 151 scan_dict["scan_time"][i] = spec.get("MS:1000016") 152 scan_dict["scan_text"][i] = spec.get("MS:1000512") 153 scan_dict["scan_window_lower"][i] = spec.get("MS:1000501") 154 scan_dict["scan_window_upper"][i] = spec.get("MS:1000500") 155 if spec.get("MS:1000128"): 156 scan_dict["ms_format"][i] = "profile" 157 elif spec.get("MS:1000127"): 158 scan_dict["ms_format"][i] = "centroid" 159 else: 160 scan_dict["ms_format"][i] = None 161 162 scan_df = pd.DataFrame(scan_dict) 163 164 return scan_df 165 166 def get_ms_raw(self, spectra, scan_df, data): 167 """Return a dictionary of mass spectra data as a pandas DataFrame. 168 169 Parameters 170 ---------- 171 spectra : str 172 Which mass spectra data to include in the output. 173 Options: None, "ms1", "ms2", "all". 174 scan_df : pandas.DataFrame 175 Scan dataframe. Output from get_scan_df(). 176 data : pymzml.run.Reader 177 The mass spectra data. 178 179 Returns 180 ------- 181 dict 182 A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level. 183 184 """ 185 if spectra == "all": 186 scan_df_forspec = scan_df 187 elif spectra == "ms1": 188 scan_df_forspec = scan_df[scan_df.ms_level == 1] 189 elif spectra == "ms2": 190 scan_df_forspec = scan_df[scan_df.ms_level == 2] 191 else: 192 raise ValueError("spectra must be 'all', 'ms1', or 'ms2'") 193 194 # Result container 195 res = {} 196 197 # Row count container 198 counter = {} 199 200 # Column name container 201 cols = {} 202 203 # set at float32 204 dtype = np.float32 205 206 # First pass: get nrows 207 N = defaultdict(lambda: 0) 208 for i, spec in enumerate(data): 209 if i in scan_df_forspec.scan: 210 # Get ms level 211 level = "ms{}".format(spec.ms_level) 212 213 # Number of rows 214 N[level] += spec.mz.shape[0] 215 216 # Second pass: parse 217 for i, spec in enumerate(data): 218 if i in scan_df_forspec.scan: 219 # Number of rows 220 n = spec.mz.shape[0] 221 222 # No measurements 223 if n == 0: 224 continue 225 226 # Dimension check 227 if len(spec.mz) != len(spec.i): 228 # raise an error if the mz and intensity arrays are not the same length 229 raise ValueError("m/z and intensity array dimension mismatch") 230 231 # Scan/frame info 232 id_dict = spec.id_dict 233 234 # Get ms level 235 level = "ms{}".format(spec.ms_level) 236 237 # Columns 238 cols[level] = list(id_dict.keys()) + ["mz", "intensity"] 239 m = len(cols[level]) 240 241 # Subarray init 242 arr = np.empty((n, m), dtype=dtype) 243 inx = 0 244 245 # Populate scan/frame info 246 for k, v in id_dict.items(): 247 arr[:, inx] = v 248 inx += 1 249 250 # Populate m/z 251 arr[:, inx] = spec.mz 252 inx += 1 253 254 # Populate intensity 255 arr[:, inx] = spec.i 256 inx += 1 257 258 # Initialize output container 259 if level not in res: 260 res[level] = np.empty((N[level], m), dtype=dtype) 261 counter[level] = 0 262 263 # Insert subarray 264 res[level][counter[level] : counter[level] + n, :] = arr 265 counter[level] += n 266 267 # Construct ms1 and ms2 mz dataframes 268 for level in res.keys(): 269 res[level] = pd.DataFrame(res[level], columns=cols[level]).drop( 270 columns=["controllerType", "controllerNumber"], 271 axis=1, 272 inplace=False, 273 ) 274 275 return res 276 277 def run(self, spectra="all", scan_df=None): 278 """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe. 279 280 Parameters 281 ---------- 282 spectra : str, optional 283 Which mass spectra data to include in the output. Default is "all". 284 Other options: None, "ms1", "ms2". 285 scan_df : pandas.DataFrame, optional 286 Scan dataframe. If not provided, the scan dataframe is created from the mzML file. 287 288 Returns 289 ------- 290 tuple 291 A tuple containing two elements: 292 - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level. 293 - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 294 """ 295 296 # Open file 297 data = self.load() 298 299 if scan_df is None: 300 scan_df = self.get_scan_df(data) 301 302 if spectra != "none": 303 res = self.get_ms_raw(spectra, scan_df, data) 304 305 else: 306 res = None 307 308 return res, scan_df 309 310 def get_mass_spectrum_from_scan( 311 self, scan_number, spectrum_mode, auto_process=True 312 ): 313 """Instatiate a mass spectrum object from the mzML file. 314 315 Parameters 316 ---------- 317 scan_number : int 318 The scan number to be parsed. 319 spectrum_mode : str 320 The type of spectrum to instantiate. Must be'profile' or 'centroid'. 321 polarity : int 322 The polarity of the scan. Must be -1 or 1. 323 auto_process : bool, optional 324 If True, process the mass spectrum. Default is True. 325 326 Returns 327 ------- 328 MassSpecProfile | MassSpecCentroid 329 The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum. 330 """ 331 332 def set_metadata( 333 scan_number: int, 334 polarity: int, 335 file_location: str, 336 label=Labels.thermo_profile, 337 ): 338 """ 339 Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object. 340 341 Parameters 342 ---------- 343 scan_number : int 344 The scan number. 345 polarity : int 346 The polarity of the data. 347 file_location : str 348 The file location. 349 label : str, optional 350 The label for the mass spectrum. Default is Labels.thermo_profile. 351 352 Returns 353 ------- 354 dict 355 The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object. 356 """ 357 d_params = default_parameters(file_location) 358 d_params["label"] = label 359 d_params["polarity"] = polarity 360 d_params["filename_path"] = file_location 361 d_params["scan_number"] = scan_number 362 363 return d_params 364 365 # Open file 366 data = self.load() 367 368 # Pluck out individual scan mz and intensity 369 spec = data[scan_number] 370 371 # Get polarity 372 if spec["negative scan"] is not None: 373 polarity = -1 374 elif spec["positive scan"] is not None: 375 polarity = 1 376 377 # Get mass spectrum 378 if spectrum_mode == "profile": 379 # Check if profile 380 if not spec.get("MS:1000128"): 381 raise ValueError("spectrum is not profile") 382 data_dict = { 383 Labels.mz: spec.mz, 384 Labels.abundance: spec.i, 385 } 386 d_params = set_metadata( 387 scan_number, 388 polarity, 389 self.file_location, 390 label=Labels.simulated_profile, 391 ) 392 mass_spectrum_obj = mass_spectrum_obj = MassSpecProfile( 393 data_dict, d_params, auto_process=auto_process 394 ) 395 elif spectrum_mode == "centroid": 396 # Check if centroided 397 if not spec.get("MS:1000127"): 398 raise ValueError("spectrum is not centroided") 399 data_dict = { 400 Labels.mz: spec.mz, 401 Labels.abundance: spec.i, 402 Labels.rp: [np.nan] * len(spec.mz), 403 Labels.s2n: [np.nan] * len(spec.i), 404 } 405 d_params = set_metadata( 406 scan_number, polarity, self.file_location, label=Labels.corems_centroid 407 ) 408 mass_spectrum_obj = MassSpecCentroid( 409 data_dict, d_params, auto_process=auto_process 410 ) 411 412 return mass_spectrum_obj 413 414 def get_mass_spectra_obj(self): 415 """Instatiate a MassSpectraBase object from the mzML file. 416 417 418 Returns 419 ------- 420 MassSpectraBase 421 The MassSpectra object containing the parsed mass spectra. 422 The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe. 423 """ 424 _, scan_df = self.run(spectra=False) 425 mass_spectra_obj = MassSpectraBase( 426 self.file_location, 427 self.analyzer, 428 self.instrument_label, 429 self.sample_name, 430 self, 431 ) 432 scan_df = scan_df.set_index("scan", drop=False) 433 mass_spectra_obj.scan_df = scan_df 434 435 return mass_spectra_obj 436 437 def get_lcms_obj(self, spectra="all"): 438 """Instatiates a LCMSBase object from the mzML file. 439 440 Parameters 441 ---------- 442 spectra : str, optional 443 Which mass spectra data to include in the output. Default is all. Other options: none, ms1, ms2. 444 445 Returns 446 ------- 447 LCMSBase 448 LCMS object containing mass spectra data. 449 The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe, 450 and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs. 451 """ 452 _, scan_df = self.run(spectra="none") # first run it to just get scan info 453 res, scan_df = self.run( 454 scan_df=scan_df, spectra=spectra 455 ) # second run to parse data 456 lcms_obj = LCMSBase( 457 self.file_location, 458 self.analyzer, 459 self.instrument_label, 460 self.sample_name, 461 self, 462 ) 463 for key in res: 464 key_int = int(key.replace("ms", "")) 465 res[key] = res[key][res[key].intensity > 0] 466 res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True) 467 lcms_obj._ms_unprocessed[key_int] = res[key] 468 lcms_obj.scan_df = scan_df.set_index("scan", drop=False) 469 # Check if polarity is mixed 470 if len(set(scan_df.polarity)) > 1: 471 raise ValueError("Mixed polarities detected in scan data") 472 lcms_obj.polarity = scan_df.polarity[0] 473 lcms_obj._scans_number_list = list(scan_df.scan) 474 lcms_obj._retention_time_list = list(scan_df.scan_time) 475 lcms_obj._tic_list = list(scan_df.tic) 476 477 return lcms_obj 478 479 def get_instrument_info(self): 480 """ 481 Return instrument information. 482 483 Returns 484 ------- 485 dict 486 A dictionary with the keys 'model' and 'serial_number'. 487 """ 488 # Load the pymzml data 489 data = self.load() 490 instrument_info = data.info.get('referenceable_param_group_list_element')[0] 491 cv_params = instrument_info.findall('{http://psi.hupo.org/ms/mzml}cvParam') 492 493 # Extract details from each cvParam 494 params = [] 495 for param in cv_params: 496 accession = param.get('accession') # Get 'accession' attribute 497 name = param.get('name') # Get 'name' attribute 498 value = param.get('value') # Get 'value' attribute 499 params.append({ 500 'accession': accession, 501 'name': name, 502 'value': value 503 }) 504 505 # Loop through params and try to find the relevant information 506 instrument_dict = { 507 'model': 'Unknown', 508 'serial_number': 'Unknown' 509 } 510 511 # Assuming there are only two paramters here - one is for the serial number (agnostic to the model) and the other is for the model 512 # If there are more than two, we raise an error 513 if len(params) < 2: 514 raise ValueError("Not enough parameters found in the instrument info, cannot parse.") 515 if len(params) > 2: 516 raise ValueError("Too many parameters found in the instrument info, cannot parse.") 517 for param in params: 518 if param['accession'] == 'MS:1000529': 519 instrument_dict['serial_number'] = param['value'] 520 else: 521 instrument_dict['model'] = data.OT[param['accession']] 522 523 return instrument_dict 524 525 def get_creation_time(self) -> datetime.datetime: 526 """ 527 Return the creation time of the mzML file. 528 """ 529 data = self.load() 530 write_time = data.info.get('start_time') 531 if write_time: 532 # Convert the write time to a datetime object 533 return datetime.datetime.strptime(write_time, "%Y-%m-%dT%H:%M:%SZ") 534 else: 535 raise ValueError("Creation time is not available in the mzML file. " 536 "Please ensure the file contains the 'start_time' information.")
A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects
Parameters
- file_location (str or Path): The path to the RAW file to be parsed.
- analyzer (str, optional): The type of mass analyzer used in the instrument. Default is "Unknown".
- instrument_label (str, optional): The name of the instrument used to acquire the data. Default is "Unknown".
- sample_name (str, optional): The name of the sample being analyzed. If not provided, the stem of the file_location path will be used.
Attributes
- file_location (Path): The path to the RAW file being parsed.
- analyzer (str): The type of mass analyzer used in the instrument.
- instrument_label (str): The name of the instrument used to acquire the data.
- sample_name (str): The name of the sample being analyzed.
Methods
- load(). Load mzML file using pymzml.run.Reader and return the data as a numpy array.
- run(spectra=True). Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe.
- get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True) Parses the mzml file and returns a MassSpecBase object from a single scan.
- get_mass_spectra_obj(). Parses the mzml file and instantiates a MassSpectraBase object.
- get_lcms_obj(). Parses the mzml file and instantiates an LCMSBase object.
- get_instrument_info(). Return instrument information from the mzML file.
- get_creation_time(). Return the creation time of the mzML file as a datetime object.
Inherits from ThermoBaseClass and SpectraParserInterface
MZMLSpectraParser( file_location, analyzer='Unknown', instrument_label='Unknown', sample_name=None)
65 def __init__( 66 self, 67 file_location, 68 analyzer="Unknown", 69 instrument_label="Unknown", 70 sample_name=None, 71 ): 72 # implementation details 73 if isinstance(file_location, str): 74 # if obj is a string it defaults to create a Path obj, pass the S3Path if needed 75 file_location = Path(file_location) 76 if not file_location.exists(): 77 raise FileExistsError("File does not exist: " + str(file_location)) 78 self.file_location = file_location 79 self.analyzer = analyzer 80 self.instrument_label = instrument_label 81 82 if sample_name: 83 self.sample_name = sample_name 84 else: 85 self.sample_name = file_location.stem
def
load(self):
87 def load(self): 88 """ 89 Load mzML file using pymzml.run.Reader and return the data as a numpy array. 90 91 Returns 92 ------- 93 numpy.ndarray 94 The mass spectra data as a numpy array. 95 """ 96 data = pymzml.run.Reader(self.file_location) 97 return data
Load mzML file using pymzml.run.Reader and return the data as a numpy array.
Returns
- numpy.ndarray: The mass spectra data as a numpy array.
def
get_scan_df(self, data):
99 def get_scan_df(self, data): 100 """ 101 Return scan data as a pandas DataFrame. 102 103 Parameters 104 ---------- 105 data : pymzml.run.Reader 106 The mass spectra data. 107 108 Returns 109 ------- 110 pandas.DataFrame 111 A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 112 """ 113 # Scan dict 114 # instatinate scan dict, with empty lists of size of scans 115 n_scans = data.get_spectrum_count() 116 scan_dict = { 117 "scan": np.empty(n_scans, dtype=np.int32), 118 "scan_time": np.empty(n_scans, dtype=np.float32), 119 "ms_level": [None] * n_scans, 120 "polarity": [None] * n_scans, 121 "precursor_mz": [None] * n_scans, 122 "scan_text": [None] * n_scans, 123 "scan_window_lower": np.empty(n_scans, dtype=np.float32), 124 "scan_window_upper": np.empty(n_scans, dtype=np.float32), 125 "scan_precision": [None] * n_scans, 126 "tic": np.empty(n_scans, dtype=np.float32), 127 "ms_format": [None] * n_scans, 128 } 129 130 # First pass: loop through scans to get scan info 131 for i, spec in enumerate(data): 132 scan_dict["scan"][i] = spec.ID 133 scan_dict["ms_level"][i] = spec.ms_level 134 scan_dict["scan_precision"][i] = spec._measured_precision 135 scan_dict["tic"][i] = spec.TIC 136 if spec.selected_precursors: 137 scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get( 138 "mz", None 139 ) 140 if spec["negative scan"] is not None: 141 scan_dict["polarity"][i] = "negative" 142 if spec["positive scan"] is not None: 143 scan_dict["polarity"][i] = "positive" 144 if spec["negative scan"] is not None and spec["positive scan"] is not None: 145 raise ValueError( 146 "Error: scan {0} has both negative and positive polarity".format( 147 spec.ID 148 ) 149 ) 150 151 scan_dict["scan_time"][i] = spec.get("MS:1000016") 152 scan_dict["scan_text"][i] = spec.get("MS:1000512") 153 scan_dict["scan_window_lower"][i] = spec.get("MS:1000501") 154 scan_dict["scan_window_upper"][i] = spec.get("MS:1000500") 155 if spec.get("MS:1000128"): 156 scan_dict["ms_format"][i] = "profile" 157 elif spec.get("MS:1000127"): 158 scan_dict["ms_format"][i] = "centroid" 159 else: 160 scan_dict["ms_format"][i] = None 161 162 scan_df = pd.DataFrame(scan_dict) 163 164 return scan_df
Return scan data as a pandas DataFrame.
Parameters
- data (pymzml.run.Reader): The mass spectra data.
Returns
- pandas.DataFrame: A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
def
get_ms_raw(self, spectra, scan_df, data):
166 def get_ms_raw(self, spectra, scan_df, data): 167 """Return a dictionary of mass spectra data as a pandas DataFrame. 168 169 Parameters 170 ---------- 171 spectra : str 172 Which mass spectra data to include in the output. 173 Options: None, "ms1", "ms2", "all". 174 scan_df : pandas.DataFrame 175 Scan dataframe. Output from get_scan_df(). 176 data : pymzml.run.Reader 177 The mass spectra data. 178 179 Returns 180 ------- 181 dict 182 A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level. 183 184 """ 185 if spectra == "all": 186 scan_df_forspec = scan_df 187 elif spectra == "ms1": 188 scan_df_forspec = scan_df[scan_df.ms_level == 1] 189 elif spectra == "ms2": 190 scan_df_forspec = scan_df[scan_df.ms_level == 2] 191 else: 192 raise ValueError("spectra must be 'all', 'ms1', or 'ms2'") 193 194 # Result container 195 res = {} 196 197 # Row count container 198 counter = {} 199 200 # Column name container 201 cols = {} 202 203 # set at float32 204 dtype = np.float32 205 206 # First pass: get nrows 207 N = defaultdict(lambda: 0) 208 for i, spec in enumerate(data): 209 if i in scan_df_forspec.scan: 210 # Get ms level 211 level = "ms{}".format(spec.ms_level) 212 213 # Number of rows 214 N[level] += spec.mz.shape[0] 215 216 # Second pass: parse 217 for i, spec in enumerate(data): 218 if i in scan_df_forspec.scan: 219 # Number of rows 220 n = spec.mz.shape[0] 221 222 # No measurements 223 if n == 0: 224 continue 225 226 # Dimension check 227 if len(spec.mz) != len(spec.i): 228 # raise an error if the mz and intensity arrays are not the same length 229 raise ValueError("m/z and intensity array dimension mismatch") 230 231 # Scan/frame info 232 id_dict = spec.id_dict 233 234 # Get ms level 235 level = "ms{}".format(spec.ms_level) 236 237 # Columns 238 cols[level] = list(id_dict.keys()) + ["mz", "intensity"] 239 m = len(cols[level]) 240 241 # Subarray init 242 arr = np.empty((n, m), dtype=dtype) 243 inx = 0 244 245 # Populate scan/frame info 246 for k, v in id_dict.items(): 247 arr[:, inx] = v 248 inx += 1 249 250 # Populate m/z 251 arr[:, inx] = spec.mz 252 inx += 1 253 254 # Populate intensity 255 arr[:, inx] = spec.i 256 inx += 1 257 258 # Initialize output container 259 if level not in res: 260 res[level] = np.empty((N[level], m), dtype=dtype) 261 counter[level] = 0 262 263 # Insert subarray 264 res[level][counter[level] : counter[level] + n, :] = arr 265 counter[level] += n 266 267 # Construct ms1 and ms2 mz dataframes 268 for level in res.keys(): 269 res[level] = pd.DataFrame(res[level], columns=cols[level]).drop( 270 columns=["controllerType", "controllerNumber"], 271 axis=1, 272 inplace=False, 273 ) 274 275 return res
Return a dictionary of mass spectra data as a pandas DataFrame.
Parameters
- spectra (str): Which mass spectra data to include in the output. Options: None, "ms1", "ms2", "all".
- scan_df (pandas.DataFrame): Scan dataframe. Output from get_scan_df().
- data (pymzml.run.Reader): The mass spectra data.
Returns
- dict: A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level.
def
run(self, spectra='all', scan_df=None):
277 def run(self, spectra="all", scan_df=None): 278 """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe. 279 280 Parameters 281 ---------- 282 spectra : str, optional 283 Which mass spectra data to include in the output. Default is "all". 284 Other options: None, "ms1", "ms2". 285 scan_df : pandas.DataFrame, optional 286 Scan dataframe. If not provided, the scan dataframe is created from the mzML file. 287 288 Returns 289 ------- 290 tuple 291 A tuple containing two elements: 292 - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level. 293 - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 294 """ 295 296 # Open file 297 data = self.load() 298 299 if scan_df is None: 300 scan_df = self.get_scan_df(data) 301 302 if spectra != "none": 303 res = self.get_ms_raw(spectra, scan_df, data) 304 305 else: 306 res = None 307 308 return res, scan_df
Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe.
Parameters
- spectra (str, optional): Which mass spectra data to include in the output. Default is "all". Other options: None, "ms1", "ms2".
- scan_df (pandas.DataFrame, optional): Scan dataframe. If not provided, the scan dataframe is created from the mzML file.
Returns
- tuple: A tuple containing two elements:
- A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level.
- A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
def
get_mass_spectrum_from_scan(self, scan_number, spectrum_mode, auto_process=True):
310 def get_mass_spectrum_from_scan( 311 self, scan_number, spectrum_mode, auto_process=True 312 ): 313 """Instatiate a mass spectrum object from the mzML file. 314 315 Parameters 316 ---------- 317 scan_number : int 318 The scan number to be parsed. 319 spectrum_mode : str 320 The type of spectrum to instantiate. Must be'profile' or 'centroid'. 321 polarity : int 322 The polarity of the scan. Must be -1 or 1. 323 auto_process : bool, optional 324 If True, process the mass spectrum. Default is True. 325 326 Returns 327 ------- 328 MassSpecProfile | MassSpecCentroid 329 The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum. 330 """ 331 332 def set_metadata( 333 scan_number: int, 334 polarity: int, 335 file_location: str, 336 label=Labels.thermo_profile, 337 ): 338 """ 339 Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object. 340 341 Parameters 342 ---------- 343 scan_number : int 344 The scan number. 345 polarity : int 346 The polarity of the data. 347 file_location : str 348 The file location. 349 label : str, optional 350 The label for the mass spectrum. Default is Labels.thermo_profile. 351 352 Returns 353 ------- 354 dict 355 The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object. 356 """ 357 d_params = default_parameters(file_location) 358 d_params["label"] = label 359 d_params["polarity"] = polarity 360 d_params["filename_path"] = file_location 361 d_params["scan_number"] = scan_number 362 363 return d_params 364 365 # Open file 366 data = self.load() 367 368 # Pluck out individual scan mz and intensity 369 spec = data[scan_number] 370 371 # Get polarity 372 if spec["negative scan"] is not None: 373 polarity = -1 374 elif spec["positive scan"] is not None: 375 polarity = 1 376 377 # Get mass spectrum 378 if spectrum_mode == "profile": 379 # Check if profile 380 if not spec.get("MS:1000128"): 381 raise ValueError("spectrum is not profile") 382 data_dict = { 383 Labels.mz: spec.mz, 384 Labels.abundance: spec.i, 385 } 386 d_params = set_metadata( 387 scan_number, 388 polarity, 389 self.file_location, 390 label=Labels.simulated_profile, 391 ) 392 mass_spectrum_obj = mass_spectrum_obj = MassSpecProfile( 393 data_dict, d_params, auto_process=auto_process 394 ) 395 elif spectrum_mode == "centroid": 396 # Check if centroided 397 if not spec.get("MS:1000127"): 398 raise ValueError("spectrum is not centroided") 399 data_dict = { 400 Labels.mz: spec.mz, 401 Labels.abundance: spec.i, 402 Labels.rp: [np.nan] * len(spec.mz), 403 Labels.s2n: [np.nan] * len(spec.i), 404 } 405 d_params = set_metadata( 406 scan_number, polarity, self.file_location, label=Labels.corems_centroid 407 ) 408 mass_spectrum_obj = MassSpecCentroid( 409 data_dict, d_params, auto_process=auto_process 410 ) 411 412 return mass_spectrum_obj
Instatiate a mass spectrum object from the mzML file.
Parameters
- scan_number (int): The scan number to be parsed.
- spectrum_mode (str): The type of spectrum to instantiate. Must be'profile' or 'centroid'.
- polarity (int): The polarity of the scan. Must be -1 or 1.
- auto_process (bool, optional): If True, process the mass spectrum. Default is True.
Returns
- MassSpecProfile | MassSpecCentroid: The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum.
def
get_mass_spectra_obj(self):
414 def get_mass_spectra_obj(self): 415 """Instatiate a MassSpectraBase object from the mzML file. 416 417 418 Returns 419 ------- 420 MassSpectraBase 421 The MassSpectra object containing the parsed mass spectra. 422 The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe. 423 """ 424 _, scan_df = self.run(spectra=False) 425 mass_spectra_obj = MassSpectraBase( 426 self.file_location, 427 self.analyzer, 428 self.instrument_label, 429 self.sample_name, 430 self, 431 ) 432 scan_df = scan_df.set_index("scan", drop=False) 433 mass_spectra_obj.scan_df = scan_df 434 435 return mass_spectra_obj
Instatiate a MassSpectraBase object from the mzML file.
Returns
- MassSpectraBase: The MassSpectra object containing the parsed mass spectra. The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe.
def
get_lcms_obj(self, spectra='all'):
437 def get_lcms_obj(self, spectra="all"): 438 """Instatiates a LCMSBase object from the mzML file. 439 440 Parameters 441 ---------- 442 spectra : str, optional 443 Which mass spectra data to include in the output. Default is all. Other options: none, ms1, ms2. 444 445 Returns 446 ------- 447 LCMSBase 448 LCMS object containing mass spectra data. 449 The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe, 450 and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs. 451 """ 452 _, scan_df = self.run(spectra="none") # first run it to just get scan info 453 res, scan_df = self.run( 454 scan_df=scan_df, spectra=spectra 455 ) # second run to parse data 456 lcms_obj = LCMSBase( 457 self.file_location, 458 self.analyzer, 459 self.instrument_label, 460 self.sample_name, 461 self, 462 ) 463 for key in res: 464 key_int = int(key.replace("ms", "")) 465 res[key] = res[key][res[key].intensity > 0] 466 res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True) 467 lcms_obj._ms_unprocessed[key_int] = res[key] 468 lcms_obj.scan_df = scan_df.set_index("scan", drop=False) 469 # Check if polarity is mixed 470 if len(set(scan_df.polarity)) > 1: 471 raise ValueError("Mixed polarities detected in scan data") 472 lcms_obj.polarity = scan_df.polarity[0] 473 lcms_obj._scans_number_list = list(scan_df.scan) 474 lcms_obj._retention_time_list = list(scan_df.scan_time) 475 lcms_obj._tic_list = list(scan_df.tic) 476 477 return lcms_obj
Instatiates a LCMSBase object from the mzML file.
Parameters
- spectra (str, optional): Which mass spectra data to include in the output. Default is all. Other options: none, ms1, ms2.
Returns
- LCMSBase: LCMS object containing mass spectra data. The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe, and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs.
def
get_instrument_info(self):
479 def get_instrument_info(self): 480 """ 481 Return instrument information. 482 483 Returns 484 ------- 485 dict 486 A dictionary with the keys 'model' and 'serial_number'. 487 """ 488 # Load the pymzml data 489 data = self.load() 490 instrument_info = data.info.get('referenceable_param_group_list_element')[0] 491 cv_params = instrument_info.findall('{http://psi.hupo.org/ms/mzml}cvParam') 492 493 # Extract details from each cvParam 494 params = [] 495 for param in cv_params: 496 accession = param.get('accession') # Get 'accession' attribute 497 name = param.get('name') # Get 'name' attribute 498 value = param.get('value') # Get 'value' attribute 499 params.append({ 500 'accession': accession, 501 'name': name, 502 'value': value 503 }) 504 505 # Loop through params and try to find the relevant information 506 instrument_dict = { 507 'model': 'Unknown', 508 'serial_number': 'Unknown' 509 } 510 511 # Assuming there are only two paramters here - one is for the serial number (agnostic to the model) and the other is for the model 512 # If there are more than two, we raise an error 513 if len(params) < 2: 514 raise ValueError("Not enough parameters found in the instrument info, cannot parse.") 515 if len(params) > 2: 516 raise ValueError("Too many parameters found in the instrument info, cannot parse.") 517 for param in params: 518 if param['accession'] == 'MS:1000529': 519 instrument_dict['serial_number'] = param['value'] 520 else: 521 instrument_dict['model'] = data.OT[param['accession']] 522 523 return instrument_dict
Return instrument information.
Returns
- dict: A dictionary with the keys 'model' and 'serial_number'.
def
get_creation_time(self) -> datetime.datetime:
525 def get_creation_time(self) -> datetime.datetime: 526 """ 527 Return the creation time of the mzML file. 528 """ 529 data = self.load() 530 write_time = data.info.get('start_time') 531 if write_time: 532 # Convert the write time to a datetime object 533 return datetime.datetime.strptime(write_time, "%Y-%m-%dT%H:%M:%SZ") 534 else: 535 raise ValueError("Creation time is not available in the mzML file. " 536 "Please ensure the file contains the 'start_time' information.")
Return the creation time of the mzML file.