corems.mass_spectra.input.mzml
1from collections import defaultdict 2from pathlib import Path 3 4import numpy as np 5import pandas as pd 6import pymzml 7import datetime 8 9from corems.encapsulation.constant import Labels 10from corems.encapsulation.factory.parameters import default_parameters 11from corems.mass_spectra.factory.lc_class import LCMSBase, MassSpectraBase 12from corems.mass_spectra.input.parserbase import SpectraParserInterface 13from corems.mass_spectrum.factory.MassSpectrumClasses import ( 14 MassSpecCentroid, 15 MassSpecProfile, 16) 17 18 19class MZMLSpectraParser(SpectraParserInterface): 20 """A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects 21 22 Parameters 23 ---------- 24 file_location : str or Path 25 The path to the RAW file to be parsed. 26 analyzer : str, optional 27 The type of mass analyzer used in the instrument. Default is "Unknown". 28 instrument_label : str, optional 29 The name of the instrument used to acquire the data. Default is "Unknown". 30 sample_name : str, optional 31 The name of the sample being analyzed. If not provided, the stem of the file_location path will be used. 32 33 Attributes 34 ---------- 35 file_location : Path 36 The path to the RAW file being parsed. 37 analyzer : str 38 The type of mass analyzer used in the instrument. 39 instrument_label : str 40 The name of the instrument used to acquire the data. 41 sample_name : str 42 The name of the sample being analyzed. 43 44 Methods 45 ------- 46 * load(). 47 Load mzML file using pymzml.run.Reader and return the data as a numpy array. 48 * run(spectra=True). 49 Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe. 50 * get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True) 51 Parses the mzml file and returns a MassSpecBase object from a single scan. 52 * get_mass_spectra_obj(). 53 Parses the mzml file and instantiates a MassSpectraBase object. 54 * get_lcms_obj(). 55 Parses the mzml file and instantiates an LCMSBase object. 56 * get_instrument_info(). 57 Return instrument information from the mzML file. 58 * get_creation_time(). 59 Return the creation time of the mzML file as a datetime object. 60 61 Inherits from ThermoBaseClass and SpectraParserInterface 62 """ 63 64 def __init__( 65 self, 66 file_location, 67 analyzer="Unknown", 68 instrument_label="Unknown", 69 sample_name=None, 70 ): 71 # implementation details 72 if isinstance(file_location, str): 73 # if obj is a string it defaults to create a Path obj, pass the S3Path if needed 74 file_location = Path(file_location) 75 if not file_location.exists(): 76 raise FileExistsError("File does not exist: " + str(file_location)) 77 self.file_location = file_location 78 self.analyzer = analyzer 79 self.instrument_label = instrument_label 80 81 if sample_name: 82 self.sample_name = sample_name 83 else: 84 self.sample_name = file_location.stem 85 86 def load(self): 87 """ 88 Load mzML file using pymzml.run.Reader and return the data as a numpy array. 89 90 Returns 91 ------- 92 numpy.ndarray 93 The mass spectra data as a numpy array. 94 """ 95 data = pymzml.run.Reader(self.file_location) 96 return data 97 98 def get_scan_df(self, data): 99 """ 100 Return scan data as a pandas DataFrame. 101 102 Parameters 103 ---------- 104 data : pymzml.run.Reader 105 The mass spectra data. 106 107 Returns 108 ------- 109 pandas.DataFrame 110 A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 111 """ 112 # Scan dict 113 # instatinate scan dict, with empty lists of size of scans 114 n_scans = data.get_spectrum_count() 115 scan_dict = { 116 "scan": np.empty(n_scans, dtype=np.int32), 117 "scan_time": np.empty(n_scans, dtype=np.float32), 118 "ms_level": [None] * n_scans, 119 "polarity": [None] * n_scans, 120 "precursor_mz": [None] * n_scans, 121 "scan_text": [None] * n_scans, 122 "scan_window_lower": np.empty(n_scans, dtype=np.float32), 123 "scan_window_upper": np.empty(n_scans, dtype=np.float32), 124 "scan_precision": [None] * n_scans, 125 "tic": np.empty(n_scans, dtype=np.float32), 126 "ms_format": [None] * n_scans, 127 } 128 129 # First pass: loop through scans to get scan info 130 for i, spec in enumerate(data): 131 scan_dict["scan"][i] = spec.ID 132 scan_dict["ms_level"][i] = spec.ms_level 133 scan_dict["scan_precision"][i] = spec._measured_precision 134 scan_dict["tic"][i] = spec.TIC 135 if spec.selected_precursors: 136 scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get( 137 "mz", None 138 ) 139 if spec["negative scan"] is not None: 140 scan_dict["polarity"][i] = "negative" 141 if spec["positive scan"] is not None: 142 scan_dict["polarity"][i] = "positive" 143 if spec["negative scan"] is not None and spec["positive scan"] is not None: 144 raise ValueError( 145 "Error: scan {0} has both negative and positive polarity".format( 146 spec.ID 147 ) 148 ) 149 150 scan_dict["scan_time"][i] = spec.get("MS:1000016") 151 scan_dict["scan_text"][i] = spec.get("MS:1000512") 152 scan_dict["scan_window_lower"][i] = spec.get("MS:1000501") 153 scan_dict["scan_window_upper"][i] = spec.get("MS:1000500") 154 if spec.get("MS:1000128"): 155 scan_dict["ms_format"][i] = "profile" 156 elif spec.get("MS:1000127"): 157 scan_dict["ms_format"][i] = "centroid" 158 else: 159 scan_dict["ms_format"][i] = None 160 161 scan_df = pd.DataFrame(scan_dict) 162 163 return scan_df 164 165 def get_ms_raw(self, spectra, scan_df, data): 166 """Return a dictionary of mass spectra data as a pandas DataFrame. 167 168 Parameters 169 ---------- 170 spectra : str 171 Which mass spectra data to include in the output. 172 Options: None, "ms1", "ms2", "all". 173 scan_df : pandas.DataFrame 174 Scan dataframe. Output from get_scan_df(). 175 data : pymzml.run.Reader 176 The mass spectra data. 177 178 Returns 179 ------- 180 dict 181 A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level. 182 183 """ 184 if spectra == "all": 185 scan_df_forspec = scan_df 186 elif spectra == "ms1": 187 scan_df_forspec = scan_df[scan_df.ms_level == 1] 188 elif spectra == "ms2": 189 scan_df_forspec = scan_df[scan_df.ms_level == 2] 190 else: 191 raise ValueError("spectra must be 'all', 'ms1', or 'ms2'") 192 193 # Result container 194 res = {} 195 196 # Row count container 197 counter = {} 198 199 # Column name container 200 cols = {} 201 202 # set at float32 203 dtype = np.float32 204 205 # First pass: get nrows 206 N = defaultdict(lambda: 0) 207 for i, spec in enumerate(data): 208 if i in scan_df_forspec.scan: 209 # Get ms level 210 level = "ms{}".format(spec.ms_level) 211 212 # Number of rows 213 N[level] += spec.mz.shape[0] 214 215 # Second pass: parse 216 for i, spec in enumerate(data): 217 if i in scan_df_forspec.scan: 218 # Number of rows 219 n = spec.mz.shape[0] 220 221 # No measurements 222 if n == 0: 223 continue 224 225 # Dimension check 226 if len(spec.mz) != len(spec.i): 227 # raise an error if the mz and intensity arrays are not the same length 228 raise ValueError("m/z and intensity array dimension mismatch") 229 230 # Scan/frame info 231 id_dict = spec.id_dict 232 233 # Get ms level 234 level = "ms{}".format(spec.ms_level) 235 236 # Columns 237 cols[level] = list(id_dict.keys()) + ["mz", "intensity"] 238 m = len(cols[level]) 239 240 # Subarray init 241 arr = np.empty((n, m), dtype=dtype) 242 inx = 0 243 244 # Populate scan/frame info 245 for k, v in id_dict.items(): 246 arr[:, inx] = v 247 inx += 1 248 249 # Populate m/z 250 arr[:, inx] = spec.mz 251 inx += 1 252 253 # Populate intensity 254 arr[:, inx] = spec.i 255 inx += 1 256 257 # Initialize output container 258 if level not in res: 259 res[level] = np.empty((N[level], m), dtype=dtype) 260 counter[level] = 0 261 262 # Insert subarray 263 res[level][counter[level] : counter[level] + n, :] = arr 264 counter[level] += n 265 266 # Construct ms1 and ms2 mz dataframes 267 for level in res.keys(): 268 res[level] = pd.DataFrame(res[level], columns=cols[level]).drop( 269 columns=["controllerType", "controllerNumber"], 270 axis=1, 271 inplace=False, 272 ) 273 274 return res 275 276 def run(self, spectra="all", scan_df=None): 277 """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe. 278 279 Parameters 280 ---------- 281 spectra : str, optional 282 Which mass spectra data to include in the output. Default is "all". 283 Other options: None, "ms1", "ms2". 284 scan_df : pandas.DataFrame, optional 285 Scan dataframe. If not provided, the scan dataframe is created from the mzML file. 286 287 Returns 288 ------- 289 tuple 290 A tuple containing two elements: 291 - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level. 292 - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 293 """ 294 295 # Open file 296 data = self.load() 297 298 if scan_df is None: 299 scan_df = self.get_scan_df(data) 300 301 if spectra != "none": 302 res = self.get_ms_raw(spectra, scan_df, data) 303 304 else: 305 res = None 306 307 return res, scan_df 308 309 def get_mass_spectrum_from_scan( 310 self, scan_number, spectrum_mode, auto_process=True 311 ): 312 """Instatiate a mass spectrum object from the mzML file. 313 314 Parameters 315 ---------- 316 scan_number : int 317 The scan number to be parsed. 318 spectrum_mode : str 319 The type of spectrum to instantiate. Must be'profile' or 'centroid'. 320 polarity : int 321 The polarity of the scan. Must be -1 or 1. 322 auto_process : bool, optional 323 If True, process the mass spectrum. Default is True. 324 325 Returns 326 ------- 327 MassSpecProfile | MassSpecCentroid 328 The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum. 329 """ 330 # Use the batch function and return the first result 331 result_list = self.get_mass_spectra_from_scan_list( 332 [scan_number], spectrum_mode, auto_process 333 ) 334 return result_list[0] if result_list else None 335 336 def get_mass_spectra_from_scan_list( 337 self, scan_list, spectrum_mode, auto_process=True 338 ): 339 """Instatiate mass spectrum objects from the mzML file. 340 341 Parameters 342 ---------- 343 scan_list : list of int 344 The scan numbers to be parsed. 345 spectrum_mode : str 346 The type of spectrum to instantiate. Must be'profile' or 'centroid'. 347 auto_process : bool, optional 348 If True, process the mass spectrum. Default is True. 349 350 Returns 351 ------- 352 list of MassSpecProfile | MassSpecCentroid 353 List of MassSpecProfile or MassSpecCentroid objects containing the parsed mass spectra. 354 """ 355 356 def set_metadata( 357 scan_number: int, 358 polarity: int, 359 file_location: str, 360 label=Labels.thermo_profile, 361 ): 362 """ 363 Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object. 364 365 Parameters 366 ---------- 367 scan_number : int 368 The scan number. 369 polarity : int 370 The polarity of the data. 371 file_location : str 372 The file location. 373 label : str, optional 374 The label for the mass spectrum. Default is Labels.thermo_profile. 375 376 Returns 377 ------- 378 dict 379 The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object. 380 """ 381 d_params = default_parameters(file_location) 382 d_params["label"] = label 383 d_params["polarity"] = polarity 384 d_params["filename_path"] = file_location 385 d_params["scan_number"] = scan_number 386 387 return d_params 388 389 # Open file 390 data = self.load() 391 392 mass_spectrum_objects = [] 393 394 for scan_number in scan_list: 395 # Pluck out individual scan mz and intensity 396 spec = data[scan_number] 397 398 # Get polarity 399 if spec["negative scan"] is not None: 400 polarity = -1 401 elif spec["positive scan"] is not None: 402 polarity = 1 403 404 # Get mass spectrum 405 if spectrum_mode == "profile": 406 # Check if profile 407 if not spec.get("MS:1000128"): 408 raise ValueError("spectrum is not profile") 409 data_dict = { 410 Labels.mz: spec.mz, 411 Labels.abundance: spec.i, 412 } 413 d_params = set_metadata( 414 scan_number, 415 polarity, 416 self.file_location, 417 label=Labels.simulated_profile, 418 ) 419 mass_spectrum_obj = MassSpecProfile( 420 data_dict, d_params, auto_process=auto_process 421 ) 422 elif spectrum_mode == "centroid": 423 # Check if centroided 424 if not spec.get("MS:1000127"): 425 raise ValueError("spectrum is not centroided") 426 data_dict = { 427 Labels.mz: spec.mz, 428 Labels.abundance: spec.i, 429 Labels.rp: [np.nan] * len(spec.mz), 430 Labels.s2n: [np.nan] * len(spec.i), 431 } 432 d_params = set_metadata( 433 scan_number, polarity, self.file_location, label=Labels.corems_centroid 434 ) 435 mass_spectrum_obj = MassSpecCentroid( 436 data_dict, d_params, auto_process=auto_process 437 ) 438 439 mass_spectrum_objects.append(mass_spectrum_obj) 440 441 return mass_spectrum_objects 442 443 def get_mass_spectra_obj(self): 444 """Instatiate a MassSpectraBase object from the mzML file. 445 446 447 Returns 448 ------- 449 MassSpectraBase 450 The MassSpectra object containing the parsed mass spectra. 451 The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe. 452 """ 453 _, scan_df = self.run(spectra=False) 454 mass_spectra_obj = MassSpectraBase( 455 self.file_location, 456 self.analyzer, 457 self.instrument_label, 458 self.sample_name, 459 self, 460 ) 461 scan_df = scan_df.set_index("scan", drop=False) 462 mass_spectra_obj.scan_df = scan_df 463 464 return mass_spectra_obj 465 466 def get_lcms_obj(self, spectra="all"): 467 """Instatiates a LCMSBase object from the mzML file. 468 469 Parameters 470 ---------- 471 spectra : str, optional 472 Which mass spectra data to include in the output. Default is all. Other options: none, ms1, ms2. 473 474 Returns 475 ------- 476 LCMSBase 477 LCMS object containing mass spectra data. 478 The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe, 479 and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs. 480 """ 481 _, scan_df = self.run(spectra="none") # first run it to just get scan info 482 if spectra != "none": 483 res, scan_df = self.run( 484 scan_df=scan_df, spectra=spectra 485 ) # second run to parse data 486 lcms_obj = LCMSBase( 487 self.file_location, 488 self.analyzer, 489 self.instrument_label, 490 self.sample_name, 491 self, 492 ) 493 if spectra != "none": 494 for key in res: 495 key_int = int(key.replace("ms", "")) 496 res[key] = res[key][res[key].intensity > 0] 497 res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True) 498 lcms_obj._ms_unprocessed[key_int] = res[key] 499 lcms_obj.scan_df = scan_df.set_index("scan", drop=False) 500 # Check if polarity is mixed 501 if len(set(scan_df.polarity)) > 1: 502 raise ValueError("Mixed polarities detected in scan data") 503 lcms_obj.polarity = scan_df.polarity[0] 504 lcms_obj._scans_number_list = list(scan_df.scan) 505 lcms_obj._retention_time_list = list(scan_df.scan_time) 506 lcms_obj._tic_list = list(scan_df.tic) 507 508 return lcms_obj 509 510 def get_instrument_info(self): 511 """ 512 Return instrument information. 513 514 Returns 515 ------- 516 dict 517 A dictionary with the keys 'model' and 'serial_number'. 518 """ 519 # Load the pymzml data 520 data = self.load() 521 instrument_info = data.info.get('referenceable_param_group_list_element')[0] 522 cv_params = instrument_info.findall('{http://psi.hupo.org/ms/mzml}cvParam') 523 524 # Extract details from each cvParam 525 params = [] 526 for param in cv_params: 527 accession = param.get('accession') # Get 'accession' attribute 528 name = param.get('name') # Get 'name' attribute 529 value = param.get('value') # Get 'value' attribute 530 params.append({ 531 'accession': accession, 532 'name': name, 533 'value': value 534 }) 535 536 # Loop through params and try to find the relevant information 537 instrument_dict = { 538 'model': 'Unknown', 539 'serial_number': 'Unknown' 540 } 541 542 # Assuming there are only two paramters here - one is for the serial number (agnostic to the model) and the other is for the model 543 # If there are more than two, we raise an error 544 if len(params) < 2: 545 raise ValueError("Not enough parameters found in the instrument info, cannot parse.") 546 if len(params) > 2: 547 raise ValueError("Too many parameters found in the instrument info, cannot parse.") 548 for param in params: 549 if param['accession'] == 'MS:1000529': 550 instrument_dict['serial_number'] = param['value'] 551 else: 552 instrument_dict['model'] = data.OT[param['accession']] 553 554 return instrument_dict 555 556 def get_creation_time(self) -> datetime.datetime: 557 """ 558 Return the creation time of the mzML file. 559 """ 560 data = self.load() 561 write_time = data.info.get('start_time') 562 if write_time: 563 # Convert the write time to a datetime object 564 return datetime.datetime.strptime(write_time, "%Y-%m-%dT%H:%M:%SZ") 565 else: 566 raise ValueError("Creation time is not available in the mzML file. " 567 "Please ensure the file contains the 'start_time' information.")
20class MZMLSpectraParser(SpectraParserInterface): 21 """A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects 22 23 Parameters 24 ---------- 25 file_location : str or Path 26 The path to the RAW file to be parsed. 27 analyzer : str, optional 28 The type of mass analyzer used in the instrument. Default is "Unknown". 29 instrument_label : str, optional 30 The name of the instrument used to acquire the data. Default is "Unknown". 31 sample_name : str, optional 32 The name of the sample being analyzed. If not provided, the stem of the file_location path will be used. 33 34 Attributes 35 ---------- 36 file_location : Path 37 The path to the RAW file being parsed. 38 analyzer : str 39 The type of mass analyzer used in the instrument. 40 instrument_label : str 41 The name of the instrument used to acquire the data. 42 sample_name : str 43 The name of the sample being analyzed. 44 45 Methods 46 ------- 47 * load(). 48 Load mzML file using pymzml.run.Reader and return the data as a numpy array. 49 * run(spectra=True). 50 Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe. 51 * get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True) 52 Parses the mzml file and returns a MassSpecBase object from a single scan. 53 * get_mass_spectra_obj(). 54 Parses the mzml file and instantiates a MassSpectraBase object. 55 * get_lcms_obj(). 56 Parses the mzml file and instantiates an LCMSBase object. 57 * get_instrument_info(). 58 Return instrument information from the mzML file. 59 * get_creation_time(). 60 Return the creation time of the mzML file as a datetime object. 61 62 Inherits from ThermoBaseClass and SpectraParserInterface 63 """ 64 65 def __init__( 66 self, 67 file_location, 68 analyzer="Unknown", 69 instrument_label="Unknown", 70 sample_name=None, 71 ): 72 # implementation details 73 if isinstance(file_location, str): 74 # if obj is a string it defaults to create a Path obj, pass the S3Path if needed 75 file_location = Path(file_location) 76 if not file_location.exists(): 77 raise FileExistsError("File does not exist: " + str(file_location)) 78 self.file_location = file_location 79 self.analyzer = analyzer 80 self.instrument_label = instrument_label 81 82 if sample_name: 83 self.sample_name = sample_name 84 else: 85 self.sample_name = file_location.stem 86 87 def load(self): 88 """ 89 Load mzML file using pymzml.run.Reader and return the data as a numpy array. 90 91 Returns 92 ------- 93 numpy.ndarray 94 The mass spectra data as a numpy array. 95 """ 96 data = pymzml.run.Reader(self.file_location) 97 return data 98 99 def get_scan_df(self, data): 100 """ 101 Return scan data as a pandas DataFrame. 102 103 Parameters 104 ---------- 105 data : pymzml.run.Reader 106 The mass spectra data. 107 108 Returns 109 ------- 110 pandas.DataFrame 111 A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 112 """ 113 # Scan dict 114 # instatinate scan dict, with empty lists of size of scans 115 n_scans = data.get_spectrum_count() 116 scan_dict = { 117 "scan": np.empty(n_scans, dtype=np.int32), 118 "scan_time": np.empty(n_scans, dtype=np.float32), 119 "ms_level": [None] * n_scans, 120 "polarity": [None] * n_scans, 121 "precursor_mz": [None] * n_scans, 122 "scan_text": [None] * n_scans, 123 "scan_window_lower": np.empty(n_scans, dtype=np.float32), 124 "scan_window_upper": np.empty(n_scans, dtype=np.float32), 125 "scan_precision": [None] * n_scans, 126 "tic": np.empty(n_scans, dtype=np.float32), 127 "ms_format": [None] * n_scans, 128 } 129 130 # First pass: loop through scans to get scan info 131 for i, spec in enumerate(data): 132 scan_dict["scan"][i] = spec.ID 133 scan_dict["ms_level"][i] = spec.ms_level 134 scan_dict["scan_precision"][i] = spec._measured_precision 135 scan_dict["tic"][i] = spec.TIC 136 if spec.selected_precursors: 137 scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get( 138 "mz", None 139 ) 140 if spec["negative scan"] is not None: 141 scan_dict["polarity"][i] = "negative" 142 if spec["positive scan"] is not None: 143 scan_dict["polarity"][i] = "positive" 144 if spec["negative scan"] is not None and spec["positive scan"] is not None: 145 raise ValueError( 146 "Error: scan {0} has both negative and positive polarity".format( 147 spec.ID 148 ) 149 ) 150 151 scan_dict["scan_time"][i] = spec.get("MS:1000016") 152 scan_dict["scan_text"][i] = spec.get("MS:1000512") 153 scan_dict["scan_window_lower"][i] = spec.get("MS:1000501") 154 scan_dict["scan_window_upper"][i] = spec.get("MS:1000500") 155 if spec.get("MS:1000128"): 156 scan_dict["ms_format"][i] = "profile" 157 elif spec.get("MS:1000127"): 158 scan_dict["ms_format"][i] = "centroid" 159 else: 160 scan_dict["ms_format"][i] = None 161 162 scan_df = pd.DataFrame(scan_dict) 163 164 return scan_df 165 166 def get_ms_raw(self, spectra, scan_df, data): 167 """Return a dictionary of mass spectra data as a pandas DataFrame. 168 169 Parameters 170 ---------- 171 spectra : str 172 Which mass spectra data to include in the output. 173 Options: None, "ms1", "ms2", "all". 174 scan_df : pandas.DataFrame 175 Scan dataframe. Output from get_scan_df(). 176 data : pymzml.run.Reader 177 The mass spectra data. 178 179 Returns 180 ------- 181 dict 182 A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level. 183 184 """ 185 if spectra == "all": 186 scan_df_forspec = scan_df 187 elif spectra == "ms1": 188 scan_df_forspec = scan_df[scan_df.ms_level == 1] 189 elif spectra == "ms2": 190 scan_df_forspec = scan_df[scan_df.ms_level == 2] 191 else: 192 raise ValueError("spectra must be 'all', 'ms1', or 'ms2'") 193 194 # Result container 195 res = {} 196 197 # Row count container 198 counter = {} 199 200 # Column name container 201 cols = {} 202 203 # set at float32 204 dtype = np.float32 205 206 # First pass: get nrows 207 N = defaultdict(lambda: 0) 208 for i, spec in enumerate(data): 209 if i in scan_df_forspec.scan: 210 # Get ms level 211 level = "ms{}".format(spec.ms_level) 212 213 # Number of rows 214 N[level] += spec.mz.shape[0] 215 216 # Second pass: parse 217 for i, spec in enumerate(data): 218 if i in scan_df_forspec.scan: 219 # Number of rows 220 n = spec.mz.shape[0] 221 222 # No measurements 223 if n == 0: 224 continue 225 226 # Dimension check 227 if len(spec.mz) != len(spec.i): 228 # raise an error if the mz and intensity arrays are not the same length 229 raise ValueError("m/z and intensity array dimension mismatch") 230 231 # Scan/frame info 232 id_dict = spec.id_dict 233 234 # Get ms level 235 level = "ms{}".format(spec.ms_level) 236 237 # Columns 238 cols[level] = list(id_dict.keys()) + ["mz", "intensity"] 239 m = len(cols[level]) 240 241 # Subarray init 242 arr = np.empty((n, m), dtype=dtype) 243 inx = 0 244 245 # Populate scan/frame info 246 for k, v in id_dict.items(): 247 arr[:, inx] = v 248 inx += 1 249 250 # Populate m/z 251 arr[:, inx] = spec.mz 252 inx += 1 253 254 # Populate intensity 255 arr[:, inx] = spec.i 256 inx += 1 257 258 # Initialize output container 259 if level not in res: 260 res[level] = np.empty((N[level], m), dtype=dtype) 261 counter[level] = 0 262 263 # Insert subarray 264 res[level][counter[level] : counter[level] + n, :] = arr 265 counter[level] += n 266 267 # Construct ms1 and ms2 mz dataframes 268 for level in res.keys(): 269 res[level] = pd.DataFrame(res[level], columns=cols[level]).drop( 270 columns=["controllerType", "controllerNumber"], 271 axis=1, 272 inplace=False, 273 ) 274 275 return res 276 277 def run(self, spectra="all", scan_df=None): 278 """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe. 279 280 Parameters 281 ---------- 282 spectra : str, optional 283 Which mass spectra data to include in the output. Default is "all". 284 Other options: None, "ms1", "ms2". 285 scan_df : pandas.DataFrame, optional 286 Scan dataframe. If not provided, the scan dataframe is created from the mzML file. 287 288 Returns 289 ------- 290 tuple 291 A tuple containing two elements: 292 - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level. 293 - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 294 """ 295 296 # Open file 297 data = self.load() 298 299 if scan_df is None: 300 scan_df = self.get_scan_df(data) 301 302 if spectra != "none": 303 res = self.get_ms_raw(spectra, scan_df, data) 304 305 else: 306 res = None 307 308 return res, scan_df 309 310 def get_mass_spectrum_from_scan( 311 self, scan_number, spectrum_mode, auto_process=True 312 ): 313 """Instatiate a mass spectrum object from the mzML file. 314 315 Parameters 316 ---------- 317 scan_number : int 318 The scan number to be parsed. 319 spectrum_mode : str 320 The type of spectrum to instantiate. Must be'profile' or 'centroid'. 321 polarity : int 322 The polarity of the scan. Must be -1 or 1. 323 auto_process : bool, optional 324 If True, process the mass spectrum. Default is True. 325 326 Returns 327 ------- 328 MassSpecProfile | MassSpecCentroid 329 The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum. 330 """ 331 # Use the batch function and return the first result 332 result_list = self.get_mass_spectra_from_scan_list( 333 [scan_number], spectrum_mode, auto_process 334 ) 335 return result_list[0] if result_list else None 336 337 def get_mass_spectra_from_scan_list( 338 self, scan_list, spectrum_mode, auto_process=True 339 ): 340 """Instatiate mass spectrum objects from the mzML file. 341 342 Parameters 343 ---------- 344 scan_list : list of int 345 The scan numbers to be parsed. 346 spectrum_mode : str 347 The type of spectrum to instantiate. Must be'profile' or 'centroid'. 348 auto_process : bool, optional 349 If True, process the mass spectrum. Default is True. 350 351 Returns 352 ------- 353 list of MassSpecProfile | MassSpecCentroid 354 List of MassSpecProfile or MassSpecCentroid objects containing the parsed mass spectra. 355 """ 356 357 def set_metadata( 358 scan_number: int, 359 polarity: int, 360 file_location: str, 361 label=Labels.thermo_profile, 362 ): 363 """ 364 Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object. 365 366 Parameters 367 ---------- 368 scan_number : int 369 The scan number. 370 polarity : int 371 The polarity of the data. 372 file_location : str 373 The file location. 374 label : str, optional 375 The label for the mass spectrum. Default is Labels.thermo_profile. 376 377 Returns 378 ------- 379 dict 380 The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object. 381 """ 382 d_params = default_parameters(file_location) 383 d_params["label"] = label 384 d_params["polarity"] = polarity 385 d_params["filename_path"] = file_location 386 d_params["scan_number"] = scan_number 387 388 return d_params 389 390 # Open file 391 data = self.load() 392 393 mass_spectrum_objects = [] 394 395 for scan_number in scan_list: 396 # Pluck out individual scan mz and intensity 397 spec = data[scan_number] 398 399 # Get polarity 400 if spec["negative scan"] is not None: 401 polarity = -1 402 elif spec["positive scan"] is not None: 403 polarity = 1 404 405 # Get mass spectrum 406 if spectrum_mode == "profile": 407 # Check if profile 408 if not spec.get("MS:1000128"): 409 raise ValueError("spectrum is not profile") 410 data_dict = { 411 Labels.mz: spec.mz, 412 Labels.abundance: spec.i, 413 } 414 d_params = set_metadata( 415 scan_number, 416 polarity, 417 self.file_location, 418 label=Labels.simulated_profile, 419 ) 420 mass_spectrum_obj = MassSpecProfile( 421 data_dict, d_params, auto_process=auto_process 422 ) 423 elif spectrum_mode == "centroid": 424 # Check if centroided 425 if not spec.get("MS:1000127"): 426 raise ValueError("spectrum is not centroided") 427 data_dict = { 428 Labels.mz: spec.mz, 429 Labels.abundance: spec.i, 430 Labels.rp: [np.nan] * len(spec.mz), 431 Labels.s2n: [np.nan] * len(spec.i), 432 } 433 d_params = set_metadata( 434 scan_number, polarity, self.file_location, label=Labels.corems_centroid 435 ) 436 mass_spectrum_obj = MassSpecCentroid( 437 data_dict, d_params, auto_process=auto_process 438 ) 439 440 mass_spectrum_objects.append(mass_spectrum_obj) 441 442 return mass_spectrum_objects 443 444 def get_mass_spectra_obj(self): 445 """Instatiate a MassSpectraBase object from the mzML file. 446 447 448 Returns 449 ------- 450 MassSpectraBase 451 The MassSpectra object containing the parsed mass spectra. 452 The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe. 453 """ 454 _, scan_df = self.run(spectra=False) 455 mass_spectra_obj = MassSpectraBase( 456 self.file_location, 457 self.analyzer, 458 self.instrument_label, 459 self.sample_name, 460 self, 461 ) 462 scan_df = scan_df.set_index("scan", drop=False) 463 mass_spectra_obj.scan_df = scan_df 464 465 return mass_spectra_obj 466 467 def get_lcms_obj(self, spectra="all"): 468 """Instatiates a LCMSBase object from the mzML file. 469 470 Parameters 471 ---------- 472 spectra : str, optional 473 Which mass spectra data to include in the output. Default is all. Other options: none, ms1, ms2. 474 475 Returns 476 ------- 477 LCMSBase 478 LCMS object containing mass spectra data. 479 The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe, 480 and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs. 481 """ 482 _, scan_df = self.run(spectra="none") # first run it to just get scan info 483 if spectra != "none": 484 res, scan_df = self.run( 485 scan_df=scan_df, spectra=spectra 486 ) # second run to parse data 487 lcms_obj = LCMSBase( 488 self.file_location, 489 self.analyzer, 490 self.instrument_label, 491 self.sample_name, 492 self, 493 ) 494 if spectra != "none": 495 for key in res: 496 key_int = int(key.replace("ms", "")) 497 res[key] = res[key][res[key].intensity > 0] 498 res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True) 499 lcms_obj._ms_unprocessed[key_int] = res[key] 500 lcms_obj.scan_df = scan_df.set_index("scan", drop=False) 501 # Check if polarity is mixed 502 if len(set(scan_df.polarity)) > 1: 503 raise ValueError("Mixed polarities detected in scan data") 504 lcms_obj.polarity = scan_df.polarity[0] 505 lcms_obj._scans_number_list = list(scan_df.scan) 506 lcms_obj._retention_time_list = list(scan_df.scan_time) 507 lcms_obj._tic_list = list(scan_df.tic) 508 509 return lcms_obj 510 511 def get_instrument_info(self): 512 """ 513 Return instrument information. 514 515 Returns 516 ------- 517 dict 518 A dictionary with the keys 'model' and 'serial_number'. 519 """ 520 # Load the pymzml data 521 data = self.load() 522 instrument_info = data.info.get('referenceable_param_group_list_element')[0] 523 cv_params = instrument_info.findall('{http://psi.hupo.org/ms/mzml}cvParam') 524 525 # Extract details from each cvParam 526 params = [] 527 for param in cv_params: 528 accession = param.get('accession') # Get 'accession' attribute 529 name = param.get('name') # Get 'name' attribute 530 value = param.get('value') # Get 'value' attribute 531 params.append({ 532 'accession': accession, 533 'name': name, 534 'value': value 535 }) 536 537 # Loop through params and try to find the relevant information 538 instrument_dict = { 539 'model': 'Unknown', 540 'serial_number': 'Unknown' 541 } 542 543 # Assuming there are only two paramters here - one is for the serial number (agnostic to the model) and the other is for the model 544 # If there are more than two, we raise an error 545 if len(params) < 2: 546 raise ValueError("Not enough parameters found in the instrument info, cannot parse.") 547 if len(params) > 2: 548 raise ValueError("Too many parameters found in the instrument info, cannot parse.") 549 for param in params: 550 if param['accession'] == 'MS:1000529': 551 instrument_dict['serial_number'] = param['value'] 552 else: 553 instrument_dict['model'] = data.OT[param['accession']] 554 555 return instrument_dict 556 557 def get_creation_time(self) -> datetime.datetime: 558 """ 559 Return the creation time of the mzML file. 560 """ 561 data = self.load() 562 write_time = data.info.get('start_time') 563 if write_time: 564 # Convert the write time to a datetime object 565 return datetime.datetime.strptime(write_time, "%Y-%m-%dT%H:%M:%SZ") 566 else: 567 raise ValueError("Creation time is not available in the mzML file. " 568 "Please ensure the file contains the 'start_time' information.")
A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects
Parameters
- file_location (str or Path): The path to the RAW file to be parsed.
- analyzer (str, optional): The type of mass analyzer used in the instrument. Default is "Unknown".
- instrument_label (str, optional): The name of the instrument used to acquire the data. Default is "Unknown".
- sample_name (str, optional): The name of the sample being analyzed. If not provided, the stem of the file_location path will be used.
Attributes
- file_location (Path): The path to the RAW file being parsed.
- analyzer (str): The type of mass analyzer used in the instrument.
- instrument_label (str): The name of the instrument used to acquire the data.
- sample_name (str): The name of the sample being analyzed.
Methods
- load(). Load mzML file using pymzml.run.Reader and return the data as a numpy array.
- run(spectra=True). Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe.
- get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True) Parses the mzml file and returns a MassSpecBase object from a single scan.
- get_mass_spectra_obj(). Parses the mzml file and instantiates a MassSpectraBase object.
- get_lcms_obj(). Parses the mzml file and instantiates an LCMSBase object.
- get_instrument_info(). Return instrument information from the mzML file.
- get_creation_time(). Return the creation time of the mzML file as a datetime object.
Inherits from ThermoBaseClass and SpectraParserInterface
MZMLSpectraParser( file_location, analyzer='Unknown', instrument_label='Unknown', sample_name=None)
65 def __init__( 66 self, 67 file_location, 68 analyzer="Unknown", 69 instrument_label="Unknown", 70 sample_name=None, 71 ): 72 # implementation details 73 if isinstance(file_location, str): 74 # if obj is a string it defaults to create a Path obj, pass the S3Path if needed 75 file_location = Path(file_location) 76 if not file_location.exists(): 77 raise FileExistsError("File does not exist: " + str(file_location)) 78 self.file_location = file_location 79 self.analyzer = analyzer 80 self.instrument_label = instrument_label 81 82 if sample_name: 83 self.sample_name = sample_name 84 else: 85 self.sample_name = file_location.stem
def
load(self):
87 def load(self): 88 """ 89 Load mzML file using pymzml.run.Reader and return the data as a numpy array. 90 91 Returns 92 ------- 93 numpy.ndarray 94 The mass spectra data as a numpy array. 95 """ 96 data = pymzml.run.Reader(self.file_location) 97 return data
Load mzML file using pymzml.run.Reader and return the data as a numpy array.
Returns
- numpy.ndarray: The mass spectra data as a numpy array.
def
get_scan_df(self, data):
99 def get_scan_df(self, data): 100 """ 101 Return scan data as a pandas DataFrame. 102 103 Parameters 104 ---------- 105 data : pymzml.run.Reader 106 The mass spectra data. 107 108 Returns 109 ------- 110 pandas.DataFrame 111 A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 112 """ 113 # Scan dict 114 # instatinate scan dict, with empty lists of size of scans 115 n_scans = data.get_spectrum_count() 116 scan_dict = { 117 "scan": np.empty(n_scans, dtype=np.int32), 118 "scan_time": np.empty(n_scans, dtype=np.float32), 119 "ms_level": [None] * n_scans, 120 "polarity": [None] * n_scans, 121 "precursor_mz": [None] * n_scans, 122 "scan_text": [None] * n_scans, 123 "scan_window_lower": np.empty(n_scans, dtype=np.float32), 124 "scan_window_upper": np.empty(n_scans, dtype=np.float32), 125 "scan_precision": [None] * n_scans, 126 "tic": np.empty(n_scans, dtype=np.float32), 127 "ms_format": [None] * n_scans, 128 } 129 130 # First pass: loop through scans to get scan info 131 for i, spec in enumerate(data): 132 scan_dict["scan"][i] = spec.ID 133 scan_dict["ms_level"][i] = spec.ms_level 134 scan_dict["scan_precision"][i] = spec._measured_precision 135 scan_dict["tic"][i] = spec.TIC 136 if spec.selected_precursors: 137 scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get( 138 "mz", None 139 ) 140 if spec["negative scan"] is not None: 141 scan_dict["polarity"][i] = "negative" 142 if spec["positive scan"] is not None: 143 scan_dict["polarity"][i] = "positive" 144 if spec["negative scan"] is not None and spec["positive scan"] is not None: 145 raise ValueError( 146 "Error: scan {0} has both negative and positive polarity".format( 147 spec.ID 148 ) 149 ) 150 151 scan_dict["scan_time"][i] = spec.get("MS:1000016") 152 scan_dict["scan_text"][i] = spec.get("MS:1000512") 153 scan_dict["scan_window_lower"][i] = spec.get("MS:1000501") 154 scan_dict["scan_window_upper"][i] = spec.get("MS:1000500") 155 if spec.get("MS:1000128"): 156 scan_dict["ms_format"][i] = "profile" 157 elif spec.get("MS:1000127"): 158 scan_dict["ms_format"][i] = "centroid" 159 else: 160 scan_dict["ms_format"][i] = None 161 162 scan_df = pd.DataFrame(scan_dict) 163 164 return scan_df
Return scan data as a pandas DataFrame.
Parameters
- data (pymzml.run.Reader): The mass spectra data.
Returns
- pandas.DataFrame: A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
def
get_ms_raw(self, spectra, scan_df, data):
166 def get_ms_raw(self, spectra, scan_df, data): 167 """Return a dictionary of mass spectra data as a pandas DataFrame. 168 169 Parameters 170 ---------- 171 spectra : str 172 Which mass spectra data to include in the output. 173 Options: None, "ms1", "ms2", "all". 174 scan_df : pandas.DataFrame 175 Scan dataframe. Output from get_scan_df(). 176 data : pymzml.run.Reader 177 The mass spectra data. 178 179 Returns 180 ------- 181 dict 182 A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level. 183 184 """ 185 if spectra == "all": 186 scan_df_forspec = scan_df 187 elif spectra == "ms1": 188 scan_df_forspec = scan_df[scan_df.ms_level == 1] 189 elif spectra == "ms2": 190 scan_df_forspec = scan_df[scan_df.ms_level == 2] 191 else: 192 raise ValueError("spectra must be 'all', 'ms1', or 'ms2'") 193 194 # Result container 195 res = {} 196 197 # Row count container 198 counter = {} 199 200 # Column name container 201 cols = {} 202 203 # set at float32 204 dtype = np.float32 205 206 # First pass: get nrows 207 N = defaultdict(lambda: 0) 208 for i, spec in enumerate(data): 209 if i in scan_df_forspec.scan: 210 # Get ms level 211 level = "ms{}".format(spec.ms_level) 212 213 # Number of rows 214 N[level] += spec.mz.shape[0] 215 216 # Second pass: parse 217 for i, spec in enumerate(data): 218 if i in scan_df_forspec.scan: 219 # Number of rows 220 n = spec.mz.shape[0] 221 222 # No measurements 223 if n == 0: 224 continue 225 226 # Dimension check 227 if len(spec.mz) != len(spec.i): 228 # raise an error if the mz and intensity arrays are not the same length 229 raise ValueError("m/z and intensity array dimension mismatch") 230 231 # Scan/frame info 232 id_dict = spec.id_dict 233 234 # Get ms level 235 level = "ms{}".format(spec.ms_level) 236 237 # Columns 238 cols[level] = list(id_dict.keys()) + ["mz", "intensity"] 239 m = len(cols[level]) 240 241 # Subarray init 242 arr = np.empty((n, m), dtype=dtype) 243 inx = 0 244 245 # Populate scan/frame info 246 for k, v in id_dict.items(): 247 arr[:, inx] = v 248 inx += 1 249 250 # Populate m/z 251 arr[:, inx] = spec.mz 252 inx += 1 253 254 # Populate intensity 255 arr[:, inx] = spec.i 256 inx += 1 257 258 # Initialize output container 259 if level not in res: 260 res[level] = np.empty((N[level], m), dtype=dtype) 261 counter[level] = 0 262 263 # Insert subarray 264 res[level][counter[level] : counter[level] + n, :] = arr 265 counter[level] += n 266 267 # Construct ms1 and ms2 mz dataframes 268 for level in res.keys(): 269 res[level] = pd.DataFrame(res[level], columns=cols[level]).drop( 270 columns=["controllerType", "controllerNumber"], 271 axis=1, 272 inplace=False, 273 ) 274 275 return res
Return a dictionary of mass spectra data as a pandas DataFrame.
Parameters
- spectra (str): Which mass spectra data to include in the output. Options: None, "ms1", "ms2", "all".
- scan_df (pandas.DataFrame): Scan dataframe. Output from get_scan_df().
- data (pymzml.run.Reader): The mass spectra data.
Returns
- dict: A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level.
def
run(self, spectra='all', scan_df=None):
277 def run(self, spectra="all", scan_df=None): 278 """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe. 279 280 Parameters 281 ---------- 282 spectra : str, optional 283 Which mass spectra data to include in the output. Default is "all". 284 Other options: None, "ms1", "ms2". 285 scan_df : pandas.DataFrame, optional 286 Scan dataframe. If not provided, the scan dataframe is created from the mzML file. 287 288 Returns 289 ------- 290 tuple 291 A tuple containing two elements: 292 - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level. 293 - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 294 """ 295 296 # Open file 297 data = self.load() 298 299 if scan_df is None: 300 scan_df = self.get_scan_df(data) 301 302 if spectra != "none": 303 res = self.get_ms_raw(spectra, scan_df, data) 304 305 else: 306 res = None 307 308 return res, scan_df
Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe.
Parameters
- spectra (str, optional): Which mass spectra data to include in the output. Default is "all". Other options: None, "ms1", "ms2".
- scan_df (pandas.DataFrame, optional): Scan dataframe. If not provided, the scan dataframe is created from the mzML file.
Returns
- tuple: A tuple containing two elements:
- A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level.
- A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
def
get_mass_spectrum_from_scan(self, scan_number, spectrum_mode, auto_process=True):
310 def get_mass_spectrum_from_scan( 311 self, scan_number, spectrum_mode, auto_process=True 312 ): 313 """Instatiate a mass spectrum object from the mzML file. 314 315 Parameters 316 ---------- 317 scan_number : int 318 The scan number to be parsed. 319 spectrum_mode : str 320 The type of spectrum to instantiate. Must be'profile' or 'centroid'. 321 polarity : int 322 The polarity of the scan. Must be -1 or 1. 323 auto_process : bool, optional 324 If True, process the mass spectrum. Default is True. 325 326 Returns 327 ------- 328 MassSpecProfile | MassSpecCentroid 329 The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum. 330 """ 331 # Use the batch function and return the first result 332 result_list = self.get_mass_spectra_from_scan_list( 333 [scan_number], spectrum_mode, auto_process 334 ) 335 return result_list[0] if result_list else None
Instatiate a mass spectrum object from the mzML file.
Parameters
- scan_number (int): The scan number to be parsed.
- spectrum_mode (str): The type of spectrum to instantiate. Must be'profile' or 'centroid'.
- polarity (int): The polarity of the scan. Must be -1 or 1.
- auto_process (bool, optional): If True, process the mass spectrum. Default is True.
Returns
- MassSpecProfile | MassSpecCentroid: The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum.
def
get_mass_spectra_from_scan_list(self, scan_list, spectrum_mode, auto_process=True):
337 def get_mass_spectra_from_scan_list( 338 self, scan_list, spectrum_mode, auto_process=True 339 ): 340 """Instatiate mass spectrum objects from the mzML file. 341 342 Parameters 343 ---------- 344 scan_list : list of int 345 The scan numbers to be parsed. 346 spectrum_mode : str 347 The type of spectrum to instantiate. Must be'profile' or 'centroid'. 348 auto_process : bool, optional 349 If True, process the mass spectrum. Default is True. 350 351 Returns 352 ------- 353 list of MassSpecProfile | MassSpecCentroid 354 List of MassSpecProfile or MassSpecCentroid objects containing the parsed mass spectra. 355 """ 356 357 def set_metadata( 358 scan_number: int, 359 polarity: int, 360 file_location: str, 361 label=Labels.thermo_profile, 362 ): 363 """ 364 Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object. 365 366 Parameters 367 ---------- 368 scan_number : int 369 The scan number. 370 polarity : int 371 The polarity of the data. 372 file_location : str 373 The file location. 374 label : str, optional 375 The label for the mass spectrum. Default is Labels.thermo_profile. 376 377 Returns 378 ------- 379 dict 380 The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object. 381 """ 382 d_params = default_parameters(file_location) 383 d_params["label"] = label 384 d_params["polarity"] = polarity 385 d_params["filename_path"] = file_location 386 d_params["scan_number"] = scan_number 387 388 return d_params 389 390 # Open file 391 data = self.load() 392 393 mass_spectrum_objects = [] 394 395 for scan_number in scan_list: 396 # Pluck out individual scan mz and intensity 397 spec = data[scan_number] 398 399 # Get polarity 400 if spec["negative scan"] is not None: 401 polarity = -1 402 elif spec["positive scan"] is not None: 403 polarity = 1 404 405 # Get mass spectrum 406 if spectrum_mode == "profile": 407 # Check if profile 408 if not spec.get("MS:1000128"): 409 raise ValueError("spectrum is not profile") 410 data_dict = { 411 Labels.mz: spec.mz, 412 Labels.abundance: spec.i, 413 } 414 d_params = set_metadata( 415 scan_number, 416 polarity, 417 self.file_location, 418 label=Labels.simulated_profile, 419 ) 420 mass_spectrum_obj = MassSpecProfile( 421 data_dict, d_params, auto_process=auto_process 422 ) 423 elif spectrum_mode == "centroid": 424 # Check if centroided 425 if not spec.get("MS:1000127"): 426 raise ValueError("spectrum is not centroided") 427 data_dict = { 428 Labels.mz: spec.mz, 429 Labels.abundance: spec.i, 430 Labels.rp: [np.nan] * len(spec.mz), 431 Labels.s2n: [np.nan] * len(spec.i), 432 } 433 d_params = set_metadata( 434 scan_number, polarity, self.file_location, label=Labels.corems_centroid 435 ) 436 mass_spectrum_obj = MassSpecCentroid( 437 data_dict, d_params, auto_process=auto_process 438 ) 439 440 mass_spectrum_objects.append(mass_spectrum_obj) 441 442 return mass_spectrum_objects
Instatiate mass spectrum objects from the mzML file.
Parameters
- scan_list (list of int): The scan numbers to be parsed.
- spectrum_mode (str): The type of spectrum to instantiate. Must be'profile' or 'centroid'.
- auto_process (bool, optional): If True, process the mass spectrum. Default is True.
Returns
- list of MassSpecProfile | MassSpecCentroid: List of MassSpecProfile or MassSpecCentroid objects containing the parsed mass spectra.
def
get_mass_spectra_obj(self):
444 def get_mass_spectra_obj(self): 445 """Instatiate a MassSpectraBase object from the mzML file. 446 447 448 Returns 449 ------- 450 MassSpectraBase 451 The MassSpectra object containing the parsed mass spectra. 452 The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe. 453 """ 454 _, scan_df = self.run(spectra=False) 455 mass_spectra_obj = MassSpectraBase( 456 self.file_location, 457 self.analyzer, 458 self.instrument_label, 459 self.sample_name, 460 self, 461 ) 462 scan_df = scan_df.set_index("scan", drop=False) 463 mass_spectra_obj.scan_df = scan_df 464 465 return mass_spectra_obj
Instatiate a MassSpectraBase object from the mzML file.
Returns
- MassSpectraBase: The MassSpectra object containing the parsed mass spectra. The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe.
def
get_lcms_obj(self, spectra='all'):
467 def get_lcms_obj(self, spectra="all"): 468 """Instatiates a LCMSBase object from the mzML file. 469 470 Parameters 471 ---------- 472 spectra : str, optional 473 Which mass spectra data to include in the output. Default is all. Other options: none, ms1, ms2. 474 475 Returns 476 ------- 477 LCMSBase 478 LCMS object containing mass spectra data. 479 The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe, 480 and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs. 481 """ 482 _, scan_df = self.run(spectra="none") # first run it to just get scan info 483 if spectra != "none": 484 res, scan_df = self.run( 485 scan_df=scan_df, spectra=spectra 486 ) # second run to parse data 487 lcms_obj = LCMSBase( 488 self.file_location, 489 self.analyzer, 490 self.instrument_label, 491 self.sample_name, 492 self, 493 ) 494 if spectra != "none": 495 for key in res: 496 key_int = int(key.replace("ms", "")) 497 res[key] = res[key][res[key].intensity > 0] 498 res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True) 499 lcms_obj._ms_unprocessed[key_int] = res[key] 500 lcms_obj.scan_df = scan_df.set_index("scan", drop=False) 501 # Check if polarity is mixed 502 if len(set(scan_df.polarity)) > 1: 503 raise ValueError("Mixed polarities detected in scan data") 504 lcms_obj.polarity = scan_df.polarity[0] 505 lcms_obj._scans_number_list = list(scan_df.scan) 506 lcms_obj._retention_time_list = list(scan_df.scan_time) 507 lcms_obj._tic_list = list(scan_df.tic) 508 509 return lcms_obj
Instatiates a LCMSBase object from the mzML file.
Parameters
- spectra (str, optional): Which mass spectra data to include in the output. Default is all. Other options: none, ms1, ms2.
Returns
- LCMSBase: LCMS object containing mass spectra data. The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe, and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs.
def
get_instrument_info(self):
511 def get_instrument_info(self): 512 """ 513 Return instrument information. 514 515 Returns 516 ------- 517 dict 518 A dictionary with the keys 'model' and 'serial_number'. 519 """ 520 # Load the pymzml data 521 data = self.load() 522 instrument_info = data.info.get('referenceable_param_group_list_element')[0] 523 cv_params = instrument_info.findall('{http://psi.hupo.org/ms/mzml}cvParam') 524 525 # Extract details from each cvParam 526 params = [] 527 for param in cv_params: 528 accession = param.get('accession') # Get 'accession' attribute 529 name = param.get('name') # Get 'name' attribute 530 value = param.get('value') # Get 'value' attribute 531 params.append({ 532 'accession': accession, 533 'name': name, 534 'value': value 535 }) 536 537 # Loop through params and try to find the relevant information 538 instrument_dict = { 539 'model': 'Unknown', 540 'serial_number': 'Unknown' 541 } 542 543 # Assuming there are only two paramters here - one is for the serial number (agnostic to the model) and the other is for the model 544 # If there are more than two, we raise an error 545 if len(params) < 2: 546 raise ValueError("Not enough parameters found in the instrument info, cannot parse.") 547 if len(params) > 2: 548 raise ValueError("Too many parameters found in the instrument info, cannot parse.") 549 for param in params: 550 if param['accession'] == 'MS:1000529': 551 instrument_dict['serial_number'] = param['value'] 552 else: 553 instrument_dict['model'] = data.OT[param['accession']] 554 555 return instrument_dict
Return instrument information.
Returns
- dict: A dictionary with the keys 'model' and 'serial_number'.
def
get_creation_time(self) -> datetime.datetime:
557 def get_creation_time(self) -> datetime.datetime: 558 """ 559 Return the creation time of the mzML file. 560 """ 561 data = self.load() 562 write_time = data.info.get('start_time') 563 if write_time: 564 # Convert the write time to a datetime object 565 return datetime.datetime.strptime(write_time, "%Y-%m-%dT%H:%M:%SZ") 566 else: 567 raise ValueError("Creation time is not available in the mzML file. " 568 "Please ensure the file contains the 'start_time' information.")
Return the creation time of the mzML file.