corems.mass_spectra.input.mzml
1from collections import defaultdict 2from pathlib import Path 3 4import numpy as np 5import pandas as pd 6import pymzml 7 8from corems.encapsulation.constant import Labels 9from corems.encapsulation.factory.parameters import default_parameters 10from corems.mass_spectra.factory.lc_class import LCMSBase, MassSpectraBase 11from corems.mass_spectra.input.parserbase import SpectraParserInterface 12from corems.mass_spectrum.factory.MassSpectrumClasses import ( 13 MassSpecCentroid, 14 MassSpecProfile, 15) 16 17 18class MZMLSpectraParser(SpectraParserInterface): 19 """A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects 20 21 Parameters 22 ---------- 23 file_location : str or Path 24 The path to the RAW file to be parsed. 25 analyzer : str, optional 26 The type of mass analyzer used in the instrument. Default is "Unknown". 27 instrument_label : str, optional 28 The name of the instrument used to acquire the data. Default is "Unknown". 29 sample_name : str, optional 30 The name of the sample being analyzed. If not provided, the stem of the file_location path will be used. 31 32 Attributes 33 ---------- 34 file_location : Path 35 The path to the RAW file being parsed. 36 analyzer : str 37 The type of mass analyzer used in the instrument. 38 instrument_label : str 39 The name of the instrument used to acquire the data. 40 sample_name : str 41 The name of the sample being analyzed. 42 43 Methods 44 ------- 45 * load(). 46 Load mzML file using pymzml.run.Reader and return the data as a numpy array. 47 * run(spectra=True). 48 Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe. 49 * get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True) 50 Parses the mzml file and returns a MassSpecBase object from a single scan. 51 * get_mass_spectra_obj(). 52 Parses the mzml file and instantiates a MassSpectraBase object. 53 * get_lcms_obj(). 54 Parses the mzml file and instantiates an LCMSBase object. 55 56 Inherits from ThermoBaseClass and SpectraParserInterface 57 """ 58 59 def __init__( 60 self, 61 file_location, 62 analyzer="Unknown", 63 instrument_label="Unknown", 64 sample_name=None, 65 ): 66 # implementation details 67 if isinstance(file_location, str): 68 # if obj is a string it defaults to create a Path obj, pass the S3Path if needed 69 file_location = Path(file_location) 70 if not file_location.exists(): 71 raise FileExistsError("File does not exist: " + str(file_location)) 72 self.file_location = file_location 73 self.analyzer = analyzer 74 self.instrument_label = instrument_label 75 76 if sample_name: 77 self.sample_name = sample_name 78 else: 79 self.sample_name = file_location.stem 80 81 def load(self): 82 """ 83 Load mzML file using pymzml.run.Reader and return the data as a numpy array. 84 85 Returns 86 ------- 87 numpy.ndarray 88 The mass spectra data as a numpy array. 89 """ 90 data = pymzml.run.Reader(self.file_location) 91 return data 92 93 def get_scan_df(self, data): 94 """ 95 Return scan data as a pandas DataFrame. 96 97 Parameters 98 ---------- 99 data : pymzml.run.Reader 100 The mass spectra data. 101 102 Returns 103 ------- 104 pandas.DataFrame 105 A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 106 """ 107 # Scan dict 108 # instatinate scan dict, with empty lists of size of scans 109 n_scans = data.get_spectrum_count() 110 scan_dict = { 111 "scan": np.empty(n_scans, dtype=np.int32), 112 "scan_time": np.empty(n_scans, dtype=np.float32), 113 "ms_level": [None] * n_scans, 114 "polarity": [None] * n_scans, 115 "precursor_mz": [None] * n_scans, 116 "scan_text": [None] * n_scans, 117 "scan_window_lower": np.empty(n_scans, dtype=np.float32), 118 "scan_window_upper": np.empty(n_scans, dtype=np.float32), 119 "scan_precision": [None] * n_scans, 120 "tic": np.empty(n_scans, dtype=np.float32), 121 "ms_format": [None] * n_scans, 122 } 123 124 # First pass: loop through scans to get scan info 125 for i, spec in enumerate(data): 126 scan_dict["scan"][i] = spec.ID 127 scan_dict["ms_level"][i] = spec.ms_level 128 scan_dict["scan_precision"][i] = spec._measured_precision 129 scan_dict["tic"][i] = spec.TIC 130 if spec.selected_precursors: 131 scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get( 132 "mz", None 133 ) 134 if spec["negative scan"] is not None: 135 scan_dict["polarity"][i] = "negative" 136 if spec["positive scan"] is not None: 137 scan_dict["polarity"][i] = "positive" 138 if spec["negative scan"] is not None and spec["positive scan"] is not None: 139 raise ValueError( 140 "Error: scan {0} has both negative and positive polarity".format( 141 spec.ID 142 ) 143 ) 144 145 scan_dict["scan_time"][i] = spec.get("MS:1000016") 146 scan_dict["scan_text"][i] = spec.get("MS:1000512") 147 scan_dict["scan_window_lower"][i] = spec.get("MS:1000501") 148 scan_dict["scan_window_upper"][i] = spec.get("MS:1000500") 149 if spec.get("MS:1000128"): 150 scan_dict["ms_format"][i] = "profile" 151 elif spec.get("MS:1000127"): 152 scan_dict["ms_format"][i] = "centroid" 153 else: 154 scan_dict["ms_format"][i] = None 155 156 scan_df = pd.DataFrame(scan_dict) 157 158 return scan_df 159 160 def get_ms_raw(self, spectra, scan_df, data): 161 """Return a dictionary of mass spectra data as a pandas DataFrame. 162 163 Parameters 164 ---------- 165 spectra : str 166 Which mass spectra data to include in the output. 167 Options: None, "ms1", "ms2", "all". 168 scan_df : pandas.DataFrame 169 Scan dataframe. Output from get_scan_df(). 170 data : pymzml.run.Reader 171 The mass spectra data. 172 173 Returns 174 ------- 175 dict 176 A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level. 177 178 """ 179 if spectra == "all": 180 scan_df_forspec = scan_df 181 elif spectra == "ms1": 182 scan_df_forspec = scan_df[scan_df.ms_level == 1] 183 elif spectra == "ms2": 184 scan_df_forspec = scan_df[scan_df.ms_level == 2] 185 else: 186 raise ValueError("spectra must be 'all', 'ms1', or 'ms2'") 187 188 # Result container 189 res = {} 190 191 # Row count container 192 counter = {} 193 194 # Column name container 195 cols = {} 196 197 # set at float32 198 dtype = np.float32 199 200 # First pass: get nrows 201 N = defaultdict(lambda: 0) 202 for i, spec in enumerate(data): 203 if i in scan_df_forspec.scan: 204 # Get ms level 205 level = "ms{}".format(spec.ms_level) 206 207 # Number of rows 208 N[level] += spec.mz.shape[0] 209 210 # Second pass: parse 211 for i, spec in enumerate(data): 212 if i in scan_df_forspec.scan: 213 # Number of rows 214 n = spec.mz.shape[0] 215 216 # No measurements 217 if n == 0: 218 continue 219 220 # Dimension check 221 if len(spec.mz) != len(spec.i): 222 # raise an error if the mz and intensity arrays are not the same length 223 raise ValueError("m/z and intensity array dimension mismatch") 224 225 # Scan/frame info 226 id_dict = spec.id_dict 227 228 # Get ms level 229 level = "ms{}".format(spec.ms_level) 230 231 # Columns 232 cols[level] = list(id_dict.keys()) + ["mz", "intensity"] 233 m = len(cols[level]) 234 235 # Subarray init 236 arr = np.empty((n, m), dtype=dtype) 237 inx = 0 238 239 # Populate scan/frame info 240 for k, v in id_dict.items(): 241 arr[:, inx] = v 242 inx += 1 243 244 # Populate m/z 245 arr[:, inx] = spec.mz 246 inx += 1 247 248 # Populate intensity 249 arr[:, inx] = spec.i 250 inx += 1 251 252 # Initialize output container 253 if level not in res: 254 res[level] = np.empty((N[level], m), dtype=dtype) 255 counter[level] = 0 256 257 # Insert subarray 258 res[level][counter[level] : counter[level] + n, :] = arr 259 counter[level] += n 260 261 # Construct ms1 and ms2 mz dataframes 262 for level in res.keys(): 263 res[level] = pd.DataFrame(res[level], columns=cols[level]).drop( 264 columns=["controllerType", "controllerNumber"], 265 axis=1, 266 inplace=False, 267 ) 268 269 return res 270 271 def run(self, spectra="all", scan_df=None): 272 """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe. 273 274 Parameters 275 ---------- 276 spectra : str, optional 277 Which mass spectra data to include in the output. Default is "all". 278 Other options: None, "ms1", "ms2". 279 scan_df : pandas.DataFrame, optional 280 Scan dataframe. If not provided, the scan dataframe is created from the mzML file. 281 282 Returns 283 ------- 284 tuple 285 A tuple containing two elements: 286 - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level. 287 - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 288 """ 289 290 # Open file 291 data = self.load() 292 293 if scan_df is None: 294 scan_df = self.get_scan_df(data) 295 296 if spectra != "none": 297 res = self.get_ms_raw(spectra, scan_df, data) 298 299 else: 300 res = None 301 302 return res, scan_df 303 304 def get_mass_spectrum_from_scan( 305 self, scan_number, spectrum_mode, auto_process=True 306 ): 307 """Instatiate a mass spectrum object from the mzML file. 308 309 Parameters 310 ---------- 311 scan_number : int 312 The scan number to be parsed. 313 spectrum_mode : str 314 The type of spectrum to instantiate. Must be'profile' or 'centroid'. 315 polarity : int 316 The polarity of the scan. Must be -1 or 1. 317 auto_process : bool, optional 318 If True, process the mass spectrum. Default is True. 319 320 Returns 321 ------- 322 MassSpecProfile | MassSpecCentroid 323 The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum. 324 """ 325 326 def set_metadata( 327 scan_number: int, 328 polarity: int, 329 file_location: str, 330 label=Labels.thermo_profile, 331 ): 332 """ 333 Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object. 334 335 Parameters 336 ---------- 337 scan_number : int 338 The scan number. 339 polarity : int 340 The polarity of the data. 341 file_location : str 342 The file location. 343 label : str, optional 344 The label for the mass spectrum. Default is Labels.thermo_profile. 345 346 Returns 347 ------- 348 dict 349 The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object. 350 """ 351 d_params = default_parameters(file_location) 352 d_params["label"] = label 353 d_params["polarity"] = polarity 354 d_params["filename_path"] = file_location 355 d_params["scan_number"] = scan_number 356 357 return d_params 358 359 # Open file 360 data = self.load() 361 362 # Pluck out individual scan mz and intensity 363 spec = data[scan_number] 364 365 # Get polarity 366 if spec["negative scan"] is not None: 367 polarity = -1 368 elif spec["positive scan"] is not None: 369 polarity = 1 370 371 # Get mass spectrum 372 if spectrum_mode == "profile": 373 # Check if profile 374 if not spec.get("MS:1000128"): 375 raise ValueError("spectrum is not profile") 376 data_dict = { 377 Labels.mz: spec.mz, 378 Labels.abundance: spec.i, 379 } 380 d_params = set_metadata( 381 scan_number, 382 polarity, 383 self.file_location, 384 label=Labels.simulated_profile, 385 ) 386 mass_spectrum_obj = mass_spectrum_obj = MassSpecProfile( 387 data_dict, d_params, auto_process=auto_process 388 ) 389 elif spectrum_mode == "centroid": 390 # Check if centroided 391 if not spec.get("MS:1000127"): 392 raise ValueError("spectrum is not centroided") 393 data_dict = { 394 Labels.mz: spec.mz, 395 Labels.abundance: spec.i, 396 Labels.rp: [np.nan] * len(spec.mz), 397 Labels.s2n: [np.nan] * len(spec.i), 398 } 399 d_params = set_metadata( 400 scan_number, polarity, self.file_location, label=Labels.corems_centroid 401 ) 402 mass_spectrum_obj = MassSpecCentroid( 403 data_dict, d_params, auto_process=auto_process 404 ) 405 406 return mass_spectrum_obj 407 408 def get_mass_spectra_obj(self): 409 """Instatiate a MassSpectraBase object from the mzML file. 410 411 412 Returns 413 ------- 414 MassSpectraBase 415 The MassSpectra object containing the parsed mass spectra. 416 The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe. 417 """ 418 _, scan_df = self.run(spectra=False) 419 mass_spectra_obj = MassSpectraBase( 420 self.file_location, 421 self.analyzer, 422 self.instrument_label, 423 self.sample_name, 424 self, 425 ) 426 scan_df = scan_df.set_index("scan", drop=False) 427 mass_spectra_obj.scan_df = scan_df 428 429 return mass_spectra_obj 430 431 def get_lcms_obj(self, spectra="all"): 432 """Instatiates a LCMSBase object from the mzML file. 433 434 Parameters 435 ---------- 436 spectra : str, optional 437 Which mass spectra data to include in the output. Default is all. Other options: none, ms1, ms2. 438 439 Returns 440 ------- 441 LCMSBase 442 LCMS object containing mass spectra data. 443 The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe, 444 and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs. 445 """ 446 _, scan_df = self.run(spectra="none") # first run it to just get scan info 447 res, scan_df = self.run( 448 scan_df=scan_df, spectra=spectra 449 ) # second run to parse data 450 lcms_obj = LCMSBase( 451 self.file_location, 452 self.analyzer, 453 self.instrument_label, 454 self.sample_name, 455 self, 456 ) 457 for key in res: 458 key_int = int(key.replace("ms", "")) 459 res[key] = res[key][res[key].intensity > 0] 460 res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True) 461 lcms_obj._ms_unprocessed[key_int] = res[key] 462 lcms_obj.scan_df = scan_df.set_index("scan", drop=False) 463 # Check if polarity is mixed 464 if len(set(scan_df.polarity)) > 1: 465 raise ValueError("Mixed polarities detected in scan data") 466 lcms_obj.polarity = scan_df.polarity[0] 467 lcms_obj._scans_number_list = list(scan_df.scan) 468 lcms_obj._retention_time_list = list(scan_df.scan_time) 469 lcms_obj._tic_list = list(scan_df.tic) 470 471 return lcms_obj
19class MZMLSpectraParser(SpectraParserInterface): 20 """A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects 21 22 Parameters 23 ---------- 24 file_location : str or Path 25 The path to the RAW file to be parsed. 26 analyzer : str, optional 27 The type of mass analyzer used in the instrument. Default is "Unknown". 28 instrument_label : str, optional 29 The name of the instrument used to acquire the data. Default is "Unknown". 30 sample_name : str, optional 31 The name of the sample being analyzed. If not provided, the stem of the file_location path will be used. 32 33 Attributes 34 ---------- 35 file_location : Path 36 The path to the RAW file being parsed. 37 analyzer : str 38 The type of mass analyzer used in the instrument. 39 instrument_label : str 40 The name of the instrument used to acquire the data. 41 sample_name : str 42 The name of the sample being analyzed. 43 44 Methods 45 ------- 46 * load(). 47 Load mzML file using pymzml.run.Reader and return the data as a numpy array. 48 * run(spectra=True). 49 Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe. 50 * get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True) 51 Parses the mzml file and returns a MassSpecBase object from a single scan. 52 * get_mass_spectra_obj(). 53 Parses the mzml file and instantiates a MassSpectraBase object. 54 * get_lcms_obj(). 55 Parses the mzml file and instantiates an LCMSBase object. 56 57 Inherits from ThermoBaseClass and SpectraParserInterface 58 """ 59 60 def __init__( 61 self, 62 file_location, 63 analyzer="Unknown", 64 instrument_label="Unknown", 65 sample_name=None, 66 ): 67 # implementation details 68 if isinstance(file_location, str): 69 # if obj is a string it defaults to create a Path obj, pass the S3Path if needed 70 file_location = Path(file_location) 71 if not file_location.exists(): 72 raise FileExistsError("File does not exist: " + str(file_location)) 73 self.file_location = file_location 74 self.analyzer = analyzer 75 self.instrument_label = instrument_label 76 77 if sample_name: 78 self.sample_name = sample_name 79 else: 80 self.sample_name = file_location.stem 81 82 def load(self): 83 """ 84 Load mzML file using pymzml.run.Reader and return the data as a numpy array. 85 86 Returns 87 ------- 88 numpy.ndarray 89 The mass spectra data as a numpy array. 90 """ 91 data = pymzml.run.Reader(self.file_location) 92 return data 93 94 def get_scan_df(self, data): 95 """ 96 Return scan data as a pandas DataFrame. 97 98 Parameters 99 ---------- 100 data : pymzml.run.Reader 101 The mass spectra data. 102 103 Returns 104 ------- 105 pandas.DataFrame 106 A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 107 """ 108 # Scan dict 109 # instatinate scan dict, with empty lists of size of scans 110 n_scans = data.get_spectrum_count() 111 scan_dict = { 112 "scan": np.empty(n_scans, dtype=np.int32), 113 "scan_time": np.empty(n_scans, dtype=np.float32), 114 "ms_level": [None] * n_scans, 115 "polarity": [None] * n_scans, 116 "precursor_mz": [None] * n_scans, 117 "scan_text": [None] * n_scans, 118 "scan_window_lower": np.empty(n_scans, dtype=np.float32), 119 "scan_window_upper": np.empty(n_scans, dtype=np.float32), 120 "scan_precision": [None] * n_scans, 121 "tic": np.empty(n_scans, dtype=np.float32), 122 "ms_format": [None] * n_scans, 123 } 124 125 # First pass: loop through scans to get scan info 126 for i, spec in enumerate(data): 127 scan_dict["scan"][i] = spec.ID 128 scan_dict["ms_level"][i] = spec.ms_level 129 scan_dict["scan_precision"][i] = spec._measured_precision 130 scan_dict["tic"][i] = spec.TIC 131 if spec.selected_precursors: 132 scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get( 133 "mz", None 134 ) 135 if spec["negative scan"] is not None: 136 scan_dict["polarity"][i] = "negative" 137 if spec["positive scan"] is not None: 138 scan_dict["polarity"][i] = "positive" 139 if spec["negative scan"] is not None and spec["positive scan"] is not None: 140 raise ValueError( 141 "Error: scan {0} has both negative and positive polarity".format( 142 spec.ID 143 ) 144 ) 145 146 scan_dict["scan_time"][i] = spec.get("MS:1000016") 147 scan_dict["scan_text"][i] = spec.get("MS:1000512") 148 scan_dict["scan_window_lower"][i] = spec.get("MS:1000501") 149 scan_dict["scan_window_upper"][i] = spec.get("MS:1000500") 150 if spec.get("MS:1000128"): 151 scan_dict["ms_format"][i] = "profile" 152 elif spec.get("MS:1000127"): 153 scan_dict["ms_format"][i] = "centroid" 154 else: 155 scan_dict["ms_format"][i] = None 156 157 scan_df = pd.DataFrame(scan_dict) 158 159 return scan_df 160 161 def get_ms_raw(self, spectra, scan_df, data): 162 """Return a dictionary of mass spectra data as a pandas DataFrame. 163 164 Parameters 165 ---------- 166 spectra : str 167 Which mass spectra data to include in the output. 168 Options: None, "ms1", "ms2", "all". 169 scan_df : pandas.DataFrame 170 Scan dataframe. Output from get_scan_df(). 171 data : pymzml.run.Reader 172 The mass spectra data. 173 174 Returns 175 ------- 176 dict 177 A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level. 178 179 """ 180 if spectra == "all": 181 scan_df_forspec = scan_df 182 elif spectra == "ms1": 183 scan_df_forspec = scan_df[scan_df.ms_level == 1] 184 elif spectra == "ms2": 185 scan_df_forspec = scan_df[scan_df.ms_level == 2] 186 else: 187 raise ValueError("spectra must be 'all', 'ms1', or 'ms2'") 188 189 # Result container 190 res = {} 191 192 # Row count container 193 counter = {} 194 195 # Column name container 196 cols = {} 197 198 # set at float32 199 dtype = np.float32 200 201 # First pass: get nrows 202 N = defaultdict(lambda: 0) 203 for i, spec in enumerate(data): 204 if i in scan_df_forspec.scan: 205 # Get ms level 206 level = "ms{}".format(spec.ms_level) 207 208 # Number of rows 209 N[level] += spec.mz.shape[0] 210 211 # Second pass: parse 212 for i, spec in enumerate(data): 213 if i in scan_df_forspec.scan: 214 # Number of rows 215 n = spec.mz.shape[0] 216 217 # No measurements 218 if n == 0: 219 continue 220 221 # Dimension check 222 if len(spec.mz) != len(spec.i): 223 # raise an error if the mz and intensity arrays are not the same length 224 raise ValueError("m/z and intensity array dimension mismatch") 225 226 # Scan/frame info 227 id_dict = spec.id_dict 228 229 # Get ms level 230 level = "ms{}".format(spec.ms_level) 231 232 # Columns 233 cols[level] = list(id_dict.keys()) + ["mz", "intensity"] 234 m = len(cols[level]) 235 236 # Subarray init 237 arr = np.empty((n, m), dtype=dtype) 238 inx = 0 239 240 # Populate scan/frame info 241 for k, v in id_dict.items(): 242 arr[:, inx] = v 243 inx += 1 244 245 # Populate m/z 246 arr[:, inx] = spec.mz 247 inx += 1 248 249 # Populate intensity 250 arr[:, inx] = spec.i 251 inx += 1 252 253 # Initialize output container 254 if level not in res: 255 res[level] = np.empty((N[level], m), dtype=dtype) 256 counter[level] = 0 257 258 # Insert subarray 259 res[level][counter[level] : counter[level] + n, :] = arr 260 counter[level] += n 261 262 # Construct ms1 and ms2 mz dataframes 263 for level in res.keys(): 264 res[level] = pd.DataFrame(res[level], columns=cols[level]).drop( 265 columns=["controllerType", "controllerNumber"], 266 axis=1, 267 inplace=False, 268 ) 269 270 return res 271 272 def run(self, spectra="all", scan_df=None): 273 """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe. 274 275 Parameters 276 ---------- 277 spectra : str, optional 278 Which mass spectra data to include in the output. Default is "all". 279 Other options: None, "ms1", "ms2". 280 scan_df : pandas.DataFrame, optional 281 Scan dataframe. If not provided, the scan dataframe is created from the mzML file. 282 283 Returns 284 ------- 285 tuple 286 A tuple containing two elements: 287 - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level. 288 - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 289 """ 290 291 # Open file 292 data = self.load() 293 294 if scan_df is None: 295 scan_df = self.get_scan_df(data) 296 297 if spectra != "none": 298 res = self.get_ms_raw(spectra, scan_df, data) 299 300 else: 301 res = None 302 303 return res, scan_df 304 305 def get_mass_spectrum_from_scan( 306 self, scan_number, spectrum_mode, auto_process=True 307 ): 308 """Instatiate a mass spectrum object from the mzML file. 309 310 Parameters 311 ---------- 312 scan_number : int 313 The scan number to be parsed. 314 spectrum_mode : str 315 The type of spectrum to instantiate. Must be'profile' or 'centroid'. 316 polarity : int 317 The polarity of the scan. Must be -1 or 1. 318 auto_process : bool, optional 319 If True, process the mass spectrum. Default is True. 320 321 Returns 322 ------- 323 MassSpecProfile | MassSpecCentroid 324 The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum. 325 """ 326 327 def set_metadata( 328 scan_number: int, 329 polarity: int, 330 file_location: str, 331 label=Labels.thermo_profile, 332 ): 333 """ 334 Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object. 335 336 Parameters 337 ---------- 338 scan_number : int 339 The scan number. 340 polarity : int 341 The polarity of the data. 342 file_location : str 343 The file location. 344 label : str, optional 345 The label for the mass spectrum. Default is Labels.thermo_profile. 346 347 Returns 348 ------- 349 dict 350 The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object. 351 """ 352 d_params = default_parameters(file_location) 353 d_params["label"] = label 354 d_params["polarity"] = polarity 355 d_params["filename_path"] = file_location 356 d_params["scan_number"] = scan_number 357 358 return d_params 359 360 # Open file 361 data = self.load() 362 363 # Pluck out individual scan mz and intensity 364 spec = data[scan_number] 365 366 # Get polarity 367 if spec["negative scan"] is not None: 368 polarity = -1 369 elif spec["positive scan"] is not None: 370 polarity = 1 371 372 # Get mass spectrum 373 if spectrum_mode == "profile": 374 # Check if profile 375 if not spec.get("MS:1000128"): 376 raise ValueError("spectrum is not profile") 377 data_dict = { 378 Labels.mz: spec.mz, 379 Labels.abundance: spec.i, 380 } 381 d_params = set_metadata( 382 scan_number, 383 polarity, 384 self.file_location, 385 label=Labels.simulated_profile, 386 ) 387 mass_spectrum_obj = mass_spectrum_obj = MassSpecProfile( 388 data_dict, d_params, auto_process=auto_process 389 ) 390 elif spectrum_mode == "centroid": 391 # Check if centroided 392 if not spec.get("MS:1000127"): 393 raise ValueError("spectrum is not centroided") 394 data_dict = { 395 Labels.mz: spec.mz, 396 Labels.abundance: spec.i, 397 Labels.rp: [np.nan] * len(spec.mz), 398 Labels.s2n: [np.nan] * len(spec.i), 399 } 400 d_params = set_metadata( 401 scan_number, polarity, self.file_location, label=Labels.corems_centroid 402 ) 403 mass_spectrum_obj = MassSpecCentroid( 404 data_dict, d_params, auto_process=auto_process 405 ) 406 407 return mass_spectrum_obj 408 409 def get_mass_spectra_obj(self): 410 """Instatiate a MassSpectraBase object from the mzML file. 411 412 413 Returns 414 ------- 415 MassSpectraBase 416 The MassSpectra object containing the parsed mass spectra. 417 The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe. 418 """ 419 _, scan_df = self.run(spectra=False) 420 mass_spectra_obj = MassSpectraBase( 421 self.file_location, 422 self.analyzer, 423 self.instrument_label, 424 self.sample_name, 425 self, 426 ) 427 scan_df = scan_df.set_index("scan", drop=False) 428 mass_spectra_obj.scan_df = scan_df 429 430 return mass_spectra_obj 431 432 def get_lcms_obj(self, spectra="all"): 433 """Instatiates a LCMSBase object from the mzML file. 434 435 Parameters 436 ---------- 437 spectra : str, optional 438 Which mass spectra data to include in the output. Default is all. Other options: none, ms1, ms2. 439 440 Returns 441 ------- 442 LCMSBase 443 LCMS object containing mass spectra data. 444 The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe, 445 and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs. 446 """ 447 _, scan_df = self.run(spectra="none") # first run it to just get scan info 448 res, scan_df = self.run( 449 scan_df=scan_df, spectra=spectra 450 ) # second run to parse data 451 lcms_obj = LCMSBase( 452 self.file_location, 453 self.analyzer, 454 self.instrument_label, 455 self.sample_name, 456 self, 457 ) 458 for key in res: 459 key_int = int(key.replace("ms", "")) 460 res[key] = res[key][res[key].intensity > 0] 461 res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True) 462 lcms_obj._ms_unprocessed[key_int] = res[key] 463 lcms_obj.scan_df = scan_df.set_index("scan", drop=False) 464 # Check if polarity is mixed 465 if len(set(scan_df.polarity)) > 1: 466 raise ValueError("Mixed polarities detected in scan data") 467 lcms_obj.polarity = scan_df.polarity[0] 468 lcms_obj._scans_number_list = list(scan_df.scan) 469 lcms_obj._retention_time_list = list(scan_df.scan_time) 470 lcms_obj._tic_list = list(scan_df.tic) 471 472 return lcms_obj
A class for parsing mzml spectrometry data files into MassSpectraBase or LCMSBase objects
Parameters
- file_location (str or Path): The path to the RAW file to be parsed.
- analyzer (str, optional): The type of mass analyzer used in the instrument. Default is "Unknown".
- instrument_label (str, optional): The name of the instrument used to acquire the data. Default is "Unknown".
- sample_name (str, optional): The name of the sample being analyzed. If not provided, the stem of the file_location path will be used.
Attributes
- file_location (Path): The path to the RAW file being parsed.
- analyzer (str): The type of mass analyzer used in the instrument.
- instrument_label (str): The name of the instrument used to acquire the data.
- sample_name (str): The name of the sample being analyzed.
Methods
- load(). Load mzML file using pymzml.run.Reader and return the data as a numpy array.
- run(spectra=True). Parses the mzml file and returns a dictionary of mass spectra dataframes and a scan metadata dataframe.
- get_mass_spectrum_from_scan(scan_number, polarity, auto_process=True) Parses the mzml file and returns a MassSpecBase object from a single scan.
- get_mass_spectra_obj(). Parses the mzml file and instantiates a MassSpectraBase object.
- get_lcms_obj(). Parses the mzml file and instantiates an LCMSBase object.
Inherits from ThermoBaseClass and SpectraParserInterface
MZMLSpectraParser( file_location, analyzer='Unknown', instrument_label='Unknown', sample_name=None)
60 def __init__( 61 self, 62 file_location, 63 analyzer="Unknown", 64 instrument_label="Unknown", 65 sample_name=None, 66 ): 67 # implementation details 68 if isinstance(file_location, str): 69 # if obj is a string it defaults to create a Path obj, pass the S3Path if needed 70 file_location = Path(file_location) 71 if not file_location.exists(): 72 raise FileExistsError("File does not exist: " + str(file_location)) 73 self.file_location = file_location 74 self.analyzer = analyzer 75 self.instrument_label = instrument_label 76 77 if sample_name: 78 self.sample_name = sample_name 79 else: 80 self.sample_name = file_location.stem
def
load(self):
82 def load(self): 83 """ 84 Load mzML file using pymzml.run.Reader and return the data as a numpy array. 85 86 Returns 87 ------- 88 numpy.ndarray 89 The mass spectra data as a numpy array. 90 """ 91 data = pymzml.run.Reader(self.file_location) 92 return data
Load mzML file using pymzml.run.Reader and return the data as a numpy array.
Returns
- numpy.ndarray: The mass spectra data as a numpy array.
def
get_scan_df(self, data):
94 def get_scan_df(self, data): 95 """ 96 Return scan data as a pandas DataFrame. 97 98 Parameters 99 ---------- 100 data : pymzml.run.Reader 101 The mass spectra data. 102 103 Returns 104 ------- 105 pandas.DataFrame 106 A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 107 """ 108 # Scan dict 109 # instatinate scan dict, with empty lists of size of scans 110 n_scans = data.get_spectrum_count() 111 scan_dict = { 112 "scan": np.empty(n_scans, dtype=np.int32), 113 "scan_time": np.empty(n_scans, dtype=np.float32), 114 "ms_level": [None] * n_scans, 115 "polarity": [None] * n_scans, 116 "precursor_mz": [None] * n_scans, 117 "scan_text": [None] * n_scans, 118 "scan_window_lower": np.empty(n_scans, dtype=np.float32), 119 "scan_window_upper": np.empty(n_scans, dtype=np.float32), 120 "scan_precision": [None] * n_scans, 121 "tic": np.empty(n_scans, dtype=np.float32), 122 "ms_format": [None] * n_scans, 123 } 124 125 # First pass: loop through scans to get scan info 126 for i, spec in enumerate(data): 127 scan_dict["scan"][i] = spec.ID 128 scan_dict["ms_level"][i] = spec.ms_level 129 scan_dict["scan_precision"][i] = spec._measured_precision 130 scan_dict["tic"][i] = spec.TIC 131 if spec.selected_precursors: 132 scan_dict["precursor_mz"][i] = spec.selected_precursors[0].get( 133 "mz", None 134 ) 135 if spec["negative scan"] is not None: 136 scan_dict["polarity"][i] = "negative" 137 if spec["positive scan"] is not None: 138 scan_dict["polarity"][i] = "positive" 139 if spec["negative scan"] is not None and spec["positive scan"] is not None: 140 raise ValueError( 141 "Error: scan {0} has both negative and positive polarity".format( 142 spec.ID 143 ) 144 ) 145 146 scan_dict["scan_time"][i] = spec.get("MS:1000016") 147 scan_dict["scan_text"][i] = spec.get("MS:1000512") 148 scan_dict["scan_window_lower"][i] = spec.get("MS:1000501") 149 scan_dict["scan_window_upper"][i] = spec.get("MS:1000500") 150 if spec.get("MS:1000128"): 151 scan_dict["ms_format"][i] = "profile" 152 elif spec.get("MS:1000127"): 153 scan_dict["ms_format"][i] = "centroid" 154 else: 155 scan_dict["ms_format"][i] = None 156 157 scan_df = pd.DataFrame(scan_dict) 158 159 return scan_df
Return scan data as a pandas DataFrame.
Parameters
- data (pymzml.run.Reader): The mass spectra data.
Returns
- pandas.DataFrame: A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
def
get_ms_raw(self, spectra, scan_df, data):
161 def get_ms_raw(self, spectra, scan_df, data): 162 """Return a dictionary of mass spectra data as a pandas DataFrame. 163 164 Parameters 165 ---------- 166 spectra : str 167 Which mass spectra data to include in the output. 168 Options: None, "ms1", "ms2", "all". 169 scan_df : pandas.DataFrame 170 Scan dataframe. Output from get_scan_df(). 171 data : pymzml.run.Reader 172 The mass spectra data. 173 174 Returns 175 ------- 176 dict 177 A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level. 178 179 """ 180 if spectra == "all": 181 scan_df_forspec = scan_df 182 elif spectra == "ms1": 183 scan_df_forspec = scan_df[scan_df.ms_level == 1] 184 elif spectra == "ms2": 185 scan_df_forspec = scan_df[scan_df.ms_level == 2] 186 else: 187 raise ValueError("spectra must be 'all', 'ms1', or 'ms2'") 188 189 # Result container 190 res = {} 191 192 # Row count container 193 counter = {} 194 195 # Column name container 196 cols = {} 197 198 # set at float32 199 dtype = np.float32 200 201 # First pass: get nrows 202 N = defaultdict(lambda: 0) 203 for i, spec in enumerate(data): 204 if i in scan_df_forspec.scan: 205 # Get ms level 206 level = "ms{}".format(spec.ms_level) 207 208 # Number of rows 209 N[level] += spec.mz.shape[0] 210 211 # Second pass: parse 212 for i, spec in enumerate(data): 213 if i in scan_df_forspec.scan: 214 # Number of rows 215 n = spec.mz.shape[0] 216 217 # No measurements 218 if n == 0: 219 continue 220 221 # Dimension check 222 if len(spec.mz) != len(spec.i): 223 # raise an error if the mz and intensity arrays are not the same length 224 raise ValueError("m/z and intensity array dimension mismatch") 225 226 # Scan/frame info 227 id_dict = spec.id_dict 228 229 # Get ms level 230 level = "ms{}".format(spec.ms_level) 231 232 # Columns 233 cols[level] = list(id_dict.keys()) + ["mz", "intensity"] 234 m = len(cols[level]) 235 236 # Subarray init 237 arr = np.empty((n, m), dtype=dtype) 238 inx = 0 239 240 # Populate scan/frame info 241 for k, v in id_dict.items(): 242 arr[:, inx] = v 243 inx += 1 244 245 # Populate m/z 246 arr[:, inx] = spec.mz 247 inx += 1 248 249 # Populate intensity 250 arr[:, inx] = spec.i 251 inx += 1 252 253 # Initialize output container 254 if level not in res: 255 res[level] = np.empty((N[level], m), dtype=dtype) 256 counter[level] = 0 257 258 # Insert subarray 259 res[level][counter[level] : counter[level] + n, :] = arr 260 counter[level] += n 261 262 # Construct ms1 and ms2 mz dataframes 263 for level in res.keys(): 264 res[level] = pd.DataFrame(res[level], columns=cols[level]).drop( 265 columns=["controllerType", "controllerNumber"], 266 axis=1, 267 inplace=False, 268 ) 269 270 return res
Return a dictionary of mass spectra data as a pandas DataFrame.
Parameters
- spectra (str): Which mass spectra data to include in the output. Options: None, "ms1", "ms2", "all".
- scan_df (pandas.DataFrame): Scan dataframe. Output from get_scan_df().
- data (pymzml.run.Reader): The mass spectra data.
Returns
- dict: A dictionary containing the mass spectra data as pandas DataFrames, with keys corresponding to the MS level.
def
run(self, spectra='all', scan_df=None):
272 def run(self, spectra="all", scan_df=None): 273 """Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe. 274 275 Parameters 276 ---------- 277 spectra : str, optional 278 Which mass spectra data to include in the output. Default is "all". 279 Other options: None, "ms1", "ms2". 280 scan_df : pandas.DataFrame, optional 281 Scan dataframe. If not provided, the scan dataframe is created from the mzML file. 282 283 Returns 284 ------- 285 tuple 286 A tuple containing two elements: 287 - A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level. 288 - A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time. 289 """ 290 291 # Open file 292 data = self.load() 293 294 if scan_df is None: 295 scan_df = self.get_scan_df(data) 296 297 if spectra != "none": 298 res = self.get_ms_raw(spectra, scan_df, data) 299 300 else: 301 res = None 302 303 return res, scan_df
Parse the mzML file and return a dictionary of spectra dataframes and a scan metadata dataframe.
Parameters
- spectra (str, optional): Which mass spectra data to include in the output. Default is "all". Other options: None, "ms1", "ms2".
- scan_df (pandas.DataFrame, optional): Scan dataframe. If not provided, the scan dataframe is created from the mzML file.
Returns
- tuple: A tuple containing two elements:
- A dictionary containing the mass spectra data as numpy arrays, with keys corresponding to the MS level.
- A pandas DataFrame containing metadata for each scan, including scan number, MS level, polarity, and scan time.
def
get_mass_spectrum_from_scan(self, scan_number, spectrum_mode, auto_process=True):
305 def get_mass_spectrum_from_scan( 306 self, scan_number, spectrum_mode, auto_process=True 307 ): 308 """Instatiate a mass spectrum object from the mzML file. 309 310 Parameters 311 ---------- 312 scan_number : int 313 The scan number to be parsed. 314 spectrum_mode : str 315 The type of spectrum to instantiate. Must be'profile' or 'centroid'. 316 polarity : int 317 The polarity of the scan. Must be -1 or 1. 318 auto_process : bool, optional 319 If True, process the mass spectrum. Default is True. 320 321 Returns 322 ------- 323 MassSpecProfile | MassSpecCentroid 324 The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum. 325 """ 326 327 def set_metadata( 328 scan_number: int, 329 polarity: int, 330 file_location: str, 331 label=Labels.thermo_profile, 332 ): 333 """ 334 Set the output parameters for creating a MassSpecProfile or MassSpecCentroid object. 335 336 Parameters 337 ---------- 338 scan_number : int 339 The scan number. 340 polarity : int 341 The polarity of the data. 342 file_location : str 343 The file location. 344 label : str, optional 345 The label for the mass spectrum. Default is Labels.thermo_profile. 346 347 Returns 348 ------- 349 dict 350 The output parameters ready for creating a MassSpecProfile or MassSpecCentroid object. 351 """ 352 d_params = default_parameters(file_location) 353 d_params["label"] = label 354 d_params["polarity"] = polarity 355 d_params["filename_path"] = file_location 356 d_params["scan_number"] = scan_number 357 358 return d_params 359 360 # Open file 361 data = self.load() 362 363 # Pluck out individual scan mz and intensity 364 spec = data[scan_number] 365 366 # Get polarity 367 if spec["negative scan"] is not None: 368 polarity = -1 369 elif spec["positive scan"] is not None: 370 polarity = 1 371 372 # Get mass spectrum 373 if spectrum_mode == "profile": 374 # Check if profile 375 if not spec.get("MS:1000128"): 376 raise ValueError("spectrum is not profile") 377 data_dict = { 378 Labels.mz: spec.mz, 379 Labels.abundance: spec.i, 380 } 381 d_params = set_metadata( 382 scan_number, 383 polarity, 384 self.file_location, 385 label=Labels.simulated_profile, 386 ) 387 mass_spectrum_obj = mass_spectrum_obj = MassSpecProfile( 388 data_dict, d_params, auto_process=auto_process 389 ) 390 elif spectrum_mode == "centroid": 391 # Check if centroided 392 if not spec.get("MS:1000127"): 393 raise ValueError("spectrum is not centroided") 394 data_dict = { 395 Labels.mz: spec.mz, 396 Labels.abundance: spec.i, 397 Labels.rp: [np.nan] * len(spec.mz), 398 Labels.s2n: [np.nan] * len(spec.i), 399 } 400 d_params = set_metadata( 401 scan_number, polarity, self.file_location, label=Labels.corems_centroid 402 ) 403 mass_spectrum_obj = MassSpecCentroid( 404 data_dict, d_params, auto_process=auto_process 405 ) 406 407 return mass_spectrum_obj
Instatiate a mass spectrum object from the mzML file.
Parameters
- scan_number (int): The scan number to be parsed.
- spectrum_mode (str): The type of spectrum to instantiate. Must be'profile' or 'centroid'.
- polarity (int): The polarity of the scan. Must be -1 or 1.
- auto_process (bool, optional): If True, process the mass spectrum. Default is True.
Returns
- MassSpecProfile | MassSpecCentroid: The MassSpecProfile or MassSpecCentroid object containing the parsed mass spectrum.
def
get_mass_spectra_obj(self):
409 def get_mass_spectra_obj(self): 410 """Instatiate a MassSpectraBase object from the mzML file. 411 412 413 Returns 414 ------- 415 MassSpectraBase 416 The MassSpectra object containing the parsed mass spectra. 417 The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe. 418 """ 419 _, scan_df = self.run(spectra=False) 420 mass_spectra_obj = MassSpectraBase( 421 self.file_location, 422 self.analyzer, 423 self.instrument_label, 424 self.sample_name, 425 self, 426 ) 427 scan_df = scan_df.set_index("scan", drop=False) 428 mass_spectra_obj.scan_df = scan_df 429 430 return mass_spectra_obj
Instatiate a MassSpectraBase object from the mzML file.
Returns
- MassSpectraBase: The MassSpectra object containing the parsed mass spectra. The object is instatiated with the mzML file, analyzer, instrument, sample name, and scan dataframe.
def
get_lcms_obj(self, spectra='all'):
432 def get_lcms_obj(self, spectra="all"): 433 """Instatiates a LCMSBase object from the mzML file. 434 435 Parameters 436 ---------- 437 spectra : str, optional 438 Which mass spectra data to include in the output. Default is all. Other options: none, ms1, ms2. 439 440 Returns 441 ------- 442 LCMSBase 443 LCMS object containing mass spectra data. 444 The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe, 445 and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs. 446 """ 447 _, scan_df = self.run(spectra="none") # first run it to just get scan info 448 res, scan_df = self.run( 449 scan_df=scan_df, spectra=spectra 450 ) # second run to parse data 451 lcms_obj = LCMSBase( 452 self.file_location, 453 self.analyzer, 454 self.instrument_label, 455 self.sample_name, 456 self, 457 ) 458 for key in res: 459 key_int = int(key.replace("ms", "")) 460 res[key] = res[key][res[key].intensity > 0] 461 res[key] = res[key].sort_values(by=["scan", "mz"]).reset_index(drop=True) 462 lcms_obj._ms_unprocessed[key_int] = res[key] 463 lcms_obj.scan_df = scan_df.set_index("scan", drop=False) 464 # Check if polarity is mixed 465 if len(set(scan_df.polarity)) > 1: 466 raise ValueError("Mixed polarities detected in scan data") 467 lcms_obj.polarity = scan_df.polarity[0] 468 lcms_obj._scans_number_list = list(scan_df.scan) 469 lcms_obj._retention_time_list = list(scan_df.scan_time) 470 lcms_obj._tic_list = list(scan_df.tic) 471 472 return lcms_obj
Instatiates a LCMSBase object from the mzML file.
Parameters
- spectra (str, optional): Which mass spectra data to include in the output. Default is all. Other options: none, ms1, ms2.
Returns
- LCMSBase: LCMS object containing mass spectra data. The object is instatiated with the mzML file, analyzer, instrument, sample name, scan dataframe, and mz dataframe(s), as well as lists of scan numbers, retention times, and TICs.