corems.mass_spectra.input.brukerSolarix_utils
Utility functions for Bruker data processing.
1"""Utility functions for Bruker data processing.""" 2 3from pathlib import Path 4from s3path import S3Path 5 6 7def get_scan_attributes(scan_attr, imaging_info_attr) -> dict: 8 """ 9 Get the scan attributes from the scan.xml or ImagingInfo.xml file. 10 11 Parameters 12 ---------- 13 d_directory_location : str, Path, or S3Path 14 Directory containing the XML files 15 16 Returns 17 ------- 18 dict 19 Dictionary containing the scan number as key and a tuple of retention time, TIC, 20 and optionally maxpeak and spotname as values. 21 22 23 TODO: We need to reformat the dictionary to actually include keys and values so it is self-descriptive. 24 TODO: This will break the code, so a new version is needed. 25 TODO: Will need to make sure theres tests which capture this change. 26 27 """ 28 from bs4 import BeautifulSoup 29 30 scan_xml_exists = scan_attr.exists() 31 imaging_info_exists = imaging_info_attr.exists() 32 33 if scan_xml_exists: 34 try: 35 soup = BeautifulSoup(scan_attr.open(), "xml") 36 list_rt = [float(rt.text) for rt in soup.find_all("minutes")] 37 list_tic = [float(tic.text) for tic in soup.find_all("tic")] 38 list_scan = [int(scan.text) for scan in soup.find_all("count")] 39 40 # Check if maxpeak exists (more comprehensive version) 41 # TODO: Enable this, but it could break code so a new version is needed 42 enable_maxpeak = False 43 if enable_maxpeak: 44 maxpeak_elements = soup.find_all("maxpeak") 45 if maxpeak_elements: 46 list_maxpeak = [float(maxpeak.text) for maxpeak in maxpeak_elements] 47 dict_scan_rt_tic = dict(zip(list_scan, zip(list_rt, list_tic, list_maxpeak))) 48 else: 49 dict_scan_rt_tic = dict(zip(list_scan, zip(list_rt, list_tic))) 50 51 dict_scan_rt_tic = dict(zip(list_scan, zip(list_rt, list_tic))) 52 53 return dict_scan_rt_tic 54 except Exception as e: 55 raise FileNotFoundError(f"Error reading scan.xml: {e}") 56 elif imaging_info_exists: 57 try: 58 soup = BeautifulSoup(imaging_info_attr.open(), "xml") 59 list_rt = [float(rt.text) for rt in soup.find_all("minutes")] 60 list_tic = [float(tic.text) for tic in soup.find_all("tic")] 61 list_maxpeak = [float(maxpeak.text) for maxpeak in soup.find_all("maxpeak")] 62 list_scan = [int(scan.find("count").text) for scan in soup.find_all("scan")] 63 list_spotname = [ 64 scan.find("spotName").text for scan in soup.find_all("scan") 65 ] 66 dict_scan_rt_tic = dict(zip(list_scan, zip(list_rt, list_tic, list_maxpeak, list_spotname))) 67 return dict_scan_rt_tic 68 except Exception as e: 69 raise FileNotFoundError(f"Error reading ImagingInfo.xml: {e}") 70 else: 71 raise FileNotFoundError( 72 "Dataset does not contain a 'scan.xml' or 'ImagingInfo.xml' file." 73 )
def
get_scan_attributes(scan_attr, imaging_info_attr) -> dict:
8def get_scan_attributes(scan_attr, imaging_info_attr) -> dict: 9 """ 10 Get the scan attributes from the scan.xml or ImagingInfo.xml file. 11 12 Parameters 13 ---------- 14 d_directory_location : str, Path, or S3Path 15 Directory containing the XML files 16 17 Returns 18 ------- 19 dict 20 Dictionary containing the scan number as key and a tuple of retention time, TIC, 21 and optionally maxpeak and spotname as values. 22 23 24 TODO: We need to reformat the dictionary to actually include keys and values so it is self-descriptive. 25 TODO: This will break the code, so a new version is needed. 26 TODO: Will need to make sure theres tests which capture this change. 27 28 """ 29 from bs4 import BeautifulSoup 30 31 scan_xml_exists = scan_attr.exists() 32 imaging_info_exists = imaging_info_attr.exists() 33 34 if scan_xml_exists: 35 try: 36 soup = BeautifulSoup(scan_attr.open(), "xml") 37 list_rt = [float(rt.text) for rt in soup.find_all("minutes")] 38 list_tic = [float(tic.text) for tic in soup.find_all("tic")] 39 list_scan = [int(scan.text) for scan in soup.find_all("count")] 40 41 # Check if maxpeak exists (more comprehensive version) 42 # TODO: Enable this, but it could break code so a new version is needed 43 enable_maxpeak = False 44 if enable_maxpeak: 45 maxpeak_elements = soup.find_all("maxpeak") 46 if maxpeak_elements: 47 list_maxpeak = [float(maxpeak.text) for maxpeak in maxpeak_elements] 48 dict_scan_rt_tic = dict(zip(list_scan, zip(list_rt, list_tic, list_maxpeak))) 49 else: 50 dict_scan_rt_tic = dict(zip(list_scan, zip(list_rt, list_tic))) 51 52 dict_scan_rt_tic = dict(zip(list_scan, zip(list_rt, list_tic))) 53 54 return dict_scan_rt_tic 55 except Exception as e: 56 raise FileNotFoundError(f"Error reading scan.xml: {e}") 57 elif imaging_info_exists: 58 try: 59 soup = BeautifulSoup(imaging_info_attr.open(), "xml") 60 list_rt = [float(rt.text) for rt in soup.find_all("minutes")] 61 list_tic = [float(tic.text) for tic in soup.find_all("tic")] 62 list_maxpeak = [float(maxpeak.text) for maxpeak in soup.find_all("maxpeak")] 63 list_scan = [int(scan.find("count").text) for scan in soup.find_all("scan")] 64 list_spotname = [ 65 scan.find("spotName").text for scan in soup.find_all("scan") 66 ] 67 dict_scan_rt_tic = dict(zip(list_scan, zip(list_rt, list_tic, list_maxpeak, list_spotname))) 68 return dict_scan_rt_tic 69 except Exception as e: 70 raise FileNotFoundError(f"Error reading ImagingInfo.xml: {e}") 71 else: 72 raise FileNotFoundError( 73 "Dataset does not contain a 'scan.xml' or 'ImagingInfo.xml' file." 74 )
Get the scan attributes from the scan.xml or ImagingInfo.xml file.
Parameters
- d_directory_location (str, Path, or S3Path): Directory containing the XML files
Returns
- dict: Dictionary containing the scan number as key and a tuple of retention time, TIC, and optionally maxpeak and spotname as values.
TODO (We need to reformat the dictionary to actually include keys and values so it is self-descriptive.):
TODO (This will break the code, so a new version is needed.):
TODO (Will need to make sure theres tests which capture this change.):