corems.mass_spectra.input.brukerSolarix_utils

Utility functions for Bruker data processing.

 1"""Utility functions for Bruker data processing."""
 2
 3from pathlib import Path
 4from s3path import S3Path
 5
 6
 7def get_scan_attributes(scan_attr, imaging_info_attr) -> dict:
 8    """
 9    Get the scan attributes from the scan.xml or ImagingInfo.xml file.
10    
11    Parameters
12    ----------
13    d_directory_location : str, Path, or S3Path
14        Directory containing the XML files
15        
16    Returns
17    -------
18    dict
19        Dictionary containing the scan number as key and a tuple of retention time, TIC, 
20        and optionally maxpeak and spotname as values.
21
22
23    TODO: We need to reformat the dictionary to actually include keys and values so it is self-descriptive.
24    TODO: This will break the code, so a new version is needed. 
25    TODO: Will need to make sure theres tests which capture this change.
26    
27    """
28    from bs4 import BeautifulSoup
29    
30    scan_xml_exists = scan_attr.exists()
31    imaging_info_exists = imaging_info_attr.exists()
32
33    if scan_xml_exists:
34        try:
35            soup = BeautifulSoup(scan_attr.open(), "xml")
36            list_rt = [float(rt.text) for rt in soup.find_all("minutes")]
37            list_tic = [float(tic.text) for tic in soup.find_all("tic")]
38            list_scan = [int(scan.text) for scan in soup.find_all("count")]
39            
40            # Check if maxpeak exists (more comprehensive version)
41            # TODO: Enable this, but it could break code so a new version is needed
42            enable_maxpeak = False
43            if enable_maxpeak:
44                maxpeak_elements = soup.find_all("maxpeak")
45                if maxpeak_elements:
46                    list_maxpeak = [float(maxpeak.text) for maxpeak in maxpeak_elements]
47                    dict_scan_rt_tic = dict(zip(list_scan, zip(list_rt, list_tic, list_maxpeak)))
48                else:
49                    dict_scan_rt_tic = dict(zip(list_scan, zip(list_rt, list_tic)))
50
51            dict_scan_rt_tic = dict(zip(list_scan, zip(list_rt, list_tic)))
52
53            return dict_scan_rt_tic
54        except Exception as e:
55            raise FileNotFoundError(f"Error reading scan.xml: {e}")
56    elif imaging_info_exists:
57        try:
58            soup = BeautifulSoup(imaging_info_attr.open(), "xml")
59            list_rt = [float(rt.text) for rt in soup.find_all("minutes")]
60            list_tic = [float(tic.text) for tic in soup.find_all("tic")]
61            list_maxpeak = [float(maxpeak.text) for maxpeak in soup.find_all("maxpeak")]
62            list_scan = [int(scan.find("count").text) for scan in soup.find_all("scan")]
63            list_spotname = [
64                scan.find("spotName").text for scan in soup.find_all("scan")
65            ]
66            dict_scan_rt_tic = dict(zip(list_scan, zip(list_rt, list_tic, list_maxpeak, list_spotname)))
67            return dict_scan_rt_tic
68        except Exception as e:
69            raise FileNotFoundError(f"Error reading ImagingInfo.xml: {e}")
70    else:
71        raise FileNotFoundError(
72            "Dataset does not contain a 'scan.xml' or 'ImagingInfo.xml' file."
73        )
def get_scan_attributes(scan_attr, imaging_info_attr) -> dict:
 8def get_scan_attributes(scan_attr, imaging_info_attr) -> dict:
 9    """
10    Get the scan attributes from the scan.xml or ImagingInfo.xml file.
11    
12    Parameters
13    ----------
14    d_directory_location : str, Path, or S3Path
15        Directory containing the XML files
16        
17    Returns
18    -------
19    dict
20        Dictionary containing the scan number as key and a tuple of retention time, TIC, 
21        and optionally maxpeak and spotname as values.
22
23
24    TODO: We need to reformat the dictionary to actually include keys and values so it is self-descriptive.
25    TODO: This will break the code, so a new version is needed. 
26    TODO: Will need to make sure theres tests which capture this change.
27    
28    """
29    from bs4 import BeautifulSoup
30    
31    scan_xml_exists = scan_attr.exists()
32    imaging_info_exists = imaging_info_attr.exists()
33
34    if scan_xml_exists:
35        try:
36            soup = BeautifulSoup(scan_attr.open(), "xml")
37            list_rt = [float(rt.text) for rt in soup.find_all("minutes")]
38            list_tic = [float(tic.text) for tic in soup.find_all("tic")]
39            list_scan = [int(scan.text) for scan in soup.find_all("count")]
40            
41            # Check if maxpeak exists (more comprehensive version)
42            # TODO: Enable this, but it could break code so a new version is needed
43            enable_maxpeak = False
44            if enable_maxpeak:
45                maxpeak_elements = soup.find_all("maxpeak")
46                if maxpeak_elements:
47                    list_maxpeak = [float(maxpeak.text) for maxpeak in maxpeak_elements]
48                    dict_scan_rt_tic = dict(zip(list_scan, zip(list_rt, list_tic, list_maxpeak)))
49                else:
50                    dict_scan_rt_tic = dict(zip(list_scan, zip(list_rt, list_tic)))
51
52            dict_scan_rt_tic = dict(zip(list_scan, zip(list_rt, list_tic)))
53
54            return dict_scan_rt_tic
55        except Exception as e:
56            raise FileNotFoundError(f"Error reading scan.xml: {e}")
57    elif imaging_info_exists:
58        try:
59            soup = BeautifulSoup(imaging_info_attr.open(), "xml")
60            list_rt = [float(rt.text) for rt in soup.find_all("minutes")]
61            list_tic = [float(tic.text) for tic in soup.find_all("tic")]
62            list_maxpeak = [float(maxpeak.text) for maxpeak in soup.find_all("maxpeak")]
63            list_scan = [int(scan.find("count").text) for scan in soup.find_all("scan")]
64            list_spotname = [
65                scan.find("spotName").text for scan in soup.find_all("scan")
66            ]
67            dict_scan_rt_tic = dict(zip(list_scan, zip(list_rt, list_tic, list_maxpeak, list_spotname)))
68            return dict_scan_rt_tic
69        except Exception as e:
70            raise FileNotFoundError(f"Error reading ImagingInfo.xml: {e}")
71    else:
72        raise FileNotFoundError(
73            "Dataset does not contain a 'scan.xml' or 'ImagingInfo.xml' file."
74        )

Get the scan attributes from the scan.xml or ImagingInfo.xml file.

Parameters
  • d_directory_location (str, Path, or S3Path): Directory containing the XML files
Returns
  • dict: Dictionary containing the scan number as key and a tuple of retention time, TIC, and optionally maxpeak and spotname as values.
  • TODO (We need to reformat the dictionary to actually include keys and values so it is self-descriptive.):

  • TODO (This will break the code, so a new version is needed.):

  • TODO (Will need to make sure theres tests which capture this change.):