corems.molecular_formula.input.masslist_ref

  1__author__ = "Yuri E. Corilo"
  2__date__ = "Oct 24, 2019"
  3
  4import json
  5import re
  6import sys
  7from pathlib import Path
  8from typing import Dict, List
  9
 10import pandas as pd
 11
 12from corems.encapsulation.constant import Atoms, Labels
 13from corems.molecular_formula.factory.MolecularFormulaFactory import (
 14    LCMSLibRefMolecularFormula,
 15    MolecularFormula,
 16)
 17
 18
 19class MolecularFormulaLinkProxy:
 20    """Proxy class for MolecularFormulaLink to be used in the molecular formula ref file import
 21
 22    Parameters
 23    ----------
 24    molecular_formula : MolecularFormula | LCMSLibRefMolecularFormula
 25        corems MolecularFormula or LCMSLibRefMolecularFormula object
 26    mz : float
 27        target m/z
 28
 29    Attributes
 30    ----------
 31    C : int
 32        number of carbon atoms
 33    H : int
 34        number of hydrogen atoms
 35    H_C : float
 36        ratio of hydrogen to carbon atoms
 37    class_label : str
 38        molecular formula class label
 39    mz_calc : float
 40        calculated m/z
 41    dbe : int
 42        double bond equivalent
 43    formula_dict : dict
 44        molecular formula dictionary
 45
 46    Methods
 47    -------
 48    * to_dict().
 49        return molecular formula dictionary
 50
 51    """
 52
 53    def __init__(self, molecular_formula, mz):
 54        self.C = molecular_formula.get("C")
 55        self.H = molecular_formula.get("H")
 56        self.H_C = molecular_formula.get("H") / molecular_formula.get("C")
 57        self.class_label = json.dumps(molecular_formula.class_dict)
 58        self.mz_calc = float(mz)
 59        self.dbe = molecular_formula.dbe
 60        self.formula_dict = molecular_formula.to_dict()
 61
 62    def to_dict(self):
 63        return self.formula_dict
 64
 65
 66class ImportMassListRef:  # Thread
 67    """Import Mass List from Reference File
 68
 69    Parameters
 70    ----------
 71    ref_file_location : str
 72        path to the reference file
 73
 74    Attributes
 75    ----------
 76    ref_file_location : str
 77        path to the reference file
 78
 79    Methods
 80    -------
 81    * molecular_formula_ref(mz, molecular_formula).
 82        Return MolecularFormulaLinkProxy object
 83    * from_lcms_lib_file(ion_charge, ion_types).
 84        Return Dict[standard_name, Dict[m/z, List[MolecularFormula]]] from LCMS library reference file
 85    * from_bruker_ref_file().
 86        Return List[MolecularFormula] from Bruker reference file
 87    * from_corems_ref_file(delimiter).
 88        Return List[MolecularFormula] from CoreMS reference file
 89    * split(delimiters, string, maxsplit).
 90        Splits a string using a list of delimiters.
 91    * mformula_s_to_dict(s_mformulatring, iontype).
 92        Converts a molecular formula string to a dict
 93    """
 94
 95    def __init__(self, ref_file_location):
 96        # Thread.__init__(self)
 97
 98        self.ref_file_location = Path(ref_file_location)
 99
100        if not self.ref_file_location.exists():
101            tb = sys.exc_info()[2]
102            raise FileNotFoundError(ref_file_location).with_traceback(tb)
103
104    def molecular_formula_ref(self, mz, molecular_formula):
105        """Instantiate a MolecularFormulaLinkProxy object
106
107        Parameters
108        ----------
109        mz : float
110            target m/z
111        molecular_formula : MolecularFormula | LCMSLibRefMolecularFormula
112            corems MolecularFormula or LCMSLibRefMolecularFormula object
113
114        Returns
115        -------
116        MolecularFormulaLinkProxy
117            MolecularFormulaLinkProxy object
118        """
119        return MolecularFormulaLinkProxy(molecular_formula, mz)
120
121    def from_lcms_lib_file(
122        self, ion_charge: float, ion_types: List[str]
123    ) -> Dict[str, Dict[float, List[LCMSLibRefMolecularFormula]]]:
124        """Create a dictionary of LCMSLibRefMolecularFormula objects from LCMS library reference file
125
126        Parameters
127        ----------
128        ion_charge : float
129            ion charge
130        ion_types : List[str]
131            list of ion types
132
133        Returns
134        -------
135        Dict
136            Dict[standard_name, Dict[m/z, List[MolecularFormula]]] from LCMS library reference file. m/z is the target m/z; standard_name is the name of the molecular standard mix; MolecularFormula is the corems molecular formula class
137        """
138
139        data = {}
140
141        with open(self.ref_file_location) as ref_f:
142            df = pd.read_csv(ref_f, header=0, encoding="unicode_escape")
143
144            for index, row in df.iterrows():
145                formula_s = row["Neutral Formula"]
146                formula_dict = self.mformula_s_to_dict(formula_s, Labels.neutral)
147                name = row["Compound Name"]
148                kegg_id = row["KEGG ID"]
149                standard_name = row["NEW MIX"]
150                cas = row["KEGG ID"]
151                # print(row["Neutral Formula"], formula_dict)
152                molf_formula = LCMSLibRefMolecularFormula(
153                    formula_dict,
154                    ion_charge,
155                    Labels.neutral,
156                    name=name,
157                    kegg_id=kegg_id,
158                    cas=cas,
159                )
160                # if round(molf_formula.mz_calc, 4) != round(row['Mass Adduct -H'],4):
161                #    print(formula_s)
162                #    print(round(molf_formula.mz_calc, 4) , round(row['Mass Adduct -H'],4))
163
164                if standard_name in data.keys():
165                    # TODO change it to target ion types and add ion type in the data structure
166                    mz_calc = molf_formula.protonated_mz
167
168                    if mz_calc in data.get(standard_name).keys():
169                        data.get(standard_name).get(mz_calc).append(molf_formula)
170
171                    else:
172                        data[standard_name][mz_calc] = [molf_formula]
173                else:
174                    data[standard_name] = {molf_formula.mz_calc: [molf_formula]}
175                # print(formula_s, formula_dict)
176                # if molf_formula.ion_type != 'de-protonated':
177                #    print( 'ha', molf_formula.ion_type )
178                # print(formula_dict)
179                # print(row['c1'], row['c2'])
180
181        return data
182
183    def from_bruker_ref_file(self) -> List[MolecularFormula]:
184        """Create a list of MolecularFormula objects from Bruker reference file
185
186        Returns
187        -------
188        List[MolecularFormula]
189            List of MolecularFormula objects from Bruker reference file
190        """
191
192        import csv
193
194        list_mf_obj = []
195
196        with open(self.ref_file_location) as ref_f:
197            labels = ref_f.readline().strip("\n").split(";")
198
199            for line in ref_f.readlines():
200                if line != "\n":
201                    list_ref = line.strip("\n").split(" ")
202
203                    if list_ref[2][-1] == "+":
204                        ion_charge = int(list_ref[2][:-1])
205
206                    else:
207                        ion_charge = -1 * int(list_ref[2][:-1])
208
209                    ion_mol_formula = list_ref[0]
210                    mz = float(list_ref[1])
211                    formula_dict = self.mformula_s_to_dict(ion_mol_formula)
212
213                    list_mf_obj.append(
214                        MolecularFormula(formula_dict, ion_charge, external_mz=mz)
215                    )
216
217        return list_mf_obj
218
219    def from_corems_ref_file(self, delimiter="\t"):  # pragma: no cover
220        """Create a list of MolecularFormula objects from CoreMS reference file
221
222        Not being used
223
224        Parameters
225        ----------
226        delimiter : str
227            delimiter used in the reference file
228
229        Returns
230        -------
231        List[MolecularFormula]
232            List of MolecularFormula objects from CoreMS reference file
233        """
234        # not being used
235        import csv
236
237        list_mf_obj = []
238
239        with open("res/RefMassLists/Crude-Pos-ESI.ref") as ref_f:
240            labels = ref_f.readline().strip("\n").split(delimiter)
241
242            for line in ref_f.readlines():
243                if line != "\n":
244                    list_ref = line.strip("\n").split(delimiter)
245
246                    formula_string = list_ref[0]
247                    ion_charge = int(list_ref[1])
248                    ion_type = list_ref[2]
249
250                    molform = MolecularFormula(
251                        formula_string, ion_charge, ion_type=ion_type
252                    )
253
254                    list_mf_obj.append(self.molecular_formula_ref(molform))
255
256        return list_mf_obj
257
258    def split(self, delimiters, string, maxsplit=0):  # pragma: no cover
259        """Splits a string using a list of delimiters.
260
261        Does not work when formula has atoms with same characters, i.e - C10H21NNa
262
263        Parameters
264        ----------
265        delimiters : list
266            list of delimiters
267        string : str
268            string to be split
269        maxsplit : int, optional
270            maximum number of splits. Default is 0
271
272        Returns
273        -------
274        list
275            list of strings obtained after splitting the string
276        list
277            list of counts obtained after splitting the string
278        """
279        regexPattern = "|".join(map(re.escape, delimiters))  # pragma: no cover
280        isotopes = re.findall(regexPattern, string)  # pragma: no cover
281        counts = re.split(regexPattern, string, maxsplit)  # pragma: no cover
282        return isotopes, counts
283
284    def mformula_s_to_dict(self, s_mformulatring, iontype="unknown"):
285        """Converts a molecular formula string to a dict
286
287        Parameters
288        ----------
289        s_mformulatring : str
290            molecular formula string, i.e. 'C10H21NNa'
291        iontype : str, optional
292            ion type. Default is 'unknown'
293
294        Returns
295        -------
296        dict
297            molecular formula dictionary
298
299        Notes
300        -----
301        Does not work if the atomic mass number is passed i.e. 37Cl, 81Br, convention follow the light isotope labeling 35Cl is Cl, 12C is C, etc.
302        If you need to use heavy isotopes please use another reference file format that separate the formula string by a blank space and parse it using the function corems_ref_file
303
304        Raises
305        ------
306        TypeError
307            Atom does not exist in Atoms.atoms_order list
308        Exception
309            Empty molecular formula
310        """
311        if s_mformulatring:
312            # find the case C122
313            all_atoms = re.findall(r"[A-Z]{1}[0-9]{1,10000}", s_mformulatring)
314
315            # find the case Br2
316            all_atoms2 = re.findall(r"[A-Z]{1}[a-z]{1}[0-9]{1,10000}", s_mformulatring)
317            # find the case N
318            single_digit_atoms_one = re.findall(
319                r"[A-Z]{1}(?![0-9])(?![a-z])", s_mformulatring
320            )
321            # print(single_digit_atoms_one)
322            # find the case Na
323            due_digit_atoms_one = re.findall(
324                r"[A-Z]{1}[a-z]{1}(?![0-9])", s_mformulatring
325            )
326
327            all_atoms = (
328                all_atoms + all_atoms2 + due_digit_atoms_one + single_digit_atoms_one
329            )
330
331            dict_res = {}
332
333            for each_atom_count in all_atoms:
334                count = re.findall(r"[0-9]{1,10000}", each_atom_count)
335                atom = "".join(re.findall(r"[A-z]", each_atom_count))
336
337                if atom in Atoms.atoms_order:
338                    if count:
339                        dict_res[atom] = int(count[0])
340                    else:
341                        dict_res[atom] = 1
342
343                else:
344                    tb = sys.exc_info()[2]
345                    raise TypeError(
346                        "Atom %s does not exist in Atoms.atoms_order list" % atom
347                    ).with_traceback(tb)
348
349            dict_res[Labels.ion_type] = iontype
350
351            return dict_res
352
353        else:
354            tb = sys.exc_info()[2]
355            raise Exception("Empty molecular formula").with_traceback(tb)
class MolecularFormulaLinkProxy:
20class MolecularFormulaLinkProxy:
21    """Proxy class for MolecularFormulaLink to be used in the molecular formula ref file import
22
23    Parameters
24    ----------
25    molecular_formula : MolecularFormula | LCMSLibRefMolecularFormula
26        corems MolecularFormula or LCMSLibRefMolecularFormula object
27    mz : float
28        target m/z
29
30    Attributes
31    ----------
32    C : int
33        number of carbon atoms
34    H : int
35        number of hydrogen atoms
36    H_C : float
37        ratio of hydrogen to carbon atoms
38    class_label : str
39        molecular formula class label
40    mz_calc : float
41        calculated m/z
42    dbe : int
43        double bond equivalent
44    formula_dict : dict
45        molecular formula dictionary
46
47    Methods
48    -------
49    * to_dict().
50        return molecular formula dictionary
51
52    """
53
54    def __init__(self, molecular_formula, mz):
55        self.C = molecular_formula.get("C")
56        self.H = molecular_formula.get("H")
57        self.H_C = molecular_formula.get("H") / molecular_formula.get("C")
58        self.class_label = json.dumps(molecular_formula.class_dict)
59        self.mz_calc = float(mz)
60        self.dbe = molecular_formula.dbe
61        self.formula_dict = molecular_formula.to_dict()
62
63    def to_dict(self):
64        return self.formula_dict

Proxy class for MolecularFormulaLink to be used in the molecular formula ref file import

Parameters
  • molecular_formula (MolecularFormula | LCMSLibRefMolecularFormula): corems MolecularFormula or LCMSLibRefMolecularFormula object
  • mz (float): target m/z
Attributes
  • C (int): number of carbon atoms
  • H (int): number of hydrogen atoms
  • H_C (float): ratio of hydrogen to carbon atoms
  • class_label (str): molecular formula class label
  • mz_calc (float): calculated m/z
  • dbe (int): double bond equivalent
  • formula_dict (dict): molecular formula dictionary
Methods
  • to_dict(). return molecular formula dictionary
MolecularFormulaLinkProxy(molecular_formula, mz)
54    def __init__(self, molecular_formula, mz):
55        self.C = molecular_formula.get("C")
56        self.H = molecular_formula.get("H")
57        self.H_C = molecular_formula.get("H") / molecular_formula.get("C")
58        self.class_label = json.dumps(molecular_formula.class_dict)
59        self.mz_calc = float(mz)
60        self.dbe = molecular_formula.dbe
61        self.formula_dict = molecular_formula.to_dict()
C
H
H_C
class_label
mz_calc
dbe
formula_dict
def to_dict(self):
63    def to_dict(self):
64        return self.formula_dict
class ImportMassListRef:
 67class ImportMassListRef:  # Thread
 68    """Import Mass List from Reference File
 69
 70    Parameters
 71    ----------
 72    ref_file_location : str
 73        path to the reference file
 74
 75    Attributes
 76    ----------
 77    ref_file_location : str
 78        path to the reference file
 79
 80    Methods
 81    -------
 82    * molecular_formula_ref(mz, molecular_formula).
 83        Return MolecularFormulaLinkProxy object
 84    * from_lcms_lib_file(ion_charge, ion_types).
 85        Return Dict[standard_name, Dict[m/z, List[MolecularFormula]]] from LCMS library reference file
 86    * from_bruker_ref_file().
 87        Return List[MolecularFormula] from Bruker reference file
 88    * from_corems_ref_file(delimiter).
 89        Return List[MolecularFormula] from CoreMS reference file
 90    * split(delimiters, string, maxsplit).
 91        Splits a string using a list of delimiters.
 92    * mformula_s_to_dict(s_mformulatring, iontype).
 93        Converts a molecular formula string to a dict
 94    """
 95
 96    def __init__(self, ref_file_location):
 97        # Thread.__init__(self)
 98
 99        self.ref_file_location = Path(ref_file_location)
100
101        if not self.ref_file_location.exists():
102            tb = sys.exc_info()[2]
103            raise FileNotFoundError(ref_file_location).with_traceback(tb)
104
105    def molecular_formula_ref(self, mz, molecular_formula):
106        """Instantiate a MolecularFormulaLinkProxy object
107
108        Parameters
109        ----------
110        mz : float
111            target m/z
112        molecular_formula : MolecularFormula | LCMSLibRefMolecularFormula
113            corems MolecularFormula or LCMSLibRefMolecularFormula object
114
115        Returns
116        -------
117        MolecularFormulaLinkProxy
118            MolecularFormulaLinkProxy object
119        """
120        return MolecularFormulaLinkProxy(molecular_formula, mz)
121
122    def from_lcms_lib_file(
123        self, ion_charge: float, ion_types: List[str]
124    ) -> Dict[str, Dict[float, List[LCMSLibRefMolecularFormula]]]:
125        """Create a dictionary of LCMSLibRefMolecularFormula objects from LCMS library reference file
126
127        Parameters
128        ----------
129        ion_charge : float
130            ion charge
131        ion_types : List[str]
132            list of ion types
133
134        Returns
135        -------
136        Dict
137            Dict[standard_name, Dict[m/z, List[MolecularFormula]]] from LCMS library reference file. m/z is the target m/z; standard_name is the name of the molecular standard mix; MolecularFormula is the corems molecular formula class
138        """
139
140        data = {}
141
142        with open(self.ref_file_location) as ref_f:
143            df = pd.read_csv(ref_f, header=0, encoding="unicode_escape")
144
145            for index, row in df.iterrows():
146                formula_s = row["Neutral Formula"]
147                formula_dict = self.mformula_s_to_dict(formula_s, Labels.neutral)
148                name = row["Compound Name"]
149                kegg_id = row["KEGG ID"]
150                standard_name = row["NEW MIX"]
151                cas = row["KEGG ID"]
152                # print(row["Neutral Formula"], formula_dict)
153                molf_formula = LCMSLibRefMolecularFormula(
154                    formula_dict,
155                    ion_charge,
156                    Labels.neutral,
157                    name=name,
158                    kegg_id=kegg_id,
159                    cas=cas,
160                )
161                # if round(molf_formula.mz_calc, 4) != round(row['Mass Adduct -H'],4):
162                #    print(formula_s)
163                #    print(round(molf_formula.mz_calc, 4) , round(row['Mass Adduct -H'],4))
164
165                if standard_name in data.keys():
166                    # TODO change it to target ion types and add ion type in the data structure
167                    mz_calc = molf_formula.protonated_mz
168
169                    if mz_calc in data.get(standard_name).keys():
170                        data.get(standard_name).get(mz_calc).append(molf_formula)
171
172                    else:
173                        data[standard_name][mz_calc] = [molf_formula]
174                else:
175                    data[standard_name] = {molf_formula.mz_calc: [molf_formula]}
176                # print(formula_s, formula_dict)
177                # if molf_formula.ion_type != 'de-protonated':
178                #    print( 'ha', molf_formula.ion_type )
179                # print(formula_dict)
180                # print(row['c1'], row['c2'])
181
182        return data
183
184    def from_bruker_ref_file(self) -> List[MolecularFormula]:
185        """Create a list of MolecularFormula objects from Bruker reference file
186
187        Returns
188        -------
189        List[MolecularFormula]
190            List of MolecularFormula objects from Bruker reference file
191        """
192
193        import csv
194
195        list_mf_obj = []
196
197        with open(self.ref_file_location) as ref_f:
198            labels = ref_f.readline().strip("\n").split(";")
199
200            for line in ref_f.readlines():
201                if line != "\n":
202                    list_ref = line.strip("\n").split(" ")
203
204                    if list_ref[2][-1] == "+":
205                        ion_charge = int(list_ref[2][:-1])
206
207                    else:
208                        ion_charge = -1 * int(list_ref[2][:-1])
209
210                    ion_mol_formula = list_ref[0]
211                    mz = float(list_ref[1])
212                    formula_dict = self.mformula_s_to_dict(ion_mol_formula)
213
214                    list_mf_obj.append(
215                        MolecularFormula(formula_dict, ion_charge, external_mz=mz)
216                    )
217
218        return list_mf_obj
219
220    def from_corems_ref_file(self, delimiter="\t"):  # pragma: no cover
221        """Create a list of MolecularFormula objects from CoreMS reference file
222
223        Not being used
224
225        Parameters
226        ----------
227        delimiter : str
228            delimiter used in the reference file
229
230        Returns
231        -------
232        List[MolecularFormula]
233            List of MolecularFormula objects from CoreMS reference file
234        """
235        # not being used
236        import csv
237
238        list_mf_obj = []
239
240        with open("res/RefMassLists/Crude-Pos-ESI.ref") as ref_f:
241            labels = ref_f.readline().strip("\n").split(delimiter)
242
243            for line in ref_f.readlines():
244                if line != "\n":
245                    list_ref = line.strip("\n").split(delimiter)
246
247                    formula_string = list_ref[0]
248                    ion_charge = int(list_ref[1])
249                    ion_type = list_ref[2]
250
251                    molform = MolecularFormula(
252                        formula_string, ion_charge, ion_type=ion_type
253                    )
254
255                    list_mf_obj.append(self.molecular_formula_ref(molform))
256
257        return list_mf_obj
258
259    def split(self, delimiters, string, maxsplit=0):  # pragma: no cover
260        """Splits a string using a list of delimiters.
261
262        Does not work when formula has atoms with same characters, i.e - C10H21NNa
263
264        Parameters
265        ----------
266        delimiters : list
267            list of delimiters
268        string : str
269            string to be split
270        maxsplit : int, optional
271            maximum number of splits. Default is 0
272
273        Returns
274        -------
275        list
276            list of strings obtained after splitting the string
277        list
278            list of counts obtained after splitting the string
279        """
280        regexPattern = "|".join(map(re.escape, delimiters))  # pragma: no cover
281        isotopes = re.findall(regexPattern, string)  # pragma: no cover
282        counts = re.split(regexPattern, string, maxsplit)  # pragma: no cover
283        return isotopes, counts
284
285    def mformula_s_to_dict(self, s_mformulatring, iontype="unknown"):
286        """Converts a molecular formula string to a dict
287
288        Parameters
289        ----------
290        s_mformulatring : str
291            molecular formula string, i.e. 'C10H21NNa'
292        iontype : str, optional
293            ion type. Default is 'unknown'
294
295        Returns
296        -------
297        dict
298            molecular formula dictionary
299
300        Notes
301        -----
302        Does not work if the atomic mass number is passed i.e. 37Cl, 81Br, convention follow the light isotope labeling 35Cl is Cl, 12C is C, etc.
303        If you need to use heavy isotopes please use another reference file format that separate the formula string by a blank space and parse it using the function corems_ref_file
304
305        Raises
306        ------
307        TypeError
308            Atom does not exist in Atoms.atoms_order list
309        Exception
310            Empty molecular formula
311        """
312        if s_mformulatring:
313            # find the case C122
314            all_atoms = re.findall(r"[A-Z]{1}[0-9]{1,10000}", s_mformulatring)
315
316            # find the case Br2
317            all_atoms2 = re.findall(r"[A-Z]{1}[a-z]{1}[0-9]{1,10000}", s_mformulatring)
318            # find the case N
319            single_digit_atoms_one = re.findall(
320                r"[A-Z]{1}(?![0-9])(?![a-z])", s_mformulatring
321            )
322            # print(single_digit_atoms_one)
323            # find the case Na
324            due_digit_atoms_one = re.findall(
325                r"[A-Z]{1}[a-z]{1}(?![0-9])", s_mformulatring
326            )
327
328            all_atoms = (
329                all_atoms + all_atoms2 + due_digit_atoms_one + single_digit_atoms_one
330            )
331
332            dict_res = {}
333
334            for each_atom_count in all_atoms:
335                count = re.findall(r"[0-9]{1,10000}", each_atom_count)
336                atom = "".join(re.findall(r"[A-z]", each_atom_count))
337
338                if atom in Atoms.atoms_order:
339                    if count:
340                        dict_res[atom] = int(count[0])
341                    else:
342                        dict_res[atom] = 1
343
344                else:
345                    tb = sys.exc_info()[2]
346                    raise TypeError(
347                        "Atom %s does not exist in Atoms.atoms_order list" % atom
348                    ).with_traceback(tb)
349
350            dict_res[Labels.ion_type] = iontype
351
352            return dict_res
353
354        else:
355            tb = sys.exc_info()[2]
356            raise Exception("Empty molecular formula").with_traceback(tb)

Import Mass List from Reference File

Parameters
  • ref_file_location (str): path to the reference file
Attributes
  • ref_file_location (str): path to the reference file
Methods
  • molecular_formula_ref(mz, molecular_formula). Return MolecularFormulaLinkProxy object
  • from_lcms_lib_file(ion_charge, ion_types). Return Dict[standard_name, Dict[m/z, List[MolecularFormula]]] from LCMS library reference file
  • from_bruker_ref_file(). Return List[MolecularFormula] from Bruker reference file
  • from_corems_ref_file(delimiter). Return List[MolecularFormula] from CoreMS reference file
  • split(delimiters, string, maxsplit). Splits a string using a list of delimiters.
  • mformula_s_to_dict(s_mformulatring, iontype). Converts a molecular formula string to a dict
ImportMassListRef(ref_file_location)
 96    def __init__(self, ref_file_location):
 97        # Thread.__init__(self)
 98
 99        self.ref_file_location = Path(ref_file_location)
100
101        if not self.ref_file_location.exists():
102            tb = sys.exc_info()[2]
103            raise FileNotFoundError(ref_file_location).with_traceback(tb)
ref_file_location
def molecular_formula_ref(self, mz, molecular_formula):
105    def molecular_formula_ref(self, mz, molecular_formula):
106        """Instantiate a MolecularFormulaLinkProxy object
107
108        Parameters
109        ----------
110        mz : float
111            target m/z
112        molecular_formula : MolecularFormula | LCMSLibRefMolecularFormula
113            corems MolecularFormula or LCMSLibRefMolecularFormula object
114
115        Returns
116        -------
117        MolecularFormulaLinkProxy
118            MolecularFormulaLinkProxy object
119        """
120        return MolecularFormulaLinkProxy(molecular_formula, mz)

Instantiate a MolecularFormulaLinkProxy object

Parameters
  • mz (float): target m/z
  • molecular_formula (MolecularFormula | LCMSLibRefMolecularFormula): corems MolecularFormula or LCMSLibRefMolecularFormula object
Returns
  • MolecularFormulaLinkProxy: MolecularFormulaLinkProxy object
def from_lcms_lib_file( self, ion_charge: float, ion_types: List[str]) -> Dict[str, Dict[float, List[corems.molecular_formula.factory.MolecularFormulaFactory.LCMSLibRefMolecularFormula]]]:
122    def from_lcms_lib_file(
123        self, ion_charge: float, ion_types: List[str]
124    ) -> Dict[str, Dict[float, List[LCMSLibRefMolecularFormula]]]:
125        """Create a dictionary of LCMSLibRefMolecularFormula objects from LCMS library reference file
126
127        Parameters
128        ----------
129        ion_charge : float
130            ion charge
131        ion_types : List[str]
132            list of ion types
133
134        Returns
135        -------
136        Dict
137            Dict[standard_name, Dict[m/z, List[MolecularFormula]]] from LCMS library reference file. m/z is the target m/z; standard_name is the name of the molecular standard mix; MolecularFormula is the corems molecular formula class
138        """
139
140        data = {}
141
142        with open(self.ref_file_location) as ref_f:
143            df = pd.read_csv(ref_f, header=0, encoding="unicode_escape")
144
145            for index, row in df.iterrows():
146                formula_s = row["Neutral Formula"]
147                formula_dict = self.mformula_s_to_dict(formula_s, Labels.neutral)
148                name = row["Compound Name"]
149                kegg_id = row["KEGG ID"]
150                standard_name = row["NEW MIX"]
151                cas = row["KEGG ID"]
152                # print(row["Neutral Formula"], formula_dict)
153                molf_formula = LCMSLibRefMolecularFormula(
154                    formula_dict,
155                    ion_charge,
156                    Labels.neutral,
157                    name=name,
158                    kegg_id=kegg_id,
159                    cas=cas,
160                )
161                # if round(molf_formula.mz_calc, 4) != round(row['Mass Adduct -H'],4):
162                #    print(formula_s)
163                #    print(round(molf_formula.mz_calc, 4) , round(row['Mass Adduct -H'],4))
164
165                if standard_name in data.keys():
166                    # TODO change it to target ion types and add ion type in the data structure
167                    mz_calc = molf_formula.protonated_mz
168
169                    if mz_calc in data.get(standard_name).keys():
170                        data.get(standard_name).get(mz_calc).append(molf_formula)
171
172                    else:
173                        data[standard_name][mz_calc] = [molf_formula]
174                else:
175                    data[standard_name] = {molf_formula.mz_calc: [molf_formula]}
176                # print(formula_s, formula_dict)
177                # if molf_formula.ion_type != 'de-protonated':
178                #    print( 'ha', molf_formula.ion_type )
179                # print(formula_dict)
180                # print(row['c1'], row['c2'])
181
182        return data

Create a dictionary of LCMSLibRefMolecularFormula objects from LCMS library reference file

Parameters
  • ion_charge (float): ion charge
  • ion_types (List[str]): list of ion types
Returns
  • Dict: Dict[standard_name, Dict[m/z, List[MolecularFormula]]] from LCMS library reference file. m/z is the target m/z; standard_name is the name of the molecular standard mix; MolecularFormula is the corems molecular formula class
def from_bruker_ref_file( self) -> List[corems.molecular_formula.factory.MolecularFormulaFactory.MolecularFormula]:
184    def from_bruker_ref_file(self) -> List[MolecularFormula]:
185        """Create a list of MolecularFormula objects from Bruker reference file
186
187        Returns
188        -------
189        List[MolecularFormula]
190            List of MolecularFormula objects from Bruker reference file
191        """
192
193        import csv
194
195        list_mf_obj = []
196
197        with open(self.ref_file_location) as ref_f:
198            labels = ref_f.readline().strip("\n").split(";")
199
200            for line in ref_f.readlines():
201                if line != "\n":
202                    list_ref = line.strip("\n").split(" ")
203
204                    if list_ref[2][-1] == "+":
205                        ion_charge = int(list_ref[2][:-1])
206
207                    else:
208                        ion_charge = -1 * int(list_ref[2][:-1])
209
210                    ion_mol_formula = list_ref[0]
211                    mz = float(list_ref[1])
212                    formula_dict = self.mformula_s_to_dict(ion_mol_formula)
213
214                    list_mf_obj.append(
215                        MolecularFormula(formula_dict, ion_charge, external_mz=mz)
216                    )
217
218        return list_mf_obj

Create a list of MolecularFormula objects from Bruker reference file

Returns
  • List[MolecularFormula]: List of MolecularFormula objects from Bruker reference file
def from_corems_ref_file(self, delimiter='\t'):
220    def from_corems_ref_file(self, delimiter="\t"):  # pragma: no cover
221        """Create a list of MolecularFormula objects from CoreMS reference file
222
223        Not being used
224
225        Parameters
226        ----------
227        delimiter : str
228            delimiter used in the reference file
229
230        Returns
231        -------
232        List[MolecularFormula]
233            List of MolecularFormula objects from CoreMS reference file
234        """
235        # not being used
236        import csv
237
238        list_mf_obj = []
239
240        with open("res/RefMassLists/Crude-Pos-ESI.ref") as ref_f:
241            labels = ref_f.readline().strip("\n").split(delimiter)
242
243            for line in ref_f.readlines():
244                if line != "\n":
245                    list_ref = line.strip("\n").split(delimiter)
246
247                    formula_string = list_ref[0]
248                    ion_charge = int(list_ref[1])
249                    ion_type = list_ref[2]
250
251                    molform = MolecularFormula(
252                        formula_string, ion_charge, ion_type=ion_type
253                    )
254
255                    list_mf_obj.append(self.molecular_formula_ref(molform))
256
257        return list_mf_obj

Create a list of MolecularFormula objects from CoreMS reference file

Not being used

Parameters
  • delimiter (str): delimiter used in the reference file
Returns
  • List[MolecularFormula]: List of MolecularFormula objects from CoreMS reference file
def split(self, delimiters, string, maxsplit=0):
259    def split(self, delimiters, string, maxsplit=0):  # pragma: no cover
260        """Splits a string using a list of delimiters.
261
262        Does not work when formula has atoms with same characters, i.e - C10H21NNa
263
264        Parameters
265        ----------
266        delimiters : list
267            list of delimiters
268        string : str
269            string to be split
270        maxsplit : int, optional
271            maximum number of splits. Default is 0
272
273        Returns
274        -------
275        list
276            list of strings obtained after splitting the string
277        list
278            list of counts obtained after splitting the string
279        """
280        regexPattern = "|".join(map(re.escape, delimiters))  # pragma: no cover
281        isotopes = re.findall(regexPattern, string)  # pragma: no cover
282        counts = re.split(regexPattern, string, maxsplit)  # pragma: no cover
283        return isotopes, counts

Splits a string using a list of delimiters.

Does not work when formula has atoms with same characters, i.e - C10H21NNa

Parameters
  • delimiters (list): list of delimiters
  • string (str): string to be split
  • maxsplit (int, optional): maximum number of splits. Default is 0
Returns
  • list: list of strings obtained after splitting the string
  • list: list of counts obtained after splitting the string
def mformula_s_to_dict(self, s_mformulatring, iontype='unknown'):
285    def mformula_s_to_dict(self, s_mformulatring, iontype="unknown"):
286        """Converts a molecular formula string to a dict
287
288        Parameters
289        ----------
290        s_mformulatring : str
291            molecular formula string, i.e. 'C10H21NNa'
292        iontype : str, optional
293            ion type. Default is 'unknown'
294
295        Returns
296        -------
297        dict
298            molecular formula dictionary
299
300        Notes
301        -----
302        Does not work if the atomic mass number is passed i.e. 37Cl, 81Br, convention follow the light isotope labeling 35Cl is Cl, 12C is C, etc.
303        If you need to use heavy isotopes please use another reference file format that separate the formula string by a blank space and parse it using the function corems_ref_file
304
305        Raises
306        ------
307        TypeError
308            Atom does not exist in Atoms.atoms_order list
309        Exception
310            Empty molecular formula
311        """
312        if s_mformulatring:
313            # find the case C122
314            all_atoms = re.findall(r"[A-Z]{1}[0-9]{1,10000}", s_mformulatring)
315
316            # find the case Br2
317            all_atoms2 = re.findall(r"[A-Z]{1}[a-z]{1}[0-9]{1,10000}", s_mformulatring)
318            # find the case N
319            single_digit_atoms_one = re.findall(
320                r"[A-Z]{1}(?![0-9])(?![a-z])", s_mformulatring
321            )
322            # print(single_digit_atoms_one)
323            # find the case Na
324            due_digit_atoms_one = re.findall(
325                r"[A-Z]{1}[a-z]{1}(?![0-9])", s_mformulatring
326            )
327
328            all_atoms = (
329                all_atoms + all_atoms2 + due_digit_atoms_one + single_digit_atoms_one
330            )
331
332            dict_res = {}
333
334            for each_atom_count in all_atoms:
335                count = re.findall(r"[0-9]{1,10000}", each_atom_count)
336                atom = "".join(re.findall(r"[A-z]", each_atom_count))
337
338                if atom in Atoms.atoms_order:
339                    if count:
340                        dict_res[atom] = int(count[0])
341                    else:
342                        dict_res[atom] = 1
343
344                else:
345                    tb = sys.exc_info()[2]
346                    raise TypeError(
347                        "Atom %s does not exist in Atoms.atoms_order list" % atom
348                    ).with_traceback(tb)
349
350            dict_res[Labels.ion_type] = iontype
351
352            return dict_res
353
354        else:
355            tb = sys.exc_info()[2]
356            raise Exception("Empty molecular formula").with_traceback(tb)

Converts a molecular formula string to a dict

Parameters
  • s_mformulatring (str): molecular formula string, i.e. 'C10H21NNa'
  • iontype (str, optional): ion type. Default is 'unknown'
Returns
  • dict: molecular formula dictionary
Notes

Does not work if the atomic mass number is passed i.e. 37Cl, 81Br, convention follow the light isotope labeling 35Cl is Cl, 12C is C, etc. If you need to use heavy isotopes please use another reference file format that separate the formula string by a blank space and parse it using the function corems_ref_file

Raises
  • TypeError: Atom does not exist in Atoms.atoms_order list
  • Exception: Empty molecular formula