corems.molecular_formula.input.masslist_ref
1__author__ = "Yuri E. Corilo" 2__date__ = "Oct 24, 2019" 3 4import json 5import re 6import sys 7from pathlib import Path 8from typing import Dict, List 9 10import pandas as pd 11 12from corems.encapsulation.constant import Atoms, Labels 13from corems.molecular_formula.factory.MolecularFormulaFactory import ( 14 LCMSLibRefMolecularFormula, 15 MolecularFormula, 16) 17 18 19class MolecularFormulaLinkProxy: 20 """Proxy class for MolecularFormulaLink to be used in the molecular formula ref file import 21 22 Parameters 23 ---------- 24 molecular_formula : MolecularFormula | LCMSLibRefMolecularFormula 25 corems MolecularFormula or LCMSLibRefMolecularFormula object 26 mz : float 27 target m/z 28 29 Attributes 30 ---------- 31 C : int 32 number of carbon atoms 33 H : int 34 number of hydrogen atoms 35 H_C : float 36 ratio of hydrogen to carbon atoms 37 class_label : str 38 molecular formula class label 39 mz_calc : float 40 calculated m/z 41 dbe : int 42 double bond equivalent 43 formula_dict : dict 44 molecular formula dictionary 45 46 Methods 47 ------- 48 * to_dict(). 49 return molecular formula dictionary 50 51 """ 52 53 def __init__(self, molecular_formula, mz): 54 self.C = molecular_formula.get("C") 55 self.H = molecular_formula.get("H") 56 self.H_C = molecular_formula.get("H") / molecular_formula.get("C") 57 self.class_label = json.dumps(molecular_formula.class_dict) 58 self.mz_calc = float(mz) 59 self.dbe = molecular_formula.dbe 60 self.formula_dict = molecular_formula.to_dict() 61 62 def to_dict(self): 63 return self.formula_dict 64 65 66class ImportMassListRef: # Thread 67 """Import Mass List from Reference File 68 69 Parameters 70 ---------- 71 ref_file_location : str 72 path to the reference file 73 74 Attributes 75 ---------- 76 ref_file_location : str 77 path to the reference file 78 79 Methods 80 ------- 81 * molecular_formula_ref(mz, molecular_formula). 82 Return MolecularFormulaLinkProxy object 83 * from_lcms_lib_file(ion_charge, ion_types). 84 Return Dict[standard_name, Dict[m/z, List[MolecularFormula]]] from LCMS library reference file 85 * from_bruker_ref_file(). 86 Return List[MolecularFormula] from Bruker reference file 87 * from_corems_ref_file(delimiter). 88 Return List[MolecularFormula] from CoreMS reference file 89 * split(delimiters, string, maxsplit). 90 Splits a string using a list of delimiters. 91 * mformula_s_to_dict(s_mformulatring, iontype). 92 Converts a molecular formula string to a dict 93 """ 94 95 def __init__(self, ref_file_location): 96 # Thread.__init__(self) 97 98 self.ref_file_location = Path(ref_file_location) 99 100 if not self.ref_file_location.exists(): 101 tb = sys.exc_info()[2] 102 raise FileNotFoundError(ref_file_location).with_traceback(tb) 103 104 def molecular_formula_ref(self, mz, molecular_formula): 105 """Instantiate a MolecularFormulaLinkProxy object 106 107 Parameters 108 ---------- 109 mz : float 110 target m/z 111 molecular_formula : MolecularFormula | LCMSLibRefMolecularFormula 112 corems MolecularFormula or LCMSLibRefMolecularFormula object 113 114 Returns 115 ------- 116 MolecularFormulaLinkProxy 117 MolecularFormulaLinkProxy object 118 """ 119 return MolecularFormulaLinkProxy(molecular_formula, mz) 120 121 def from_lcms_lib_file( 122 self, ion_charge: float, ion_types: List[str] 123 ) -> Dict[str, Dict[float, List[LCMSLibRefMolecularFormula]]]: 124 """Create a dictionary of LCMSLibRefMolecularFormula objects from LCMS library reference file 125 126 Parameters 127 ---------- 128 ion_charge : float 129 ion charge 130 ion_types : List[str] 131 list of ion types 132 133 Returns 134 ------- 135 Dict 136 Dict[standard_name, Dict[m/z, List[MolecularFormula]]] from LCMS library reference file. m/z is the target m/z; standard_name is the name of the molecular standard mix; MolecularFormula is the corems molecular formula class 137 """ 138 139 data = {} 140 141 with open(self.ref_file_location) as ref_f: 142 df = pd.read_csv(ref_f, header=0, encoding="unicode_escape") 143 144 for index, row in df.iterrows(): 145 formula_s = row["Neutral Formula"] 146 formula_dict = self.mformula_s_to_dict(formula_s, Labels.neutral) 147 name = row["Compound Name"] 148 kegg_id = row["KEGG ID"] 149 standard_name = row["NEW MIX"] 150 cas = row["KEGG ID"] 151 # print(row["Neutral Formula"], formula_dict) 152 molf_formula = LCMSLibRefMolecularFormula( 153 formula_dict, 154 ion_charge, 155 Labels.neutral, 156 name=name, 157 kegg_id=kegg_id, 158 cas=cas, 159 ) 160 # if round(molf_formula.mz_calc, 4) != round(row['Mass Adduct -H'],4): 161 # print(formula_s) 162 # print(round(molf_formula.mz_calc, 4) , round(row['Mass Adduct -H'],4)) 163 164 if standard_name in data.keys(): 165 # TODO change it to target ion types and add ion type in the data structure 166 mz_calc = molf_formula.protonated_mz 167 168 if mz_calc in data.get(standard_name).keys(): 169 data.get(standard_name).get(mz_calc).append(molf_formula) 170 171 else: 172 data[standard_name][mz_calc] = [molf_formula] 173 else: 174 data[standard_name] = {molf_formula.mz_calc: [molf_formula]} 175 # print(formula_s, formula_dict) 176 # if molf_formula.ion_type != 'de-protonated': 177 # print( 'ha', molf_formula.ion_type ) 178 # print(formula_dict) 179 # print(row['c1'], row['c2']) 180 181 return data 182 183 def from_bruker_ref_file(self) -> List[MolecularFormula]: 184 """Create a list of MolecularFormula objects from Bruker reference file 185 186 Returns 187 ------- 188 List[MolecularFormula] 189 List of MolecularFormula objects from Bruker reference file 190 """ 191 192 import csv 193 194 list_mf_obj = [] 195 196 with open(self.ref_file_location) as ref_f: 197 labels = ref_f.readline().strip("\n").split(";") 198 199 for line in ref_f.readlines(): 200 if line != "\n": 201 list_ref = line.strip("\n").split(" ") 202 203 if list_ref[2][-1] == "+": 204 ion_charge = int(list_ref[2][:-1]) 205 206 else: 207 ion_charge = -1 * int(list_ref[2][:-1]) 208 209 ion_mol_formula = list_ref[0] 210 mz = float(list_ref[1]) 211 formula_dict = self.mformula_s_to_dict(ion_mol_formula) 212 213 list_mf_obj.append( 214 MolecularFormula(formula_dict, ion_charge, external_mz=mz) 215 ) 216 217 return list_mf_obj 218 219 def from_corems_ref_file(self, delimiter="\t"): # pragma: no cover 220 """Create a list of MolecularFormula objects from CoreMS reference file 221 222 Not being used 223 224 Parameters 225 ---------- 226 delimiter : str 227 delimiter used in the reference file 228 229 Returns 230 ------- 231 List[MolecularFormula] 232 List of MolecularFormula objects from CoreMS reference file 233 """ 234 # not being used 235 import csv 236 237 list_mf_obj = [] 238 239 with open("res/RefMassLists/Crude-Pos-ESI.ref") as ref_f: 240 labels = ref_f.readline().strip("\n").split(delimiter) 241 242 for line in ref_f.readlines(): 243 if line != "\n": 244 list_ref = line.strip("\n").split(delimiter) 245 246 formula_string = list_ref[0] 247 ion_charge = int(list_ref[1]) 248 ion_type = list_ref[2] 249 250 molform = MolecularFormula( 251 formula_string, ion_charge, ion_type=ion_type 252 ) 253 254 list_mf_obj.append(self.molecular_formula_ref(molform)) 255 256 return list_mf_obj 257 258 def split(self, delimiters, string, maxsplit=0): # pragma: no cover 259 """Splits a string using a list of delimiters. 260 261 Does not work when formula has atoms with same characters, i.e - C10H21NNa 262 263 Parameters 264 ---------- 265 delimiters : list 266 list of delimiters 267 string : str 268 string to be split 269 maxsplit : int, optional 270 maximum number of splits. Default is 0 271 272 Returns 273 ------- 274 list 275 list of strings obtained after splitting the string 276 list 277 list of counts obtained after splitting the string 278 """ 279 regexPattern = "|".join(map(re.escape, delimiters)) # pragma: no cover 280 isotopes = re.findall(regexPattern, string) # pragma: no cover 281 counts = re.split(regexPattern, string, maxsplit) # pragma: no cover 282 return isotopes, counts 283 284 def mformula_s_to_dict(self, s_mformulatring, iontype="unknown"): 285 """Converts a molecular formula string to a dict 286 287 Parameters 288 ---------- 289 s_mformulatring : str 290 molecular formula string, i.e. 'C10H21NNa' 291 iontype : str, optional 292 ion type. Default is 'unknown' 293 294 Returns 295 ------- 296 dict 297 molecular formula dictionary 298 299 Notes 300 ----- 301 Does not work if the atomic mass number is passed i.e. 37Cl, 81Br, convention follow the light isotope labeling 35Cl is Cl, 12C is C, etc. 302 If you need to use heavy isotopes please use another reference file format that separate the formula string by a blank space and parse it using the function corems_ref_file 303 304 Raises 305 ------ 306 TypeError 307 Atom does not exist in Atoms.atoms_order list 308 Exception 309 Empty molecular formula 310 """ 311 if s_mformulatring: 312 # find the case C122 313 all_atoms = re.findall(r"[A-Z]{1}[0-9]{1,10000}", s_mformulatring) 314 315 # find the case Br2 316 all_atoms2 = re.findall(r"[A-Z]{1}[a-z]{1}[0-9]{1,10000}", s_mformulatring) 317 # find the case N 318 single_digit_atoms_one = re.findall( 319 r"[A-Z]{1}(?![0-9])(?![a-z])", s_mformulatring 320 ) 321 # print(single_digit_atoms_one) 322 # find the case Na 323 due_digit_atoms_one = re.findall( 324 r"[A-Z]{1}[a-z]{1}(?![0-9])", s_mformulatring 325 ) 326 327 all_atoms = ( 328 all_atoms + all_atoms2 + due_digit_atoms_one + single_digit_atoms_one 329 ) 330 331 dict_res = {} 332 333 for each_atom_count in all_atoms: 334 count = re.findall(r"[0-9]{1,10000}", each_atom_count) 335 atom = "".join(re.findall(r"[A-z]", each_atom_count)) 336 337 if atom in Atoms.atoms_order: 338 if count: 339 dict_res[atom] = int(count[0]) 340 else: 341 dict_res[atom] = 1 342 343 else: 344 tb = sys.exc_info()[2] 345 raise TypeError( 346 "Atom %s does not exist in Atoms.atoms_order list" % atom 347 ).with_traceback(tb) 348 349 dict_res[Labels.ion_type] = iontype 350 351 return dict_res 352 353 else: 354 tb = sys.exc_info()[2] 355 raise Exception("Empty molecular formula").with_traceback(tb)
20class MolecularFormulaLinkProxy: 21 """Proxy class for MolecularFormulaLink to be used in the molecular formula ref file import 22 23 Parameters 24 ---------- 25 molecular_formula : MolecularFormula | LCMSLibRefMolecularFormula 26 corems MolecularFormula or LCMSLibRefMolecularFormula object 27 mz : float 28 target m/z 29 30 Attributes 31 ---------- 32 C : int 33 number of carbon atoms 34 H : int 35 number of hydrogen atoms 36 H_C : float 37 ratio of hydrogen to carbon atoms 38 class_label : str 39 molecular formula class label 40 mz_calc : float 41 calculated m/z 42 dbe : int 43 double bond equivalent 44 formula_dict : dict 45 molecular formula dictionary 46 47 Methods 48 ------- 49 * to_dict(). 50 return molecular formula dictionary 51 52 """ 53 54 def __init__(self, molecular_formula, mz): 55 self.C = molecular_formula.get("C") 56 self.H = molecular_formula.get("H") 57 self.H_C = molecular_formula.get("H") / molecular_formula.get("C") 58 self.class_label = json.dumps(molecular_formula.class_dict) 59 self.mz_calc = float(mz) 60 self.dbe = molecular_formula.dbe 61 self.formula_dict = molecular_formula.to_dict() 62 63 def to_dict(self): 64 return self.formula_dict
Proxy class for MolecularFormulaLink to be used in the molecular formula ref file import
Parameters
- molecular_formula (MolecularFormula | LCMSLibRefMolecularFormula): corems MolecularFormula or LCMSLibRefMolecularFormula object
- mz (float): target m/z
Attributes
- C (int): number of carbon atoms
- H (int): number of hydrogen atoms
- H_C (float): ratio of hydrogen to carbon atoms
- class_label (str): molecular formula class label
- mz_calc (float): calculated m/z
- dbe (int): double bond equivalent
- formula_dict (dict): molecular formula dictionary
Methods
- to_dict(). return molecular formula dictionary
54 def __init__(self, molecular_formula, mz): 55 self.C = molecular_formula.get("C") 56 self.H = molecular_formula.get("H") 57 self.H_C = molecular_formula.get("H") / molecular_formula.get("C") 58 self.class_label = json.dumps(molecular_formula.class_dict) 59 self.mz_calc = float(mz) 60 self.dbe = molecular_formula.dbe 61 self.formula_dict = molecular_formula.to_dict()
67class ImportMassListRef: # Thread 68 """Import Mass List from Reference File 69 70 Parameters 71 ---------- 72 ref_file_location : str 73 path to the reference file 74 75 Attributes 76 ---------- 77 ref_file_location : str 78 path to the reference file 79 80 Methods 81 ------- 82 * molecular_formula_ref(mz, molecular_formula). 83 Return MolecularFormulaLinkProxy object 84 * from_lcms_lib_file(ion_charge, ion_types). 85 Return Dict[standard_name, Dict[m/z, List[MolecularFormula]]] from LCMS library reference file 86 * from_bruker_ref_file(). 87 Return List[MolecularFormula] from Bruker reference file 88 * from_corems_ref_file(delimiter). 89 Return List[MolecularFormula] from CoreMS reference file 90 * split(delimiters, string, maxsplit). 91 Splits a string using a list of delimiters. 92 * mformula_s_to_dict(s_mformulatring, iontype). 93 Converts a molecular formula string to a dict 94 """ 95 96 def __init__(self, ref_file_location): 97 # Thread.__init__(self) 98 99 self.ref_file_location = Path(ref_file_location) 100 101 if not self.ref_file_location.exists(): 102 tb = sys.exc_info()[2] 103 raise FileNotFoundError(ref_file_location).with_traceback(tb) 104 105 def molecular_formula_ref(self, mz, molecular_formula): 106 """Instantiate a MolecularFormulaLinkProxy object 107 108 Parameters 109 ---------- 110 mz : float 111 target m/z 112 molecular_formula : MolecularFormula | LCMSLibRefMolecularFormula 113 corems MolecularFormula or LCMSLibRefMolecularFormula object 114 115 Returns 116 ------- 117 MolecularFormulaLinkProxy 118 MolecularFormulaLinkProxy object 119 """ 120 return MolecularFormulaLinkProxy(molecular_formula, mz) 121 122 def from_lcms_lib_file( 123 self, ion_charge: float, ion_types: List[str] 124 ) -> Dict[str, Dict[float, List[LCMSLibRefMolecularFormula]]]: 125 """Create a dictionary of LCMSLibRefMolecularFormula objects from LCMS library reference file 126 127 Parameters 128 ---------- 129 ion_charge : float 130 ion charge 131 ion_types : List[str] 132 list of ion types 133 134 Returns 135 ------- 136 Dict 137 Dict[standard_name, Dict[m/z, List[MolecularFormula]]] from LCMS library reference file. m/z is the target m/z; standard_name is the name of the molecular standard mix; MolecularFormula is the corems molecular formula class 138 """ 139 140 data = {} 141 142 with open(self.ref_file_location) as ref_f: 143 df = pd.read_csv(ref_f, header=0, encoding="unicode_escape") 144 145 for index, row in df.iterrows(): 146 formula_s = row["Neutral Formula"] 147 formula_dict = self.mformula_s_to_dict(formula_s, Labels.neutral) 148 name = row["Compound Name"] 149 kegg_id = row["KEGG ID"] 150 standard_name = row["NEW MIX"] 151 cas = row["KEGG ID"] 152 # print(row["Neutral Formula"], formula_dict) 153 molf_formula = LCMSLibRefMolecularFormula( 154 formula_dict, 155 ion_charge, 156 Labels.neutral, 157 name=name, 158 kegg_id=kegg_id, 159 cas=cas, 160 ) 161 # if round(molf_formula.mz_calc, 4) != round(row['Mass Adduct -H'],4): 162 # print(formula_s) 163 # print(round(molf_formula.mz_calc, 4) , round(row['Mass Adduct -H'],4)) 164 165 if standard_name in data.keys(): 166 # TODO change it to target ion types and add ion type in the data structure 167 mz_calc = molf_formula.protonated_mz 168 169 if mz_calc in data.get(standard_name).keys(): 170 data.get(standard_name).get(mz_calc).append(molf_formula) 171 172 else: 173 data[standard_name][mz_calc] = [molf_formula] 174 else: 175 data[standard_name] = {molf_formula.mz_calc: [molf_formula]} 176 # print(formula_s, formula_dict) 177 # if molf_formula.ion_type != 'de-protonated': 178 # print( 'ha', molf_formula.ion_type ) 179 # print(formula_dict) 180 # print(row['c1'], row['c2']) 181 182 return data 183 184 def from_bruker_ref_file(self) -> List[MolecularFormula]: 185 """Create a list of MolecularFormula objects from Bruker reference file 186 187 Returns 188 ------- 189 List[MolecularFormula] 190 List of MolecularFormula objects from Bruker reference file 191 """ 192 193 import csv 194 195 list_mf_obj = [] 196 197 with open(self.ref_file_location) as ref_f: 198 labels = ref_f.readline().strip("\n").split(";") 199 200 for line in ref_f.readlines(): 201 if line != "\n": 202 list_ref = line.strip("\n").split(" ") 203 204 if list_ref[2][-1] == "+": 205 ion_charge = int(list_ref[2][:-1]) 206 207 else: 208 ion_charge = -1 * int(list_ref[2][:-1]) 209 210 ion_mol_formula = list_ref[0] 211 mz = float(list_ref[1]) 212 formula_dict = self.mformula_s_to_dict(ion_mol_formula) 213 214 list_mf_obj.append( 215 MolecularFormula(formula_dict, ion_charge, external_mz=mz) 216 ) 217 218 return list_mf_obj 219 220 def from_corems_ref_file(self, delimiter="\t"): # pragma: no cover 221 """Create a list of MolecularFormula objects from CoreMS reference file 222 223 Not being used 224 225 Parameters 226 ---------- 227 delimiter : str 228 delimiter used in the reference file 229 230 Returns 231 ------- 232 List[MolecularFormula] 233 List of MolecularFormula objects from CoreMS reference file 234 """ 235 # not being used 236 import csv 237 238 list_mf_obj = [] 239 240 with open("res/RefMassLists/Crude-Pos-ESI.ref") as ref_f: 241 labels = ref_f.readline().strip("\n").split(delimiter) 242 243 for line in ref_f.readlines(): 244 if line != "\n": 245 list_ref = line.strip("\n").split(delimiter) 246 247 formula_string = list_ref[0] 248 ion_charge = int(list_ref[1]) 249 ion_type = list_ref[2] 250 251 molform = MolecularFormula( 252 formula_string, ion_charge, ion_type=ion_type 253 ) 254 255 list_mf_obj.append(self.molecular_formula_ref(molform)) 256 257 return list_mf_obj 258 259 def split(self, delimiters, string, maxsplit=0): # pragma: no cover 260 """Splits a string using a list of delimiters. 261 262 Does not work when formula has atoms with same characters, i.e - C10H21NNa 263 264 Parameters 265 ---------- 266 delimiters : list 267 list of delimiters 268 string : str 269 string to be split 270 maxsplit : int, optional 271 maximum number of splits. Default is 0 272 273 Returns 274 ------- 275 list 276 list of strings obtained after splitting the string 277 list 278 list of counts obtained after splitting the string 279 """ 280 regexPattern = "|".join(map(re.escape, delimiters)) # pragma: no cover 281 isotopes = re.findall(regexPattern, string) # pragma: no cover 282 counts = re.split(regexPattern, string, maxsplit) # pragma: no cover 283 return isotopes, counts 284 285 def mformula_s_to_dict(self, s_mformulatring, iontype="unknown"): 286 """Converts a molecular formula string to a dict 287 288 Parameters 289 ---------- 290 s_mformulatring : str 291 molecular formula string, i.e. 'C10H21NNa' 292 iontype : str, optional 293 ion type. Default is 'unknown' 294 295 Returns 296 ------- 297 dict 298 molecular formula dictionary 299 300 Notes 301 ----- 302 Does not work if the atomic mass number is passed i.e. 37Cl, 81Br, convention follow the light isotope labeling 35Cl is Cl, 12C is C, etc. 303 If you need to use heavy isotopes please use another reference file format that separate the formula string by a blank space and parse it using the function corems_ref_file 304 305 Raises 306 ------ 307 TypeError 308 Atom does not exist in Atoms.atoms_order list 309 Exception 310 Empty molecular formula 311 """ 312 if s_mformulatring: 313 # find the case C122 314 all_atoms = re.findall(r"[A-Z]{1}[0-9]{1,10000}", s_mformulatring) 315 316 # find the case Br2 317 all_atoms2 = re.findall(r"[A-Z]{1}[a-z]{1}[0-9]{1,10000}", s_mformulatring) 318 # find the case N 319 single_digit_atoms_one = re.findall( 320 r"[A-Z]{1}(?![0-9])(?![a-z])", s_mformulatring 321 ) 322 # print(single_digit_atoms_one) 323 # find the case Na 324 due_digit_atoms_one = re.findall( 325 r"[A-Z]{1}[a-z]{1}(?![0-9])", s_mformulatring 326 ) 327 328 all_atoms = ( 329 all_atoms + all_atoms2 + due_digit_atoms_one + single_digit_atoms_one 330 ) 331 332 dict_res = {} 333 334 for each_atom_count in all_atoms: 335 count = re.findall(r"[0-9]{1,10000}", each_atom_count) 336 atom = "".join(re.findall(r"[A-z]", each_atom_count)) 337 338 if atom in Atoms.atoms_order: 339 if count: 340 dict_res[atom] = int(count[0]) 341 else: 342 dict_res[atom] = 1 343 344 else: 345 tb = sys.exc_info()[2] 346 raise TypeError( 347 "Atom %s does not exist in Atoms.atoms_order list" % atom 348 ).with_traceback(tb) 349 350 dict_res[Labels.ion_type] = iontype 351 352 return dict_res 353 354 else: 355 tb = sys.exc_info()[2] 356 raise Exception("Empty molecular formula").with_traceback(tb)
Import Mass List from Reference File
Parameters
- ref_file_location (str): path to the reference file
Attributes
- ref_file_location (str): path to the reference file
Methods
- molecular_formula_ref(mz, molecular_formula). Return MolecularFormulaLinkProxy object
- from_lcms_lib_file(ion_charge, ion_types). Return Dict[standard_name, Dict[m/z, List[MolecularFormula]]] from LCMS library reference file
- from_bruker_ref_file(). Return List[MolecularFormula] from Bruker reference file
- from_corems_ref_file(delimiter). Return List[MolecularFormula] from CoreMS reference file
- split(delimiters, string, maxsplit). Splits a string using a list of delimiters.
- mformula_s_to_dict(s_mformulatring, iontype). Converts a molecular formula string to a dict
105 def molecular_formula_ref(self, mz, molecular_formula): 106 """Instantiate a MolecularFormulaLinkProxy object 107 108 Parameters 109 ---------- 110 mz : float 111 target m/z 112 molecular_formula : MolecularFormula | LCMSLibRefMolecularFormula 113 corems MolecularFormula or LCMSLibRefMolecularFormula object 114 115 Returns 116 ------- 117 MolecularFormulaLinkProxy 118 MolecularFormulaLinkProxy object 119 """ 120 return MolecularFormulaLinkProxy(molecular_formula, mz)
Instantiate a MolecularFormulaLinkProxy object
Parameters
- mz (float): target m/z
- molecular_formula (MolecularFormula | LCMSLibRefMolecularFormula): corems MolecularFormula or LCMSLibRefMolecularFormula object
Returns
- MolecularFormulaLinkProxy: MolecularFormulaLinkProxy object
122 def from_lcms_lib_file( 123 self, ion_charge: float, ion_types: List[str] 124 ) -> Dict[str, Dict[float, List[LCMSLibRefMolecularFormula]]]: 125 """Create a dictionary of LCMSLibRefMolecularFormula objects from LCMS library reference file 126 127 Parameters 128 ---------- 129 ion_charge : float 130 ion charge 131 ion_types : List[str] 132 list of ion types 133 134 Returns 135 ------- 136 Dict 137 Dict[standard_name, Dict[m/z, List[MolecularFormula]]] from LCMS library reference file. m/z is the target m/z; standard_name is the name of the molecular standard mix; MolecularFormula is the corems molecular formula class 138 """ 139 140 data = {} 141 142 with open(self.ref_file_location) as ref_f: 143 df = pd.read_csv(ref_f, header=0, encoding="unicode_escape") 144 145 for index, row in df.iterrows(): 146 formula_s = row["Neutral Formula"] 147 formula_dict = self.mformula_s_to_dict(formula_s, Labels.neutral) 148 name = row["Compound Name"] 149 kegg_id = row["KEGG ID"] 150 standard_name = row["NEW MIX"] 151 cas = row["KEGG ID"] 152 # print(row["Neutral Formula"], formula_dict) 153 molf_formula = LCMSLibRefMolecularFormula( 154 formula_dict, 155 ion_charge, 156 Labels.neutral, 157 name=name, 158 kegg_id=kegg_id, 159 cas=cas, 160 ) 161 # if round(molf_formula.mz_calc, 4) != round(row['Mass Adduct -H'],4): 162 # print(formula_s) 163 # print(round(molf_formula.mz_calc, 4) , round(row['Mass Adduct -H'],4)) 164 165 if standard_name in data.keys(): 166 # TODO change it to target ion types and add ion type in the data structure 167 mz_calc = molf_formula.protonated_mz 168 169 if mz_calc in data.get(standard_name).keys(): 170 data.get(standard_name).get(mz_calc).append(molf_formula) 171 172 else: 173 data[standard_name][mz_calc] = [molf_formula] 174 else: 175 data[standard_name] = {molf_formula.mz_calc: [molf_formula]} 176 # print(formula_s, formula_dict) 177 # if molf_formula.ion_type != 'de-protonated': 178 # print( 'ha', molf_formula.ion_type ) 179 # print(formula_dict) 180 # print(row['c1'], row['c2']) 181 182 return data
Create a dictionary of LCMSLibRefMolecularFormula objects from LCMS library reference file
Parameters
- ion_charge (float): ion charge
- ion_types (List[str]): list of ion types
Returns
- Dict: Dict[standard_name, Dict[m/z, List[MolecularFormula]]] from LCMS library reference file. m/z is the target m/z; standard_name is the name of the molecular standard mix; MolecularFormula is the corems molecular formula class
184 def from_bruker_ref_file(self) -> List[MolecularFormula]: 185 """Create a list of MolecularFormula objects from Bruker reference file 186 187 Returns 188 ------- 189 List[MolecularFormula] 190 List of MolecularFormula objects from Bruker reference file 191 """ 192 193 import csv 194 195 list_mf_obj = [] 196 197 with open(self.ref_file_location) as ref_f: 198 labels = ref_f.readline().strip("\n").split(";") 199 200 for line in ref_f.readlines(): 201 if line != "\n": 202 list_ref = line.strip("\n").split(" ") 203 204 if list_ref[2][-1] == "+": 205 ion_charge = int(list_ref[2][:-1]) 206 207 else: 208 ion_charge = -1 * int(list_ref[2][:-1]) 209 210 ion_mol_formula = list_ref[0] 211 mz = float(list_ref[1]) 212 formula_dict = self.mformula_s_to_dict(ion_mol_formula) 213 214 list_mf_obj.append( 215 MolecularFormula(formula_dict, ion_charge, external_mz=mz) 216 ) 217 218 return list_mf_obj
Create a list of MolecularFormula objects from Bruker reference file
Returns
- List[MolecularFormula]: List of MolecularFormula objects from Bruker reference file
220 def from_corems_ref_file(self, delimiter="\t"): # pragma: no cover 221 """Create a list of MolecularFormula objects from CoreMS reference file 222 223 Not being used 224 225 Parameters 226 ---------- 227 delimiter : str 228 delimiter used in the reference file 229 230 Returns 231 ------- 232 List[MolecularFormula] 233 List of MolecularFormula objects from CoreMS reference file 234 """ 235 # not being used 236 import csv 237 238 list_mf_obj = [] 239 240 with open("res/RefMassLists/Crude-Pos-ESI.ref") as ref_f: 241 labels = ref_f.readline().strip("\n").split(delimiter) 242 243 for line in ref_f.readlines(): 244 if line != "\n": 245 list_ref = line.strip("\n").split(delimiter) 246 247 formula_string = list_ref[0] 248 ion_charge = int(list_ref[1]) 249 ion_type = list_ref[2] 250 251 molform = MolecularFormula( 252 formula_string, ion_charge, ion_type=ion_type 253 ) 254 255 list_mf_obj.append(self.molecular_formula_ref(molform)) 256 257 return list_mf_obj
Create a list of MolecularFormula objects from CoreMS reference file
Not being used
Parameters
- delimiter (str): delimiter used in the reference file
Returns
- List[MolecularFormula]: List of MolecularFormula objects from CoreMS reference file
259 def split(self, delimiters, string, maxsplit=0): # pragma: no cover 260 """Splits a string using a list of delimiters. 261 262 Does not work when formula has atoms with same characters, i.e - C10H21NNa 263 264 Parameters 265 ---------- 266 delimiters : list 267 list of delimiters 268 string : str 269 string to be split 270 maxsplit : int, optional 271 maximum number of splits. Default is 0 272 273 Returns 274 ------- 275 list 276 list of strings obtained after splitting the string 277 list 278 list of counts obtained after splitting the string 279 """ 280 regexPattern = "|".join(map(re.escape, delimiters)) # pragma: no cover 281 isotopes = re.findall(regexPattern, string) # pragma: no cover 282 counts = re.split(regexPattern, string, maxsplit) # pragma: no cover 283 return isotopes, counts
Splits a string using a list of delimiters.
Does not work when formula has atoms with same characters, i.e - C10H21NNa
Parameters
- delimiters (list): list of delimiters
- string (str): string to be split
- maxsplit (int, optional): maximum number of splits. Default is 0
Returns
- list: list of strings obtained after splitting the string
- list: list of counts obtained after splitting the string
285 def mformula_s_to_dict(self, s_mformulatring, iontype="unknown"): 286 """Converts a molecular formula string to a dict 287 288 Parameters 289 ---------- 290 s_mformulatring : str 291 molecular formula string, i.e. 'C10H21NNa' 292 iontype : str, optional 293 ion type. Default is 'unknown' 294 295 Returns 296 ------- 297 dict 298 molecular formula dictionary 299 300 Notes 301 ----- 302 Does not work if the atomic mass number is passed i.e. 37Cl, 81Br, convention follow the light isotope labeling 35Cl is Cl, 12C is C, etc. 303 If you need to use heavy isotopes please use another reference file format that separate the formula string by a blank space and parse it using the function corems_ref_file 304 305 Raises 306 ------ 307 TypeError 308 Atom does not exist in Atoms.atoms_order list 309 Exception 310 Empty molecular formula 311 """ 312 if s_mformulatring: 313 # find the case C122 314 all_atoms = re.findall(r"[A-Z]{1}[0-9]{1,10000}", s_mformulatring) 315 316 # find the case Br2 317 all_atoms2 = re.findall(r"[A-Z]{1}[a-z]{1}[0-9]{1,10000}", s_mformulatring) 318 # find the case N 319 single_digit_atoms_one = re.findall( 320 r"[A-Z]{1}(?![0-9])(?![a-z])", s_mformulatring 321 ) 322 # print(single_digit_atoms_one) 323 # find the case Na 324 due_digit_atoms_one = re.findall( 325 r"[A-Z]{1}[a-z]{1}(?![0-9])", s_mformulatring 326 ) 327 328 all_atoms = ( 329 all_atoms + all_atoms2 + due_digit_atoms_one + single_digit_atoms_one 330 ) 331 332 dict_res = {} 333 334 for each_atom_count in all_atoms: 335 count = re.findall(r"[0-9]{1,10000}", each_atom_count) 336 atom = "".join(re.findall(r"[A-z]", each_atom_count)) 337 338 if atom in Atoms.atoms_order: 339 if count: 340 dict_res[atom] = int(count[0]) 341 else: 342 dict_res[atom] = 1 343 344 else: 345 tb = sys.exc_info()[2] 346 raise TypeError( 347 "Atom %s does not exist in Atoms.atoms_order list" % atom 348 ).with_traceback(tb) 349 350 dict_res[Labels.ion_type] = iontype 351 352 return dict_res 353 354 else: 355 tb = sys.exc_info()[2] 356 raise Exception("Empty molecular formula").with_traceback(tb)
Converts a molecular formula string to a dict
Parameters
- s_mformulatring (str): molecular formula string, i.e. 'C10H21NNa'
- iontype (str, optional): ion type. Default is 'unknown'
Returns
- dict: molecular formula dictionary
Notes
Does not work if the atomic mass number is passed i.e. 37Cl, 81Br, convention follow the light isotope labeling 35Cl is Cl, 12C is C, etc. If you need to use heavy isotopes please use another reference file format that separate the formula string by a blank space and parse it using the function corems_ref_file
Raises
- TypeError: Atom does not exist in Atoms.atoms_order list
- Exception: Empty molecular formula