griptomo.core.pdb2graph
1# August George, 2022, PNNL 2 3import numpy as np 4import pandas as pd 5import networkx as nx 6from Bio.PDB import ( 7 PDBParser, 8 MMCIFParser, 9) # BioPython PDB doc: https://biopython.org/docs/1.75/api/Bio.PDB.html 10from scipy.spatial.distance import pdist, squareform 11import argparse 12import matplotlib.pyplot as plt 13from pathlib import Path 14 15# for convenience 16import warnings 17 18warnings.filterwarnings("ignore") 19 20 21def get_hydrophobicity(name, warn=False): 22 """ 23 Gets hydrophobicity based on amino acid name. 24 25 Parameters 26 ---------- 27 name : str 28 Amino acid name. 29 warn : bool, optional 30 If True, print a warning when hydrophobicity can't be determined. Default is False. 31 32 Returns 33 ------- 34 float 35 Hydrophobicity value. 36 37 Notes 38 ----- 39 Returns NaN if the amino acid name is not recognized. 40 41 Ref: https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/midas/hydrophob.html 42 """ 43 acid_names = [ 44 "Ile", 45 "Val", 46 "Leu", 47 "Phe", 48 "Cys", 49 "Met", 50 "Ala", 51 "Gly", 52 "Thr", 53 "Ser", 54 "Trp", 55 "Tyr", 56 "Pro", 57 "His", 58 "Glu", 59 "Gln", 60 "Asp", 61 "Asn", 62 "Lys", 63 "Arg", 64 ] 65 acid_kdH = [ 66 4.5, 67 4.2, 68 3.8, 69 2.8, 70 2.5, 71 1.9, 72 1.8, 73 -0.4, 74 -0.7, 75 -0.8, 76 -0.9, 77 -1.3, 78 -1.6, 79 -3.2, 80 -3.5, 81 -3.5, 82 -3.5, 83 -3.5, 84 -3.9, 85 -4.5, 86 ] 87 hydro_dict = dict(zip(acid_names, acid_kdH)) 88 try: 89 hydrophobicity = hydro_dict[ 90 f"{name.title()}" 91 ] # make sure string is in correct format 92 except: 93 hydrophobicity = np.nan 94 if warn == True: 95 print( 96 f"warning! could not assign hydrophobicity for {name.title()}. setting hydrophobicity=NaN" 97 ) 98 return hydrophobicity 99 100 101def PDB_to_df(pdb_code, fname, pdbx, offset, CA_only=1): 102 """ 103 Loads a PDB (or PDBx) file and stores the atom coordinates and residue name and number into a dataframe. 104 105 Note: if the PDB file has more than one model, the first model is chosen. 106 107 Parameters 108 ---------- 109 pdb_code : str 110 PDB ID / label for the protein of interest. 111 fname : str 112 Filename for the protein of interest. Can be PDB or PDBx format. 113 pdbx : int 114 Set to 1 if using the newer PDBx file format. 115 offset : int 116 Index offset in case the first residue ID in PDB file is not the first physical residue (e.g. PDB starts at 5th residue). 117 CA_only : int, optional 118 Set to 1 [default] if using only alpha carbons, else all atoms are used. 119 120 Returns 121 ------- 122 pandas.DataFrame 123 Dataframe containing every atom's x, y, z coordinates and serial number. 124 """ 125 # pick a file reader based on the filetype. Newer/bigger structures will use PDBx 126 if pdbx == 1: 127 parser = MMCIFParser() # for PDBx files (.CIF) 128 else: 129 parser = PDBParser() # for PDB files (.PDB) 130 structure = parser.get_structure( 131 pdb_code, fname 132 ) # parse pdb file and store as a PDB structure object 133 134 # makes lists storing the alpha carbon (CA) coordinates and serial number 135 atom_coord_list = [] 136 res_id_list = [] 137 hydro_list = [] 138 139 # loop through PDB structure object to get alpha carbon (or all atom) coordinates and serial number 140 for i, model in enumerate( 141 structure 142 ): # iterate through each model in the structure, exit loop after first model 143 if ( 144 i > 0 145 ): # NMR files have multiple models, but only one model is used for the graph 146 break 147 for chain in model: # iterate through each chain in the model 148 for residue in chain: # iterate through each residue in the chain 149 for atom in residue: # iterate through each atom in residue 150 if CA_only == 1: 151 if ( 152 atom.get_name() == "CA" 153 ): # if atom is an alpha carbon then append its coordinates and serial number 154 atom_coord_list.append( 155 atom.get_vector()[:] 156 ) # convert vector object into list of numbers 157 atom_sn = atom.get_serial_number() 158 159 # biopython PDB module residue.__repr__() returns a string representing the residue: 160 # e.g. '<Residue THR het= resseq=1 icode= >' 161 # 'resseq' is the residue serial number 162 # --> split the string until the number after 'resseq=' is all that remains 163 res_repr = residue.__repr__() 164 res_repr_tmp = res_repr.split("resseq=", 1)[1] 165 res_sn = ( 166 int(res_repr_tmp.split(" ", 1)[0]) + offset 167 ) # get residue serial number (+ offest) 168 res_name = ( 169 residue.get_resname() 170 ) # get first letters of residue name 171 hydro_list.append( 172 get_hydrophobicity(res_name) 173 ) # get hydrophobicity based on residue name 174 175 # store residue ID: <first letter of residue name><residue serial number + offset> 176 res_id_list.append(f"{res_name}{res_sn}-{atom_sn}") 177 else: # get all atoms 178 if residue.get_resname() != "HOH": 179 atom_coord_list.append( 180 atom.get_vector()[:] 181 ) # convert vector object into list of numbers 182 atom_sn = atom.get_serial_number() 183 184 # biopython PDB module residue.__repr__() returns a string representing the residue: 185 # e.g. '<Residue THR het= resseq=1 icode= >' 186 # 'resseq' is the residue serial number 187 # --> split the string until the number after 'resseq=' is all that remains 188 res_repr = residue.__repr__() 189 res_repr_tmp = res_repr.split("resseq=", 1)[1] 190 res_sn = ( 191 int(res_repr_tmp.split(" ", 1)[0]) + offset 192 ) # get residue serial number (+ offest) 193 res_name = ( 194 residue.get_resname() 195 ) # get first letters of residue name 196 hydro_list.append( 197 get_hydrophobicity(res_name) 198 ) # get hydrophobicity based on residue name 199 200 # store residue ID: <first letter of residue name><residue serial number + offset> 201 res_id_list.append(f"{res_name}{res_sn}-{atom_sn}") 202 203 # create dataframes for atom coordinates, serial numbers, and then combine them. 204 # there's probably a better way to do this in a single line 205 atom_df = pd.DataFrame(atom_coord_list, columns=["x", "y", "z"]) 206 res_id_df = pd.DataFrame(res_id_list, columns=["residue id"]) 207 hydro_df = pd.DataFrame(hydro_list, columns=["hydrophobicity"]) 208 df = pd.concat([atom_df, res_id_df, hydro_df], axis=1) 209 return df 210 211 212def PDB_df_to_G(PDB_df, d_cut=8.0): 213 """ 214 Converts a dataframe containing alpha carbon / atom coordinates (in Angstroms) into a graph, G(V,E). 215 216 Each vertex, V, is an alpha carbon / atom. Two alpha carbons / atoms with a distance (in Angstroms) less than a cutoff, d_cut, are connected by an edge, E. 217 218 Parameters 219 ---------- 220 PDB_df : pandas.DataFrame 221 A dataframe containing alpha carbon / atom coordinate columns labeled: 'x', 'y', and 'z'. 222 d_cut : float, optional 223 Threshold for two alpha carbons / atoms to be connected (in Angstroms) by an edge. Defaults to 8.0. 224 225 Returns 226 ------- 227 networkx.Graph 228 Protein structure network graph, G(V,E). 229 """ 230 # create distance matrix where element i,j is the Euclidean distance between vertex i and vertex j 231 d_matrix = squareform(pdist(PDB_df[["x", "y", "z"]], "euclid")) 232 d_matrix_thresh = np.where( 233 d_matrix > d_cut, 0, d_matrix 234 ) # if the distance is > d_cut, replace it with 0 (i.e. remove edge) 235 G = nx.convert_matrix.from_numpy_array( 236 d_matrix_thresh 237 ) # convert distance matrix to networkx graph object 238 239 # create mapping to relabel nodes from index (i.e. 0...n-1) to residue id (i.e. str(<resname><res s/n + offset>) for each residue) 240 label_mapping = {} 241 hydro_mapping = {} 242 for node in G.nodes(): # here the node value is an index (0-n-1) 243 label_mapping[node] = PDB_df["residue id"][node] 244 hydro_mapping[node] = PDB_df["hydrophobicity"][node] 245 246 betweenness = nx.betweenness_centrality(G) 247 nx.set_node_attributes(G, betweenness, "betweenness") 248 nx.set_node_attributes(G, hydro_mapping, "hydrophobicity") 249 H = nx.relabel_nodes( 250 G, label_mapping 251 ) # update the nodes and store as new graph object 252 return H 253 254 255def save_data(df, G, df_name, G_name): 256 """ 257 Convenience function that stores dataframe as .csv and graph as .gexf file. 258 259 Parameters 260 ---------- 261 df : pandas.DataFrame 262 Dataframe to save. 263 G : networkx.Graph 264 Graph to save. 265 df_name : str 266 Output filename for dataframe .csv. 267 G_name : str 268 Output filename for graph .gexf. 269 """ 270 df.to_csv(f"{df_name}.csv") # write dataframe to .csv 271 nx.write_gexf(G, f"{G_name}.gexf") # write graph to .gexf (graph network XML file) 272 273 274def save_data_at_this_folder(data_path, df, G, df_name, G_name): 275 """ 276 Convenience function that stores dataframe as .csv and graph as .gexf file. 277 278 Parameters 279 ---------- 280 data_path : str or Path 281 Output directory path. 282 df : pandas.DataFrame 283 Dataframe to save. 284 G : networkx.Graph 285 Graph to save. 286 df_name : str 287 Output filename for dataframe .csv. 288 G_name : str 289 Output filename for graph .gexf. 290 """ 291 df_name = Path(data_path, df_name) 292 df.to_csv(f"{df_name}.csv") # write dataframe to .csv 293 G_name = Path(data_path, G_name) 294 nx.write_gexf(G, f"{G_name}.gexf") # write graph to .gexf (graph network XML file) 295 296 297def plot_coordinates(xyz_data, figsize=5): 298 """ 299 Creates a 3D scatter plot containing the xyz data. 300 301 Parameters 302 ---------- 303 xyz_data : np.ndarray 304 A[0] = [x0, y0, z0]. 305 figsize : int, optional 306 Size of figure (figsize x figsize). Defaults to 5. 307 308 Returns 309 ------- 310 matplotlib.pyplot.Figure 311 3D scatter plot figure. 312 """ 313 fig = plt.figure(figsize=(figsize, figsize)) 314 plt.title("x y z coordinates") 315 ax = fig.add_subplot(projection="3d") 316 ax.scatter( 317 xyz_data[:, 0], xyz_data[:, 1], xyz_data[:, 2], c="purple", s=2, alpha=0.3 318 ) 319 return fig 320 321 322def plot_FA_and_CA_coordinates(FA_xyz, CA_xyz, figsize=5): 323 """ 324 Creates a 3D scatter plot containing CA and FA atom coordinates. 325 326 Parameters 327 ---------- 328 FA_xyz : np.ndarray 329 A[0] = [x0, y0, z0] for all atom coordinate data. 330 CA_xyz : np.ndarray 331 A[0] = [x0, y0, z0] for alpha carbon only coordinate data. 332 figsize : int, optional 333 Size of figure (figsize x figsize). Defaults to 5. 334 335 Returns 336 ------- 337 matplotlib.pyplot.Figure 338 3D scatter plot figure. 339 """ 340 fig = plt.figure(figsize=(figsize, figsize)) 341 342 plt.title("x y z coordinates - FA and CA") 343 ax = fig.add_subplot(projection="3d") 344 ax.scatter( 345 FA_xyz[:, 0], FA_xyz[:, 1], FA_xyz[:, 2], c="purple", s=5, alpha=0.3, label="FA" 346 ) 347 ax.scatter( 348 CA_xyz[:, 0], CA_xyz[:, 1], CA_xyz[:, 2], c="black", s=15, alpha=0.5, label="CA" 349 ) 350 ax.legend() 351 return fig 352 353 354def main(args): 355 """ 356 Takes a .pdb(x) file, converts it into a graph, and saves the atom coordinates to .csv and graph as .gexf. 357 358 Parameters 359 ---------- 360 args : argparse.Namespace 361 - pdb_code : str 362 PDB id / protein name. 363 - fname : str 364 PDB/PDBx filename. 365 - d_cut : float 366 Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms). 367 - o : int 368 PDB residue index offset integer. Default is 0. 369 - pdbx : int 370 Set=1 to use pdbx file parser. 371 - CA_only : int 372 Set=1 to use only alpha carbons (0 for all atoms 373 """ 374 # get arguments 375 pdb_code = args.pdb_code 376 fname = args.fname 377 d_cut = args.d_cut 378 o = args.o 379 pdbx = args.pdbx 380 CA_only = args.CA_only 381 382 df = PDB_to_df( 383 pdb_code, fname, pdbx, o, CA_only 384 ) # convert pdb file into dataframe of atom/alpha carbon coordinates and s/n 385 G = PDB_df_to_G(df, d_cut) # convert coordinate dataframe into network graph 386 save_data( 387 df, G, pdb_code, pdb_code 388 ) # save dataframe as .csv and graph as .gexf using the same name as the pdb code 389 390 391if __name__ == "__main__": 392 # example: >> python pdb2graph.py PDBid pdb_file.pdb 8 0 0 1 393 # get input arguments from command line 394 parser = argparse.ArgumentParser() 395 parser.add_argument("pdb_code", help="PDB id / protein name", type=str) 396 parser.add_argument("fname", help="PDB/PDBx filename", type=str) 397 parser.add_argument( 398 "d_cut", 399 help="Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms)", 400 type=float, 401 ) 402 parser.add_argument( 403 "o", help="PDB residue index offset integer. Default is 0.", type=int 404 ) 405 parser.add_argument("pdbx", help="set=1 to use pdbx file parser", type=int) 406 parser.add_argument( 407 "CA_only", help="set=1 to use only alpha carbons (0 for all atoms)", type=int 408 ) 409 args = parser.parse_args() 410 main(args)
22def get_hydrophobicity(name, warn=False): 23 """ 24 Gets hydrophobicity based on amino acid name. 25 26 Parameters 27 ---------- 28 name : str 29 Amino acid name. 30 warn : bool, optional 31 If True, print a warning when hydrophobicity can't be determined. Default is False. 32 33 Returns 34 ------- 35 float 36 Hydrophobicity value. 37 38 Notes 39 ----- 40 Returns NaN if the amino acid name is not recognized. 41 42 Ref: https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/midas/hydrophob.html 43 """ 44 acid_names = [ 45 "Ile", 46 "Val", 47 "Leu", 48 "Phe", 49 "Cys", 50 "Met", 51 "Ala", 52 "Gly", 53 "Thr", 54 "Ser", 55 "Trp", 56 "Tyr", 57 "Pro", 58 "His", 59 "Glu", 60 "Gln", 61 "Asp", 62 "Asn", 63 "Lys", 64 "Arg", 65 ] 66 acid_kdH = [ 67 4.5, 68 4.2, 69 3.8, 70 2.8, 71 2.5, 72 1.9, 73 1.8, 74 -0.4, 75 -0.7, 76 -0.8, 77 -0.9, 78 -1.3, 79 -1.6, 80 -3.2, 81 -3.5, 82 -3.5, 83 -3.5, 84 -3.5, 85 -3.9, 86 -4.5, 87 ] 88 hydro_dict = dict(zip(acid_names, acid_kdH)) 89 try: 90 hydrophobicity = hydro_dict[ 91 f"{name.title()}" 92 ] # make sure string is in correct format 93 except: 94 hydrophobicity = np.nan 95 if warn == True: 96 print( 97 f"warning! could not assign hydrophobicity for {name.title()}. setting hydrophobicity=NaN" 98 ) 99 return hydrophobicity
Gets hydrophobicity based on amino acid name.
Parameters
- name (str): Amino acid name.
- warn (bool, optional): If True, print a warning when hydrophobicity can't be determined. Default is False.
Returns
- float: Hydrophobicity value.
Notes
Returns NaN if the amino acid name is not recognized.
Ref: https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/midas/hydrophob.html
102def PDB_to_df(pdb_code, fname, pdbx, offset, CA_only=1): 103 """ 104 Loads a PDB (or PDBx) file and stores the atom coordinates and residue name and number into a dataframe. 105 106 Note: if the PDB file has more than one model, the first model is chosen. 107 108 Parameters 109 ---------- 110 pdb_code : str 111 PDB ID / label for the protein of interest. 112 fname : str 113 Filename for the protein of interest. Can be PDB or PDBx format. 114 pdbx : int 115 Set to 1 if using the newer PDBx file format. 116 offset : int 117 Index offset in case the first residue ID in PDB file is not the first physical residue (e.g. PDB starts at 5th residue). 118 CA_only : int, optional 119 Set to 1 [default] if using only alpha carbons, else all atoms are used. 120 121 Returns 122 ------- 123 pandas.DataFrame 124 Dataframe containing every atom's x, y, z coordinates and serial number. 125 """ 126 # pick a file reader based on the filetype. Newer/bigger structures will use PDBx 127 if pdbx == 1: 128 parser = MMCIFParser() # for PDBx files (.CIF) 129 else: 130 parser = PDBParser() # for PDB files (.PDB) 131 structure = parser.get_structure( 132 pdb_code, fname 133 ) # parse pdb file and store as a PDB structure object 134 135 # makes lists storing the alpha carbon (CA) coordinates and serial number 136 atom_coord_list = [] 137 res_id_list = [] 138 hydro_list = [] 139 140 # loop through PDB structure object to get alpha carbon (or all atom) coordinates and serial number 141 for i, model in enumerate( 142 structure 143 ): # iterate through each model in the structure, exit loop after first model 144 if ( 145 i > 0 146 ): # NMR files have multiple models, but only one model is used for the graph 147 break 148 for chain in model: # iterate through each chain in the model 149 for residue in chain: # iterate through each residue in the chain 150 for atom in residue: # iterate through each atom in residue 151 if CA_only == 1: 152 if ( 153 atom.get_name() == "CA" 154 ): # if atom is an alpha carbon then append its coordinates and serial number 155 atom_coord_list.append( 156 atom.get_vector()[:] 157 ) # convert vector object into list of numbers 158 atom_sn = atom.get_serial_number() 159 160 # biopython PDB module residue.__repr__() returns a string representing the residue: 161 # e.g. '<Residue THR het= resseq=1 icode= >' 162 # 'resseq' is the residue serial number 163 # --> split the string until the number after 'resseq=' is all that remains 164 res_repr = residue.__repr__() 165 res_repr_tmp = res_repr.split("resseq=", 1)[1] 166 res_sn = ( 167 int(res_repr_tmp.split(" ", 1)[0]) + offset 168 ) # get residue serial number (+ offest) 169 res_name = ( 170 residue.get_resname() 171 ) # get first letters of residue name 172 hydro_list.append( 173 get_hydrophobicity(res_name) 174 ) # get hydrophobicity based on residue name 175 176 # store residue ID: <first letter of residue name><residue serial number + offset> 177 res_id_list.append(f"{res_name}{res_sn}-{atom_sn}") 178 else: # get all atoms 179 if residue.get_resname() != "HOH": 180 atom_coord_list.append( 181 atom.get_vector()[:] 182 ) # convert vector object into list of numbers 183 atom_sn = atom.get_serial_number() 184 185 # biopython PDB module residue.__repr__() returns a string representing the residue: 186 # e.g. '<Residue THR het= resseq=1 icode= >' 187 # 'resseq' is the residue serial number 188 # --> split the string until the number after 'resseq=' is all that remains 189 res_repr = residue.__repr__() 190 res_repr_tmp = res_repr.split("resseq=", 1)[1] 191 res_sn = ( 192 int(res_repr_tmp.split(" ", 1)[0]) + offset 193 ) # get residue serial number (+ offest) 194 res_name = ( 195 residue.get_resname() 196 ) # get first letters of residue name 197 hydro_list.append( 198 get_hydrophobicity(res_name) 199 ) # get hydrophobicity based on residue name 200 201 # store residue ID: <first letter of residue name><residue serial number + offset> 202 res_id_list.append(f"{res_name}{res_sn}-{atom_sn}") 203 204 # create dataframes for atom coordinates, serial numbers, and then combine them. 205 # there's probably a better way to do this in a single line 206 atom_df = pd.DataFrame(atom_coord_list, columns=["x", "y", "z"]) 207 res_id_df = pd.DataFrame(res_id_list, columns=["residue id"]) 208 hydro_df = pd.DataFrame(hydro_list, columns=["hydrophobicity"]) 209 df = pd.concat([atom_df, res_id_df, hydro_df], axis=1) 210 return df
Loads a PDB (or PDBx) file and stores the atom coordinates and residue name and number into a dataframe.
Note: if the PDB file has more than one model, the first model is chosen.
Parameters
- pdb_code (str): PDB ID / label for the protein of interest.
- fname (str): Filename for the protein of interest. Can be PDB or PDBx format.
- pdbx (int): Set to 1 if using the newer PDBx file format.
- offset (int): Index offset in case the first residue ID in PDB file is not the first physical residue (e.g. PDB starts at 5th residue).
- CA_only (int, optional): Set to 1 [default] if using only alpha carbons, else all atoms are used.
Returns
- pandas.DataFrame: Dataframe containing every atom's x, y, z coordinates and serial number.
213def PDB_df_to_G(PDB_df, d_cut=8.0): 214 """ 215 Converts a dataframe containing alpha carbon / atom coordinates (in Angstroms) into a graph, G(V,E). 216 217 Each vertex, V, is an alpha carbon / atom. Two alpha carbons / atoms with a distance (in Angstroms) less than a cutoff, d_cut, are connected by an edge, E. 218 219 Parameters 220 ---------- 221 PDB_df : pandas.DataFrame 222 A dataframe containing alpha carbon / atom coordinate columns labeled: 'x', 'y', and 'z'. 223 d_cut : float, optional 224 Threshold for two alpha carbons / atoms to be connected (in Angstroms) by an edge. Defaults to 8.0. 225 226 Returns 227 ------- 228 networkx.Graph 229 Protein structure network graph, G(V,E). 230 """ 231 # create distance matrix where element i,j is the Euclidean distance between vertex i and vertex j 232 d_matrix = squareform(pdist(PDB_df[["x", "y", "z"]], "euclid")) 233 d_matrix_thresh = np.where( 234 d_matrix > d_cut, 0, d_matrix 235 ) # if the distance is > d_cut, replace it with 0 (i.e. remove edge) 236 G = nx.convert_matrix.from_numpy_array( 237 d_matrix_thresh 238 ) # convert distance matrix to networkx graph object 239 240 # create mapping to relabel nodes from index (i.e. 0...n-1) to residue id (i.e. str(<resname><res s/n + offset>) for each residue) 241 label_mapping = {} 242 hydro_mapping = {} 243 for node in G.nodes(): # here the node value is an index (0-n-1) 244 label_mapping[node] = PDB_df["residue id"][node] 245 hydro_mapping[node] = PDB_df["hydrophobicity"][node] 246 247 betweenness = nx.betweenness_centrality(G) 248 nx.set_node_attributes(G, betweenness, "betweenness") 249 nx.set_node_attributes(G, hydro_mapping, "hydrophobicity") 250 H = nx.relabel_nodes( 251 G, label_mapping 252 ) # update the nodes and store as new graph object 253 return H
Converts a dataframe containing alpha carbon / atom coordinates (in Angstroms) into a graph, G(V,E).
Each vertex, V, is an alpha carbon / atom. Two alpha carbons / atoms with a distance (in Angstroms) less than a cutoff, d_cut, are connected by an edge, E.
Parameters
- PDB_df (pandas.DataFrame): A dataframe containing alpha carbon / atom coordinate columns labeled: 'x', 'y', and 'z'.
- d_cut (float, optional): Threshold for two alpha carbons / atoms to be connected (in Angstroms) by an edge. Defaults to 8.0.
Returns
- networkx.Graph: Protein structure network graph, G(V,E).
256def save_data(df, G, df_name, G_name): 257 """ 258 Convenience function that stores dataframe as .csv and graph as .gexf file. 259 260 Parameters 261 ---------- 262 df : pandas.DataFrame 263 Dataframe to save. 264 G : networkx.Graph 265 Graph to save. 266 df_name : str 267 Output filename for dataframe .csv. 268 G_name : str 269 Output filename for graph .gexf. 270 """ 271 df.to_csv(f"{df_name}.csv") # write dataframe to .csv 272 nx.write_gexf(G, f"{G_name}.gexf") # write graph to .gexf (graph network XML file)
Convenience function that stores dataframe as .csv and graph as .gexf file.
Parameters
- df (pandas.DataFrame): Dataframe to save.
- G (networkx.Graph): Graph to save.
- df_name (str): Output filename for dataframe .csv.
- G_name (str): Output filename for graph .gexf.
275def save_data_at_this_folder(data_path, df, G, df_name, G_name): 276 """ 277 Convenience function that stores dataframe as .csv and graph as .gexf file. 278 279 Parameters 280 ---------- 281 data_path : str or Path 282 Output directory path. 283 df : pandas.DataFrame 284 Dataframe to save. 285 G : networkx.Graph 286 Graph to save. 287 df_name : str 288 Output filename for dataframe .csv. 289 G_name : str 290 Output filename for graph .gexf. 291 """ 292 df_name = Path(data_path, df_name) 293 df.to_csv(f"{df_name}.csv") # write dataframe to .csv 294 G_name = Path(data_path, G_name) 295 nx.write_gexf(G, f"{G_name}.gexf") # write graph to .gexf (graph network XML file)
Convenience function that stores dataframe as .csv and graph as .gexf file.
Parameters
- data_path (str or Path): Output directory path.
- df (pandas.DataFrame): Dataframe to save.
- G (networkx.Graph): Graph to save.
- df_name (str): Output filename for dataframe .csv.
- G_name (str): Output filename for graph .gexf.
298def plot_coordinates(xyz_data, figsize=5): 299 """ 300 Creates a 3D scatter plot containing the xyz data. 301 302 Parameters 303 ---------- 304 xyz_data : np.ndarray 305 A[0] = [x0, y0, z0]. 306 figsize : int, optional 307 Size of figure (figsize x figsize). Defaults to 5. 308 309 Returns 310 ------- 311 matplotlib.pyplot.Figure 312 3D scatter plot figure. 313 """ 314 fig = plt.figure(figsize=(figsize, figsize)) 315 plt.title("x y z coordinates") 316 ax = fig.add_subplot(projection="3d") 317 ax.scatter( 318 xyz_data[:, 0], xyz_data[:, 1], xyz_data[:, 2], c="purple", s=2, alpha=0.3 319 ) 320 return fig
Creates a 3D scatter plot containing the xyz data.
Parameters
- xyz_data (np.ndarray): A[0] = [x0, y0, z0].
- figsize (int, optional): Size of figure (figsize x figsize). Defaults to 5.
Returns
- matplotlib.pyplot.Figure: 3D scatter plot figure.
323def plot_FA_and_CA_coordinates(FA_xyz, CA_xyz, figsize=5): 324 """ 325 Creates a 3D scatter plot containing CA and FA atom coordinates. 326 327 Parameters 328 ---------- 329 FA_xyz : np.ndarray 330 A[0] = [x0, y0, z0] for all atom coordinate data. 331 CA_xyz : np.ndarray 332 A[0] = [x0, y0, z0] for alpha carbon only coordinate data. 333 figsize : int, optional 334 Size of figure (figsize x figsize). Defaults to 5. 335 336 Returns 337 ------- 338 matplotlib.pyplot.Figure 339 3D scatter plot figure. 340 """ 341 fig = plt.figure(figsize=(figsize, figsize)) 342 343 plt.title("x y z coordinates - FA and CA") 344 ax = fig.add_subplot(projection="3d") 345 ax.scatter( 346 FA_xyz[:, 0], FA_xyz[:, 1], FA_xyz[:, 2], c="purple", s=5, alpha=0.3, label="FA" 347 ) 348 ax.scatter( 349 CA_xyz[:, 0], CA_xyz[:, 1], CA_xyz[:, 2], c="black", s=15, alpha=0.5, label="CA" 350 ) 351 ax.legend() 352 return fig
Creates a 3D scatter plot containing CA and FA atom coordinates.
Parameters
- FA_xyz (np.ndarray): A[0] = [x0, y0, z0] for all atom coordinate data.
- CA_xyz (np.ndarray): A[0] = [x0, y0, z0] for alpha carbon only coordinate data.
- figsize (int, optional): Size of figure (figsize x figsize). Defaults to 5.
Returns
- matplotlib.pyplot.Figure: 3D scatter plot figure.
355def main(args): 356 """ 357 Takes a .pdb(x) file, converts it into a graph, and saves the atom coordinates to .csv and graph as .gexf. 358 359 Parameters 360 ---------- 361 args : argparse.Namespace 362 - pdb_code : str 363 PDB id / protein name. 364 - fname : str 365 PDB/PDBx filename. 366 - d_cut : float 367 Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms). 368 - o : int 369 PDB residue index offset integer. Default is 0. 370 - pdbx : int 371 Set=1 to use pdbx file parser. 372 - CA_only : int 373 Set=1 to use only alpha carbons (0 for all atoms 374 """ 375 # get arguments 376 pdb_code = args.pdb_code 377 fname = args.fname 378 d_cut = args.d_cut 379 o = args.o 380 pdbx = args.pdbx 381 CA_only = args.CA_only 382 383 df = PDB_to_df( 384 pdb_code, fname, pdbx, o, CA_only 385 ) # convert pdb file into dataframe of atom/alpha carbon coordinates and s/n 386 G = PDB_df_to_G(df, d_cut) # convert coordinate dataframe into network graph 387 save_data( 388 df, G, pdb_code, pdb_code 389 ) # save dataframe as .csv and graph as .gexf using the same name as the pdb code
Takes a .pdb(x) file, converts it into a graph, and saves the atom coordinates to .csv and graph as .gexf.
Parameters
- args (argparse.Namespace):
- pdb_code : str PDB id / protein name.
- fname : str PDB/PDBx filename.
- d_cut : float Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms).
- o : int PDB residue index offset integer. Default is 0.
- pdbx : int Set=1 to use pdbx file parser.
- CA_only : int Set=1 to use only alpha carbons (0 for all atoms