griptomo.core.pdb2graph

  1# August George, 2022, PNNL
  2
  3import numpy as np
  4import pandas as pd
  5import networkx as nx
  6from Bio.PDB import (
  7    PDBParser,
  8    MMCIFParser,
  9)  # BioPython PDB doc: https://biopython.org/docs/1.75/api/Bio.PDB.html
 10from scipy.spatial.distance import pdist, squareform
 11import argparse
 12import matplotlib.pyplot as plt
 13from pathlib import Path
 14
 15# for convenience
 16import warnings
 17
 18warnings.filterwarnings("ignore")
 19
 20
 21def get_hydrophobicity(name, warn=False):
 22    """
 23    Gets hydrophobicity based on amino acid name.
 24
 25    Parameters
 26    ----------
 27    name : str
 28        Amino acid name.
 29    warn : bool, optional
 30        If True, print a warning when hydrophobicity can't be determined. Default is False.
 31
 32    Returns
 33    -------
 34    float
 35        Hydrophobicity value.
 36
 37    Notes
 38    -----
 39    Returns NaN if the amino acid name is not recognized.
 40
 41    Ref: https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/midas/hydrophob.html
 42    """
 43    acid_names = [
 44        "Ile",
 45        "Val",
 46        "Leu",
 47        "Phe",
 48        "Cys",
 49        "Met",
 50        "Ala",
 51        "Gly",
 52        "Thr",
 53        "Ser",
 54        "Trp",
 55        "Tyr",
 56        "Pro",
 57        "His",
 58        "Glu",
 59        "Gln",
 60        "Asp",
 61        "Asn",
 62        "Lys",
 63        "Arg",
 64    ]
 65    acid_kdH = [
 66        4.5,
 67        4.2,
 68        3.8,
 69        2.8,
 70        2.5,
 71        1.9,
 72        1.8,
 73        -0.4,
 74        -0.7,
 75        -0.8,
 76        -0.9,
 77        -1.3,
 78        -1.6,
 79        -3.2,
 80        -3.5,
 81        -3.5,
 82        -3.5,
 83        -3.5,
 84        -3.9,
 85        -4.5,
 86    ]
 87    hydro_dict = dict(zip(acid_names, acid_kdH))
 88    try:
 89        hydrophobicity = hydro_dict[
 90            f"{name.title()}"
 91        ]  # make sure string is in correct format
 92    except:
 93        hydrophobicity = np.nan
 94        if warn == True:
 95            print(
 96                f"warning! could not assign hydrophobicity for {name.title()}. setting hydrophobicity=NaN"
 97            )
 98    return hydrophobicity
 99
100
101def PDB_to_df(pdb_code, fname, pdbx, offset, CA_only=1):
102    """
103    Loads a PDB (or PDBx) file and stores the atom coordinates and residue name and number into a dataframe.
104
105    Note: if the PDB file has more than one model, the first model is chosen.
106
107    Parameters
108    ----------
109    pdb_code : str
110        PDB ID / label for the protein of interest.
111    fname : str
112        Filename for the protein of interest. Can be PDB or PDBx format.
113    pdbx : int
114        Set to 1 if using the newer PDBx file format.
115    offset : int
116        Index offset in case the first residue ID in PDB file is not the first physical residue (e.g. PDB starts at 5th residue).
117    CA_only : int, optional
118        Set to 1 [default] if using only alpha carbons, else all atoms are used.
119
120    Returns
121    -------
122    pandas.DataFrame
123        Dataframe containing every atom's x, y, z coordinates and serial number.
124    """
125    # pick a file reader based on the filetype. Newer/bigger structures will use PDBx
126    if pdbx == 1:
127        parser = MMCIFParser()  # for PDBx files (.CIF)
128    else:
129        parser = PDBParser()  # for PDB files (.PDB)
130    structure = parser.get_structure(
131        pdb_code, fname
132    )  # parse pdb file and store as a PDB structure object
133
134    # makes lists storing the alpha carbon (CA) coordinates and serial number
135    atom_coord_list = []
136    res_id_list = []
137    hydro_list = []
138
139    # loop through PDB structure object to get alpha carbon (or all atom) coordinates and serial number
140    for i, model in enumerate(
141        structure
142    ):  # iterate through each model in the structure, exit loop after first model
143        if (
144            i > 0
145        ):  # NMR files have multiple models, but only one model is used for the graph
146            break
147        for chain in model:  # iterate through each chain in the model
148            for residue in chain:  # iterate through each residue in the chain
149                for atom in residue:  # iterate through each atom in residue
150                    if CA_only == 1:
151                        if (
152                            atom.get_name() == "CA"
153                        ):  # if atom is an alpha carbon then append its coordinates and serial number
154                            atom_coord_list.append(
155                                atom.get_vector()[:]
156                            )  # convert vector object into list of numbers
157                            atom_sn = atom.get_serial_number()
158
159                            # biopython PDB module residue.__repr__() returns a string representing the residue:
160                            # e.g. '<Residue THR het=  resseq=1 icode= >'
161                            # 'resseq' is the residue serial number
162                            # --> split the string until the number after 'resseq=' is all that remains
163                            res_repr = residue.__repr__()
164                            res_repr_tmp = res_repr.split("resseq=", 1)[1]
165                            res_sn = (
166                                int(res_repr_tmp.split(" ", 1)[0]) + offset
167                            )  # get residue serial number (+ offest)
168                            res_name = (
169                                residue.get_resname()
170                            )  # get first letters of residue name
171                            hydro_list.append(
172                                get_hydrophobicity(res_name)
173                            )  # get hydrophobicity based on residue name
174
175                            # store residue ID: <first letter of residue name><residue serial number + offset>
176                            res_id_list.append(f"{res_name}{res_sn}-{atom_sn}")
177                    else:  # get all atoms
178                        if residue.get_resname() != "HOH":
179                            atom_coord_list.append(
180                                atom.get_vector()[:]
181                            )  # convert vector object into list of numbers
182                            atom_sn = atom.get_serial_number()
183
184                            # biopython PDB module residue.__repr__() returns a string representing the residue:
185                            # e.g. '<Residue THR het=  resseq=1 icode= >'
186                            # 'resseq' is the residue serial number
187                            # --> split the string until the number after 'resseq=' is all that remains
188                            res_repr = residue.__repr__()
189                            res_repr_tmp = res_repr.split("resseq=", 1)[1]
190                            res_sn = (
191                                int(res_repr_tmp.split(" ", 1)[0]) + offset
192                            )  # get residue serial number (+ offest)
193                            res_name = (
194                                residue.get_resname()
195                            )  # get first letters of residue name
196                            hydro_list.append(
197                                get_hydrophobicity(res_name)
198                            )  # get hydrophobicity based on residue name
199
200                            # store residue ID: <first letter of residue name><residue serial number + offset>
201                            res_id_list.append(f"{res_name}{res_sn}-{atom_sn}")
202
203    # create dataframes for atom coordinates, serial numbers, and then combine them.
204    # there's probably a better way to do this in a single line
205    atom_df = pd.DataFrame(atom_coord_list, columns=["x", "y", "z"])
206    res_id_df = pd.DataFrame(res_id_list, columns=["residue id"])
207    hydro_df = pd.DataFrame(hydro_list, columns=["hydrophobicity"])
208    df = pd.concat([atom_df, res_id_df, hydro_df], axis=1)
209    return df
210
211
212def PDB_df_to_G(PDB_df, d_cut=8.0):
213    """
214    Converts a dataframe containing alpha carbon / atom coordinates (in Angstroms) into a graph, G(V,E).
215
216    Each vertex, V, is an alpha carbon / atom. Two alpha carbons / atoms with a distance (in Angstroms) less than a cutoff, d_cut, are connected by an edge, E.
217
218    Parameters
219    ----------
220    PDB_df : pandas.DataFrame
221        A dataframe containing alpha carbon / atom coordinate columns labeled: 'x', 'y', and 'z'.
222    d_cut : float, optional
223        Threshold for two alpha carbons / atoms to be connected (in Angstroms) by an edge. Defaults to 8.0.
224
225    Returns
226    -------
227    networkx.Graph
228        Protein structure network graph, G(V,E).
229    """
230    # create distance matrix where element i,j is the Euclidean distance between vertex i and vertex j
231    d_matrix = squareform(pdist(PDB_df[["x", "y", "z"]], "euclid"))
232    d_matrix_thresh = np.where(
233        d_matrix > d_cut, 0, d_matrix
234    )  # if the distance is > d_cut, replace it with 0 (i.e. remove edge)
235    G = nx.convert_matrix.from_numpy_array(
236        d_matrix_thresh
237    )  # convert distance matrix to networkx graph object
238
239    # create mapping to relabel nodes from index (i.e. 0...n-1) to residue id (i.e. str(<resname><res s/n + offset>) for each residue)
240    label_mapping = {}
241    hydro_mapping = {}
242    for node in G.nodes():  # here the node value is an index (0-n-1)
243        label_mapping[node] = PDB_df["residue id"][node]
244        hydro_mapping[node] = PDB_df["hydrophobicity"][node]
245
246    betweenness = nx.betweenness_centrality(G)
247    nx.set_node_attributes(G, betweenness, "betweenness")
248    nx.set_node_attributes(G, hydro_mapping, "hydrophobicity")
249    H = nx.relabel_nodes(
250        G, label_mapping
251    )  # update the nodes and store as new graph object
252    return H
253
254
255def save_data(df, G, df_name, G_name):
256    """
257    Convenience function that stores dataframe as .csv and graph as .gexf file.
258
259    Parameters
260    ----------
261    df : pandas.DataFrame
262        Dataframe to save.
263    G : networkx.Graph
264        Graph to save.
265    df_name : str
266        Output filename for dataframe .csv.
267    G_name : str
268        Output filename for graph .gexf.
269    """
270    df.to_csv(f"{df_name}.csv")  # write dataframe to .csv
271    nx.write_gexf(G, f"{G_name}.gexf")  # write graph to .gexf (graph network XML file)
272
273
274def save_data_at_this_folder(data_path, df, G, df_name, G_name):
275    """
276    Convenience function that stores dataframe as .csv and graph as .gexf file.
277
278    Parameters
279    ----------
280    data_path : str or Path
281        Output directory path.
282    df : pandas.DataFrame
283        Dataframe to save.
284    G : networkx.Graph
285        Graph to save.
286    df_name : str
287        Output filename for dataframe .csv.
288    G_name : str
289        Output filename for graph .gexf.
290    """
291    df_name = Path(data_path, df_name)
292    df.to_csv(f"{df_name}.csv")  # write dataframe to .csv
293    G_name = Path(data_path, G_name)
294    nx.write_gexf(G, f"{G_name}.gexf")  # write graph to .gexf (graph network XML file)
295
296
297def plot_coordinates(xyz_data, figsize=5):
298    """
299    Creates a 3D scatter plot containing the xyz data.
300
301    Parameters
302    ----------
303    xyz_data : np.ndarray
304        A[0] = [x0, y0, z0].
305    figsize : int, optional
306        Size of figure (figsize x figsize). Defaults to 5.
307
308    Returns
309    -------
310    matplotlib.pyplot.Figure
311        3D scatter plot figure.
312    """
313    fig = plt.figure(figsize=(figsize, figsize))
314    plt.title("x y z coordinates")
315    ax = fig.add_subplot(projection="3d")
316    ax.scatter(
317        xyz_data[:, 0], xyz_data[:, 1], xyz_data[:, 2], c="purple", s=2, alpha=0.3
318    )
319    return fig
320
321
322def plot_FA_and_CA_coordinates(FA_xyz, CA_xyz, figsize=5):
323    """
324    Creates a 3D scatter plot containing CA and FA atom coordinates.
325
326    Parameters
327    ----------
328    FA_xyz : np.ndarray
329        A[0] = [x0, y0, z0] for all atom coordinate data.
330    CA_xyz : np.ndarray
331        A[0] = [x0, y0, z0] for alpha carbon only coordinate data.
332    figsize : int, optional
333        Size of figure (figsize x figsize). Defaults to 5.
334
335    Returns
336    -------
337    matplotlib.pyplot.Figure
338        3D scatter plot figure.
339    """
340    fig = plt.figure(figsize=(figsize, figsize))
341
342    plt.title("x y z coordinates - FA and CA")
343    ax = fig.add_subplot(projection="3d")
344    ax.scatter(
345        FA_xyz[:, 0], FA_xyz[:, 1], FA_xyz[:, 2], c="purple", s=5, alpha=0.3, label="FA"
346    )
347    ax.scatter(
348        CA_xyz[:, 0], CA_xyz[:, 1], CA_xyz[:, 2], c="black", s=15, alpha=0.5, label="CA"
349    )
350    ax.legend()
351    return fig
352
353
354def main(args):
355    """
356    Takes a .pdb(x) file, converts it into a graph, and saves the atom coordinates to .csv and graph as .gexf.
357
358    Parameters
359    ----------
360    args : argparse.Namespace
361        - pdb_code : str
362            PDB id / protein name.
363        - fname : str
364            PDB/PDBx filename.
365        - d_cut : float
366            Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms).
367        - o : int
368            PDB residue index offset integer. Default is 0.
369        - pdbx : int
370            Set=1 to use pdbx file parser.
371        - CA_only : int
372            Set=1 to use only alpha carbons (0 for all atoms
373    """
374    # get arguments
375    pdb_code = args.pdb_code
376    fname = args.fname
377    d_cut = args.d_cut
378    o = args.o
379    pdbx = args.pdbx
380    CA_only = args.CA_only
381
382    df = PDB_to_df(
383        pdb_code, fname, pdbx, o, CA_only
384    )  # convert pdb file into dataframe of atom/alpha carbon coordinates and s/n
385    G = PDB_df_to_G(df, d_cut)  # convert coordinate dataframe into network graph
386    save_data(
387        df, G, pdb_code, pdb_code
388    )  # save dataframe as .csv and graph as .gexf using the same name as the pdb code
389
390
391if __name__ == "__main__":
392    # example: >> python pdb2graph.py PDBid pdb_file.pdb 8 0 0 1
393    # get input arguments from command line
394    parser = argparse.ArgumentParser()
395    parser.add_argument("pdb_code", help="PDB id / protein name", type=str)
396    parser.add_argument("fname", help="PDB/PDBx filename", type=str)
397    parser.add_argument(
398        "d_cut",
399        help="Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms)",
400        type=float,
401    )
402    parser.add_argument(
403        "o", help="PDB residue index offset integer. Default is 0.", type=int
404    )
405    parser.add_argument("pdbx", help="set=1 to use pdbx file parser", type=int)
406    parser.add_argument(
407        "CA_only", help="set=1 to use only alpha carbons (0 for all atoms)", type=int
408    )
409    args = parser.parse_args()
410    main(args)
def get_hydrophobicity(name, warn=False):
22def get_hydrophobicity(name, warn=False):
23    """
24    Gets hydrophobicity based on amino acid name.
25
26    Parameters
27    ----------
28    name : str
29        Amino acid name.
30    warn : bool, optional
31        If True, print a warning when hydrophobicity can't be determined. Default is False.
32
33    Returns
34    -------
35    float
36        Hydrophobicity value.
37
38    Notes
39    -----
40    Returns NaN if the amino acid name is not recognized.
41
42    Ref: https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/midas/hydrophob.html
43    """
44    acid_names = [
45        "Ile",
46        "Val",
47        "Leu",
48        "Phe",
49        "Cys",
50        "Met",
51        "Ala",
52        "Gly",
53        "Thr",
54        "Ser",
55        "Trp",
56        "Tyr",
57        "Pro",
58        "His",
59        "Glu",
60        "Gln",
61        "Asp",
62        "Asn",
63        "Lys",
64        "Arg",
65    ]
66    acid_kdH = [
67        4.5,
68        4.2,
69        3.8,
70        2.8,
71        2.5,
72        1.9,
73        1.8,
74        -0.4,
75        -0.7,
76        -0.8,
77        -0.9,
78        -1.3,
79        -1.6,
80        -3.2,
81        -3.5,
82        -3.5,
83        -3.5,
84        -3.5,
85        -3.9,
86        -4.5,
87    ]
88    hydro_dict = dict(zip(acid_names, acid_kdH))
89    try:
90        hydrophobicity = hydro_dict[
91            f"{name.title()}"
92        ]  # make sure string is in correct format
93    except:
94        hydrophobicity = np.nan
95        if warn == True:
96            print(
97                f"warning! could not assign hydrophobicity for {name.title()}. setting hydrophobicity=NaN"
98            )
99    return hydrophobicity

Gets hydrophobicity based on amino acid name.

Parameters
  • name (str): Amino acid name.
  • warn (bool, optional): If True, print a warning when hydrophobicity can't be determined. Default is False.
Returns
  • float: Hydrophobicity value.
Notes

Returns NaN if the amino acid name is not recognized.

Ref: https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/midas/hydrophob.html

def PDB_to_df(pdb_code, fname, pdbx, offset, CA_only=1):
102def PDB_to_df(pdb_code, fname, pdbx, offset, CA_only=1):
103    """
104    Loads a PDB (or PDBx) file and stores the atom coordinates and residue name and number into a dataframe.
105
106    Note: if the PDB file has more than one model, the first model is chosen.
107
108    Parameters
109    ----------
110    pdb_code : str
111        PDB ID / label for the protein of interest.
112    fname : str
113        Filename for the protein of interest. Can be PDB or PDBx format.
114    pdbx : int
115        Set to 1 if using the newer PDBx file format.
116    offset : int
117        Index offset in case the first residue ID in PDB file is not the first physical residue (e.g. PDB starts at 5th residue).
118    CA_only : int, optional
119        Set to 1 [default] if using only alpha carbons, else all atoms are used.
120
121    Returns
122    -------
123    pandas.DataFrame
124        Dataframe containing every atom's x, y, z coordinates and serial number.
125    """
126    # pick a file reader based on the filetype. Newer/bigger structures will use PDBx
127    if pdbx == 1:
128        parser = MMCIFParser()  # for PDBx files (.CIF)
129    else:
130        parser = PDBParser()  # for PDB files (.PDB)
131    structure = parser.get_structure(
132        pdb_code, fname
133    )  # parse pdb file and store as a PDB structure object
134
135    # makes lists storing the alpha carbon (CA) coordinates and serial number
136    atom_coord_list = []
137    res_id_list = []
138    hydro_list = []
139
140    # loop through PDB structure object to get alpha carbon (or all atom) coordinates and serial number
141    for i, model in enumerate(
142        structure
143    ):  # iterate through each model in the structure, exit loop after first model
144        if (
145            i > 0
146        ):  # NMR files have multiple models, but only one model is used for the graph
147            break
148        for chain in model:  # iterate through each chain in the model
149            for residue in chain:  # iterate through each residue in the chain
150                for atom in residue:  # iterate through each atom in residue
151                    if CA_only == 1:
152                        if (
153                            atom.get_name() == "CA"
154                        ):  # if atom is an alpha carbon then append its coordinates and serial number
155                            atom_coord_list.append(
156                                atom.get_vector()[:]
157                            )  # convert vector object into list of numbers
158                            atom_sn = atom.get_serial_number()
159
160                            # biopython PDB module residue.__repr__() returns a string representing the residue:
161                            # e.g. '<Residue THR het=  resseq=1 icode= >'
162                            # 'resseq' is the residue serial number
163                            # --> split the string until the number after 'resseq=' is all that remains
164                            res_repr = residue.__repr__()
165                            res_repr_tmp = res_repr.split("resseq=", 1)[1]
166                            res_sn = (
167                                int(res_repr_tmp.split(" ", 1)[0]) + offset
168                            )  # get residue serial number (+ offest)
169                            res_name = (
170                                residue.get_resname()
171                            )  # get first letters of residue name
172                            hydro_list.append(
173                                get_hydrophobicity(res_name)
174                            )  # get hydrophobicity based on residue name
175
176                            # store residue ID: <first letter of residue name><residue serial number + offset>
177                            res_id_list.append(f"{res_name}{res_sn}-{atom_sn}")
178                    else:  # get all atoms
179                        if residue.get_resname() != "HOH":
180                            atom_coord_list.append(
181                                atom.get_vector()[:]
182                            )  # convert vector object into list of numbers
183                            atom_sn = atom.get_serial_number()
184
185                            # biopython PDB module residue.__repr__() returns a string representing the residue:
186                            # e.g. '<Residue THR het=  resseq=1 icode= >'
187                            # 'resseq' is the residue serial number
188                            # --> split the string until the number after 'resseq=' is all that remains
189                            res_repr = residue.__repr__()
190                            res_repr_tmp = res_repr.split("resseq=", 1)[1]
191                            res_sn = (
192                                int(res_repr_tmp.split(" ", 1)[0]) + offset
193                            )  # get residue serial number (+ offest)
194                            res_name = (
195                                residue.get_resname()
196                            )  # get first letters of residue name
197                            hydro_list.append(
198                                get_hydrophobicity(res_name)
199                            )  # get hydrophobicity based on residue name
200
201                            # store residue ID: <first letter of residue name><residue serial number + offset>
202                            res_id_list.append(f"{res_name}{res_sn}-{atom_sn}")
203
204    # create dataframes for atom coordinates, serial numbers, and then combine them.
205    # there's probably a better way to do this in a single line
206    atom_df = pd.DataFrame(atom_coord_list, columns=["x", "y", "z"])
207    res_id_df = pd.DataFrame(res_id_list, columns=["residue id"])
208    hydro_df = pd.DataFrame(hydro_list, columns=["hydrophobicity"])
209    df = pd.concat([atom_df, res_id_df, hydro_df], axis=1)
210    return df

Loads a PDB (or PDBx) file and stores the atom coordinates and residue name and number into a dataframe.

Note: if the PDB file has more than one model, the first model is chosen.

Parameters
  • pdb_code (str): PDB ID / label for the protein of interest.
  • fname (str): Filename for the protein of interest. Can be PDB or PDBx format.
  • pdbx (int): Set to 1 if using the newer PDBx file format.
  • offset (int): Index offset in case the first residue ID in PDB file is not the first physical residue (e.g. PDB starts at 5th residue).
  • CA_only (int, optional): Set to 1 [default] if using only alpha carbons, else all atoms are used.
Returns
  • pandas.DataFrame: Dataframe containing every atom's x, y, z coordinates and serial number.
def PDB_df_to_G(PDB_df, d_cut=8.0):
213def PDB_df_to_G(PDB_df, d_cut=8.0):
214    """
215    Converts a dataframe containing alpha carbon / atom coordinates (in Angstroms) into a graph, G(V,E).
216
217    Each vertex, V, is an alpha carbon / atom. Two alpha carbons / atoms with a distance (in Angstroms) less than a cutoff, d_cut, are connected by an edge, E.
218
219    Parameters
220    ----------
221    PDB_df : pandas.DataFrame
222        A dataframe containing alpha carbon / atom coordinate columns labeled: 'x', 'y', and 'z'.
223    d_cut : float, optional
224        Threshold for two alpha carbons / atoms to be connected (in Angstroms) by an edge. Defaults to 8.0.
225
226    Returns
227    -------
228    networkx.Graph
229        Protein structure network graph, G(V,E).
230    """
231    # create distance matrix where element i,j is the Euclidean distance between vertex i and vertex j
232    d_matrix = squareform(pdist(PDB_df[["x", "y", "z"]], "euclid"))
233    d_matrix_thresh = np.where(
234        d_matrix > d_cut, 0, d_matrix
235    )  # if the distance is > d_cut, replace it with 0 (i.e. remove edge)
236    G = nx.convert_matrix.from_numpy_array(
237        d_matrix_thresh
238    )  # convert distance matrix to networkx graph object
239
240    # create mapping to relabel nodes from index (i.e. 0...n-1) to residue id (i.e. str(<resname><res s/n + offset>) for each residue)
241    label_mapping = {}
242    hydro_mapping = {}
243    for node in G.nodes():  # here the node value is an index (0-n-1)
244        label_mapping[node] = PDB_df["residue id"][node]
245        hydro_mapping[node] = PDB_df["hydrophobicity"][node]
246
247    betweenness = nx.betweenness_centrality(G)
248    nx.set_node_attributes(G, betweenness, "betweenness")
249    nx.set_node_attributes(G, hydro_mapping, "hydrophobicity")
250    H = nx.relabel_nodes(
251        G, label_mapping
252    )  # update the nodes and store as new graph object
253    return H

Converts a dataframe containing alpha carbon / atom coordinates (in Angstroms) into a graph, G(V,E).

Each vertex, V, is an alpha carbon / atom. Two alpha carbons / atoms with a distance (in Angstroms) less than a cutoff, d_cut, are connected by an edge, E.

Parameters
  • PDB_df (pandas.DataFrame): A dataframe containing alpha carbon / atom coordinate columns labeled: 'x', 'y', and 'z'.
  • d_cut (float, optional): Threshold for two alpha carbons / atoms to be connected (in Angstroms) by an edge. Defaults to 8.0.
Returns
  • networkx.Graph: Protein structure network graph, G(V,E).
def save_data(df, G, df_name, G_name):
256def save_data(df, G, df_name, G_name):
257    """
258    Convenience function that stores dataframe as .csv and graph as .gexf file.
259
260    Parameters
261    ----------
262    df : pandas.DataFrame
263        Dataframe to save.
264    G : networkx.Graph
265        Graph to save.
266    df_name : str
267        Output filename for dataframe .csv.
268    G_name : str
269        Output filename for graph .gexf.
270    """
271    df.to_csv(f"{df_name}.csv")  # write dataframe to .csv
272    nx.write_gexf(G, f"{G_name}.gexf")  # write graph to .gexf (graph network XML file)

Convenience function that stores dataframe as .csv and graph as .gexf file.

Parameters
  • df (pandas.DataFrame): Dataframe to save.
  • G (networkx.Graph): Graph to save.
  • df_name (str): Output filename for dataframe .csv.
  • G_name (str): Output filename for graph .gexf.
def save_data_at_this_folder(data_path, df, G, df_name, G_name):
275def save_data_at_this_folder(data_path, df, G, df_name, G_name):
276    """
277    Convenience function that stores dataframe as .csv and graph as .gexf file.
278
279    Parameters
280    ----------
281    data_path : str or Path
282        Output directory path.
283    df : pandas.DataFrame
284        Dataframe to save.
285    G : networkx.Graph
286        Graph to save.
287    df_name : str
288        Output filename for dataframe .csv.
289    G_name : str
290        Output filename for graph .gexf.
291    """
292    df_name = Path(data_path, df_name)
293    df.to_csv(f"{df_name}.csv")  # write dataframe to .csv
294    G_name = Path(data_path, G_name)
295    nx.write_gexf(G, f"{G_name}.gexf")  # write graph to .gexf (graph network XML file)

Convenience function that stores dataframe as .csv and graph as .gexf file.

Parameters
  • data_path (str or Path): Output directory path.
  • df (pandas.DataFrame): Dataframe to save.
  • G (networkx.Graph): Graph to save.
  • df_name (str): Output filename for dataframe .csv.
  • G_name (str): Output filename for graph .gexf.
def plot_coordinates(xyz_data, figsize=5):
298def plot_coordinates(xyz_data, figsize=5):
299    """
300    Creates a 3D scatter plot containing the xyz data.
301
302    Parameters
303    ----------
304    xyz_data : np.ndarray
305        A[0] = [x0, y0, z0].
306    figsize : int, optional
307        Size of figure (figsize x figsize). Defaults to 5.
308
309    Returns
310    -------
311    matplotlib.pyplot.Figure
312        3D scatter plot figure.
313    """
314    fig = plt.figure(figsize=(figsize, figsize))
315    plt.title("x y z coordinates")
316    ax = fig.add_subplot(projection="3d")
317    ax.scatter(
318        xyz_data[:, 0], xyz_data[:, 1], xyz_data[:, 2], c="purple", s=2, alpha=0.3
319    )
320    return fig

Creates a 3D scatter plot containing the xyz data.

Parameters
  • xyz_data (np.ndarray): A[0] = [x0, y0, z0].
  • figsize (int, optional): Size of figure (figsize x figsize). Defaults to 5.
Returns
  • matplotlib.pyplot.Figure: 3D scatter plot figure.
def plot_FA_and_CA_coordinates(FA_xyz, CA_xyz, figsize=5):
323def plot_FA_and_CA_coordinates(FA_xyz, CA_xyz, figsize=5):
324    """
325    Creates a 3D scatter plot containing CA and FA atom coordinates.
326
327    Parameters
328    ----------
329    FA_xyz : np.ndarray
330        A[0] = [x0, y0, z0] for all atom coordinate data.
331    CA_xyz : np.ndarray
332        A[0] = [x0, y0, z0] for alpha carbon only coordinate data.
333    figsize : int, optional
334        Size of figure (figsize x figsize). Defaults to 5.
335
336    Returns
337    -------
338    matplotlib.pyplot.Figure
339        3D scatter plot figure.
340    """
341    fig = plt.figure(figsize=(figsize, figsize))
342
343    plt.title("x y z coordinates - FA and CA")
344    ax = fig.add_subplot(projection="3d")
345    ax.scatter(
346        FA_xyz[:, 0], FA_xyz[:, 1], FA_xyz[:, 2], c="purple", s=5, alpha=0.3, label="FA"
347    )
348    ax.scatter(
349        CA_xyz[:, 0], CA_xyz[:, 1], CA_xyz[:, 2], c="black", s=15, alpha=0.5, label="CA"
350    )
351    ax.legend()
352    return fig

Creates a 3D scatter plot containing CA and FA atom coordinates.

Parameters
  • FA_xyz (np.ndarray): A[0] = [x0, y0, z0] for all atom coordinate data.
  • CA_xyz (np.ndarray): A[0] = [x0, y0, z0] for alpha carbon only coordinate data.
  • figsize (int, optional): Size of figure (figsize x figsize). Defaults to 5.
Returns
  • matplotlib.pyplot.Figure: 3D scatter plot figure.
def main(args):
355def main(args):
356    """
357    Takes a .pdb(x) file, converts it into a graph, and saves the atom coordinates to .csv and graph as .gexf.
358
359    Parameters
360    ----------
361    args : argparse.Namespace
362        - pdb_code : str
363            PDB id / protein name.
364        - fname : str
365            PDB/PDBx filename.
366        - d_cut : float
367            Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms).
368        - o : int
369            PDB residue index offset integer. Default is 0.
370        - pdbx : int
371            Set=1 to use pdbx file parser.
372        - CA_only : int
373            Set=1 to use only alpha carbons (0 for all atoms
374    """
375    # get arguments
376    pdb_code = args.pdb_code
377    fname = args.fname
378    d_cut = args.d_cut
379    o = args.o
380    pdbx = args.pdbx
381    CA_only = args.CA_only
382
383    df = PDB_to_df(
384        pdb_code, fname, pdbx, o, CA_only
385    )  # convert pdb file into dataframe of atom/alpha carbon coordinates and s/n
386    G = PDB_df_to_G(df, d_cut)  # convert coordinate dataframe into network graph
387    save_data(
388        df, G, pdb_code, pdb_code
389    )  # save dataframe as .csv and graph as .gexf using the same name as the pdb code

Takes a .pdb(x) file, converts it into a graph, and saves the atom coordinates to .csv and graph as .gexf.

Parameters
  • args (argparse.Namespace):
    • pdb_code : str PDB id / protein name.
    • fname : str PDB/PDBx filename.
    • d_cut : float Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms).
    • o : int PDB residue index offset integer. Default is 0.
    • pdbx : int Set=1 to use pdbx file parser.
    • CA_only : int Set=1 to use only alpha carbons (0 for all atoms