griptomo.core.multiple_pdb2graph

View Source

  1import argparse
  2import fnmatch
  3import os, sys, time
  4import multiprocessing
  5import pandas as pd
  6import griptomo.core.pdb2graph as p2g
  7
  8
  9# to check cpu count use multiprocessing.cpu_count()
 10
 11# to check the platform look at the link below
 12# https://docs.python.org/3/library/sys.html#sys.platform
 13
 14"""
 15AIX : 'aix'
 16Linux : 'linux'
 17Windows : 'win32'
 18Windows/Cygwin : 'cygwin'
 19macOS : 'darwin'
 20"""
 21
 22_args = sys.argv[0:]
 23_py_file = _args[0]
 24
 25
 26_code_location = os.path.dirname(os.path.abspath(_py_file))
 27
 28try:
 29    index_of_latest = _code_location.index("griptomo")
 30    scripts_path = os.path.join(_code_location[:index_of_latest], "griptomoml", "core")
 31    sys.path.insert(0, scripts_path)
 32except ValueError:
 33    print(f"Error: 'griptomo' not found in the directory path {_code_location}")
 34
 35
 36_folder_location = _args[1]
 37
 38
 39def show_time(process, time_start, time_end):
 40    """
 41    Calculate and format the time taken for a process.
 42
 43    Parameters
 44    ----------
 45    process : str
 46        Name of the process.
 47    time_start : float
 48        Start time of the process.
 49    time_end : float
 50        End time of the process.
 51
 52    Returns
 53    -------
 54    str
 55        Formatted string showing the time taken for the process.
 56    """
 57    time_took = "\n" + str(process) + " finished in "
 58    if round((time_end - time_start) / 60, 1) < 1:
 59        time_took = time_took + str(round((time_end - time_start), 1)) + " seconds "
 60    elif round((time_end - time_start) / 60 / 60, 1) < 1:
 61        time_took = (
 62            time_took + str(round((time_end - time_start) / 60, 1)) + " minutes "
 63        )
 64    else:
 65        time_took = (
 66            time_took + str(round((time_end - time_start) / 60 / 60, 1)) + " hours "
 67        )
 68    time_took = time_took + "(wall clock)."
 69    return time_took
 70
 71
 72# define the function to call in main
 73def generate_graph(pdb_code, fname, t, o, pdbx, CA_only):
 74    """
 75    Generate a graph from a PDB file.
 76
 77    Parameters
 78    ----------
 79    pdb_code : str
 80        PDB ID / label for the protein of interest.
 81    fname : str
 82        Filename for the protein of interest. Can be PDB or PDBx format.
 83    t : float
 84        Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms).
 85    o : int
 86        Index offset in case the first residue ID in PDB file is not the first physical residue.
 87    pdbx : int
 88        Set to 1 if using the newer PDBx file format.
 89    CA_only : int
 90        Set to 1 if using only alpha carbons, else all atoms are used.
 91
 92    Returns
 93    -------
 94    str
 95        PDB code.
 96    """
 97    df = p2g.PDB_to_df(pdb_code, fname, pdbx, o, CA_only)
 98    G = p2g.PDB_df_to_G(df, t)
 99    p2g.save_data(df, G, pdb_code, pdb_code)
100    return pdb_code
101
102
103def pdb2graph_list(t, o, pdbx, CA_only=1):
104    """
105    Convert multiple PDBs to graphs from a folder of PDBs.
106
107    Parameters
108    ----------
109    t : float
110        Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms).
111    o : int
112        Index offset in case the first residue ID in PDB file is not the first physical residue.
113    pdbx : int
114        Set to 1 if using the newer PDBx file format.
115    CA_only : int, optional
116        Set to 1 if using only alpha carbons, else all atoms are used. Default is 1.
117
118    Returns
119    -------
120    list
121        List of results from the graph generation process.
122    """
123    # List for storing file names and the arguments
124    file_list = []
125    for file_name in os.listdir(_folder_location):
126        if fnmatch.fnmatch(file_name, "*.pdb"):
127            pdb_code = file_name[0 : file_name.find(".pdb")] + "_" + str(int(t))
128            fname = os.path.join(_folder_location, file_name)
129            tuple = (pdb_code, fname, t, o, pdbx, CA_only)
130            file_list.append(tuple)
131
132    # process pool for passing pdb2graph_list to multiple processes
133    pool = multiprocessing.Pool()
134    result = pool.starmap_async(generate_graph, file_list)
135    pool.close()
136    print(result.get())
137    pool.join()
138    return result.get()
139
140
141def main(args):
142    """
143    Main function to process multiple PDB files and convert them to graphs.
144
145    Parameters
146    ----------
147    args : argparse.Namespace
148        Command line arguments.
149    """
150    pdb_code = ""
151    fname = ""
152    t = args.t
153    o = args.o
154    pdbx = args.pdbx
155    CA_only = args.CA_only
156
157    pdb2graph_list(t, o, pdbx, CA_only)
158
159
160if __name__ == "__main__":
161    parser = argparse.ArgumentParser()
162    parser.add_argument("f", help="Folder name with the pdb files", type=str)
163    parser.add_argument(
164        "t", help="Alpha Carbon contact distance threshold (in Angstroms)", type=float
165    )
166    parser.add_argument(
167        "o", help="PDB residue index offset integer. Default is 0.", type=int
168    )
169    parser.add_argument("pdbx", help="set=1 to use pdbx file parser", type=int)
170    parser.add_argument("CA_only", help="set=1 to use only alpha carbons", type=int)
171    args = parser.parse_args()
172
173    start_time = time.time()
174    main(args)
175    print(show_time("pdb to graph", start_time, time.time()))
176
177    # example running: python /griptomo/core/multiple_pdb2graph.py 8 0 0 1
178
179    # parameters used for the 1st paper
180    # t = 8 # pairwise distance cutoff for assigning edges, in Angstroms
181    # o = 0  # residue indexing offest (default = 0)
182    # pdbx = 0  # using .pdb (0) or .pdbx (1) file format
183    # CA_only = 1 # using alpha carbons only
184
185    # ref. With 8 cores at mac,         it took 20 minutes (wall clock) to generate graphs for 64 pdb files
186    # ref. With 10 M1 max cores at mac, it took 11 minutes (wall clock) to generate graphs for 100 apoferritin-sized pdb files

def show_time(process, time_start, time_end): View Source

40def show_time(process, time_start, time_end):
41    """
42    Calculate and format the time taken for a process.
43
44    Parameters
45    ----------
46    process : str
47        Name of the process.
48    time_start : float
49        Start time of the process.
50    time_end : float
51        End time of the process.
52
53    Returns
54    -------
55    str
56        Formatted string showing the time taken for the process.
57    """
58    time_took = "\n" + str(process) + " finished in "
59    if round((time_end - time_start) / 60, 1) < 1:
60        time_took = time_took + str(round((time_end - time_start), 1)) + " seconds "
61    elif round((time_end - time_start) / 60 / 60, 1) < 1:
62        time_took = (
63            time_took + str(round((time_end - time_start) / 60, 1)) + " minutes "
64        )
65    else:
66        time_took = (
67            time_took + str(round((time_end - time_start) / 60 / 60, 1)) + " hours "
68        )
69    time_took = time_took + "(wall clock)."
70    return time_took

Calculate and format the time taken for a process.

Parameters

process (str): Name of the process.
time_start (float): Start time of the process.
time_end (float): End time of the process.

Returns

str: Formatted string showing the time taken for the process.

def generate_graph(pdb_code, fname, t, o, pdbx, CA_only): View Source

 74def generate_graph(pdb_code, fname, t, o, pdbx, CA_only):
 75    """
 76    Generate a graph from a PDB file.
 77
 78    Parameters
 79    ----------
 80    pdb_code : str
 81        PDB ID / label for the protein of interest.
 82    fname : str
 83        Filename for the protein of interest. Can be PDB or PDBx format.
 84    t : float
 85        Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms).
 86    o : int
 87        Index offset in case the first residue ID in PDB file is not the first physical residue.
 88    pdbx : int
 89        Set to 1 if using the newer PDBx file format.
 90    CA_only : int
 91        Set to 1 if using only alpha carbons, else all atoms are used.
 92
 93    Returns
 94    -------
 95    str
 96        PDB code.
 97    """
 98    df = p2g.PDB_to_df(pdb_code, fname, pdbx, o, CA_only)
 99    G = p2g.PDB_df_to_G(df, t)
100    p2g.save_data(df, G, pdb_code, pdb_code)
101    return pdb_code

Generate a graph from a PDB file.

Parameters

pdb_code (str): PDB ID / label for the protein of interest.
fname (str): Filename for the protein of interest. Can be PDB or PDBx format.
t (float): Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms).
o (int): Index offset in case the first residue ID in PDB file is not the first physical residue.
pdbx (int): Set to 1 if using the newer PDBx file format.
CA_only (int): Set to 1 if using only alpha carbons, else all atoms are used.

Returns

str: PDB code.

def pdb2graph_list(t, o, pdbx, CA_only=1): View Source

104def pdb2graph_list(t, o, pdbx, CA_only=1):
105    """
106    Convert multiple PDBs to graphs from a folder of PDBs.
107
108    Parameters
109    ----------
110    t : float
111        Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms).
112    o : int
113        Index offset in case the first residue ID in PDB file is not the first physical residue.
114    pdbx : int
115        Set to 1 if using the newer PDBx file format.
116    CA_only : int, optional
117        Set to 1 if using only alpha carbons, else all atoms are used. Default is 1.
118
119    Returns
120    -------
121    list
122        List of results from the graph generation process.
123    """
124    # List for storing file names and the arguments
125    file_list = []
126    for file_name in os.listdir(_folder_location):
127        if fnmatch.fnmatch(file_name, "*.pdb"):
128            pdb_code = file_name[0 : file_name.find(".pdb")] + "_" + str(int(t))
129            fname = os.path.join(_folder_location, file_name)
130            tuple = (pdb_code, fname, t, o, pdbx, CA_only)
131            file_list.append(tuple)
132
133    # process pool for passing pdb2graph_list to multiple processes
134    pool = multiprocessing.Pool()
135    result = pool.starmap_async(generate_graph, file_list)
136    pool.close()
137    print(result.get())
138    pool.join()
139    return result.get()

Convert multiple PDBs to graphs from a folder of PDBs.

Parameters

t (float): Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms).
o (int): Index offset in case the first residue ID in PDB file is not the first physical residue.
pdbx (int): Set to 1 if using the newer PDBx file format.
CA_only (int, optional): Set to 1 if using only alpha carbons, else all atoms are used. Default is 1.

Returns

list: List of results from the graph generation process.

def main(args): View Source

142def main(args):
143    """
144    Main function to process multiple PDB files and convert them to graphs.
145
146    Parameters
147    ----------
148    args : argparse.Namespace
149        Command line arguments.
150    """
151    pdb_code = ""
152    fname = ""
153    t = args.t
154    o = args.o
155    pdbx = args.pdbx
156    CA_only = args.CA_only
157
158    pdb2graph_list(t, o, pdbx, CA_only)

Main function to process multiple PDB files and convert them to graphs.

Parameters

args (argparse.Namespace): Command line arguments.