griptomo.core.multiple_pdb2graph
1import argparse 2import fnmatch 3import os, sys, time 4import multiprocessing 5import pandas as pd 6import griptomo.core.pdb2graph as p2g 7 8 9# to check cpu count use multiprocessing.cpu_count() 10 11# to check the platform look at the link below 12# https://docs.python.org/3/library/sys.html#sys.platform 13 14""" 15AIX : 'aix' 16Linux : 'linux' 17Windows : 'win32' 18Windows/Cygwin : 'cygwin' 19macOS : 'darwin' 20""" 21 22_args = sys.argv[0:] 23_py_file = _args[0] 24 25 26_code_location = os.path.dirname(os.path.abspath(_py_file)) 27 28try: 29 index_of_latest = _code_location.index("griptomo") 30 scripts_path = os.path.join(_code_location[:index_of_latest], "griptomoml", "core") 31 sys.path.insert(0, scripts_path) 32except ValueError: 33 print(f"Error: 'griptomo' not found in the directory path {_code_location}") 34 35 36_folder_location = _args[1] 37 38 39def show_time(process, time_start, time_end): 40 """ 41 Calculate and format the time taken for a process. 42 43 Parameters 44 ---------- 45 process : str 46 Name of the process. 47 time_start : float 48 Start time of the process. 49 time_end : float 50 End time of the process. 51 52 Returns 53 ------- 54 str 55 Formatted string showing the time taken for the process. 56 """ 57 time_took = "\n" + str(process) + " finished in " 58 if round((time_end - time_start) / 60, 1) < 1: 59 time_took = time_took + str(round((time_end - time_start), 1)) + " seconds " 60 elif round((time_end - time_start) / 60 / 60, 1) < 1: 61 time_took = ( 62 time_took + str(round((time_end - time_start) / 60, 1)) + " minutes " 63 ) 64 else: 65 time_took = ( 66 time_took + str(round((time_end - time_start) / 60 / 60, 1)) + " hours " 67 ) 68 time_took = time_took + "(wall clock)." 69 return time_took 70 71 72# define the function to call in main 73def generate_graph(pdb_code, fname, t, o, pdbx, CA_only): 74 """ 75 Generate a graph from a PDB file. 76 77 Parameters 78 ---------- 79 pdb_code : str 80 PDB ID / label for the protein of interest. 81 fname : str 82 Filename for the protein of interest. Can be PDB or PDBx format. 83 t : float 84 Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms). 85 o : int 86 Index offset in case the first residue ID in PDB file is not the first physical residue. 87 pdbx : int 88 Set to 1 if using the newer PDBx file format. 89 CA_only : int 90 Set to 1 if using only alpha carbons, else all atoms are used. 91 92 Returns 93 ------- 94 str 95 PDB code. 96 """ 97 df = p2g.PDB_to_df(pdb_code, fname, pdbx, o, CA_only) 98 G = p2g.PDB_df_to_G(df, t) 99 p2g.save_data(df, G, pdb_code, pdb_code) 100 return pdb_code 101 102 103def pdb2graph_list(t, o, pdbx, CA_only=1): 104 """ 105 Convert multiple PDBs to graphs from a folder of PDBs. 106 107 Parameters 108 ---------- 109 t : float 110 Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms). 111 o : int 112 Index offset in case the first residue ID in PDB file is not the first physical residue. 113 pdbx : int 114 Set to 1 if using the newer PDBx file format. 115 CA_only : int, optional 116 Set to 1 if using only alpha carbons, else all atoms are used. Default is 1. 117 118 Returns 119 ------- 120 list 121 List of results from the graph generation process. 122 """ 123 # List for storing file names and the arguments 124 file_list = [] 125 for file_name in os.listdir(_folder_location): 126 if fnmatch.fnmatch(file_name, "*.pdb"): 127 pdb_code = file_name[0 : file_name.find(".pdb")] + "_" + str(int(t)) 128 fname = os.path.join(_folder_location, file_name) 129 tuple = (pdb_code, fname, t, o, pdbx, CA_only) 130 file_list.append(tuple) 131 132 # process pool for passing pdb2graph_list to multiple processes 133 pool = multiprocessing.Pool() 134 result = pool.starmap_async(generate_graph, file_list) 135 pool.close() 136 print(result.get()) 137 pool.join() 138 return result.get() 139 140 141def main(args): 142 """ 143 Main function to process multiple PDB files and convert them to graphs. 144 145 Parameters 146 ---------- 147 args : argparse.Namespace 148 Command line arguments. 149 """ 150 pdb_code = "" 151 fname = "" 152 t = args.t 153 o = args.o 154 pdbx = args.pdbx 155 CA_only = args.CA_only 156 157 pdb2graph_list(t, o, pdbx, CA_only) 158 159 160if __name__ == "__main__": 161 parser = argparse.ArgumentParser() 162 parser.add_argument("f", help="Folder name with the pdb files", type=str) 163 parser.add_argument( 164 "t", help="Alpha Carbon contact distance threshold (in Angstroms)", type=float 165 ) 166 parser.add_argument( 167 "o", help="PDB residue index offset integer. Default is 0.", type=int 168 ) 169 parser.add_argument("pdbx", help="set=1 to use pdbx file parser", type=int) 170 parser.add_argument("CA_only", help="set=1 to use only alpha carbons", type=int) 171 args = parser.parse_args() 172 173 start_time = time.time() 174 main(args) 175 print(show_time("pdb to graph", start_time, time.time())) 176 177 # example running: python /griptomo/core/multiple_pdb2graph.py 8 0 0 1 178 179 # parameters used for the 1st paper 180 # t = 8 # pairwise distance cutoff for assigning edges, in Angstroms 181 # o = 0 # residue indexing offest (default = 0) 182 # pdbx = 0 # using .pdb (0) or .pdbx (1) file format 183 # CA_only = 1 # using alpha carbons only 184 185 # ref. With 8 cores at mac, it took 20 minutes (wall clock) to generate graphs for 64 pdb files 186 # ref. With 10 M1 max cores at mac, it took 11 minutes (wall clock) to generate graphs for 100 apoferritin-sized pdb files
def
show_time(process, time_start, time_end):
40def show_time(process, time_start, time_end): 41 """ 42 Calculate and format the time taken for a process. 43 44 Parameters 45 ---------- 46 process : str 47 Name of the process. 48 time_start : float 49 Start time of the process. 50 time_end : float 51 End time of the process. 52 53 Returns 54 ------- 55 str 56 Formatted string showing the time taken for the process. 57 """ 58 time_took = "\n" + str(process) + " finished in " 59 if round((time_end - time_start) / 60, 1) < 1: 60 time_took = time_took + str(round((time_end - time_start), 1)) + " seconds " 61 elif round((time_end - time_start) / 60 / 60, 1) < 1: 62 time_took = ( 63 time_took + str(round((time_end - time_start) / 60, 1)) + " minutes " 64 ) 65 else: 66 time_took = ( 67 time_took + str(round((time_end - time_start) / 60 / 60, 1)) + " hours " 68 ) 69 time_took = time_took + "(wall clock)." 70 return time_took
Calculate and format the time taken for a process.
Parameters
- process (str): Name of the process.
- time_start (float): Start time of the process.
- time_end (float): End time of the process.
Returns
- str: Formatted string showing the time taken for the process.
def
generate_graph(pdb_code, fname, t, o, pdbx, CA_only):
74def generate_graph(pdb_code, fname, t, o, pdbx, CA_only): 75 """ 76 Generate a graph from a PDB file. 77 78 Parameters 79 ---------- 80 pdb_code : str 81 PDB ID / label for the protein of interest. 82 fname : str 83 Filename for the protein of interest. Can be PDB or PDBx format. 84 t : float 85 Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms). 86 o : int 87 Index offset in case the first residue ID in PDB file is not the first physical residue. 88 pdbx : int 89 Set to 1 if using the newer PDBx file format. 90 CA_only : int 91 Set to 1 if using only alpha carbons, else all atoms are used. 92 93 Returns 94 ------- 95 str 96 PDB code. 97 """ 98 df = p2g.PDB_to_df(pdb_code, fname, pdbx, o, CA_only) 99 G = p2g.PDB_df_to_G(df, t) 100 p2g.save_data(df, G, pdb_code, pdb_code) 101 return pdb_code
Generate a graph from a PDB file.
Parameters
- pdb_code (str): PDB ID / label for the protein of interest.
- fname (str): Filename for the protein of interest. Can be PDB or PDBx format.
- t (float): Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms).
- o (int): Index offset in case the first residue ID in PDB file is not the first physical residue.
- pdbx (int): Set to 1 if using the newer PDBx file format.
- CA_only (int): Set to 1 if using only alpha carbons, else all atoms are used.
Returns
- str: PDB code.
def
pdb2graph_list(t, o, pdbx, CA_only=1):
104def pdb2graph_list(t, o, pdbx, CA_only=1): 105 """ 106 Convert multiple PDBs to graphs from a folder of PDBs. 107 108 Parameters 109 ---------- 110 t : float 111 Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms). 112 o : int 113 Index offset in case the first residue ID in PDB file is not the first physical residue. 114 pdbx : int 115 Set to 1 if using the newer PDBx file format. 116 CA_only : int, optional 117 Set to 1 if using only alpha carbons, else all atoms are used. Default is 1. 118 119 Returns 120 ------- 121 list 122 List of results from the graph generation process. 123 """ 124 # List for storing file names and the arguments 125 file_list = [] 126 for file_name in os.listdir(_folder_location): 127 if fnmatch.fnmatch(file_name, "*.pdb"): 128 pdb_code = file_name[0 : file_name.find(".pdb")] + "_" + str(int(t)) 129 fname = os.path.join(_folder_location, file_name) 130 tuple = (pdb_code, fname, t, o, pdbx, CA_only) 131 file_list.append(tuple) 132 133 # process pool for passing pdb2graph_list to multiple processes 134 pool = multiprocessing.Pool() 135 result = pool.starmap_async(generate_graph, file_list) 136 pool.close() 137 print(result.get()) 138 pool.join() 139 return result.get()
Convert multiple PDBs to graphs from a folder of PDBs.
Parameters
- t (float): Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms).
- o (int): Index offset in case the first residue ID in PDB file is not the first physical residue.
- pdbx (int): Set to 1 if using the newer PDBx file format.
- CA_only (int, optional): Set to 1 if using only alpha carbons, else all atoms are used. Default is 1.
Returns
- list: List of results from the graph generation process.
def
main(args):
142def main(args): 143 """ 144 Main function to process multiple PDB files and convert them to graphs. 145 146 Parameters 147 ---------- 148 args : argparse.Namespace 149 Command line arguments. 150 """ 151 pdb_code = "" 152 fname = "" 153 t = args.t 154 o = args.o 155 pdbx = args.pdbx 156 CA_only = args.CA_only 157 158 pdb2graph_list(t, o, pdbx, CA_only)
Main function to process multiple PDB files and convert them to graphs.
Parameters
- args (argparse.Namespace): Command line arguments.