kimou605 commited on
Commit
bc6546f
1 Parent(s): 4d057f2

Upload proteins_viz.py

Browse files
Files changed (1) hide show
  1. proteins_viz.py +136 -0
proteins_viz.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from biopandas.pdb import PandasPdb
3
+ from prody import parsePDBHeader
4
+
5
+
6
+
7
+
8
+ def read_pdb_to_dataframe(
9
+ pdb_path,
10
+ model_index: int = 1,
11
+ parse_header: bool = True,
12
+ ) -> pd.DataFrame:
13
+ """
14
+ Read a PDB file, and return a Pandas DataFrame containing the atomic coordinates and metadata.
15
+
16
+ Args:
17
+ pdb_path (str, optional): Path to a local PDB file to read. Defaults to None.
18
+ model_index (int, optional): Index of the model to extract from the PDB file, in case
19
+ it contains multiple models. Defaults to 1.
20
+ parse_header (bool, optional): Whether to parse the PDB header and extract metadata.
21
+ Defaults to True.
22
+
23
+ Returns:
24
+ pd.DataFrame: A DataFrame containing the atomic coordinates and metadata, with one row
25
+ per atom
26
+ """
27
+ atomic_df = PandasPdb().read_pdb(pdb_path)
28
+ if parse_header:
29
+ header = parsePDBHeader(pdb_path)
30
+ else:
31
+ header = None
32
+ atomic_df = atomic_df.get_model(model_index)
33
+ if len(atomic_df.df["ATOM"]) == 0:
34
+ raise ValueError(f"No model found for index: {model_index}")
35
+
36
+ return pd.concat([atomic_df.df["ATOM"], atomic_df.df["HETATM"]]), header
37
+
38
+ from graphein.protein.graphs import label_node_id
39
+
40
+ def process_dataframe(df: pd.DataFrame, granularity='CA') -> pd.DataFrame:
41
+ """
42
+ Process a DataFrame of protein structure data to reduce ambiguity and simplify analysis.
43
+
44
+ This function performs the following steps:
45
+ 1. Handles alternate locations for an atom, defaulting to keep the first one if multiple exist.
46
+ 2. Assigns a unique node_id to each residue in the DataFrame, using a helper function label_node_id.
47
+ 3. Filters the DataFrame based on specified granularity (defaults to 'CA' for alpha carbon).
48
+
49
+ Parameters
50
+ ----------
51
+ df : pd.DataFrame
52
+ The DataFrame containing protein structure data to process. It is expected to contain columns 'alt_loc' and 'atom_name'.
53
+
54
+ granularity : str, optional
55
+ The level of detail or perspective at which the DataFrame should be analyzed. Defaults to 'CA' (alpha carbon).
56
+ """
57
+ # handle the case of alternative locations,
58
+ # if so default to the 1st one = A
59
+ if 'alt_loc' in df.columns:
60
+ df['alt_loc'] = df['alt_loc'].replace('', 'A')
61
+ df = df.loc[(df['alt_loc']=='A')]
62
+ df = label_node_id(df, granularity)
63
+ df = df.loc[(df['atom_name']==granularity)]
64
+ return df
65
+
66
+
67
+ from graphein.protein.graphs import initialise_graph_with_metadata
68
+ from graphein.protein.graphs import add_nodes_to_graph
69
+ from graphein.protein.visualisation import plotly_protein_structure_graph
70
+ from PIL import Image
71
+ import networkx as nx
72
+
73
+ def take_care(pdb_path):
74
+
75
+
76
+ df, header = read_pdb_to_dataframe(pdb_path)
77
+ process_df = process_dataframe(df)
78
+
79
+ g = initialise_graph_with_metadata(protein_df=process_df, # from above cell
80
+ raw_pdb_df=df, # Store this for traceability
81
+ pdb_code = '3nir', #and again
82
+ granularity = 'CA' # Store this so we know what kind of graph we have
83
+ )
84
+ g = add_nodes_to_graph(g)
85
+
86
+
87
+ def add_backbone_edges(G: nx.Graph) -> nx.Graph:
88
+ # Iterate over every chain
89
+ for chain_id in G.graph["chain_ids"]:
90
+ # Find chain residues
91
+ chain_residues = [
92
+ (n, v) for n, v in G.nodes(data=True) if v["chain_id"] == chain_id
93
+ ]
94
+ # Iterate over every residue in chain
95
+ for i, residue in enumerate(chain_residues):
96
+ try:
97
+ # Checks not at chain terminus
98
+ if i == len(chain_residues) - 1:
99
+ continue
100
+ # Asserts residues are on the same chain
101
+ cond_1 = ( residue[1]["chain_id"] == chain_residues[i + 1][1]["chain_id"])
102
+ # Asserts residue numbers are adjacent
103
+ cond_2 = (abs(residue[1]["residue_number"] - chain_residues[i + 1][1]["residue_number"])== 1)
104
+
105
+ # If this checks out, we add a peptide bond
106
+ if (cond_1) and (cond_2):
107
+ # Adds "peptide bond" between current residue and the next
108
+ if G.has_edge(i, i + 1):
109
+ G.edges[i, i + 1]["kind"].add('backbone_bond')
110
+ else:
111
+ G.add_edge(residue[0],chain_residues[i + 1][0],kind={'backbone_bond'},)
112
+ except IndexError as e:
113
+ print(e)
114
+ return G
115
+
116
+ g = add_backbone_edges(g)
117
+
118
+
119
+
120
+ p = plotly_protein_structure_graph(
121
+ g,
122
+ colour_edges_by="kind",
123
+ colour_nodes_by="seq_position",
124
+ label_node_ids=False,
125
+ plot_title="Backbone Protein Graph",
126
+ node_size_multiplier=1,
127
+ )
128
+ image_file = "protein_graph.png"
129
+ p.write_image(image_file, format='png')
130
+
131
+
132
+ # Load the PNG image into a PIL image
133
+ image = Image.open(image_file)
134
+
135
+
136
+ return image