Spaces:

kimou605
/

GenSeq

Runtime error

App Files Files Community

kimou605 commited on May 26

Commit

bc6546f

•

1 Parent(s): 4d057f2

Upload proteins_viz.py

Browse files

Files changed (1) hide show

proteins_viz.py +136 -0

proteins_viz.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import pandas as pd
+from biopandas.pdb import PandasPdb
+from prody import parsePDBHeader
+def read_pdb_to_dataframe(
+    pdb_path,
+    model_index: int = 1,
+    parse_header: bool = True,
+    ) -> pd.DataFrame:
+    """
+    Read a PDB file, and return a Pandas DataFrame containing the atomic coordinates and metadata.
+    Args:
+        pdb_path (str, optional): Path to a local PDB file to read. Defaults to None.
+        model_index (int, optional): Index of the model to extract from the PDB file, in case
+            it contains multiple models. Defaults to 1.
+        parse_header (bool, optional): Whether to parse the PDB header and extract metadata.
+            Defaults to True.
+    Returns:
+        pd.DataFrame: A DataFrame containing the atomic coordinates and metadata, with one row
+            per atom
+    """
+    atomic_df = PandasPdb().read_pdb(pdb_path)
+    if parse_header:
+        header = parsePDBHeader(pdb_path)
+    else:
+        header = None
+    atomic_df = atomic_df.get_model(model_index)
+    if len(atomic_df.df["ATOM"]) == 0:
+        raise ValueError(f"No model found for index: {model_index}")
+    return pd.concat([atomic_df.df["ATOM"], atomic_df.df["HETATM"]]), header
+from graphein.protein.graphs import label_node_id
+def process_dataframe(df: pd.DataFrame, granularity='CA') -> pd.DataFrame:
+    """
+    Process a DataFrame of protein structure data to reduce ambiguity and simplify analysis.
+    This function performs the following steps:
+    1. Handles alternate locations for an atom, defaulting to keep the first one if multiple exist.
+    2. Assigns a unique node_id to each residue in the DataFrame, using a helper function label_node_id.
+    3. Filters the DataFrame based on specified granularity (defaults to 'CA' for alpha carbon).
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The DataFrame containing protein structure data to process. It is expected to contain columns 'alt_loc' and 'atom_name'.
+    granularity : str, optional
+        The level of detail or perspective at which the DataFrame should be analyzed. Defaults to 'CA' (alpha carbon).
+    """
+    # handle the case of alternative locations,
+    # if so default to the 1st one = A
+    if 'alt_loc' in df.columns:
+      df['alt_loc'] = df['alt_loc'].replace('', 'A')
+      df = df.loc[(df['alt_loc']=='A')]
+    df = label_node_id(df, granularity)
+    df = df.loc[(df['atom_name']==granularity)]
+    return df
+from graphein.protein.graphs import initialise_graph_with_metadata
+from graphein.protein.graphs import add_nodes_to_graph
+from graphein.protein.visualisation import plotly_protein_structure_graph
+from PIL import Image
+import networkx as nx
+def take_care(pdb_path):
+    df, header = read_pdb_to_dataframe(pdb_path)
+    process_df = process_dataframe(df)
+    g = initialise_graph_with_metadata(protein_df=process_df, # from above cell
+                                        raw_pdb_df=df, # Store this for traceability
+                                        pdb_code = '3nir', #and again
+                                        granularity = 'CA' # Store this so we know what kind of graph we have
+                                        )
+    g = add_nodes_to_graph(g)
+    def add_backbone_edges(G: nx.Graph) -> nx.Graph:
+        # Iterate over every chain
+        for chain_id in G.graph["chain_ids"]:
+            # Find chain residues
+            chain_residues = [
+                (n, v) for n, v in G.nodes(data=True) if v["chain_id"] == chain_id
+            ]
+            # Iterate over every residue in chain
+            for i, residue in enumerate(chain_residues):
+                try:
+                    # Checks not at chain terminus
+                    if i == len(chain_residues) - 1:
+                        continue
+                    # Asserts residues are on the same chain
+                    cond_1 = ( residue[1]["chain_id"] == chain_residues[i + 1][1]["chain_id"])
+                    # Asserts residue numbers are adjacent
+                    cond_2 = (abs(residue[1]["residue_number"] - chain_residues[i + 1][1]["residue_number"])== 1)
+                    # If this checks out, we add a peptide bond
+                    if (cond_1) and (cond_2):
+                        # Adds "peptide bond" between current residue and the next
+                        if G.has_edge(i, i + 1):
+                            G.edges[i, i + 1]["kind"].add('backbone_bond')
+                        else:
+                            G.add_edge(residue[0],chain_residues[i + 1][0],kind={'backbone_bond'},)
+                except IndexError as e:
+                    print(e)
+        return G
+    g = add_backbone_edges(g)
+    p = plotly_protein_structure_graph(
+        g,
+        colour_edges_by="kind",
+        colour_nodes_by="seq_position",
+        label_node_ids=False,
+        plot_title="Backbone Protein Graph",
+        node_size_multiplier=1,
+    )
+    image_file = "protein_graph.png"
+    p.write_image(image_file, format='png')
+    # Load the PNG image into a PIL image
+    image = Image.open(image_file)
+    return image