Spaces:

kimou605
/

GenSeq

Runtime error

App Files Files Community

kimou605 commited on May 26

Commit

4d057f2

•

1 Parent(s): 001392b

Update app.py

Browse files

Files changed (1) hide show

app.py +179 -0

app.py CHANGED Viewed

@@ -5,6 +5,171 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 from transformers import pipeline
 pipe = pipeline("text-generation", model="kimou605/shadow-clown-BioMistral-7B-DARE", torch_dtype=torch.bfloat16, device_map="auto")
@@ -38,6 +203,20 @@ def respond(
     substring_after_last_space = outputs[0]["generated_text"][last_space_index + 7:]
     yield substring_after_last_space
 demo = gr.ChatInterface(
     respond,

 import torch
 from transformers import pipeline
+def extract_nucleotide_sequences(text):
+    """
+    Extrait des séquences de nucléotides (ADN) d'un texte.
+    Args:
+    text (str): Le texte contenant des séquences de nucléotides.
+    Returns:
+    list: Une liste de séquences de nucléotides extraites du texte.
+    """
+    # Utilisation d'une expression régulière pour trouver des séquences de nucléotides
+    sequences = re.findall(r'\b[ATCG]+\b', text, re.IGNORECASE)
+    return sequences
+genetic_code = {
+    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
+    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
+    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
+    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
+    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
+    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
+    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
+    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
+    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
+    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
+    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
+    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
+    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
+    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
+    'TAC':'Y', 'TAT':'Y', 'TGC':'C', 'TGT':'C', 'TGG':'W'
+}
+def translate_nucleotide_sequence(nucleotide_seq):
+    """
+    Translates a nucleotide sequence to a protein sequence.
+    Parameters:
+    nucleotide_seq (str): The nucleotide sequence.
+    Returns:
+    str: The translated protein sequence.
+    """
+    # Truncate the sequence to the nearest multiple of 3
+    truncated_seq = nucleotide_seq[:len(nucleotide_seq) // 3 * 3]
+    # Translate the nucleotide sequence to a protein sequence
+    protein_seq = ''
+    for i in range(0, len(truncated_seq), 3):
+        codon = truncated_seq[i:i+3]
+        protein_seq += genetic_code.get(codon, 'X')  # Use 'X' for unknown codons
+    return protein_seq
+# Example usage
+# nucleotide_seq = "ATGGCGCGGGAGGCTCTGGAAGGAGGCTGCCGGGGCGCCCACGGAGCTGC"
+# protein_seq = translate_nucleotide_sequence(nucleotide_seq)
+# print(protein_seq)
+from transformers import AutoTokenizer, EsmForProteinFolding
+from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
+from transformers.models.esm.openfold_utils.feats import atom14_to_atom37
+from proteins_viz import *
+import gradio as gr
+import spaces
+def read_mol(molpath):
+    with open(molpath, "r") as fp:
+        lines = fp.readlines()
+    mol = ""
+    for l in lines:
+        mol += l
+    return mol
+def molecule(input_pdb):
+    mol = read_mol(input_pdb)
+    x = (
+        """<!DOCTYPE html>
+        <html>
+        <head>
+    <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
+    <style>
+    body{
+        font-family:sans-serif
+    }
+    .mol-container {
+    width: 100%;
+    height: 600px;
+    position: relative;
+    }
+    .mol-container select{
+        background-image:None;
+    }
+    </style>
+     <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.6.3/jquery.min.js" integrity="sha512-STof4xm1wgkfm7heWqFJVn58Hm3EtS31XFaagaa8VMReCXAkQnJZ+jEy8PCC/iT18dFy95WcExNHFTqLyp72eQ==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
+    <script src="https://3Dmol.csb.pitt.edu/build/3Dmol-min.js"></script>
+    </head>
+    <body>
+    <div id="container" class="mol-container"></div>
+            <script>
+               let pdb = `"""
+        + mol
+        + """`
+             $(document).ready(function () {
+                let element = $("#container");
+                let config = { backgroundColor: "white" };
+                let viewer = $3Dmol.createViewer(element, config);
+                viewer.addModel(pdb, "pdb");
+                viewer.getModel(0).setStyle({}, { cartoon: { colorscheme:"whiteCarbon" } });
+                viewer.zoomTo();
+                viewer.render();
+                viewer.zoom(0.8, 2000);
+              })
+        </script>
+        </body></html>"""
+    )
+    return f"""<iframe style="width: 100%; height: 600px" name="result" allow="midi; geolocation; microphone; camera;
+    display-capture; encrypted-media;" sandbox="allow-modals allow-forms
+    allow-scripts allow-same-origin allow-popups
+    allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
+    allowpaymentrequest="" frameborder="0" srcdoc='{x}'></iframe>"""
+def convert_outputs_to_pdb(outputs):
+    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
+    outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()}
+    final_atom_positions = final_atom_positions.cpu().numpy()
+    final_atom_mask = outputs["atom37_atom_exists"]
+    pdbs = []
+    for i in range(outputs["aatype"].shape[0]):
+        aa = outputs["aatype"][i]
+        pred_pos = final_atom_positions[i]
+        mask = final_atom_mask[i]
+        resid = outputs["residue_index"][i] + 1
+        pred = OFProtein(
+            aatype=aa,
+            atom_positions=pred_pos,
+            atom_mask=mask,
+            residue_index=resid,
+            b_factors=outputs["plddt"][i],
+            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
+        )
+        pdbs.append(to_pdb(pred))
+    return pdbs
+tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
+model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)
+model = model.cuda()
+model.esm = model.esm.half()
+import torch
+torch.backends.cuda.matmul.allow_tf32 = True
+model.trunk.set_chunk_size(64)
 pipe = pipeline("text-generation", model="kimou605/shadow-clown-BioMistral-7B-DARE", torch_dtype=torch.bfloat16, device_map="auto")
     substring_after_last_space = outputs[0]["generated_text"][last_space_index + 7:]
     yield substring_after_last_space
+    sequences = extract_nucleotide_sequences(substring_after_last_space)
+    test_protein = translate_nucleotide_sequence(sequences)
+    tokenized_input = tokenizer([test_protein], return_tensors="pt", add_special_tokens=False)['input_ids']
+    tokenized_input = tokenized_input.cuda()
+    with torch.no_grad():
+        output = model(tokenized_input)
+    pdb = convert_outputs_to_pdb(output)
+    with open("output_structure.pdb", "w") as f:
+        f.write("".join(pdb))
+    image = take_care("output_structure.pdb")
+    html = molecule("output_structure.pdb")
+    return image, html
 demo = gr.ChatInterface(
     respond,