Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,171 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
5 |
import torch
|
6 |
from transformers import pipeline
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
pipe = pipeline("text-generation", model="kimou605/shadow-clown-BioMistral-7B-DARE", torch_dtype=torch.bfloat16, device_map="auto")
|
9 |
|
10 |
|
@@ -38,6 +203,20 @@ def respond(
|
|
38 |
substring_after_last_space = outputs[0]["generated_text"][last_space_index + 7:]
|
39 |
yield substring_after_last_space
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
demo = gr.ChatInterface(
|
43 |
respond,
|
|
|
5 |
import torch
|
6 |
from transformers import pipeline
|
7 |
|
8 |
+
def extract_nucleotide_sequences(text):
|
9 |
+
"""
|
10 |
+
Extrait des séquences de nucléotides (ADN) d'un texte.
|
11 |
+
|
12 |
+
Args:
|
13 |
+
text (str): Le texte contenant des séquences de nucléotides.
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
list: Une liste de séquences de nucléotides extraites du texte.
|
17 |
+
"""
|
18 |
+
# Utilisation d'une expression régulière pour trouver des séquences de nucléotides
|
19 |
+
sequences = re.findall(r'\b[ATCG]+\b', text, re.IGNORECASE)
|
20 |
+
return sequences
|
21 |
+
|
22 |
+
genetic_code = {
|
23 |
+
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
|
24 |
+
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
|
25 |
+
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
|
26 |
+
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
|
27 |
+
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
|
28 |
+
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
|
29 |
+
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
|
30 |
+
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
|
31 |
+
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
|
32 |
+
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
|
33 |
+
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
|
34 |
+
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
|
35 |
+
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
|
36 |
+
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
|
37 |
+
'TAC':'Y', 'TAT':'Y', 'TGC':'C', 'TGT':'C', 'TGG':'W'
|
38 |
+
}
|
39 |
+
|
40 |
+
def translate_nucleotide_sequence(nucleotide_seq):
|
41 |
+
"""
|
42 |
+
Translates a nucleotide sequence to a protein sequence.
|
43 |
+
|
44 |
+
Parameters:
|
45 |
+
nucleotide_seq (str): The nucleotide sequence.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
str: The translated protein sequence.
|
49 |
+
"""
|
50 |
+
# Truncate the sequence to the nearest multiple of 3
|
51 |
+
truncated_seq = nucleotide_seq[:len(nucleotide_seq) // 3 * 3]
|
52 |
+
|
53 |
+
# Translate the nucleotide sequence to a protein sequence
|
54 |
+
protein_seq = ''
|
55 |
+
for i in range(0, len(truncated_seq), 3):
|
56 |
+
codon = truncated_seq[i:i+3]
|
57 |
+
protein_seq += genetic_code.get(codon, 'X') # Use 'X' for unknown codons
|
58 |
+
|
59 |
+
return protein_seq
|
60 |
+
|
61 |
+
# Example usage
|
62 |
+
# nucleotide_seq = "ATGGCGCGGGAGGCTCTGGAAGGAGGCTGCCGGGGCGCCCACGGAGCTGC"
|
63 |
+
# protein_seq = translate_nucleotide_sequence(nucleotide_seq)
|
64 |
+
# print(protein_seq)
|
65 |
+
|
66 |
+
from transformers import AutoTokenizer, EsmForProteinFolding
|
67 |
+
from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
|
68 |
+
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37
|
69 |
+
from proteins_viz import *
|
70 |
+
import gradio as gr
|
71 |
+
import spaces
|
72 |
+
|
73 |
+
def read_mol(molpath):
|
74 |
+
with open(molpath, "r") as fp:
|
75 |
+
lines = fp.readlines()
|
76 |
+
mol = ""
|
77 |
+
for l in lines:
|
78 |
+
mol += l
|
79 |
+
return mol
|
80 |
+
|
81 |
+
|
82 |
+
def molecule(input_pdb):
|
83 |
+
|
84 |
+
mol = read_mol(input_pdb)
|
85 |
+
|
86 |
+
x = (
|
87 |
+
"""<!DOCTYPE html>
|
88 |
+
<html>
|
89 |
+
<head>
|
90 |
+
<meta http-equiv="content-type" content="text/html; charset=UTF-8" />
|
91 |
+
<style>
|
92 |
+
body{
|
93 |
+
font-family:sans-serif
|
94 |
+
}
|
95 |
+
.mol-container {
|
96 |
+
width: 100%;
|
97 |
+
height: 600px;
|
98 |
+
position: relative;
|
99 |
+
}
|
100 |
+
.mol-container select{
|
101 |
+
background-image:None;
|
102 |
+
}
|
103 |
+
</style>
|
104 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.6.3/jquery.min.js" integrity="sha512-STof4xm1wgkfm7heWqFJVn58Hm3EtS31XFaagaa8VMReCXAkQnJZ+jEy8PCC/iT18dFy95WcExNHFTqLyp72eQ==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
|
105 |
+
<script src="https://3Dmol.csb.pitt.edu/build/3Dmol-min.js"></script>
|
106 |
+
</head>
|
107 |
+
<body>
|
108 |
+
<div id="container" class="mol-container"></div>
|
109 |
+
|
110 |
+
<script>
|
111 |
+
let pdb = `"""
|
112 |
+
+ mol
|
113 |
+
+ """`
|
114 |
+
|
115 |
+
$(document).ready(function () {
|
116 |
+
let element = $("#container");
|
117 |
+
let config = { backgroundColor: "white" };
|
118 |
+
let viewer = $3Dmol.createViewer(element, config);
|
119 |
+
viewer.addModel(pdb, "pdb");
|
120 |
+
viewer.getModel(0).setStyle({}, { cartoon: { colorscheme:"whiteCarbon" } });
|
121 |
+
viewer.zoomTo();
|
122 |
+
viewer.render();
|
123 |
+
viewer.zoom(0.8, 2000);
|
124 |
+
})
|
125 |
+
</script>
|
126 |
+
</body></html>"""
|
127 |
+
)
|
128 |
+
|
129 |
+
return f"""<iframe style="width: 100%; height: 600px" name="result" allow="midi; geolocation; microphone; camera;
|
130 |
+
display-capture; encrypted-media;" sandbox="allow-modals allow-forms
|
131 |
+
allow-scripts allow-same-origin allow-popups
|
132 |
+
allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
|
133 |
+
allowpaymentrequest="" frameborder="0" srcdoc='{x}'></iframe>"""
|
134 |
+
|
135 |
+
|
136 |
+
def convert_outputs_to_pdb(outputs):
|
137 |
+
final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
|
138 |
+
outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()}
|
139 |
+
final_atom_positions = final_atom_positions.cpu().numpy()
|
140 |
+
final_atom_mask = outputs["atom37_atom_exists"]
|
141 |
+
pdbs = []
|
142 |
+
for i in range(outputs["aatype"].shape[0]):
|
143 |
+
aa = outputs["aatype"][i]
|
144 |
+
pred_pos = final_atom_positions[i]
|
145 |
+
mask = final_atom_mask[i]
|
146 |
+
resid = outputs["residue_index"][i] + 1
|
147 |
+
pred = OFProtein(
|
148 |
+
aatype=aa,
|
149 |
+
atom_positions=pred_pos,
|
150 |
+
atom_mask=mask,
|
151 |
+
residue_index=resid,
|
152 |
+
b_factors=outputs["plddt"][i],
|
153 |
+
chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
|
154 |
+
)
|
155 |
+
pdbs.append(to_pdb(pred))
|
156 |
+
return pdbs
|
157 |
+
|
158 |
+
tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
|
159 |
+
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)
|
160 |
+
|
161 |
+
model = model.cuda()
|
162 |
+
|
163 |
+
model.esm = model.esm.half()
|
164 |
+
|
165 |
+
import torch
|
166 |
+
|
167 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
168 |
+
|
169 |
+
model.trunk.set_chunk_size(64)
|
170 |
+
|
171 |
+
|
172 |
+
|
173 |
pipe = pipeline("text-generation", model="kimou605/shadow-clown-BioMistral-7B-DARE", torch_dtype=torch.bfloat16, device_map="auto")
|
174 |
|
175 |
|
|
|
203 |
substring_after_last_space = outputs[0]["generated_text"][last_space_index + 7:]
|
204 |
yield substring_after_last_space
|
205 |
|
206 |
+
sequences = extract_nucleotide_sequences(substring_after_last_space)
|
207 |
+
test_protein = translate_nucleotide_sequence(sequences)
|
208 |
+
|
209 |
+
tokenized_input = tokenizer([test_protein], return_tensors="pt", add_special_tokens=False)['input_ids']
|
210 |
+
tokenized_input = tokenized_input.cuda()
|
211 |
+
with torch.no_grad():
|
212 |
+
output = model(tokenized_input)
|
213 |
+
pdb = convert_outputs_to_pdb(output)
|
214 |
+
with open("output_structure.pdb", "w") as f:
|
215 |
+
f.write("".join(pdb))
|
216 |
+
image = take_care("output_structure.pdb")
|
217 |
+
html = molecule("output_structure.pdb")
|
218 |
+
return image, html
|
219 |
+
|
220 |
|
221 |
demo = gr.ChatInterface(
|
222 |
respond,
|