kimou605 commited on
Commit
4d057f2
1 Parent(s): 001392b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -0
app.py CHANGED
@@ -5,6 +5,171 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
5
  import torch
6
  from transformers import pipeline
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  pipe = pipeline("text-generation", model="kimou605/shadow-clown-BioMistral-7B-DARE", torch_dtype=torch.bfloat16, device_map="auto")
9
 
10
 
@@ -38,6 +203,20 @@ def respond(
38
  substring_after_last_space = outputs[0]["generated_text"][last_space_index + 7:]
39
  yield substring_after_last_space
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  demo = gr.ChatInterface(
43
  respond,
 
5
  import torch
6
  from transformers import pipeline
7
 
8
+ def extract_nucleotide_sequences(text):
9
+ """
10
+ Extrait des séquences de nucléotides (ADN) d'un texte.
11
+
12
+ Args:
13
+ text (str): Le texte contenant des séquences de nucléotides.
14
+
15
+ Returns:
16
+ list: Une liste de séquences de nucléotides extraites du texte.
17
+ """
18
+ # Utilisation d'une expression régulière pour trouver des séquences de nucléotides
19
+ sequences = re.findall(r'\b[ATCG]+\b', text, re.IGNORECASE)
20
+ return sequences
21
+
22
+ genetic_code = {
23
+ 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
24
+ 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
25
+ 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
26
+ 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
27
+ 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
28
+ 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
29
+ 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
30
+ 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
31
+ 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
32
+ 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
33
+ 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
34
+ 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
35
+ 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
36
+ 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
37
+ 'TAC':'Y', 'TAT':'Y', 'TGC':'C', 'TGT':'C', 'TGG':'W'
38
+ }
39
+
40
+ def translate_nucleotide_sequence(nucleotide_seq):
41
+ """
42
+ Translates a nucleotide sequence to a protein sequence.
43
+
44
+ Parameters:
45
+ nucleotide_seq (str): The nucleotide sequence.
46
+
47
+ Returns:
48
+ str: The translated protein sequence.
49
+ """
50
+ # Truncate the sequence to the nearest multiple of 3
51
+ truncated_seq = nucleotide_seq[:len(nucleotide_seq) // 3 * 3]
52
+
53
+ # Translate the nucleotide sequence to a protein sequence
54
+ protein_seq = ''
55
+ for i in range(0, len(truncated_seq), 3):
56
+ codon = truncated_seq[i:i+3]
57
+ protein_seq += genetic_code.get(codon, 'X') # Use 'X' for unknown codons
58
+
59
+ return protein_seq
60
+
61
+ # Example usage
62
+ # nucleotide_seq = "ATGGCGCGGGAGGCTCTGGAAGGAGGCTGCCGGGGCGCCCACGGAGCTGC"
63
+ # protein_seq = translate_nucleotide_sequence(nucleotide_seq)
64
+ # print(protein_seq)
65
+
66
+ from transformers import AutoTokenizer, EsmForProteinFolding
67
+ from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
68
+ from transformers.models.esm.openfold_utils.feats import atom14_to_atom37
69
+ from proteins_viz import *
70
+ import gradio as gr
71
+ import spaces
72
+
73
+ def read_mol(molpath):
74
+ with open(molpath, "r") as fp:
75
+ lines = fp.readlines()
76
+ mol = ""
77
+ for l in lines:
78
+ mol += l
79
+ return mol
80
+
81
+
82
+ def molecule(input_pdb):
83
+
84
+ mol = read_mol(input_pdb)
85
+
86
+ x = (
87
+ """<!DOCTYPE html>
88
+ <html>
89
+ <head>
90
+ <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
91
+ <style>
92
+ body{
93
+ font-family:sans-serif
94
+ }
95
+ .mol-container {
96
+ width: 100%;
97
+ height: 600px;
98
+ position: relative;
99
+ }
100
+ .mol-container select{
101
+ background-image:None;
102
+ }
103
+ </style>
104
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.6.3/jquery.min.js" integrity="sha512-STof4xm1wgkfm7heWqFJVn58Hm3EtS31XFaagaa8VMReCXAkQnJZ+jEy8PCC/iT18dFy95WcExNHFTqLyp72eQ==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
105
+ <script src="https://3Dmol.csb.pitt.edu/build/3Dmol-min.js"></script>
106
+ </head>
107
+ <body>
108
+ <div id="container" class="mol-container"></div>
109
+
110
+ <script>
111
+ let pdb = `"""
112
+ + mol
113
+ + """`
114
+
115
+ $(document).ready(function () {
116
+ let element = $("#container");
117
+ let config = { backgroundColor: "white" };
118
+ let viewer = $3Dmol.createViewer(element, config);
119
+ viewer.addModel(pdb, "pdb");
120
+ viewer.getModel(0).setStyle({}, { cartoon: { colorscheme:"whiteCarbon" } });
121
+ viewer.zoomTo();
122
+ viewer.render();
123
+ viewer.zoom(0.8, 2000);
124
+ })
125
+ </script>
126
+ </body></html>"""
127
+ )
128
+
129
+ return f"""<iframe style="width: 100%; height: 600px" name="result" allow="midi; geolocation; microphone; camera;
130
+ display-capture; encrypted-media;" sandbox="allow-modals allow-forms
131
+ allow-scripts allow-same-origin allow-popups
132
+ allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
133
+ allowpaymentrequest="" frameborder="0" srcdoc='{x}'></iframe>"""
134
+
135
+
136
+ def convert_outputs_to_pdb(outputs):
137
+ final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
138
+ outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()}
139
+ final_atom_positions = final_atom_positions.cpu().numpy()
140
+ final_atom_mask = outputs["atom37_atom_exists"]
141
+ pdbs = []
142
+ for i in range(outputs["aatype"].shape[0]):
143
+ aa = outputs["aatype"][i]
144
+ pred_pos = final_atom_positions[i]
145
+ mask = final_atom_mask[i]
146
+ resid = outputs["residue_index"][i] + 1
147
+ pred = OFProtein(
148
+ aatype=aa,
149
+ atom_positions=pred_pos,
150
+ atom_mask=mask,
151
+ residue_index=resid,
152
+ b_factors=outputs["plddt"][i],
153
+ chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
154
+ )
155
+ pdbs.append(to_pdb(pred))
156
+ return pdbs
157
+
158
+ tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
159
+ model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)
160
+
161
+ model = model.cuda()
162
+
163
+ model.esm = model.esm.half()
164
+
165
+ import torch
166
+
167
+ torch.backends.cuda.matmul.allow_tf32 = True
168
+
169
+ model.trunk.set_chunk_size(64)
170
+
171
+
172
+
173
  pipe = pipeline("text-generation", model="kimou605/shadow-clown-BioMistral-7B-DARE", torch_dtype=torch.bfloat16, device_map="auto")
174
 
175
 
 
203
  substring_after_last_space = outputs[0]["generated_text"][last_space_index + 7:]
204
  yield substring_after_last_space
205
 
206
+ sequences = extract_nucleotide_sequences(substring_after_last_space)
207
+ test_protein = translate_nucleotide_sequence(sequences)
208
+
209
+ tokenized_input = tokenizer([test_protein], return_tensors="pt", add_special_tokens=False)['input_ids']
210
+ tokenized_input = tokenized_input.cuda()
211
+ with torch.no_grad():
212
+ output = model(tokenized_input)
213
+ pdb = convert_outputs_to_pdb(output)
214
+ with open("output_structure.pdb", "w") as f:
215
+ f.write("".join(pdb))
216
+ image = take_care("output_structure.pdb")
217
+ html = molecule("output_structure.pdb")
218
+ return image, html
219
+
220
 
221
  demo = gr.ChatInterface(
222
  respond,