File size: 2,884 Bytes
529b593
d3248a6
 
 
 
 
 
 
 
 
 
 
b200805
d3248a6
 
 
 
529b593
d3248a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from api_prediction import AptaBLE_Pipeline
import gradio as gr
import pandas as pd
import torch
import tempfile
from tabulate import tabulate
import itertools
import os
import random
# Visualization

os.environ['GRADIO_SERVER_NAME'] = '0.0.0.0'
title='DNAptaBLE Model Inference'
desc='AptaBLE (cross-attention network), trained to predict the likelihood a DNA aptamer will form a complex with a target protein!\n\nPass in a FASTA-formatted file of all aptamers and input your protein target amino acid sequence. Your output scores are available for download via an Excel file.'

global pipeline

pipeline = AptaBLE_Pipeline(
    lr=1e-6,
    weight_decay=None,
    epochs=None,
    model_type=None,
    model_version=None,
    model_save_path=None,
    accelerate_save_path=None,
    tensorboard_logdir=None,
    d_model=128,
    d_ff=512,
    n_layers=6,
    n_heads=8,
    dropout=0.1,
    load_best_pt=True, # already loads the pretrained model using the datasets included in repo -- no need to run the bottom two cells
    device='cuda',
    seed=1004)

def comparison(protein, aptamer_file, analysis):
    print('analysis: ', analysis)
    display = []
    table_data = pd.DataFrame()
    r_names, aptamers = read_fasta(aptamer_file)
    proteins = [protein for i in range(len(aptamers))]
    df = pd.DataFrame(columns=['Protein', 'Protein Seq', 'Aptamer', 'Aptamer Seq', 'Score'])
    # print('Number of aptamers: ', len(aptamers))
    scores = get_scores(aptamers, proteins)
    df['Protein'] = ['protein_prov.']*len(aptamers)
    df['Aptamer'] = r_names
    df['Protein Seq'] = proteins
    df['Aptamer Seq'] = aptamers
    df['Score'] = scores

    with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
        with pd.ExcelWriter(temp_file.name, engine='openpyxl') as writer:
            df.to_excel(writer, index=False)
        temp_file_path = temp_file.name

    print('Saving to excel!')
    df.to_excel(f'{aptamer_file}.xlsx')

    torch.cuda.empty_cache()

    return '\n'.join(display), temp_file_path

def read_fasta(file_path):
    headers = []
    sequences = []
    with open(file_path, 'r') as file:
        content = file.readlines()
    for i in range(0, len(content), 2):
        header = content[i].strip()
        if header.startswith('>'):
            headers.append(header)
            sequences.append(content[i+1].strip())
    return headers, sequences

def get_scores(aptamers, proteins):
    pipeline.model.to('cuda')
    scores = pipeline.inference(aptamers, proteins, [0]*len(aptamers))
    pipeline.model.to('cpu')
    return scores


iface = gr.Interface(
    fn=comparison,
    inputs=[
        gr.Textbox(lines=2, placeholder="Protein"),
        gr.File(type="filepath"),
    ],
    outputs=[
        gr.Textbox(placeholder="Scores"),
        gr.File(label="Download Excel")
    ],
    description=desc
)

iface.launch()