File size: 3,366 Bytes
d3248a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from api_prediction import AptaTransPipeline_Dist
import gradio as gr
import pandas as pd
import torch
import tempfile
from tabulate import tabulate
from PIL import Image
import itertools
import os
import RNA
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import random
from scipy.cluster.hierarchy import dendrogram, linkage
# Visualization
from Bio.Phylo.PhyloXML import Phylogeny
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import AlignIO
from Bio.Align.Applications import MafftCommandline
from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import io

os.environ['GRADIO_SERVER_NAME'] = '0.0.0.0'
title='DNAptaESM2 Model Infernence'
desc='AptaBLE (cross-attention network), trained to predict the likelihood a DNA aptamer will form a complex with a target protein!\n\nPass in a FASTA-formatted file of all aptamers and input your protein target amino acid sequence. Your output scores are available for download via an Excel file.'

global pipeline

pipeline = AptaTransPipeline_Dist(
    lr=1e-6,
    weight_decay=None,
    epochs=None,
    model_type=None,
    model_version=None,
    model_save_path=None,
    accelerate_save_path=None,
    tensorboard_logdir=None,
    d_model=128,
    d_ff=512,
    n_layers=6,
    n_heads=8,
    dropout=0.1,
    load_best_pt=True, # already loads the pretrained model using the datasets included in repo -- no need to run the bottom two cells
    device='cuda',
    seed=1004)

def comparison(protein, aptamer_file, analysis):
    print('analysis: ', analysis)
    display = []
    table_data = pd.DataFrame()
    r_names, aptamers = read_fasta(aptamer_file)
    proteins = [protein for i in range(len(aptamers))]
    df = pd.DataFrame(columns=['Protein', 'Protein Seq', 'Aptamer', 'Aptamer Seq', 'Score'])
    # print('Number of aptamers: ', len(aptamers))
    scores = get_scores(aptamers, proteins)
    df['Protein'] = ['protein_prov.']*len(aptamers)
    df['Aptamer'] = r_names
    df['Protein Seq'] = proteins
    df['Aptamer Seq'] = aptamers
    df['Score'] = scores

    with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
        with pd.ExcelWriter(temp_file.name, engine='openpyxl') as writer:
            df.to_excel(writer, index=False)
        temp_file_path = temp_file.name

    print('Saving to excel!')
    df.to_excel(f'{aptamer_file}.xlsx')

    torch.cuda.empty_cache()

    return '\n'.join(display), temp_file_path

def read_fasta(file_path):
    headers = []
    sequences = []
    with open(file_path, 'r') as file:
        content = file.readlines()
    for i in range(0, len(content), 2):
        header = content[i].strip()
        if header.startswith('>'):
            headers.append(header)
            sequences.append(content[i+1].strip())
    return headers, sequences

def get_scores(aptamers, proteins):
    pipeline.model.to('cuda')
    scores = pipeline.inference(aptamers, proteins, [0]*len(aptamers))
    pipeline.model.to('cpu')
    return scores


iface = gr.Interface(
    fn=comparison,
    inputs=[
        gr.Textbox(lines=2, placeholder="Protein"),
        gr.File(type="filepath"),
    ],
    outputs=[
        gr.Textbox(placeholder="Scores"),
        gr.File(label="Download Excel")
    ],
    description=desc
)

iface.launch()