from api_prediction import AptaTransPipeline_Dist import gradio as gr import pandas as pd import torch import tempfile from tabulate import tabulate from PIL import Image import itertools import os import RNA import matplotlib.pyplot as plt import matplotlib.image as mpimg import random from scipy.cluster.hierarchy import dendrogram, linkage # Visualization from Bio.Phylo.PhyloXML import Phylogeny from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import AlignIO from Bio.Align.Applications import MafftCommandline from Bio import Phylo from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor import io os.environ['GRADIO_SERVER_NAME'] = '0.0.0.0' title='DNAptaESM2 Model Infernence' desc='AptaBLE (cross-attention network), trained to predict the likelihood a DNA aptamer will form a complex with a target protein!\n\nPass in a FASTA-formatted file of all aptamers and input your protein target amino acid sequence. Your output scores are available for download via an Excel file.' global pipeline pipeline = AptaTransPipeline_Dist( lr=1e-6, weight_decay=None, epochs=None, model_type=None, model_version=None, model_save_path=None, accelerate_save_path=None, tensorboard_logdir=None, d_model=128, d_ff=512, n_layers=6, n_heads=8, dropout=0.1, load_best_pt=True, # already loads the pretrained model using the datasets included in repo -- no need to run the bottom two cells device='cuda', seed=1004) def comparison(protein, aptamer_file, analysis): print('analysis: ', analysis) display = [] table_data = pd.DataFrame() r_names, aptamers = read_fasta(aptamer_file) proteins = [protein for i in range(len(aptamers))] df = pd.DataFrame(columns=['Protein', 'Protein Seq', 'Aptamer', 'Aptamer Seq', 'Score']) # print('Number of aptamers: ', len(aptamers)) scores = get_scores(aptamers, proteins) df['Protein'] = ['protein_prov.']*len(aptamers) df['Aptamer'] = r_names df['Protein Seq'] = proteins df['Aptamer Seq'] = aptamers df['Score'] = scores with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file: with pd.ExcelWriter(temp_file.name, engine='openpyxl') as writer: df.to_excel(writer, index=False) temp_file_path = temp_file.name print('Saving to excel!') df.to_excel(f'{aptamer_file}.xlsx') torch.cuda.empty_cache() return '\n'.join(display), temp_file_path def read_fasta(file_path): headers = [] sequences = [] with open(file_path, 'r') as file: content = file.readlines() for i in range(0, len(content), 2): header = content[i].strip() if header.startswith('>'): headers.append(header) sequences.append(content[i+1].strip()) return headers, sequences def get_scores(aptamers, proteins): pipeline.model.to('cuda') scores = pipeline.inference(aptamers, proteins, [0]*len(aptamers)) pipeline.model.to('cpu') return scores iface = gr.Interface( fn=comparison, inputs=[ gr.Textbox(lines=2, placeholder="Protein"), gr.File(type="filepath"), ], outputs=[ gr.Textbox(placeholder="Scores"), gr.File(label="Download Excel") ], description=desc ) iface.launch()