Spaces:

AtomBio
/

AptaBLE

Sleeping

App Files Files Community

AptaBLE / gui.py

AtomBio

Create gui.py

d3248a6 verified 4 days ago

raw

history blame

3.37 kB

	from api_prediction import AptaTransPipeline_Dist
	import gradio as gr
	import pandas as pd
	import torch
	import tempfile
	from tabulate import tabulate
	from PIL import Image
	import itertools
	import os
	import RNA
	import matplotlib.pyplot as plt
	import matplotlib.image as mpimg
	import random
	from scipy.cluster.hierarchy import dendrogram, linkage
	# Visualization
	from Bio.Phylo.PhyloXML import Phylogeny
	from Bio import SeqIO
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord
	from Bio import AlignIO
	from Bio.Align.Applications import MafftCommandline
	from Bio import Phylo
	from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
	import io

	os.environ['GRADIO_SERVER_NAME'] = '0.0.0.0'
	title='DNAptaESM2 Model Infernence'
	desc='AptaBLE (cross-attention network), trained to predict the likelihood a DNA aptamer will form a complex with a target protein!\n\nPass in a FASTA-formatted file of all aptamers and input your protein target amino acid sequence. Your output scores are available for download via an Excel file.'

	global pipeline

	pipeline = AptaTransPipeline_Dist(
	lr=1e-6,
	weight_decay=None,
	epochs=None,
	model_type=None,
	model_version=None,
	model_save_path=None,
	accelerate_save_path=None,
	tensorboard_logdir=None,
	d_model=128,
	d_ff=512,
	n_layers=6,
	n_heads=8,
	dropout=0.1,
	load_best_pt=True, # already loads the pretrained model using the datasets included in repo -- no need to run the bottom two cells
	device='cuda',
	seed=1004)

	def comparison(protein, aptamer_file, analysis):
	print('analysis: ', analysis)
	display = []
	table_data = pd.DataFrame()
	r_names, aptamers = read_fasta(aptamer_file)
	proteins = [protein for i in range(len(aptamers))]
	df = pd.DataFrame(columns=['Protein', 'Protein Seq', 'Aptamer', 'Aptamer Seq', 'Score'])
	# print('Number of aptamers: ', len(aptamers))
	scores = get_scores(aptamers, proteins)
	df['Protein'] = ['protein_prov.']*len(aptamers)
	df['Aptamer'] = r_names
	df['Protein Seq'] = proteins
	df['Aptamer Seq'] = aptamers
	df['Score'] = scores

	with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
	with pd.ExcelWriter(temp_file.name, engine='openpyxl') as writer:
	df.to_excel(writer, index=False)
	temp_file_path = temp_file.name

	print('Saving to excel!')
	df.to_excel(f'{aptamer_file}.xlsx')

	torch.cuda.empty_cache()

	return '\n'.join(display), temp_file_path

	def read_fasta(file_path):
	headers = []
	sequences = []
	with open(file_path, 'r') as file:
	content = file.readlines()
	for i in range(0, len(content), 2):
	header = content[i].strip()
	if header.startswith('>'):
	headers.append(header)
	sequences.append(content[i+1].strip())
	return headers, sequences

	def get_scores(aptamers, proteins):
	pipeline.model.to('cuda')
	scores = pipeline.inference(aptamers, proteins, [0]*len(aptamers))
	pipeline.model.to('cpu')
	return scores


	iface = gr.Interface(
	fn=comparison,
	inputs=[
	gr.Textbox(lines=2, placeholder="Protein"),
	gr.File(type="filepath"),
	],
	outputs=[
	gr.Textbox(placeholder="Scores"),
	gr.File(label="Download Excel")
	],
	description=desc
	)

	iface.launch()