File size: 993 Bytes
dea517c
 
 
 
 
 
 
 
7898ec7
dea517c
 
 
 
b451ff3
dea517c
 
 
 
 
 
 
 
a1dd72a
dea517c
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import gradio as gr
from huggingface_hub import InferenceClient
import os 

token = os.getenv("TOKEN")
endpoint = os.getenv("ENDPOINT")

# initialize InferenceClient
client = InferenceClient(model="https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B-Instruct", token=token)

# query client using streaming mode
def inference(message, history):
    partial_message = ""
    for token in client.text_generation(message, max_new_tokens=100, stream=True):
        partial_message += token
        yield partial_message

gr.ChatInterface(
    inference,
    chatbot=gr.Chatbot(height=300),
    textbox=gr.Textbox(placeholder="Chat with me!", container=False, scale=7),
    title="Gradio 🤝 TGI",
    description="This is the demo for Gradio UI consuming TGI endpoint with LLaMA 7B-Chat model.",
    theme="abidlabs/Lime",
    examples=["Are tomatoes vegetables?"],
    cache_examples=True,
    retry_btn="Retry",
    undo_btn="Undo",
    clear_btn="Clear",
).queue().launch()