Spaces:

gokaygokay
/

Gemma-2-llamacpp

Running on Zero

File size: 5,709 Bytes

import gradio as gr

import os

from huggingface_hub.file_download import http_get
from llama_cpp import Llama

SYSTEM_PROMPT = "Tú eres ABI, un asistente automático de habla española. Hablas con las personas y las ayudas."

def obtener_tokens_mensaje(modelo, rol, contenido):
    contenido = f"{rol}\n{contenido}\n</s>"
    contenido = contenido.encode("utf-8")
    return modelo.tokenize(contenido, special=True)

def obtener_tokens_sistema(modelo):
    mensaje_sistema = {"role": "system", "content": SYSTEM_PROMPT}
    return obtener_tokens_mensaje(modelo, **mensaje_sistema)

def cargar_modelo(
    directorio: str = ".",
    nombre_modelo: str = "ecastera/eva-mistral-7b-spanish-GGUF",
    url_modelo: str = "https://huggingface.co/ecastera/eva-mistral-7b-spanish-GGUF/resolve/main/Turdus-trained-20-int4.gguf"
):
    ruta_modelo_final = os.path.join(directorio, nombre_modelo)
    
    print("Descargando todos los archivos...")
    if not os.path.exists(ruta_modelo_final):
        with open(ruta_modelo_final, "wb") as f:
            http_get(url_modelo, f)
    os.chmod(ruta_modelo_final, 0o777)
    print("¡Archivos descargados!")
    
    modelo = Llama(
        model_path=ruta_modelo_final,
        n_ctx=2048
    )
    
    print("¡Modelo cargado!")
    return modelo

MODELO = cargar_modelo()

def usuario(mensaje, historial):
    nuevo_historial = historial + [[mensaje, None]]
    return "", nuevo_historial

def bot(
    historial,
    prompt_sistema,
    top_p,
    top_k,
    temp
):
    modelo = MODELO
    tokens = obtener_tokens_sistema(modelo)[:]

    for mensaje_usuario, mensaje_bot in historial[:-1]:
        tokens_mensaje = obtener_tokens_mensaje(modelo=modelo, rol="usuario", contenido=mensaje_usuario)
        tokens.extend(tokens_mensaje)
        if mensaje_bot:
            tokens_mensaje = obtener_tokens_mensaje(modelo=modelo, rol="bot", contenido=mensaje_bot)
            tokens.extend(tokens_mensaje)

    ultimo_mensaje_usuario = historial[-1][0]
    tokens_mensaje = obtener_tokens_mensaje(modelo=modelo, rol="usuario", contenido=ultimo_mensaje_usuario)
    tokens.extend(tokens_mensaje)

    tokens_rol = modelo.tokenize("bot\n".encode("utf-8"), special=True)
    tokens.extend(tokens_rol)
    generador = modelo.generate(
        tokens,
        top_k=top_k,
        top_p=top_p,
        temp=temp
    )

    texto_parcial = ""
    for i, token in enumerate(generador):
        if token == modelo.token_eos():
            break
        texto_parcial += modelo.detokenize([token]).decode("utf-8", "ignore")
        historial[-1][1] = texto_parcial
        yield historial

with gr.Blocks(
    theme=gr.themes.Soft()
) as demo:
    favicon = '<img src="" width="48px" style="display: inline">'
    gr.Markdown(
        f"""<h1><center>{favicon}Saiga Mistral 7B GGUF Q4_K</center></h1>
        Esta es una demo de un modelo basado en Mistral que habla español
        """
    )
    with gr.Row():
        with gr.Column(scale=5):
            prompt_sistema = gr.Textbox(label="Prompt del sistema", placeholder="", value=SYSTEM_PROMPT, interactive=False)
            chatbot = gr.Chatbot(label="Diálogo", height=400)
        with gr.Column(min_width=80, scale=1):
            with gr.Tab(label="Parámetros de generación"):
                top_p = gr.Slider(
                    minimum=0.0,
                    maximum=1.0,
                    value=0.9,
                    step=0.05,
                    interactive=True,
                    label="Top-p",
                )
                top_k = gr.Slider(
                    minimum=10,
                    maximum=100,
                    value=30,
                    step=5,
                    interactive=True,
                    label="Top-k",
                )
                temp = gr.Slider(
                    minimum=0.0,
                    maximum=2.0,
                    value=0.01,
                    step=0.01,
                    interactive=True,
                    label="Temperatura"
                )
    with gr.Row():
        with gr.Column():
            msg = gr.Textbox(
                label="Enviar mensaje",
                placeholder="Enviar mensaje",
                show_label=False,
            )
        with gr.Column():
            with gr.Row():
                submit = gr.Button("Enviar")
                stop = gr.Button("Detener")
                clear = gr.Button("Limpiar")
    with gr.Row():
        gr.Markdown(
            """ADVERTENCIA: El modelo puede generar textos que sean incorrectos fácticamente o inapropiados éticamente. No nos hacemos responsables de esto."""
        )

    # Presionando Enter
    evento_enviar = msg.submit(
        fn=usuario,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot],
        queue=False,
    ).success(
        fn=bot,
        inputs=[
            chatbot,
            prompt_sistema,
            top_p,
            top_k,
            temp
        ],
        outputs=chatbot,
        queue=True,
    )

    # Presionando el botón
    evento_click_enviar = submit.click(
        fn=usuario,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot],
        queue=False,
    ).success(
        fn=bot,
        inputs=[
            chatbot,
            prompt_sistema,
            top_p,
            top_k,
            temp
        ],
        outputs=chatbot,
        queue=True,
    )

    # Detener generación
    stop.click(
        fn=None,
        inputs=None,
        outputs=None,
        cancels=[evento_enviar, evento_click_enviar],
        queue=False,
    )

    # Limpiar historial
    clear.click(lambda: None, None, chatbot, queue=False)

demo.queue(max_size=128)
demo.launch()