ONNX Demo code?

#5
by james-passio - opened

I'm trying to use the ONNX models, and I've getting bad output. Is there any working demo code available?

This is the code I'm using:

import onnxruntime as ort
from PIL import Image
import requests
import numpy as np
from transformers import AutoTokenizer, AutoProcessor

# Load the tokenizer and processor
tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")

# Download and load the image
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# Preprocess the image
image = image.resize((384, 384))  # Adjust size as needed
image_array = np.array(image).astype(np.float32) / 255.0
image_array = np.transpose(image_array, (2, 0, 1))  # CHW format
image_array = np.expand_dims(image_array, axis=0)  # Add batch dimension

# Load ONNX model
vision_encoder_session = ort.InferenceSession("/Users/jameskelly/Downloads/vision_encoder.onnx")
decoder_session = ort.InferenceSession("/Users/jameskelly/Downloads/decoder_model_merged.onnx")
embed_tokens_session = ort.InferenceSession("/Users/jameskelly/Downloads/embed_tokens.onnx")

# Run vision encoder
vision_input_name = vision_encoder_session.get_inputs()[0].name
vision_output_name = vision_encoder_session.get_outputs()[0].name
vision_features = vision_encoder_session.run([vision_output_name], {vision_input_name: image_array})[0]

# Prepare text input
conversation = [
    {
        "role": "system",
        "content": "You are a helpful assistant that answers questions about images."
    },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},
            {"type": "image"},
        ],
    },
]

# Apply chat template
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="np", padding=True, truncation=True)
input_ids = inputs.input_ids
attention_mask = inputs.attention_mask

# Prepare inputs
sequence_length = input_ids.shape[1]
batch_size = 1
num_layers = 24
head_dim = 64
num_heads = 16
past_sequence_length = 0  # Set to 0 for the initial pass

# Attention mask
attention_mask = np.ones((batch_size, past_sequence_length + sequence_length), dtype=np.int64)

# Position IDs
position_ids = np.arange(sequence_length, dtype=np.int64).reshape(1, -1)

# Past Key Values
past_key_values = {
    f"past_key_values.{i}.key": np.zeros((batch_size, num_heads, past_sequence_length, head_dim), dtype=np.float32)
    for i in range(num_layers)
}
past_key_values.update({
    f"past_key_values.{i}.value": np.zeros((batch_size, num_heads, past_sequence_length, head_dim), dtype=np.float32)
    for i in range(num_layers)
})

# Run embed tokens
embed_input_name = embed_tokens_session.get_inputs()[0].name
embed_output_name = embed_tokens_session.get_outputs()[0].name
token_embeddings = embed_tokens_session.run([embed_output_name], {embed_input_name: input_ids})[0]

# Combine token embeddings and vision features
combined_embeddings = np.concatenate([token_embeddings, vision_features], axis=1)

# Update attention_mask and position_ids
combined_length = combined_embeddings.shape[1]
attention_mask = np.ones((batch_size, combined_length), dtype=np.int64)
position_ids = np.arange(combined_length, dtype=np.int64).reshape(1, -1)

# Combine all inputs
decoder_inputs = {
    "attention_mask": attention_mask,
    "position_ids": position_ids,
    "inputs_embeds": combined_embeddings,
    **past_key_values
}

# Print input shapes
for name, value in decoder_inputs.items():
    print(f"{name} shape: {value.shape}")

# Run the decoder
decoder_input_names = [input.name for input in decoder_session.get_inputs()]
decoder_output_name = decoder_session.get_outputs()[0].name
outputs = decoder_session.run([decoder_output_name], {name: decoder_inputs[name] for name in decoder_input_names if name in decoder_inputs})[0]

# ... (previous code remains the same until after the decoder run)

print(f"Outputs shape: {outputs.shape}")
print(f"Outputs type: {outputs.dtype}")

# Print input token IDs
print(f"Input token IDs: {input_ids[0].tolist()}")

# Process outputs (decode tokens to text)
generated_tokens = []
eos_token_id = tokenizer.eos_token_id
max_new_tokens = 50

for i in range(max_new_tokens):
    logits = outputs[0, i]
    token_id = np.argmax(logits)
    
    if token_id == eos_token_id:
        break
    
    generated_tokens.append(token_id)
    
    # Print top 5 probable tokens for each step
    top_tokens = np.argsort(logits)[-5:][::-1]
    print(f"Step {i+1}: Top 5 tokens: {[(t, tokenizer.decode([t]), logits[t]) for t in top_tokens]}")

    # Prepare input for next token generation
    new_input_embeds = embed_tokens_session.run([embed_output_name], {embed_input_name: np.array([[token_id]])})[0]
    combined_embeddings = np.concatenate([combined_embeddings, new_input_embeds], axis=1)
    
    attention_mask = np.ones((1, combined_embeddings.shape[1]), dtype=np.int64)
    position_ids = np.arange(combined_embeddings.shape[1], dtype=np.int64).reshape(1, -1)
    
    decoder_inputs = {
        "attention_mask": attention_mask,
        "position_ids": position_ids,
        "inputs_embeds": combined_embeddings,
        **past_key_values
    }
    
    outputs = decoder_session.run([decoder_output_name], {name: decoder_inputs[name] for name in decoder_input_names if name in decoder_inputs})[0]

# Convert to list of integers
token_ids = [int(token) for token in generated_tokens]

print(f"Generated token IDs: {token_ids}")

# Decode tokens one by one
decoded_tokens = [tokenizer.decode([token]) for token in token_ids]
print(f"Decoded tokens: {decoded_tokens}")

# Full decoded output
decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
print(f"Full decoded output: {decoded_output}")

The final output contains Chinese characters "可以通过 plea".

I assume I'm doing something wrong, any help would be greatly appreciated.

Llava Hugging Face org

cc @Xenova who added ONNX converted weights

Bro How To Use ONNX Can You Plz Explain Or Provide A code

Llava Hugging Face org

@james-passio looks like the past key values aren't being updated - maybe something to look into.

Thanks @Xenova ,do you have a working example of how to use these ONNX models? That would be really useful

Llava Hugging Face org
edited Aug 16

All usage I've worked on is via Transformers.js (i.e., in JavaScript). Here is a demo space I created for it: https://huggingface.co/spaces/llava-hf/llava-webgpu

and the code snippet will be very similar to https://huggingface.co/onnx-community/nanoLLaVA-1.5 (see README).

Feel free to look at the Transformers.js source code and adapt it into a python version: http://github.com/xenova/transformers.js :) The above looks very close, you just need to:
(1) Update the PKVs after generation
(2) Update the attention mask and position IDs

Sign up or log in to comment