llava-hf/llava-interleave-qwen-0.5b-hf

Jul 30

I'm trying to use the ONNX models, and I've getting bad output. Is there any working demo code available?

This is the code I'm using:

import onnxruntime as ort
from PIL import Image
import requests
import numpy as np
from transformers import AutoTokenizer, AutoProcessor

# Load the tokenizer and processor
tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")

# Download and load the image
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# Preprocess the image
image = image.resize((384, 384))  # Adjust size as needed
image_array = np.array(image).astype(np.float32) / 255.0
image_array = np.transpose(image_array, (2, 0, 1))  # CHW format
image_array = np.expand_dims(image_array, axis=0)  # Add batch dimension

# Load ONNX model
vision_encoder_session = ort.InferenceSession("/Users/jameskelly/Downloads/vision_encoder.onnx")
decoder_session = ort.InferenceSession("/Users/jameskelly/Downloads/decoder_model_merged.onnx")
embed_tokens_session = ort.InferenceSession("/Users/jameskelly/Downloads/embed_tokens.onnx")

# Run vision encoder
vision_input_name = vision_encoder_session.get_inputs()[0].name
vision_output_name = vision_encoder_session.get_outputs()[0].name
vision_features = vision_encoder_session.run([vision_output_name], {vision_input_name: image_array})[0]

# Prepare text input
conversation = [
    {
        "role": "system",
        "content": "You are a helpful assistant that answers questions about images."
    },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},
            {"type": "image"},
        ],
    },
]

# Apply chat template
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="np", padding=True, truncation=True)
input_ids = inputs.input_ids
attention_mask = inputs.attention_mask

# Prepare inputs
sequence_length = input_ids.shape[1]
batch_size = 1
num_layers = 24
head_dim = 64
num_heads = 16
past_sequence_length = 0  # Set to 0 for the initial pass

# Attention mask
attention_mask = np.ones((batch_size, past_sequence_length + sequence_length), dtype=np.int64)

# Position IDs
position_ids = np.arange(sequence_length, dtype=np.int64).reshape(1, -1)

# Past Key Values
past_key_values = {
    f"past_key_values.{i}.key": np.zeros((batch_size, num_heads, past_sequence_length, head_dim), dtype=np.float32)
    for i in range(num_layers)
}
past_key_values.update({
    f"past_key_values.{i}.value": np.zeros((batch_size, num_heads, past_sequence_length, head_dim), dtype=np.float32)
    for i in range(num_layers)
})

# Run embed tokens
embed_input_name = embed_tokens_session.get_inputs()[0].name
embed_output_name = embed_tokens_session.get_outputs()[0].name
token_embeddings = embed_tokens_session.run([embed_output_name], {embed_input_name: input_ids})[0]

# Combine token embeddings and vision features
combined_embeddings = np.concatenate([token_embeddings, vision_features], axis=1)

# Update attention_mask and position_ids
combined_length = combined_embeddings.shape[1]
attention_mask = np.ones((batch_size, combined_length), dtype=np.int64)
position_ids = np.arange(combined_length, dtype=np.int64).reshape(1, -1)

# Combine all inputs
decoder_inputs = {
    "attention_mask": attention_mask,
    "position_ids": position_ids,
    "inputs_embeds": combined_embeddings,
    **past_key_values
}

# Print input shapes
for name, value in decoder_inputs.items():
    print(f"{name} shape: {value.shape}")

# Run the decoder
decoder_input_names = [input.name for input in decoder_session.get_inputs()]
decoder_output_name = decoder_session.get_outputs()[0].name
outputs = decoder_session.run([decoder_output_name], {name: decoder_inputs[name] for name in decoder_input_names if name in decoder_inputs})[0]

# ... (previous code remains the same until after the decoder run)

print(f"Outputs shape: {outputs.shape}")
print(f"Outputs type: {outputs.dtype}")

# Print input token IDs
print(f"Input token IDs: {input_ids[0].tolist()}")

# Process outputs (decode tokens to text)
generated_tokens = []
eos_token_id = tokenizer.eos_token_id
max_new_tokens = 50

for i in range(max_new_tokens):
    logits = outputs[0, i]
    token_id = np.argmax(logits)
    
    if token_id == eos_token_id:
        break
    
    generated_tokens.append(token_id)
    
    # Print top 5 probable tokens for each step
    top_tokens = np.argsort(logits)[-5:][::-1]
    print(f"Step {i+1}: Top 5 tokens: {[(t, tokenizer.decode([t]), logits[t]) for t in top_tokens]}")

    # Prepare input for next token generation
    new_input_embeds = embed_tokens_session.run([embed_output_name], {embed_input_name: np.array([[token_id]])})[0]
    combined_embeddings = np.concatenate([combined_embeddings, new_input_embeds], axis=1)
    
    attention_mask = np.ones((1, combined_embeddings.shape[1]), dtype=np.int64)
    position_ids = np.arange(combined_embeddings.shape[1], dtype=np.int64).reshape(1, -1)
    
    decoder_inputs = {
        "attention_mask": attention_mask,
        "position_ids": position_ids,
        "inputs_embeds": combined_embeddings,
        **past_key_values
    }
    
    outputs = decoder_session.run([decoder_output_name], {name: decoder_inputs[name] for name in decoder_input_names if name in decoder_inputs})[0]

# Convert to list of integers
token_ids = [int(token) for token in generated_tokens]

print(f"Generated token IDs: {token_ids}")

# Decode tokens one by one
decoded_tokens = [tokenizer.decode([token]) for token in token_ids]
print(f"Decoded tokens: {decoded_tokens}")

# Full decoded output
decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
print(f"Full decoded output: {decoded_output}")

The final output contains Chinese characters "可以通过 plea".

I assume I'm doing something wrong, any help would be greatly appreciated.

RaushanTurganbay

Llava Hugging Face org Jul 30

cc @Xenova who added ONNX converted weights

ha1772007

Aug 16

Bro How To Use ONNX Can You Plz Explain Or Provide A code

Xenova

Llava Hugging Face org Aug 16

@james-passio looks like the past key values aren't being updated - maybe something to look into.

james-passio

Aug 16

Thanks @Xenova ,do you have a working example of how to use these ONNX models? That would be really useful

Xenova

Llava Hugging Face org Aug 16

•

edited Aug 16

All usage I've worked on is via Transformers.js (i.e., in JavaScript). Here is a demo space I created for it: https://huggingface.co/spaces/llava-hf/llava-webgpu

and the code snippet will be very similar to https://huggingface.co/onnx-community/nanoLLaVA-1.5 (see README).

Feel free to look at the Transformers.js source code and adapt it into a python version: http://github.com/xenova/transformers.js :) The above looks very close, you just need to:
(1) Update the PKVs after generation
(2) Update the attention mask and position IDs

llava-hf
/

llava-interleave-qwen-0.5b-hf

ONNX Demo code?