import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) from configuration_dolphin import DolphinConfig from modeling_dolphin import DolphinForCausalLM from transformers import (AutoTokenizer, AutoModelForCausalLM, AutoConfig) import torch def inference_instruct(mycontext, question, device="cuda:0"): import time MEMORY_SIZE = 32 start_time = time.time() generated_token_ids = [] prompt = f" {question}" text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("")] input_ids = ( torch.tensor( text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long ) .unsqueeze(0) .to(device) ) # to process the context context_tokenized = tokenizer( mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]), return_tensors="pt", ) context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()} context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE # We conduct a inference process for i in range(context_token_count): next_token = ( model( input_ids, context_input_ids=context_tokenized["input_ids"], context_attention_mask=context_tokenized["attention_mask"], ) .logits[:, -1] .argmax(-1) ) if next_token.item() == 151643: break generated_token_ids.append(next_token.item()) input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1) result = tokenizer.decode(generated_token_ids) print(f"Time taken: {time.time() - start_time}") return result if __name__ == "__main__": # Register your configuration and model AutoConfig.register("dolphin", DolphinConfig) AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM) device_name = "cuda:0" if torch.cuda.is_available() else "cpu" # Load the tokenizer and model tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map=device_name) # Run inference example mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally" question = "Who founded Nexa AI?" # Pass the context and the correct device string result = inference_instruct(mycontext, question, device=device_name) print("Result:", result)