allenai-OLMoE-1B-7B-0924-cpu

Sleeping

App Files Files Community

nisten commited on 15 days ago

Commit

8ade5d7

•

1 Parent(s): c720fed

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -9

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import sys
 # Install required packages
 subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "--force-reinstall", "--no-deps", "einops", "accelerate", "torch", "git+https://github.com/Muennighoff/transformers.git@olmoe"])
 from transformers import OlmoeForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
@@ -18,11 +19,12 @@ try:
     model = OlmoeForCausalLM.from_pretrained(
         model_name,
         trust_remote_code=True,
-        torch_dtype=torch.bfloat16,
         low_cpu_mem_usage=True,
         device_map="auto",
     ).to(DEVICE)
-    model.gradient_checkpointing_enable()
     tokenizer = AutoTokenizer.from_pretrained(model_name)
 except Exception as e:
     print(f"Error loading model: {e}")
@@ -41,8 +43,10 @@ def generate_response(message, history, temperature, max_new_tokens):
         return
     messages = [{"role": "system", "content": system_prompt}]
-    for msg in history:
-        messages.append({"role": "user" if msg["role"] == "human" else "assistant", "content": msg["content"]})
     messages.append({"role": "user", "content": message})
     inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
@@ -84,7 +88,7 @@ css = """
 """
 with gr.Blocks(css=css) as demo:
-    gr.Markdown("# Nisten's Karpathy Chatbot with OLMoE (CPU only instance feel free to clone!)")
     chatbot = gr.Chatbot(elem_id="output")
     msg = gr.Textbox(label="Meow")
     with gr.Row():
@@ -93,14 +97,14 @@ with gr.Blocks(css=css) as demo:
     clear = gr.Button("Clear")
     def user(user_message, history):
-        return "", history + [{"role": "human", "content": user_message}]
     def bot(history, temp, max_tokens):
-        user_message = history[-1]["content"]
         bot_message = ""
         for token in generate_response(user_message, history[:-1], temp, max_tokens):
             bot_message = token
-            history.append({"role": "assistant", "content": bot_message})
             yield history
     msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
@@ -110,4 +114,4 @@ with gr.Blocks(css=css) as demo:
 if __name__ == "__main__":
     demo.queue(api_open=True, max_size=10)  # Limiting queue size
-    demo.launch(debug=True, show_api=True, share=False)

 # Install required packages
 subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "--force-reinstall", "--no-deps", "einops", "accelerate", "torch", "git+https://github.com/Muennighoff/transformers.git@olmoe"])
+#subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 from transformers import OlmoeForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
     model = OlmoeForCausalLM.from_pretrained(
         model_name,
         trust_remote_code=True,
+        torch_dtype=torch.bfloat16,  # Using float16 for lower precision
         low_cpu_mem_usage=True,
         device_map="auto",
+        #_attn_implementation="flash_attention_2"  # Enable Flash Attention 2
     ).to(DEVICE)
+    model.gradient_checkpointing_enable()  # Enable gradient checkpointing
     tokenizer = AutoTokenizer.from_pretrained(model_name)
 except Exception as e:
     print(f"Error loading model: {e}")
         return
     messages = [{"role": "system", "content": system_prompt}]
+    for user_msg, assistant_msg in history:
+        messages.append({"role": "user", "content": user_msg})
+        if assistant_msg:
+            messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": message})
     inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
 """
 with gr.Blocks(css=css) as demo:
+    gr.Markdown("# Nisten's Karpathy Chatbot with OSS OLMoE (CPU 2core only, feel free to clone)")
     chatbot = gr.Chatbot(elem_id="output")
     msg = gr.Textbox(label="Meow")
     with gr.Row():
     clear = gr.Button("Clear")
     def user(user_message, history):
+        return "", history + [[user_message, None]]
     def bot(history, temp, max_tokens):
+        user_message = history[-1][0]
         bot_message = ""
         for token in generate_response(user_message, history[:-1], temp, max_tokens):
             bot_message = token
+            history[-1][1] = bot_message
             yield history
     msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
 if __name__ == "__main__":
     demo.queue(api_open=True, max_size=10)  # Limiting queue size
+    demo.launch(debug=True, show_api=True, share=False)  # Disabled sharing for security