HODACHI commited on
Commit
2b88a75
1 Parent(s): 5597a6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -54
app.py CHANGED
@@ -1,86 +1,63 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
3
  import torch
4
  from threading import Thread
5
- import transformers
6
 
7
  MODEL_ID = "HODACHI/Llama-3.1-8B-EZO-1.1-it"
8
  DTYPE = torch.bfloat16
9
 
10
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
11
-
12
  model = AutoModelForCausalLM.from_pretrained(
13
  MODEL_ID,
14
- torch_dtype=torch.bfloat16, # bfloat16形式で計算を行い、精度と速度のバランスを取る
15
- device_map="auto", # 利用可能なデバイスに自動的にモデルを配置
16
- low_cpu_mem_usage=True, # CPU消費メモリを抑える
17
  )
18
 
19
- pipeline = transformers.pipeline(
20
- "text-generation", # タスクを指定(ここではテキスト生成)
21
- model=model, # 使用するモデル
22
- tokenizer=tokenizer, # 使用するトークナイザー
23
- device_map="auto", # デバイスの自動割り当て
24
  )
25
 
26
- def respond(
27
- message,
28
- history: list[tuple[str, str]],
29
- max_tokens,
30
- temperature,
31
- top_p,
32
- ):
 
 
 
 
 
 
 
 
 
33
  chat = []
34
  chat.append({"role": "system", "content": "あなたは誠実で優秀な日本人のアシスタントです。特に指示が無い場合は、原則日本語で回答してください。"})
35
  for user, assistant in history:
36
  chat.append({"role": "user", "content": user})
37
  chat.append({"role": "assistant", "content": assistant})
38
  chat.append({"role": "user", "content": message})
39
-
40
  prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
41
- #inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to(model.device)
42
-
43
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
44
 
45
- #generation_kwargs = dict(
46
- # input_ids=inputs,
47
- # max_new_tokens=max_tokens,
48
- # temperature=temperature,
49
- # top_p=top_p,
50
- # do_sample=True,
51
- # streamer=streamer,
52
- #)
53
 
54
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
55
- thread.start()
56
-
57
- #response = ""
58
- #for new_text in streamer:
59
- # response += new_text
60
- # yield response
61
- outputs = pipeline(
62
- prompt,
63
- max_new_tokens=40, # 生成する最大トークン数
64
- do_sample=True, # サンプリングを有効にして多様な出力を得る
65
- temperature=0.7, # 生成の多様性を調整(高いほど多様、低いほど決定的)
66
- top_p=0.95, # 累積確率に基づくサンプリングの閾値
67
- )
68
-
69
- response = outputs[0]["generated_text"]
70
- return response
71
 
72
  demo = gr.ChatInterface(
73
  respond,
74
  additional_inputs=[
75
  gr.Slider(minimum=1, maximum=2048, value=150, step=1, label="Max new tokens"),
76
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
77
- gr.Slider(
78
- minimum=0.1,
79
- maximum=1.0,
80
- value=0.95,
81
- step=0.05,
82
- label="Top-p (nucleus sampling)",
83
- ),
84
  ],
85
  )
86
 
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, pipeline
3
  import torch
4
  from threading import Thread
 
5
 
6
  MODEL_ID = "HODACHI/Llama-3.1-8B-EZO-1.1-it"
7
  DTYPE = torch.bfloat16
8
 
9
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
10
  model = AutoModelForCausalLM.from_pretrained(
11
  MODEL_ID,
12
+ torch_dtype=DTYPE,
13
+ device_map="auto",
14
+ low_cpu_mem_usage=True,
15
  )
16
 
17
+ pipe = pipeline(
18
+ "text-generation",
19
+ model=model,
20
+ tokenizer=tokenizer,
21
+ device_map="auto",
22
  )
23
 
24
+ def generate_text(prompt, max_new_tokens, temperature, top_p):
25
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
26
+ generation_kwargs = dict(
27
+ max_new_tokens=max_new_tokens,
28
+ temperature=temperature,
29
+ top_p=top_p,
30
+ do_sample=True,
31
+ streamer=streamer,
32
+ )
33
+
34
+ thread = Thread(target=pipe, kwargs=dict(text_inputs=prompt, **generation_kwargs))
35
+ thread.start()
36
+
37
+ return streamer
38
+
39
+ def respond(message, history, max_tokens, temperature, top_p):
40
  chat = []
41
  chat.append({"role": "system", "content": "あなたは誠実で優秀な日本人のアシスタントです。特に指示が無い場合は、原則日本語で回答してください。"})
42
  for user, assistant in history:
43
  chat.append({"role": "user", "content": user})
44
  chat.append({"role": "assistant", "content": assistant})
45
  chat.append({"role": "user", "content": message})
 
46
  prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
 
 
 
47
 
48
+ streamer = generate_text(prompt, max_tokens, temperature, top_p)
 
 
 
 
 
 
 
49
 
50
+ response = ""
51
+ for new_text in streamer:
52
+ response += new_text
53
+ yield response
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  demo = gr.ChatInterface(
56
  respond,
57
  additional_inputs=[
58
  gr.Slider(minimum=1, maximum=2048, value=150, step=1, label="Max new tokens"),
59
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
60
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
 
 
 
 
 
 
61
  ],
62
  )
63