import tkinter as tk
from tkinter import scrolledtext
from gpt.OpenAI.openAI import get_openai_response
from gpt.ChatGLM.chatglm_client import get_chatGLM2_6b_response
def send_message(event=None):
user_input = input_text.get()
chat_history.insert(tk.END, f"You: {user_input}\n")
input_text.set("")
response = get_chatGLM2_6b_response(user_input)
chat_history.insert(tk.END, f"Bot: {response}\n")
# 创建主窗口
root = tk.Tk()
root.title("Chat with OpenAI")
# 创建聊天记录文本框
chat_history = scrolledtext.ScrolledText(root, wrap=tk.WORD)
chat_history.pack(padx=10, pady=10, fill=tk.BOTH, expand=True)
# 创建输入框和发送按钮
input_text = tk.StringVar()
entry_box = tk.Entry(root, textvariable=input_text, width=50)
entry_box.pack(padx=10, pady=5, side=tk.LEFT, expand=True)
entry_box.bind('<Return>', send_message)
send_button = tk.Button(root, text="Send", command=send_message)
send_button.pack(padx=10, pady=5, side=tk.RIGHT)
# 运行主循环
root.mainloop()
import torch
from transformers import AutoTokenizer, AutoModel
# 确定设备是否支持 mps
device = "mps" if torch.backends.mps.is_available() else "cpu"
# 提前加载模型
print("Device:", device)
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("./modles/chatglm-6b", trust_remote_code=True)
print("Loading model...")
model = AutoModel.from_pretrained("./modles/chatglm-6b", trust_remote_code=True).half().to(device)
print("Model loaded.")
def get_chatGLM2_6b_response(prompt):
print(prompt)
inputs = tokenizer(prompt, return_tensors="pt").to(device)
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
print("Generating response...")
outputs = model.generate(inputs.input_ids, attention_mask=attention_mask, max_length=50)
print("Decoding response...")
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
return response
python main.py
Device: mps
Loading tokenizer...
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Loading model...
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]/Users/mutong/Documents/project/AI_Try/AI_first_try/conda/lib/python3.11/site-packages/transformers/modeling_utils.py:415: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
return torch.load(checkpoint_file, map_location="cpu")
Loading checkpoint shards: 100%|████████████████████████████████████████| 8/8 [00:10<00:00, 1.26s/it]
Model loaded.
Hello
Generating response...
The dtype of attention mask (torch.int64) is not bool
所以想请教这种情况是不是系统内存不够之类的原因
1
t41372 130 天前 via Android
不跑个量化版本的吗
我不知道是不是配置不够,不过一般你跑非量化版本的 llm 对配置要求都挺高的。一般用 mac 的会用 llama.cpp 或 ollama 或别的什么跑量化过的 llm ,也有针 Mac 的优化。 还有现在都什么年代了你还在跑 chatGLM2... glm4 都出了... |
2
sunmacarenas 130 天前 via Android
直接 ollama 跑量化的吧,省的折腾
|
4
WMutong OP @sunmacarenas 之前没接触语言模型这些,所以想从基础的入手尝试熟悉下
|