Skip to content

Commit 0b9eb9d

Browse files
author
LittleMouse
committed
[update] upload tokenizer_deepseek-r1-1.5B-ax630c.py
1 parent 1a77033 commit 0b9eb9d

1 file changed

Lines changed: 131 additions & 0 deletions

File tree

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
from transformers import AutoTokenizer, PreTrainedTokenizerFast
2+
from http.server import HTTPServer, BaseHTTPRequestHandler
3+
import json
4+
import argparse
5+
6+
class Tokenizer_Http():
7+
8+
def __init__(self, model_id):
9+
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
10+
11+
def encode(self, prompt, content):
12+
messages = [
13+
{"role": "system", "content": content},
14+
{"role": "user", "content": prompt}
15+
]
16+
text = self.tokenizer.apply_chat_template(
17+
messages,
18+
tokenize=False,
19+
add_generation_prompt=True
20+
)
21+
print(text)
22+
token_ids = self.tokenizer.encode(text)
23+
return token_ids
24+
25+
def decode(self, token_ids):
26+
return self.tokenizer.decode(token_ids)
27+
28+
@property
29+
def bos_id(self):
30+
return self.tokenizer.bos_token_id
31+
32+
@property
33+
def eos_id(self):
34+
return self.tokenizer.eos_token_id
35+
36+
@property
37+
def bos_token(self):
38+
return self.tokenizer.bos_token
39+
40+
@property
41+
def eos_token(self):
42+
return self.tokenizer.eos_token
43+
44+
class Request(BaseHTTPRequestHandler):
45+
#通过类继承,新定义类
46+
timeout = 5
47+
server_version = 'Apache'
48+
49+
def do_GET(self):
50+
print(self.path)
51+
#在新类中定义get的内容(当客户端向该服务端使用get请求时,本服务端将如下运行)
52+
self.send_response(200)
53+
self.send_header("type", "get") #设置响应头,可省略或设置多个
54+
self.end_headers()
55+
56+
if self.path == '/bos_id':
57+
bos_id = tokenizer.bos_id
58+
# print(bos_id)
59+
# to json
60+
if bos_id is None:
61+
msg = json.dumps({'bos_id': -1})
62+
else:
63+
msg = json.dumps({'bos_id': bos_id})
64+
elif self.path == '/eos_id':
65+
eos_id = tokenizer.eos_id
66+
if eos_id is None:
67+
msg = json.dumps({'eos_id': -1})
68+
else:
69+
msg = json.dumps({'eos_id': eos_id})
70+
else:
71+
msg = 'error'
72+
73+
print(msg)
74+
msg = str(msg).encode() #转为str再转为byte格式
75+
76+
self.wfile.write(msg) #将byte格式的信息返回给客户端
77+
78+
def do_POST(self):
79+
#在新类中定义post的内容(当客户端向该服务端使用post请求时,本服务端将如下运行)
80+
data = self.rfile.read(int(
81+
self.headers['content-length'])) #获取从客户端传入的参数(byte格式)
82+
data = data.decode() #将byte格式转为str格式
83+
84+
self.send_response(200)
85+
self.send_header("type", "post") #设置响应头,可省略或设置多个
86+
self.end_headers()
87+
88+
if self.path == '/encode':
89+
req = json.loads(data)
90+
prompt = req['text']
91+
92+
token_ids = tokenizer.encode(prompt, args.content)
93+
if token_ids is None:
94+
msg = json.dumps({'token_ids': -1})
95+
else:
96+
msg = json.dumps({'token_ids': token_ids})
97+
98+
elif self.path == '/decode':
99+
req = json.loads(data)
100+
token_ids = req['token_ids']
101+
text = tokenizer.decode(token_ids)
102+
if text is None:
103+
msg = json.dumps({'text': ""})
104+
else:
105+
msg = json.dumps({'text': text})
106+
else:
107+
msg = 'error'
108+
print(msg)
109+
msg = str(msg).encode() #转为str再转为byte格式
110+
111+
self.wfile.write(msg) #将byte格式的信息返回给客户端
112+
113+
114+
if __name__ == "__main__":
115+
116+
args = argparse.ArgumentParser()
117+
args.add_argument('--host', type=str, default='localhost')
118+
args.add_argument('--port', type=int, default=8080)
119+
args.add_argument('--model_id', type=str, default='deepseek_tokenizer')
120+
args.add_argument('--content', type=str, default='You are a helpful assistant.')
121+
args = args.parse_args()
122+
123+
tokenizer = Tokenizer_Http(args.model_id)
124+
125+
# print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id, tokenizer.eos_token)
126+
# print(tokenizer.encode("hello world", args.content))
127+
128+
host = (args.host, args.port) #设定地址与端口号,'localhost'等价于'127.0.0.1'
129+
print('http://%s:%s' % host)
130+
server = HTTPServer(host, Request) #根据地址端口号和新定义的类,创建服务器实例
131+
server.serve_forever() #开启服务

0 commit comments

Comments
 (0)