|
| 1 | +from transformers import AutoTokenizer, PreTrainedTokenizerFast |
| 2 | +from http.server import HTTPServer, BaseHTTPRequestHandler |
| 3 | +import json |
| 4 | +import argparse |
| 5 | + |
| 6 | + |
| 7 | +class Tokenizer_Http: |
| 8 | + |
| 9 | + def __init__(self, model_id): |
| 10 | + self.tokenizer = AutoTokenizer.from_pretrained( |
| 11 | + model_id, trust_remote_code=True, use_fast=False |
| 12 | + ) |
| 13 | + |
| 14 | + def encode(self, prompt, content): |
| 15 | + prompt = f"<|im_start|>system\n{content}<|im_end|><|im_start|>user\n{prompt}<|im_end|><|im_start|>assistant\n" |
| 16 | + input_ids = self.tokenizer.encode(prompt) |
| 17 | + return input_ids |
| 18 | + |
| 19 | + def encode_vpm(self, prompt, content="Please describe the image shortly."): |
| 20 | + prompt = f"<|im_start|>system\n{content}<|im_end|><|im_start|>user\n<img><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT></img>\n{prompt}<|im_end|><|im_start|>assistant\n" |
| 21 | + input_ids = self.tokenizer.encode(prompt) |
| 22 | + return input_ids |
| 23 | + |
| 24 | + def decode(self, token_ids): |
| 25 | + return self.tokenizer.decode(token_ids, clean_up_tokenization_spaces=False) |
| 26 | + |
| 27 | + @property |
| 28 | + def bos_id(self): |
| 29 | + return self.tokenizer.bos_token_id |
| 30 | + |
| 31 | + @property |
| 32 | + def eos_id(self): |
| 33 | + return self.tokenizer.eos_token_id |
| 34 | + |
| 35 | + @property |
| 36 | + def bos_token(self): |
| 37 | + return self.tokenizer.bos_token |
| 38 | + |
| 39 | + @property |
| 40 | + def eos_token(self): |
| 41 | + return self.tokenizer.eos_token |
| 42 | + |
| 43 | +class Request(BaseHTTPRequestHandler): |
| 44 | + # 通过类继承,新定义类 |
| 45 | + timeout = 5 |
| 46 | + server_version = "Apache" |
| 47 | + |
| 48 | + def do_GET(self): |
| 49 | + print(self.path) |
| 50 | + # 在新类中定义get的内容(当客户端向该服务端使用get请求时,本服务端将如下运行) |
| 51 | + self.send_response(200) |
| 52 | + self.send_header("type", "get") # 设置响应头,可省略或设置多个 |
| 53 | + self.end_headers() |
| 54 | + |
| 55 | + if self.path == "/bos_id": |
| 56 | + bos_id = tokenizer.bos_id |
| 57 | + # print(bos_id) |
| 58 | + # to json |
| 59 | + if bos_id is None: |
| 60 | + msg = json.dumps({"bos_id": -1}) |
| 61 | + else: |
| 62 | + msg = json.dumps({"bos_id": bos_id}) |
| 63 | + elif self.path == "/eos_id": |
| 64 | + eos_id = tokenizer.eos_id |
| 65 | + if eos_id is None: |
| 66 | + msg = json.dumps({"eos_id": -1}) |
| 67 | + else: |
| 68 | + msg = json.dumps({"eos_id": eos_id}) |
| 69 | + else: |
| 70 | + msg = "error" |
| 71 | + |
| 72 | + print(msg) |
| 73 | + msg = str(msg).encode() # 转为str再转为byte格式 |
| 74 | + |
| 75 | + self.wfile.write(msg) # 将byte格式的信息返回给客户端 |
| 76 | + |
| 77 | + def do_POST(self): |
| 78 | + # 在新类中定义post的内容(当客户端向该服务端使用post请求时,本服务端将如下运行) |
| 79 | + data = self.rfile.read( |
| 80 | + int(self.headers["content-length"]) |
| 81 | + ) # 获取从客户端传入的参数(byte格式) |
| 82 | + data = data.decode() # 将byte格式转为str格式 |
| 83 | + |
| 84 | + self.send_response(200) |
| 85 | + self.send_header("type", "post") # 设置响应头,可省略或设置多个 |
| 86 | + self.end_headers() |
| 87 | + |
| 88 | + if self.path == "/encode": |
| 89 | + req = json.loads(data) |
| 90 | + print(req) |
| 91 | + prompt = req["text"] |
| 92 | + b_img_prompt = False |
| 93 | + if "img_prompt" in req: |
| 94 | + b_img_prompt = req["img_prompt"] |
| 95 | + if b_img_prompt: |
| 96 | + token_ids = tokenizer.encode_vpm(prompt) |
| 97 | + else: |
| 98 | + token_ids = tokenizer.encode(prompt, args.content) |
| 99 | + if token_ids is None: |
| 100 | + msg = json.dumps({"token_ids": -1}) |
| 101 | + else: |
| 102 | + msg = json.dumps({"token_ids": token_ids}) |
| 103 | + |
| 104 | + elif self.path == "/decode": |
| 105 | + req = json.loads(data) |
| 106 | + token_ids = req["token_ids"] |
| 107 | + text = tokenizer.decode(token_ids) |
| 108 | + if text is None: |
| 109 | + msg = json.dumps({"text": ""}) |
| 110 | + else: |
| 111 | + msg = json.dumps({"text": text}) |
| 112 | + else: |
| 113 | + msg = "error" |
| 114 | + print(msg) |
| 115 | + msg = str(msg).encode() # 转为str再转为byte格式 |
| 116 | + |
| 117 | + self.wfile.write(msg) # 将byte格式的信息返回给客户端 |
| 118 | + |
| 119 | + |
| 120 | +if __name__ == "__main__": |
| 121 | + |
| 122 | + args = argparse.ArgumentParser() |
| 123 | + args.add_argument("--host", type=str, default="localhost") |
| 124 | + args.add_argument("--port", type=int, default=8080) |
| 125 | + args.add_argument('--model_id', type=str, default='internvl2_tokenizer') |
| 126 | + args.add_argument('--content', type=str, default='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。') |
| 127 | + args = args.parse_args() |
| 128 | + |
| 129 | + tokenizer = Tokenizer_Http(args.model_id) |
| 130 | + |
| 131 | + |
| 132 | + # print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id, tokenizer.eos_token) |
| 133 | + # print(tokenizer.encode("hello world", args.content)) |
| 134 | + |
| 135 | + host = (args.host, args.port) # 设定地址与端口号,'localhost'等价于'127.0.0.1' |
| 136 | + print("http://%s:%s" % host) |
| 137 | + server = HTTPServer(host, Request) # 根据地址端口号和新定义的类,创建服务器实例 |
| 138 | + server.serve_forever() # 开启服务 |
0 commit comments