tokenizer / 添加特殊的 token

tokenizer / 添加特殊的 token === ###### tags: `LLM / tokenizer` ###### tags: `ML`, `NLP`, `NLU`, `LLM`, `tokenizer`, `Hugging Face`, `AutoTokenizer`, `tokenizer_config.json`, `single_word"`, `lstrip`, `rstrip"`, `normalized`, `special`, `special tokens`, `special_tokens` [TOC] ## Step1：添加特殊句元 (special token) ```python= from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained('.') print(tokenizer.SPECIAL_TOKENS_ATTRIBUTES) # ['bos_token', 'eos_token', 'unk_token', # 'sep_token', 'pad_token', 'cls_token', # 'mask_token', 'additional_special_tokens'] print(tokenizer.additional_special_tokens) # [] print(tokenizer.all_special_tokens) # ['<s>', '</s>', '<unk>'] print(tokenizer.all_special_tokens_extended) # ['<s>', '</s>', '<unk>'] # (不同 model ，其結果會跟 all_special_tokens 不同) special_tokens_dict = { 'additional_special_tokens': [ '<|tool_calls|>', '<|eot_id|>' ] } tokenizer.add_special_tokens(special_tokens_dict) # help(tokenizer.add_special_tokens) # - Special tokens can be skipped when decoding using `skip_special_tokens = True` print(tokenizer.additional_special_tokens) # ['<|tool_calls|>', '<|eot_id|>'] print(tokenizer.all_special_tokens) # ['<s>', '</s>', '<unk>', '<|tool_calls|>', '<|eot_id|>'] print(tokenizer.all_special_tokens_extended) # [ # '<s>', '</s>', '<unk>', # AddedToken( # "<|tool_calls|>", rstrip=False, lstrip=False, # single_word=False, normalized=False, special=True), # AddedToken( # "<|eot_id|>", rstrip=False, lstrip=False, # single_word=False, normalized=False, special=True) # ] ``` ## Step2：儲存 tokenizer 組態，並觀察輸出結果 ```python=46 tokenizer.save_pretrained('output') # ( # 'output/tokenizer_config.json', # 'output/special_tokens_map.json', # 'output/tokenizer.model', # 'output/added_tokens.json', # 'output/tokenizer.json' # ) ``` --- ### `tokenizer.json` ```json= { "version": "1.0", "truncation": null, "padding": null, "added_tokens": [ { "id": 0, "content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 1, "content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 2, "content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 36128, "content": "<|func_end|>", "single_word": true, "lstrip": true, "rstrip": true, "normalized": false, "special": false <-- 不是特殊 token }, { "id": 36129, "content": "<|func_start|>", "single_word": true, "lstrip": true, "rstrip": true, "normalized": false, "special": false <-- 不是特殊 token }, { "id": 36130, "content": "<|tool_calls|>", <-- new "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true <-- 是特殊 token }, { "id": 36131, "content": "<|eot_id|>", <-- new "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true <-- 是特殊 token } ], ... } ``` --- ### `added_tokens.json` ```json= { "<|eot_id|>": 36131, <-- new "<|func_end|>": 36128, "<|func_start|>": 36129, "<|tool_calls|>": 36130 <-- new } ``` --- ### `special_tokens_map.json` ```json= { "additional_special_tokens": [ <-- new { "content": "<|tool_calls|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false }, { "content": "<|eot_id|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false } ], "bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>" } ``` --- ### `tokenizer_config.json` ```json= { "add_bos_token": true, "add_eos_token": false, "added_tokens_decoder": { "0": { "content": "<unk>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true }, "1": { "content": "<s>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true }, "2": { "content": "</s>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true }, "36128": { "content": "<|func_end|>", "lstrip": true, "normalized": false, "rstrip": true, "single_word": true, "special": false }, "36129": { "content": "<|func_start|>", "lstrip": true, "normalized": false, "rstrip": true, "single_word": true, "special": false }, "36130": { "content": "<|tool_calls|>", <-- new "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true }, "36131": { "content": "<|eot_id|>", <-- new "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true } }, "additional_special_tokens": [ <-- new "<|tool_calls|>", "<|eot_id|>" ], "bos_token": "<s>", "clean_up_tokenization_spaces": false, "eos_token": "</s>", "model_max_length": 1000000000000000019884624838656, "tokenizer_class": "LlamaTokenizer", "unk_token": "<unk>", "use_default_system_prompt": false } ``` ## Step3：觀察 tokenize 行為 ```python= text = '麥當勞' encoded_input = tokenizer.encode_plus(text + '<|eot_id|>') print(encoded_input) # { # 'input_ids': [1, 29871, 32857, 32015, 32523, 36131], # 'attention_mask': [1, 1, 1, 1, 1, 1] # } tokenizer.convert_ids_to_tokens(encoded_input['input_ids']) tokenizer.convert_ids_to_tokens(encoded_input['input_ids'], skip_special_tokens=False) # ['<s>', '▁', '麥', '當', '勞', '<|eot_id|>'] tokenizer.convert_ids_to_tokens(encoded_input['input_ids'], skip_special_tokens=True) # ['▁', '麥', '當', '勞'] tokenizer.decode(encoded_input['input_ids']) tokenizer.decode(encoded_input['input_ids'], skip_special_tokens=False) # '<s> 麥當勞<|eot_id|>' tokenizer.decode(encoded_input['input_ids'], skip_special_tokens=True) # '麥當勞' ``` --- ```python= text = '麥當勞' encoded_input = tokenizer.encode_plus(text + '<|func_start|>lambda x: x+1<|func_end|>' + '<|eot_id|>') print(encoded_input) # { # 'input_ids': [ # 1, 29871, 32857, 32015, 32523, 29966, 29989, 9891, # 29918, 2962, 29989, 29958, 2892, 921, 29901, 921, # 29974, 29896, 29966, 29989, 9891, 29918, 355, # 29989, 29958, 36131], # 'attention_mask': [ # 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 1, 1, 1, 1, 1, 1, 1, 1, 1]} tokenizer.convert_ids_to_tokens(encoded_input['input_ids']) tokenizer.convert_ids_to_tokens(encoded_input['input_ids'], skip_special_tokens=False) # [ # '<s>', '▁', '麥', '當', '勞', '<', '|', 'func', '_', # 'start', '|', '>', 'lambda', '▁x', ':', '▁x', '+', '1', # '<', '|', 'func', '_', 'end', '|', '>', '<|eot_id|>'] tokenizer.convert_ids_to_tokens(encoded_input['input_ids'], skip_special_tokens=True) # [ # '▁', '麥', '當', '勞', '<', '|', 'func', '_', 'start', # '|', '>', 'lambda', '▁x', ':', '▁x', '+', '1', '<', # '|', 'func', '_', 'end', '|', '>'] tokenizer.decode(encoded_input['input_ids']) tokenizer.decode(encoded_input['input_ids'], skip_special_tokens=False) # '<s> 麥當勞<|func_start|>lambda x: x+1<|func_end|><|eot_id|>' tokenizer.decode(encoded_input['input_ids'], skip_special_tokens=True) # '麥當勞<|func_start|>lambda x: x+1<|func_end|>' ``` - `<|func_start|>` 不是 special token - `<|func_end|>` 不是 special token ## tmp ``` from transformers import AutoTokenizer def test(): tokenizer = AutoTokenizer.from_pretrained('.') text = '麥當勞' encoded_input = tokenizer.encode_plus(text + '<|eot_id|>' + '<|func_start|>lambda x: x+1<|func_end|> <|eot_id|> ') print(tokenizer.convert_ids_to_tokens(encoded_input['input_ids'])) test() ``` ## 參考資料 - [How to add some new special tokens to a pretrained tokenizer? #247](https://github.com/huggingface/tokenizers/issues/247) - [How to add new tokens to an existing Huggingface tokenizer?](https://stackoverflow.com/questions/76198051/) - [transformers库的使用【三】数据的预处理](https://blog.csdn.net/qq_28790663/article/details/117073917)