tokenizer / 添加特殊的 token
===
###### tags: `LLM / tokenizer`
###### tags: `ML`, `NLP`, `NLU`, `LLM`, `tokenizer`, `Hugging Face`, `AutoTokenizer`, `tokenizer_config.json`, `single_word"`, `lstrip`, `rstrip"`, `normalized`, `special`, `special tokens`, `special_tokens`
<br>
[TOC]
<br>
## Step1:添加特殊句元 (special token)
```python=
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('.')
print(tokenizer.SPECIAL_TOKENS_ATTRIBUTES)
# ['bos_token', 'eos_token', 'unk_token',
# 'sep_token', 'pad_token', 'cls_token',
# 'mask_token', 'additional_special_tokens']
print(tokenizer.additional_special_tokens)
# []
print(tokenizer.all_special_tokens)
# ['<s>', '</s>', '<unk>']
print(tokenizer.all_special_tokens_extended)
# ['<s>', '</s>', '<unk>']
# (不同 model ,其結果會跟 all_special_tokens 不同)
special_tokens_dict = {
'additional_special_tokens': [
'<|tool_calls|>', '<|eot_id|>'
]
}
tokenizer.add_special_tokens(special_tokens_dict)
# help(tokenizer.add_special_tokens)
# - Special tokens can be skipped when decoding using `skip_special_tokens = True`
print(tokenizer.additional_special_tokens)
# ['<|tool_calls|>', '<|eot_id|>']
print(tokenizer.all_special_tokens)
# ['<s>', '</s>', '<unk>', '<|tool_calls|>', '<|eot_id|>']
print(tokenizer.all_special_tokens_extended)
# [
# '<s>', '</s>', '<unk>',
# AddedToken(
# "<|tool_calls|>", rstrip=False, lstrip=False,
# single_word=False, normalized=False, special=True),
# AddedToken(
# "<|eot_id|>", rstrip=False, lstrip=False,
# single_word=False, normalized=False, special=True)
# ]
```
<br>
## Step2:儲存 tokenizer 組態,並觀察輸出結果
```python=46
tokenizer.save_pretrained('output')
# (
# 'output/tokenizer_config.json',
# 'output/special_tokens_map.json',
# 'output/tokenizer.model',
# 'output/added_tokens.json',
# 'output/tokenizer.json'
# )
```
---
### `tokenizer.json`
```json=
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "</s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 36128,
"content": "<|func_end|>",
"single_word": true,
"lstrip": true,
"rstrip": true,
"normalized": false,
"special": false <-- 不是特殊 token
},
{
"id": 36129,
"content": "<|func_start|>",
"single_word": true,
"lstrip": true,
"rstrip": true,
"normalized": false,
"special": false <-- 不是特殊 token
},
{
"id": 36130,
"content": "<|tool_calls|>", <-- new
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true <-- 是特殊 token
},
{
"id": 36131,
"content": "<|eot_id|>", <-- new
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true <-- 是特殊 token
}
],
...
}
```
---
### `added_tokens.json`
```json=
{
"<|eot_id|>": 36131, <-- new
"<|func_end|>": 36128,
"<|func_start|>": 36129,
"<|tool_calls|>": 36130 <-- new
}
```
---
### `special_tokens_map.json`
```json=
{
"additional_special_tokens": [ <-- new
{
"content": "<|tool_calls|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
{
"content": "<|eot_id|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
],
"bos_token": "<s>",
"eos_token": "</s>",
"unk_token": "<unk>"
}
```
---
### `tokenizer_config.json`
```json=
{
"add_bos_token": true,
"add_eos_token": false,
"added_tokens_decoder": {
"0": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"1": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"2": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"36128": {
"content": "<|func_end|>",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": true,
"special": false
},
"36129": {
"content": "<|func_start|>",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": true,
"special": false
},
"36130": {
"content": "<|tool_calls|>", <-- new
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"36131": {
"content": "<|eot_id|>", <-- new
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"additional_special_tokens": [ <-- new
"<|tool_calls|>",
"<|eot_id|>"
],
"bos_token": "<s>",
"clean_up_tokenization_spaces": false,
"eos_token": "</s>",
"model_max_length": 1000000000000000019884624838656,
"tokenizer_class": "LlamaTokenizer",
"unk_token": "<unk>",
"use_default_system_prompt": false
}
```
<br>
## Step3:觀察 tokenize 行為
```python=
text = '麥當勞'
encoded_input = tokenizer.encode_plus(text + '<|eot_id|>')
print(encoded_input)
# {
# 'input_ids': [1, 29871, 32857, 32015, 32523, 36131],
# 'attention_mask': [1, 1, 1, 1, 1, 1]
# }
tokenizer.convert_ids_to_tokens(encoded_input['input_ids'])
tokenizer.convert_ids_to_tokens(encoded_input['input_ids'], skip_special_tokens=False)
# ['<s>', '▁', '麥', '當', '勞', '<|eot_id|>']
tokenizer.convert_ids_to_tokens(encoded_input['input_ids'], skip_special_tokens=True)
# ['▁', '麥', '當', '勞']
tokenizer.decode(encoded_input['input_ids'])
tokenizer.decode(encoded_input['input_ids'], skip_special_tokens=False)
# '<s> 麥當勞<|eot_id|>'
tokenizer.decode(encoded_input['input_ids'], skip_special_tokens=True)
# '麥當勞'
```
---
```python=
text = '麥當勞'
encoded_input = tokenizer.encode_plus(text
+ '<|func_start|>lambda x: x+1<|func_end|>'
+ '<|eot_id|>')
print(encoded_input)
# {
# 'input_ids': [
# 1, 29871, 32857, 32015, 32523, 29966, 29989, 9891,
# 29918, 2962, 29989, 29958, 2892, 921, 29901, 921,
# 29974, 29896, 29966, 29989, 9891, 29918, 355,
# 29989, 29958, 36131],
# 'attention_mask': [
# 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
# 1, 1, 1, 1, 1, 1, 1, 1, 1]}
tokenizer.convert_ids_to_tokens(encoded_input['input_ids'])
tokenizer.convert_ids_to_tokens(encoded_input['input_ids'], skip_special_tokens=False)
# [
# '<s>', '▁', '麥', '當', '勞', '<', '|', 'func', '_',
# 'start', '|', '>', 'lambda', '▁x', ':', '▁x', '+', '1',
# '<', '|', 'func', '_', 'end', '|', '>', '<|eot_id|>']
tokenizer.convert_ids_to_tokens(encoded_input['input_ids'], skip_special_tokens=True)
# [
# '▁', '麥', '當', '勞', '<', '|', 'func', '_', 'start',
# '|', '>', 'lambda', '▁x', ':', '▁x', '+', '1', '<',
# '|', 'func', '_', 'end', '|', '>']
tokenizer.decode(encoded_input['input_ids'])
tokenizer.decode(encoded_input['input_ids'], skip_special_tokens=False)
# '<s> 麥當勞<|func_start|>lambda x: x+1<|func_end|><|eot_id|>'
tokenizer.decode(encoded_input['input_ids'], skip_special_tokens=True)
# '麥當勞<|func_start|>lambda x: x+1<|func_end|>'
```
- `<|func_start|>` 不是 special token
- `<|func_end|>` 不是 special token
## tmp
```
from transformers import AutoTokenizer
def test():
tokenizer = AutoTokenizer.from_pretrained('.')
text = '麥當勞'
encoded_input = tokenizer.encode_plus(text
+ '<|eot_id|>'
+ '<|func_start|>lambda x: x+1<|func_end|> <|eot_id|> ')
print(tokenizer.convert_ids_to_tokens(encoded_input['input_ids']))
test()
```
<br>
## 參考資料
- [How to add some new special tokens to a pretrained tokenizer? #247](https://github.com/huggingface/tokenizers/issues/247)
- [How to add new tokens to an existing Huggingface tokenizer?](https://stackoverflow.com/questions/76198051/)
- [transformers库的使用【三】数据的预处理](https://blog.csdn.net/qq_28790663/article/details/117073917)