{"id":20782,"date":"2025-12-27T21:03:00","date_gmt":"2025-12-27T13:03:00","guid":{"rendered":"https:\/\/www.orczhou.com\/?p=20782"},"modified":"2025-12-30T09:00:04","modified_gmt":"2025-12-30T01:00:04","slug":"understanding-llm-tokenize","status":"publish","type":"post","link":"https:\/\/www.orczhou.com\/index.php\/2025\/12\/understanding-llm-tokenize\/","title":{"rendered":"\u5927\u8bed\u8a00\u6a21\u578b\u7684\u8f93\u5165\uff1atokenize"},"content":{"rendered":"\n\n\n\n<p style=\"margin-top:2px\">\u5bf9\u4e8e\u975e NLP \u4e13\u4e1a\u7684\u4eba\u6765\u8bf4\uff0c\u8981\u5411\u7406\u89e3\u5927\u8bed\u8a00\u6a21\u578b\u7684\u57fa\u7840\u5176\u5b9e\u662f\u975e\u5e38\u4e0d\u5bb9\u6613\u7684\u3002\u5728\u6709\u4e86\u4e00\u5b9a\u7684\u795e\u7ecf\u7f51\u7edc\u57fa\u7840\u3001\u6570\u5b66\u57fa\u7840\u4e4b\u540e\uff0c\u7b97\u662f\u53ef\u4ee5\u66f4\u8fdb\u4e00\u6b65\u4e86\uff0c\u5728\u4e86\u89e3LLM\u7684\u7cfb\u5217\u4e2d\uff0c\u5927\u6982\u53ef\u4ee5\u5206\u6210\u51e0\u4e2a\u90e8\u5206\uff1a\u8f93\u5165\uff08\u5373\u672c\u6587\u7684tokenize\uff09\u3001\u8ba1\u7b97\uff08Attention\uff09\u3001\u8f93\u51fa\uff08beam search\/top-k\uff09\u3002\u5728\u672c\u7bc7\u4e2d\uff1a(a) \u901a\u8fc7\u4ee3\u7801\u5b9e\u8df5\u89c2\u5bdf\u5927\u6a21\u578b\uff08GPT2\uff09\u5982\u4f55\u8fdb\u884c tokenize\uff1b(b) \u5982\u4f55\u67e5\u770b token id \u5217\u8868 (c) \u89c2\u5bdf\u6a21\u578b\u4e2d\u6240\u6709token\u7684 Embedding Matrix\u3002<\/p>\n\n\n\n<p>\u8fd9\u662f\u6211\u7684\u5927\u8bed\u8a00\uff08LLM\uff09\u5b66\u4e60\u7cfb\u5217\u4e2d\u7684\u4e00\u7bc7\uff0c\u5b8c\u6574\u7684\u5b66\u4e60\u8def\u5f84\u53c2\u8003\uff1a<a href=\"https:\/\/www.orczhou.com\/?p=18922&amp;preview=true\">\u6211\u7684\u5927\u6a21\u578b\u5b66\u4e60\u8def\u7ebf<\/a>\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">1. \u7406\u89e3 tokenize<\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">1.1 Token ID<\/h4>\n\n\n\n<p>\u8fd9\u91cc\u6211\u4eec\u4f7f\u7528\u5982\u4e0b\u7684\u63d0\u793a\u8bcd\uff0c\u6765\u770b\u770b\u5927\u6a21\u578b\u662f\u5982\u4f55\u5904\u7406\u7684\uff1a<span style=\"text-decoration: underline;\">It\u2019s very hot in summer. Swimming is<\/span> \u3002<\/p>\n\n\n\n<p>\u5927\u6a21\u578b\u4f1a\u4f7f\u7528\u9884\u5148\u8bbe\u8ba1\u597d\u7684\u201ctokenize\u201d\u5b9e\u73b0\uff0c\u5c06\u4e0a\u8ff0\u7684\u53e5\u5b50\u5206\u89e3\u6210\u72ec\u7acb\u7684\u201cToken\u201d\uff0c\u5e76\u8f6c\u6362\u4e3a\u5bf9\u5e94\u7684\u201cToken ID\u201d\uff0c\u800c\u6bcf\u4e2aToken\uff0c\u90fd\u6709\u81ea\u5df1\u7684\u7f16\u7801\uff0c\u4e5f\u5c31\u662f Embedding \uff0c\u8fd9\u4e9b Embedding \u5c31\u6700\u7ec8\u4f5c\u4e3a\u5927\u8bed\u8a00\u6a21\u578b\u7684\u8f93\u5165\u3002<\/p>\n\n\n\n<p>\u5bf9\u4e8e\u4e0a\u8ff0\u7684\u53e5\u5b50\uff0c\u201copenai-community\/gpt2\u201d \u5728\u8fdb\u884c tokenize \u4e4b\u540e\u5bf9\u5e94\u7684 Token \u548c Token ID \u5982\u4e0b\uff1a<\/p>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:2%\"><\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\">\n<figure class=\"wp-block-table is-style-regular\"><table><tbody><tr><td class=\"has-text-align-center\" data-align=\"center\"><\/td><td class=\"has-text-align-center\" data-align=\"center\">it<\/td><td class=\"has-text-align-center\" data-align=\"center\" colspan=\"2\">\u2019<\/td><td class=\"has-text-align-center\" data-align=\"center\">s<\/td><td class=\"has-text-align-center\" data-align=\"center\">very<\/td><td class=\"has-text-align-center\" data-align=\"center\">hot<\/td><td class=\"has-text-align-center\" data-align=\"center\">in<\/td><td class=\"has-text-align-center\" data-align=\"center\">summer<\/td><td class=\"has-text-align-center\" data-align=\"center\"> . <\/td><td class=\"has-text-align-center\" data-align=\"center\" colspan=\"2\">Swimming<\/td><td class=\"has-text-align-center\" data-align=\"center\">is<\/td><\/tr><tr><td class=\"has-text-align-center\" data-align=\"center\">Token ID<\/td><td class=\"has-text-align-center\" data-align=\"center\">1026<\/td><td class=\"has-text-align-center\" data-align=\"center\">447<\/td><td class=\"has-text-align-center\" data-align=\"center\">247<\/td><td class=\"has-text-align-center\" data-align=\"center\">  82  <\/td><td class=\"has-text-align-center\" data-align=\"center\">845<\/td><td class=\"has-text-align-center\" data-align=\"center\">3024<\/td><td class=\"has-text-align-center\" data-align=\"center\">287<\/td><td class=\"has-text-align-center\" data-align=\"center\">3931<\/td><td class=\"has-text-align-center\" data-align=\"center\">  13  <\/td><td class=\"has-text-align-center\" data-align=\"center\">2451<\/td><td class=\"has-text-align-center\" data-align=\"center\">27428<\/td><td class=\"has-text-align-center\" data-align=\"center\">318<\/td><\/tr><tr><td class=\"has-text-align-center\" data-align=\"center\">Token<\/td><td class=\"has-text-align-center\" data-align=\"center\">It<\/td><td class=\"has-text-align-center\" data-align=\"center\">\u00e2\u0122<\/td><td class=\"has-text-align-center\" data-align=\"center\">\u013b<\/td><td class=\"has-text-align-center\" data-align=\"center\">s<\/td><td class=\"has-text-align-center\" data-align=\"center\">\u0120very<\/td><td class=\"has-text-align-center\" data-align=\"center\">\u0120hot<\/td><td class=\"has-text-align-center\" data-align=\"center\">\u0120in<\/td><td class=\"has-text-align-center\" data-align=\"center\">\u0120summer<\/td><td class=\"has-text-align-center\" data-align=\"center\">.<\/td><td class=\"has-text-align-center\" data-align=\"center\">\u0120Sw<\/td><td class=\"has-text-align-center\" data-align=\"center\">imming<\/td><td class=\"has-text-align-center\" data-align=\"center\">\u0120is<\/td><\/tr><\/tbody><\/table><\/figure>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:2%\"><\/div>\n<\/div>\n\n\n\n<div style=\"height:10px\" aria-hidden=\"true\" class=\"wp-block-spacer\"><\/div>\n\n\n\n<div style=\"height:10px\" aria-hidden=\"true\" class=\"wp-block-spacer\"><\/div>\n\n\n\n<p>\u201copenai-community\/gpt2\u201d \u4f7f\u7528\u4e86\u8f83\u4e3a\u5e38\u89c1\u7684BPE\uff08Byte Pair Encoding\uff09\u5bf9\u53e5\u5b50\u8fdb\u884c\u5904\u7406\uff0c\u628a\u6bcf\u4e2a\u8bcd\u8bed\u6309\u7167\u201csubword\u201d\u8fdb\u884c\u5904\u7406\uff0c\u4f8b\u5982\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u201cSwimming\u201d\u62c6\u5206\u4e3a\u201cSw\u201d\u4e0e\u201cimming\u201d<\/li>\n\n\n\n<li>\u8fd9\u91cc\u4e24\u4e2a Token 447\u3001247 \u7ec4\u6210\u7279\u6b8a\u5b57\u7b26 &#8216; (\u6487\u53f7) <\/li>\n\n\n\n<li>\u0120\uff08U+0120\uff09\u7684\u4f5c\u7528\uff1a\u8868\u793a\u8fd9\u662f\u4e00\u4e2a\u65b0\u7684\u8bcd\u8bed\uff08\u800c\u4e0d\u662f\u4e00\u4e2a\u62c6\u5206\u540e\u5b50\u8bcd\uff09\uff0c\u53ef\u4ee5\u770b\u5230 \u201cSwimming\u201d\u5728\u62c6\u5206\u540e\u7684\u201cimming\u201d\u524d\u9762\u5e76\u6ca1\u6709\u0120\uff0c\u8868\u793a\u8fd9\u662f\u4e00\u4e2a\u62c6\u5206\u540e\u7684\u5b50\u8bcd<\/li>\n<\/ul>\n\n\n\n<h4 class=\"wp-block-heading\">1.2 \u8bcd\u8868\u5927\u5c0f<\/h4>\n\n\n\n<p>\u201copenai-community\/gpt2\u201d \u6a21\u578b\u7684\u8bcd\u8868\u5927\u5c0f\u4e3a\uff1a50257\u3002\u8bcd\u8868\u4e2d\u7684\u524d\u4e09\u4e2a Token \u4e3a &#8217;emb&#8217;\u3001 &#8216;\u0120Draft&#8217;\u3001 &#8216;\u0120reinvent&#8217;\uff0c\u5bf9\u5e94\u7684 Token ID \u4e3a 24419\u3001 13650\u3001 36608\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">1.3 \u6839\u636e Token ID \u6253\u5370\u5b57\u7b26<\/h4>\n\n\n\n<p>\u8fd9\u91cc\u6253\u5370\u4e86 Token ID \u4e3a 0\u3001 1\u3001 2 \u548c 50254\u3001 50255\u3001  50256 \u7684\u51e0\u4e2a\u5b57\u7b26\u5982\u4e0b\uff1a<\/p>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:33.33%\">\n<figure class=\"wp-block-table is-style-regular has-small-font-size\"><table class=\"has-fixed-layout\"><tbody><tr><td class=\"has-text-align-center\" data-align=\"center\">Token ID<\/td><td class=\"has-text-align-center\" data-align=\"center\">Char<\/td><\/tr><tr><td class=\"has-text-align-center\" data-align=\"center\">0<\/td><td class=\"has-text-align-center\" data-align=\"center\">!<\/td><\/tr><tr><td class=\"has-text-align-center\" data-align=\"center\">1<\/td><td class=\"has-text-align-center\" data-align=\"center\">&#8220;<\/td><\/tr><tr><td class=\"has-text-align-center\" data-align=\"center\">2<\/td><td class=\"has-text-align-center\" data-align=\"center\">#<\/td><\/tr><tr><td class=\"has-text-align-center\" data-align=\"center\">50254<\/td><td class=\"has-text-align-center\" data-align=\"center\">\u0120informants<\/td><\/tr><tr><td class=\"has-text-align-center\" data-align=\"center\">50255<\/td><td class=\"has-text-align-center\" data-align=\"center\">\u0120gazed<\/td><\/tr><tr><td class=\"has-text-align-center\" data-align=\"center\">50256<\/td><td class=\"has-text-align-center\" data-align=\"center\">&lt;|endoftext|&gt;<\/td><\/tr><\/tbody><\/table><\/figure>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:66.66%\">\n<pre class=\"wp-block-preformatted\">--- Token ID \u8f6c\u6362\u4e3a Token \u5b57\u7b26 ---<br>Token ID: 0     | \u5bf9\u5e94 Token \u5b57\u7b26: '!'<br>Token ID: 1     | \u5bf9\u5e94 Token \u5b57\u7b26: '\"'<br>Token ID: 2     | \u5bf9\u5e94 Token \u5b57\u7b26: '#'<br>Token ID: 50254 | \u5bf9\u5e94 Token \u5b57\u7b26: '\u0120informants'<br>Token ID: 50255 | \u5bf9\u5e94 Token \u5b57\u7b26: '\u0120gazed'<br>Token ID: 50256 | \u5bf9\u5e94 Token \u5b57\u7b26: '&lt;|endoftext|&gt;'<\/pre>\n<\/div>\n<\/div>\n\n\n\n<h3 class=\"wp-block-heading\">2. Token Embedding<\/h3>\n\n\n\n<p>\u5728\u5927\u6a21\u578b\u7684\u901a\u5e38\u662f\u4ece Embedding \u5f00\u59cb\u7684\uff0c\u5373\u5bf9\u4e8e\u6240\u6709\u5b57\u7b26\u7684\u5904\u7406\uff0c\u90fd\u662f\u4f9d\u8d56\u5b57\u7b26\u5bf9\u5e94\u7684\u201c\u5411\u91cf\u201d\u3002\u6240\u4ee5\uff0c\u5927\u81f4\u7684\u5904\u7406\u903b\u8f91\u662f\u8fd9\u6837\uff1a\u4e00\u4e2a\u53e5\u5b50\uff0c\u5148\u5207\u5206\u4e3a Token\uff0c\u7136\u540e\u6839\u636e Token ID \u5728\u201cEmbedding Matrix\u201d\u4e2d\u627e\u5230\u5bf9\u5e94\u7684\u201c\u5411\u91cf\u201d\uff0c\u628a\u8be5\u201c\u5411\u91cf\u201d\u7ec4\u4f5c\u4e3a\u8f93\u5165\u3002<\/p>\n\n\n\n<p>\u8fd9\u91cc\uff0c\u6211\u4eec\u89c2\u5bdf\u4e0a\u8ff0\u793a\u4f8b\u53e5\u5b50\uff0c\u6839\u636e\u5bf9\u5e94\u7684 Token ID \u53ef\u4ee5\u67e5\u8be2\u5230\u5bf9\u5e94\u7684\u5411\u91cf\u3002\u56e0\u4e3a\u8fd9\u91cc\u7684\u5411\u91cf\u662f768\u7ef4\u7684\uff0c\u8fd9\u91cc\u4ec5\u663e\u793a\u524d\u4e09\u4e2a\u7ef4\u5ea6\u7684\u5206\u91cf\uff0c\u5982\u4e0b\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">Token      | wte[:3]                         |<br>----------------------------------------------<br>It         | [0.039,   -0.0869, 0.0662 ,...] |<br>\u00e2\u0122         | [-0.075,  0.0948,  -0.0034,...] |<br>\u013b          | [-0.0223, 0.0182,  0.2631 ,...] |<br>s          | [-0.064,  -0.0469, 0.2061 ,...] |<br>\u0120very      | [-0.0553, -0.0348, 0.0606 ,...] |<br>\u0120hot       | [0.0399,  -0.0053, 0.0742 ,...] |<br>\u0120in        | [-0.0337, 0.0108,  0.0293 ,...] |<br>\u0120summer    | [0.0422,  0.0138,  -0.0213,...] |<br>.          | [0.0466,  -0.0113, 0.0283 ,...] |<br>\u0120Sw        | [0.0617,  0.0373,  0.1018 ,...] |<br>imming     | [-0.1385, -0.1774, -0.0181,...] |<br>\u0120is        | [-0.0097, 0.0101,  0.0556 ,...] |<\/pre>\n\n\n\n<p>\u66f4\u4e3a\u5b8c\u6574\u7684\uff0c\u4e0a\u8ff0\u7684\u6bcf\u4e2a\u8bcd\u8bed\u5bf9\u5e94\u7684\u5411\u91cf\u662f\u4e00\u4e2a 1&#215;768 \u7684\u5411\u91cf\uff1b\u6574\u4e2a\u53e5\u5b50 12 \u4e2a\u5411\u91cf\uff0c\u53ef\u4ee5\u7406\u89e3\u4e3a\u4e00\u4e2a 12&#215;768 \u7684\u8f93\u5165\u77e9\u9635\u3002\u5bf9\u4e8e\u5404 LLM \u6765\u8bf4\uff0c\u8fd9\u901a\u5e38\u5c31\u662f\u5176\u5c06\u8981\u5904\u7406\u7684\u8f93\u5165\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">3. Positional encoding<\/h3>\n\n\n\n<p>\u5728\u201cGPT-2\u201d\u4f7f\u7528\u4e86\u201cLearned Positional Embeddings\u201d\uff08\u6ce8\uff1a\u4e0e Transformer \u8bba\u6587\u4e2d\u56fa\u5b9a\u7684Sinusoidal \u5b9e\u73b0\u4e0d\u540c\uff09\u3002\u8fd9\u662f\u4e00\u4e2a \\(L \\times d \\) \u7684\u77e9\u9635\uff0c\u5176\u4e2d\uff0c\\(L \\) \u662f\u6700\u5927\u63a5\u6536\u5b57\u7b26\u6570\uff0c\\(d \\) \u662f Token Embedding \u7684\u7ef4\u5ea6\u3002\u8be5\u77e9\u9635\u901a\u5e38\u968f\u673a\u521d\u59cb\u5316\uff0c\u6700\u7ec8\u901a\u8fc7\u8bad\u7ec3\u786e\u5b9a\u3002\u5728\u201cGPT-2\u201d\u4e2d\uff0c\u6700\u7ec8\u8bad\u7ec3\u540e\u7684\u77e9\u9635\u6709\u5982\u4e0b\u5f62\u5f0f\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code lang=\"python\" class=\"language-python\">pos_emb_layer = model.transformer.wpe\n\n# .weight.data : Embedding Matrix(PyTorch Tensor)\npos_emb_matrix = pos_emb_layer.weight.data\n\nprint(pos_emb_matrix[:12,:3])<\/code><\/pre>\n\n\n\n<p>\u8f93\u5165\u5982\u4e0b\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\" style=\"font-size:14px\"><code class=\"\">tensor([[-0.0188, -0.1974,  0.0040],\n        [ 0.0240, -0.0538, -0.0949],\n        [ 0.0042, -0.0848,  0.0545],\n        [-0.0003, -0.0738,  0.1055],\n        [ 0.0076, -0.0251,  0.1270],\n        [ 0.0096, -0.0339,  0.1312],\n        [ 0.0027, -0.0205,  0.1196],\n        [ 0.0025, -0.0032,  0.1174],\n        [-0.0012, -0.0018,  0.1110],\n        [ 0.0049,  0.0021,  0.1178],\n        [ 0.0016,  0.0062,  0.1004],\n        [-0.0036,  0.0175,  0.1068]])<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\">\u793a\u4f8b\u4e0e\u4ee3\u7801<\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">\u73af\u5883\u51c6\u5907<\/h4>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u8fd9\u91cc\u4f9d\u65e7\u4f7f\u7528 &#x1f917; HuggingFace \u7684 <a href=\"https:\/\/huggingface.co\/docs\/transformers\/index\">Transformers<\/a> \u5e93\u4ee5\u53ca <a href=\"https:\/\/colab.google\/\">Google \u7684 Colab<\/a> \u4f5c\u4e3a\u73af\u5883\uff0c\u786c\u4ef6\u4e3a T4 GPU\u3002\u8fd9\u91cc\u65b0\u5efa\u7684 Colab \u6587\u4ef6\u94fe\u63a5\uff1a <a href=\"https:\/\/colab.research.google.com\/drive\/18c8TQqfK0uA8yTUWwz6kOJ2aO5NPulZM#scrollTo=2UuLV1h2_x0K\">AutoTokenizer.ipynb<\/a> <\/li>\n\n\n\n<li>\u8fd9\u91cc\u4f7f\u7528\u7684\u6d4b\u8bd5\u6a21\u578b\u662f\u201c<a href=\"https:\/\/huggingface.co\/openai-community\/gpt2\">openai-community\/gpt2<\/a>\u201d\uff0c\u8be5\u6a21\u578b\u662f\u7531 <a href=\"https:\/\/openai.com\/index\/better-language-models\/\">OpenAI \u5728 2019<\/a> \u5bf9\u5916\u53d1\u5e03\u7684\u3002<\/li>\n<\/ul>\n\n\n\n<pre class=\"wp-block-code\"><code lang=\"python\" class=\"language-python\">!pip install transformers torch\n\nfrom transformers import AutoTokenizer, AutoModelForCausalLM\n\nMODEL_NAME = \"openai-community\/gpt2\"\ntokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\nmodel = AutoModelForCausalLM.from_pretrained(MODEL_NAME)<\/code><\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">\u5bf9\u53e5\u5b50\u8fdb\u884c tokenize<\/h4>\n\n\n\n<p>\u89c2\u5bdf\u53e5\u5b50\u7684tokenize\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code lang=\"python\" class=\"language-python\">text = \"It\u2019s very hot in summer. Swimming is\"\n\n# 1. \u5bf9\u53e5\u5b50\u8fdb\u884c Tokenization\n# return_tensors='pt' \u8868\u793a\u8fd4\u56de PyTorch Tensor \u683c\u5f0f (\u867d\u7136\u8fd9\u91cc\u6211\u4eec\u4e3b\u8981\u770b IDs)\ninputs = tokenizer(text, return_tensors='pt')\n\n# 2. \u6253\u5370 Tokenization \u7ed3\u679c\nprint(f\"--- \u539f\u59cb\u53e5\u5b50\uff1a{text} ---\")\n\n# a. \u6253\u5370 Token ID Tensor\nprint(\"Token IDs (Tensor):\")\nprint(inputs['input_ids'])\n\n# b. \u5c06 Token ID \u8f6c\u6362\u56de\u53ef\u8bfb\u7684 Token (Word Pieces)\n# .squeeze() \u662f\u4e3a\u4e86\u53bb\u9664 batch \u7ef4\u5ea6\ntokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze().tolist())\nprint(\"\\nToken List (\u53ef\u8bfb\u6587\u672c Tokens):\")\nprint(tokens)\n\n# c. \u6253\u5370 Attention Mask (1 \u8868\u793a\u662f\u6709\u6548 Token\uff0c0 \u8868\u793a\u662f Padding Token)\nprint(\"\\nAttention Mask:\")\nprint(inputs['attention_mask'])\n\n--------------\n--- output ---\n--------------\n\n--- \u539f\u59cb\u53e5\u5b50\uff1aIt\u2019s very hot in summer. Swimming is ---\nToken IDs (Tensor):\ntensor([[ 1026,   447,   247,    82,   845,  3024,   287,  3931,    13,  2451,\n         27428,   318]])\n\nToken List (\u53ef\u8bfb\u6587\u672c Tokens):\n['It', '\u00e2\u0122', '\u013b', 's', '\u0120very', '\u0120hot', '\u0120in', '\u0120summer', '.', '\u0120Sw', 'imming', '\u0120is']\n\nAttention Mask:\ntensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])<\/code><\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">\u8bcd\u8868\u5927\u5c0f\u67e5\u770b<\/h4>\n\n\n\n<p>\u67e5\u770b\u8bcd\u8868\u5927\u5c0f\uff0c\u5e76\u6253\u5370\u8bcd\u8868\u4e2d\u7684\u524d20\u4e2a\u8bcd\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code lang=\"python\" class=\"language-python\"># \u83b7\u53d6\u8bcd\u6c47\u8868\u5927\u5c0f\nvocab_size = tokenizer.vocab_size\nprint(f\"--- GPT-2 \u6a21\u578b\u652f\u6301\u7684 Tokenize \u603b\u6570 (\u8bcd\u6c47\u8868\u5927\u5c0f): {vocab_size} ---\")\n\n# \u6253\u5370\u6240\u6709 Tokenize\n# tokenizer.get_vocab() \u8fd4\u56de\u7684\u662f\u4e00\u4e2a\u5b57\u5178 {token: id}\nprint(\"\\n--- \u6253\u5370\u524d 20 \u4e2a Token (\u7528\u4e8e\u793a\u4f8b): ---\")\nvocab = tokenizer.get_vocab()\ncount = 0\nfor token, id in vocab.items():\n    if count &lt; 20:\n        # \u4f7f\u7528 repr() \u786e\u4fdd\u7279\u6b8a\u5b57\u7b26\uff08\u5982\u7a7a\u683c '\u0120'\uff09\u80fd\u88ab\u6e05\u6670\u5c55\u793a\n        print(f\"ID: {id:&lt;5} | Token: {repr(token)}\")\n        count += 1\n    else:\n        break\n\n--------------\n--- output ---\n--------------\n\n--- GPT-2 \u6a21\u578b\u652f\u6301\u7684 Tokenize \u603b\u6570 (\u8bcd\u6c47\u8868\u5927\u5c0f): 50257 ---\n\n--- \u6253\u5370\u524d 20 \u4e2a Token (\u7528\u4e8e\u793a\u4f8b): ---\nID: 24419 | Token: 'emb'\nID: 13650 | Token: '\u0120Draft'\nID: 36608 | Token: '\u0120reinvent'\nID: 36171 | Token: 'Recommended'\nID: 20706 | Token: 'aunting'\nID: 39558 | Token: '\u0120protagonists'\nID: 49309 | Token: 'raised'\nID: 20589 | Token: '\u0120wicked'\nID: 43074 | Token: '\u0120\u00e2\u013f'\nID: 22792 | Token: '\u0120Tut'\nID: 21620 | Token: 'erate'\n...<\/code><\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">\u6839\u636e Token ID \u6253\u5370\u5b57\u7b26<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code lang=\"python\" class=\"language-python\"># \u5f85\u67e5\u8be2\u7684 Token ID \u5217\u8868\ntarget_ids = [0, 1, 2, 50254 ,50255, 50256]\n\ntokens = tokenizer.convert_ids_to_tokens(target_ids)\n\nprint(f\"--- Token ID \u8f6c\u6362\u4e3a Token \u5b57\u7b26 ---\")\nfor id, token in zip(target_ids, tokens):\n    # \u4f7f\u7528 repr() \u786e\u4fdd\u4efb\u4f55\u7279\u6b8a\u7684\u4e0d\u53ef\u89c1\u5b57\u7b26\uff08\u5982\u7a7a\u683c\u6216\u63a7\u5236\u5b57\u7b26\uff09\u80fd\u88ab\u6e05\u6670\u5730\u5c55\u793a\n    print(f\"Token ID: {id:&lt;5} | \u5bf9\u5e94 Token \u5b57\u7b26: {repr(token)}\")\n\n# \u989d\u5916\u6253\u5370\u8fd9\u51e0\u4e2a Token \u5728\u8bcd\u6c47\u8868\u4e2d\u7684 ID\uff0c\u4ee5\u786e\u8ba4\u5176\u5bf9\u5e94\u5173\u7cfb\n# \u8fd9\u91cc\u7684 token.id \u5e76\u4e0d\u662f\u76f4\u63a5\u4ece 1, 2, 3 \u6765\u7684\uff0c\u800c\u662f\u4ece tokenizer.get_vocab() \u4e2d\u67e5\u5230\u7684\n# \u8fd9\u662f\u4e00\u4e2a\u8f85\u52a9\u9a8c\u8bc1\u6b65\u9aa4\uff0c\u786e\u4fdd Tokenizer \u7684\u884c\u4e3a\u7b26\u5408\u9884\u671f\u3002\nprint(\"\\n--- \u8f85\u52a9\u9a8c\u8bc1 ---\")\nfor token in tokens:\n    token_id_check = tokenizer.convert_tokens_to_ids(token)\n    print(f\"Token \u5b57\u7b26: {repr(token):&lt;10} | \u67e5\u9a8c ID: {token_id_check}\")\n\n--------------\n--- output ---\n--------------\n\n--- Token ID \u8f6c\u6362\u4e3a Token \u5b57\u7b26 ---\nToken ID: 0     | \u5bf9\u5e94 Token \u5b57\u7b26: '!'\nToken ID: 1     | \u5bf9\u5e94 Token \u5b57\u7b26: '\"'\nToken ID: 2     | \u5bf9\u5e94 Token \u5b57\u7b26: '#'\nToken ID: 50254 | \u5bf9\u5e94 Token \u5b57\u7b26: '\u0120informants'\nToken ID: 50255 | \u5bf9\u5e94 Token \u5b57\u7b26: '\u0120gazed'\nToken ID: 50256 | \u5bf9\u5e94 Token \u5b57\u7b26: '&lt;|endoftext|&gt;'<\/code><\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">\u6839\u636e Token ID \u67e5\u8be2\u5bf9\u5e94\u5411\u91cf<\/h4>\n\n\n\n<p>\u5728\u6a21\u578b\u4e2d\uff0c\u201c\u8bcd\u5411\u91cf\u201d\uff08\u51c6\u786e\u7684\u5e94\u8be5\u662fToken\u5411\u91cf\uff09\u5b58\u50a8\u5728\u4e00\u4e2aEmbedding Matrix \u4e2d\uff0c\u53ef\u4ee5\u4f7f\u7528\u5982\u4e0b\u7684\u4ee3\u7801\u83b7\u53d6\u6bcf\u4e2a token \u5bf9\u5e94 Embedding \u540e\u7684\u5411\u91cf\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code lang=\"python\" class=\"language-python\">target_token = \"\u0120Sw\"  # \u6ce8\u610f\u524d\u9762\u7684\u7279\u6b8a\u7b26\u53f7\uff0c\u786e\u4fdd\u5b83\u662f\u6a21\u578b\u8bcd\u6c47\u8868\u4e2d\u7684 Token\ntarget_id = tokenizer.convert_tokens_to_ids(target_token)\nprint(target_id)\n\n# embedding_matrix[target_id]\ntarget_embedding = embedding_matrix[target_id]\nprint(target_embedding[:5].numpy())\n\n--------------\n--- output ---\n--------------\n\n2451\n[ 0.06167513  0.03733223  0.10182938  0.04881619 -0.09597597]\nEmbedding \u5c42\u7684 dtype: torch.float32\n\u6574\u4e2a\u6a21\u578b\u7684\u9ed8\u8ba4 dtype: torch.float32<\/code><\/pre>\n\n\n\n<p><\/p>\n\n\n\n<h4 class=\"wp-block-heading\">\u53e5\u5b50\u5230 Embedding \u5411\u91cf<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code lang=\"python\" class=\"language-python\">text = \"It\u2019s very hot in summer. Swimming is\"\ninputs = tokenizer(text, return_tensors='pt')\nprint(\"Token IDs (Tensor):\")\ninput_ids_tensor = inputs['input_ids']\ninput_ids_list = input_ids_tensor.squeeze().tolist()\n\nfor index, token_int in enumerate(input_ids_list):\n    token_char = tokenizer.convert_ids_to_tokens(token_int)\n    print(f\"Token ID {token_int:&lt;5} | Token: {repr(token_char):&lt;9} | {embedding_matrix[token_int][:3]}\")\n\n--------------\n--- output ---\n--------------\n\nToken ID 1026  | Token: 'It'      | tensor([ 0.0390, -0.0869,  0.0662])\nToken ID 447   | Token: '\u00e2\u0122'      | tensor([-0.0750,  0.0948, -0.0034])\nToken ID 247   | Token: '\u013b'       | tensor([-0.0223,  0.0182,  0.2631])\nToken ID 82    | Token: 's'       | tensor([-0.0640, -0.0469,  0.2061])\nToken ID 845   | Token: '\u0120very'   | tensor([-0.0553, -0.0348,  0.0606])\nToken ID 3024  | Token: '\u0120hot'    | tensor([ 0.0399, -0.0053,  0.0742])\nToken ID 287   | Token: '\u0120in'     | tensor([-0.0337,  0.0108,  0.0293])\nToken ID 3931  | Token: '\u0120summer' | tensor([ 0.0422,  0.0138, -0.0213])\nToken ID 13    | Token: '.'       | tensor([ 0.0466, -0.0113,  0.0283])\nToken ID 2451  | Token: '\u0120Sw'     | tensor([0.0617, 0.0373, 0.1018])\nToken ID 27428 | Token: 'imming'  | tensor([-0.1385, -0.1774, -0.0181])\nToken ID 318   | Token: '\u0120is'     | tensor([-0.0097,  0.0101,  0.0556])\n<\/code><\/pre>\n\n\n\n<p><\/p>\n\n\n\n<h3 class=\"wp-block-heading\">Embedding Matrix<\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">\u67e5\u770b Embedding Matrix<\/h4>\n\n\n\n<pre class=\"wp-block-code\"><code lang=\"python\" class=\"language-python\"># model.transformer.wte (Word Token Embeddings)\nembedding_layer = model.transformer.wte\n\n# .weight.data : Embedding Matrix(PyTorch Tensor)\nembedding_matrix = embedding_layer.weight.data\n\nprint(embedding_matrix.shape)\n\n--------------\n--- output ---\n--------------\n\ntorch.Size([50257, 768])<\/code><\/pre>\n\n\n\n<p><\/p>\n\n\n\n<h3 class=\"wp-block-heading\">\u6700\u540e<\/h3>\n\n\n\n<p>\u5927\u6a21\u578b\u8bad\u7ec3\u7684\u7b2c\u4e00\u6b65\u5c31\u662f\u5bf9\u4e8e\u8bed\u6599\u5e93\uff08corpus\uff09\u7684\u5904\u7406\uff0c\u5373\u5c06\u6240\u6709\u7684\u8bed\u6599\u8f6c\u6362\u4e3a\u5927\u6a21\u578b\u8bad\u7ec3\u80fd\u591f\u63a5\u53d7\u7684\u8f93\u5165\uff0c\u5373\uff1atokenize\u3002\u8be5\u8fc7\u7a0b\u4f1a\u5c06\u8bed\u6599\u5e93\u5207\u5206\u4e3a\u72ec\u7acb\u7684\u53e5\u5b50\uff0c\u591a\u4e2a\u53e5\u5b50\u53ef\u4ee5\u4f5c\u4e3a\u4e00\u4e2a\u6279\u6b21\uff08batch\uff09\u4f5c\u4e3a\u8f93\u5165\u8fdb\u884c\u8bad\u7ec3\u3002<\/p>\n\n\n\n<p><\/p>\n\n\n\n<p><\/p>\n\n\n\n<p><\/p>\n\n\n\n<p><\/p>\n\n\n\n<p><\/p>\n\n\n\n<p><\/p>\n\n\n\n<p><\/p>\n\n\n\n<p><\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u5bf9\u4e8e\u975e NLP \u4e13\u4e1a\u7684\u4eba\u6765\u8bf4\uff0c\u8981\u5411\u7406\u89e3\u5927\u8bed\u8a00\u6a21\u578b\u7684\u57fa\u7840\u5176\u5b9e\u662f\u975e\u5e38\u4e0d\u5bb9\u6613\u7684\u3002\u5728\u6709\u4e86\u4e00\u5b9a\u7684\u795e\u7ecf\u7f51\u7edc\u57fa\u7840\u3001\u6570\u5b66\u57fa\u7840\u4e4b\u540e\uff0c\u7b97\u662f\u53ef\u4ee5\u66f4\u8fdb\u4e00\u6b65\u4e86\uff0c\u5728\u4e86\u89e3LLM\u7684\u7cfb\u5217\u4e2d&#8230;<\/p>\n","protected":false},"author":1,"featured_media":21818,"comment_status":"open","ping_status":"closed","sticky":false,"template":"wp-custom-template-a-1440-px-width-template","format":"standard","meta":{"_eb_attr":"","inline_featured_image":false,"_tocer_settings":[],"footnotes":""},"categories":[137],"tags":[],"class_list":["post-20782","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-learning-more"],"_links":{"self":[{"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/posts\/20782","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/comments?post=20782"}],"version-history":[{"count":66,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/posts\/20782\/revisions"}],"predecessor-version":[{"id":22032,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/posts\/20782\/revisions\/22032"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/media\/21818"}],"wp:attachment":[{"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/media?parent=20782"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/categories?post=20782"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/tags?post=20782"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}