{"id":20839,"date":"2025-12-31T15:10:45","date_gmt":"2025-12-31T07:10:45","guid":{"rendered":"https:\/\/www.orczhou.com\/?p=20839"},"modified":"2025-12-30T16:59:56","modified_gmt":"2025-12-30T08:59:56","slug":"understanding-llm-attention","status":"publish","type":"post","link":"https:\/\/www.orczhou.com\/index.php\/2025\/12\/understanding-llm-attention\/","title":{"rendered":"\u7406\u89e3\u5927\u8bed\u8a00\u6a21\u578b\u7684\u6838\u5fc3\uff1aAttention"},"content":{"rendered":"\n\n\n\n<p style=\"margin-top:4px\">\u5728\u6574\u4e2a\u5927\u8bed\u8a00\u6a21\u578b\u5b66\u4e60\u4e4b\u8def\u4e2d\uff0c\u5bf9 Attention \u673a\u5236\u7684\u7406\u89e3\u5927\u6982\u662f\u6700\u4e3a\u8ba9\u6211\u56f0\u60d1\u7684\u90e8\u5206\uff0c\u6700\u7ec8\u7ecf\u8fc7\u5c42\u5c42\u89e3\u6784\u3001\u52a0\u4e0a\u91cd\u65b0\u628a\u201c\u7ebf\u6027\u4ee3\u6570\u201d\u6e29\u4e60\u4e86\u4e00\u904d\u4e4b\u540e\uff0c\u6700\u7ec8\uff0c\u603b\u7b97\u67d0\u79cd\u7a0b\u5ea6\u7684\u7406\u89e3\u4e86 Attention \u673a\u5236\u7684\u8bbe\u8ba1\u3002\u76f8\u4fe1\u5bf9\u4e8e\u6240\u6709NLP\u4e13\u4e1a\u7684\u4eba\uff0c\u8fd9\u90e8\u5206\u90fd\u662f\u4e0d\u592a\u5bb9\u6613\u7406\u89e3\u7684\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">1. \u6982\u8ff0<\/h3>\n\n\n\n\n\n\n<p>\u8981\u60f3\u8bb2\u6e05\u695a\uff0c\u5927\u6982\u4e5f\u662f\u975e\u5e38\u4e0d\u5bb9\u6613\u7684\uff0c\u8fd9\u91cc\u5c31\u505a\u4e00\u4e2a\u5c1d\u8bd5\u5427\u3002\u8fd9\u91cc\u7684\u91cd\u70b9\u662f\u8bb2\u6e05\u695a Attention Score \uff08\u7b80\u79f0Attention\uff09\u7684\u8ba1\u7b97\u3002\u4ecb\u7ecd\u7684\u987a\u5e8f\u662f\u201c\u4e24\u4e2a\u8bcd\u8bed\u7684\u76f8\u4f3c\u5ea6\u201d\u3001\u201cSimilarity Score Matrix\u201d\u3001\u201cAttention Score Matrix\u201d\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">1.1 \u8981\u6784\u5efa\u7684\u662f\u76f4\u89c9\uff0c\u800c\u4e0d\u662f\u201c\u63a8\u7406\u201d<\/h4>\n\n\n\n<p>\u4e3a\u4ec0\u4e48 Attention \u7406\u89e3\u8d77\u6765\u5f88\u96be\u5462\uff1f\u6211\u60f3\u5176\u4e2d\u6709\u4e00\u4e2a\u539f\u56e0\u5927\u6982\u662f\u8fd9\u4e2a\u201c\u673a\u5236\u201d\u672c\u8eab\u5e76\u4e0d\u662f\u67d0\u79cd\u201c\u516c\u5f0f\u63a8\u5bfc\u201d\u51fa\u6765\u7684\uff0c\u800c\u662f\u901a\u8fc7\u4e00\u7bc7\u7bc7\u8bba\u6587\u4e0e\u5b9e\u8df5\uff0c\u88ab\u8bc1\u660e\u975e\u5e38\u6709\u6548\u7684\u4e00\u4e2a\u673a\u5236\uff0c\u6240\u4ee5\uff0c\u8fd9\u4e2a\u673a\u5236\u672c\u8eab\u7684\u6240\u5177\u5907\u7684\u201c\u53ef\u89e3\u91ca\u6027\u201d\u5176\u5b9e\u4e5f\u662f\u6709\u9650\u7684\u3002\u8fd9\u5927\u6982\u4e5f\u662f\uff0c\u65e0\u8bba\u4f60\u5728\u4e92\u8054\u7f51\u4e0a\u5982\u4f55\u641c\u7d22\uff0c\u4e5f\u6ca1\u6709\u8c01\u53ef\u4ee5\u6bd4\u8f83\u7b80\u5355\u7684\u628a\u8fd9\u4e2a\u673a\u5236\u8bf4\u6e05\u695a\u7684\u539f\u56e0\u3002\u4f46\uff0c\u7406\u89e3\u8fd9\u4e2a\u673a\u5236\u6784\u5efa\u7684\u76f4\u89c9\uff0c\u5bf9\u4e8e\u7406\u89e3\u6574\u4e2a Transformer \uff0c\u4ee5\u53ca\u6574\u4e2a\u5f53\u4ee3\u5927\u8bed\u8a00\u6a21\u578b\u6280\u672f\u57fa\u7840\u90fd\u662f\u81f3\u5173\u91cd\u8981\u7684\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">2. \u9884\u5904\u7406<\/h3>\n\n\n\n<p>\u5728\u201c<a href=\"https:\/\/www.orczhou.com\/index.php\/2025\/12\/understanding-llm-tokenize\/\">\u5927\u8bed\u8a00\u6a21\u578b\u7684\u8f93\u5165\uff1atokenize<\/a>\u201d\u4e2d\u8be6\u7ec6\u4ecb\u7ecd\u4e86\u201c\u63d0\u793a\u8bcd\u201d\u8fdb\u5165\u5927\u6a21\u578b\u5904\u7406\u4e4b\u524d\uff0c\u5982\u4f55\u5c06\u63d0\u793a\u8bcd\u6362\u884c\u6210\u5927\u6a21\u578b\u53ef\u4ee5\u5904\u7406\u7684\u201c\u8bcd\u5411\u91cf\u201d\u6216\u8005\u8bf4\u201ctoken embedding\u201d\u3002<\/p>\n\n\n\n<p>\u5927\u8bed\u8a00\u6a21\u578b\u5728\u5f00\u59cb\u201cAttention\u201d\u8ba1\u7b97\u4e4b\u524d\uff0c\u8fd8\u4f1a\u5bf9\u201ctoken embedding\u201d\u8fdb\u884c\u4e00\u4e9b\u9884\u5904\u7406\uff0c\u8fd9\u4e9b\u9884\u5904\u7406\u5305\u62ec\u4e86\u201c\u878d\u5165\u201d\u4f4d\u7f6e\u5411\u91cf\u3001\u5bf9\u5411\u91cf\u8fdb\u884c\u201c\u5f52\u4e00\u5316\u201d\u5904\u7406\uff08\u5c06\u5404\u4e2a\u5411\u91cf\u90fd\u8f6c\u5316\u4e3a\u5747\u503c\u4e3a0\u3001\u65b9\u5dee\u4e3a1\u7684\u5411\u91cf\uff0c\u957f\u5ea6\u8981\u7edf\u4e00\u53d8\u62101\u5417\uff1f\uff09\u3002<\/p>\n\n\n\n<p>\u4f8b\u5982\uff0c\u5728\u8fd9\u91cc\u7684\u4f8b\u5b50\u4e2d\uff0c\u63d0\u793a\u8bcd <span style=\"text-decoration: underline;\">\u201cIt\u2019s very hot in summer. Swimming is<\/span>\u201d\uff0c\u5148\u8f6c\u6362\u4e3aembedding\uff0c\u7136\u540e\u52a0\u4e0a\u4f4d\u7f6e\u7f16\u7801\uff08positional encoder)\u3001\u518d\u8fdb\u884c\u6b63\u89c4\u5316\uff0c\u6700\u540e\u53d8\u6362\u4e3a\u5982\u4e0b\u7684\u5411\u91cf \u201c X \u201d \uff1a<\/p>\n\n\n\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-61540d9c4838db9d14e460a5595385cd\" style=\"font-size:10px\">|-------------------------------------------------------------------------------------------------------------------------------------------<br>|  <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">Token<\/mark>    | <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">wte[:3] (word token embedding)<\/mark>  | <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">wpe[:3](word positional e..)<\/mark>|   <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">wte [:3]  + wpe [:3]<\/mark>     | <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">X[:3] (Norm)<\/mark>                    |<br>|-------------------------------------------------------------------------------------------------------------------------------------------<br>|It         | [0.039,   -0.0869, 0.0662 ,...] | [-0.0188, -0.1974, 0.004  ] | [0.0202, -0.2844,0.0702 ]  | [ 0.0129, -0.1104, -0.0317,...] |<br>|\u00e2\u0122         | [-0.075,  0.0948,  -0.0034,...] | [0.024,   -0.0538, -0.0949] | [-0.051, 0.041,  -0.0982]  | [-0.0530,  0.0588, -0.1290,...] |<br>|\u013b          | [-0.0223, 0.0182,  0.2631 ,...] | [0.0042,  -0.0848, 0.0545 ] | [-0.0181,-0.0666,0.3176 ]  | [-0.0170, -0.0242,  0.1639,...] |<br>|s          | [-0.064,  -0.0469, 0.2061 ,...] | [-0.0003, -0.0738, 0.1055 ] | [-0.0643,-0.1207,0.3116 ]  | [-0.0754, -0.0842,  0.1842,...] |<br>|\u0120very      | [-0.0553, -0.0348, 0.0606 ,...] | [0.0076,  -0.0251, 0.127  ] | [-0.0477,-0.0599,0.1876 ]  | [-0.0566, -0.0280,  0.0953,...] |<br>|\u0120hot       | [0.0399,  -0.0053, 0.0742 ,...] | [0.0096,  -0.0339, 0.1312 ] | [0.0495, -0.0392,0.2054 ]  | [ 0.0587, -0.0086,  0.1073,...] |<br>|\u0120in        | [-0.0337, 0.0108,  0.0293 ,...] | [0.0027,  -0.0205, 0.1196 ] | [-0.031, -0.0098,0.149  ]  | [-0.0391,  0.0209,  0.0731,...] |<br>|\u0120summer    | [0.0422,  0.0138,  -0.0213,...] | [0.0025,  -0.0032, 0.1174 ] | [0.0448, 0.0106, 0.0961 ]  | [ 0.0532,  0.0397,  0.0181,...] |<br>|.          | [0.0466,  -0.0113, 0.0283 ,...] | [-0.0012, -0.0018, 0.111  ] | [0.0454, -0.0131,0.1394 ]  | [ 0.0553,  0.0152,  0.0579,...] |<br>|\u0120Sw        | [0.0617,  0.0373,  0.1018 ,...] | [0.0049,  0.0021,  0.1178 ] | [0.0666, 0.0395, 0.2196 ]  | [ 0.0807,  0.0691,  0.1216,...] |<br>|imming     | [-0.1385, -0.1774, -0.0181,...] | [0.0016,  0.0062,  0.1004 ] | [-0.1369,-0.1711,0.0823 ]  | [-0.1528, -0.1249, -0.0017,...] |<br>|\u0120is        | [-0.0097, 0.0101,  0.0556 ,...] | [-0.0036, 0.0175,  0.1068 ] | [-0.0133,0.0275, 0.1623 ]  | [-0.0175,  0.0605,  0.0880,...] |<br>|-------------------------------------------------------------------------------------------------------------------------------------------<\/pre>\n\n\n\n<p>\u8fd9\u91cc\u7684 \u201cX\u201d \u662f\u4e00\u4e2a\u753112\u4e2a \u201ctoken embedding\u201d\u7ec4\u6210\u7684\u77e9\u9635\uff0c\u201c\u5f62\u72b6\u201d\u662f 12 x 768 \u3002\u5728\u6570\u5b66\u7b26\u53f7\u4e0a\uff0c\u6709\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-eb87c4c38522c9e2ac3627e8fda2fd92\" style=\"font-size:12px\">   ---------------------------------------    ------------------------------------------   -------------<br>   |     [ 0.0129, -0.1104, -0.0317,...] |    | <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">x_1<\/mark>  = [ 0.0129, -0.1104, -0.0317,...] |   |It         |<br>   |     [-0.0530,  0.0588, -0.1290,...] |    | <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">x_2<\/mark>  = [-0.0530,  0.0588, -0.1290,...] |   |\u00e2\u0122         |<br>   |     [-0.0170, -0.0242,  0.1639,...] |    | <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">x_3<\/mark>  = [-0.0170, -0.0242,  0.1639,...] |   |\u013b          |<br>   |     [-0.0754, -0.0842,  0.1842,...] |    | <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">x_4<\/mark>  = [-0.0754, -0.0842,  0.1842,...] |   |s          |<br>   |     [-0.0566, -0.0280,  0.0953,...] |    | <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">x_5<\/mark>  = [-0.0566, -0.0280,  0.0953,...] |   |\u0120very      |<br>   | <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">X<\/mark> = [ 0.0587, -0.0086,  0.1073,...] |    | <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">x_6<\/mark>  = [ 0.0587, -0.0086,  0.1073,...] |   |\u0120hot       |<br>   |     [-0.0391,  0.0209,  0.0731,...] |    | <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">x_7<\/mark>  = [-0.0391,  0.0209,  0.0731,...] |   |\u0120in        |<br>   |     [ 0.0532,  0.0397,  0.0181,...] |    | <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">x_8<\/mark>  = [ 0.0532,  0.0397,  0.0181,...] |   |\u0120summer    |<br>   |     [ 0.0553,  0.0152,  0.0579,...] |    | <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">x_9<\/mark>  = [ 0.0553,  0.0152,  0.0579,...] |   |.          |<br>   |     [ 0.0807,  0.0691,  0.1216,...] |    | <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">x_10<\/mark> = [ 0.0807,  0.0691,  0.1216,...] |   |\u0120Sw        |<br>   |     [-0.1528, -0.1249, -0.0017,...] |    | <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">x_11<\/mark> = [-0.1528, -0.1249, -0.0017,...] |   |imming     |<br>   |     [-0.0175,  0.0605,  0.0880,...] |    | <mark style=\"background-color:rgba(0, 0, 0, 0);color:#13e4f3\" class=\"has-inline-color\">x_12<\/mark> = [-0.0175,  0.0605,  0.0880,...] |   |\u0120is        |<br>   ---------------------------------------    ------------------------------------------   -------------<\/pre>\n\n\n\n<h3 class=\"wp-block-heading\">3. Similarity Score Matrix<\/h3>\n\n\n\n<p>\u5728\u6b63\u5f0f\u4ecb\u7ecd Attention \u4e4b\u524d\uff0c\u4e3a\u4e86\u80fd\u591f\u6bd4\u8f83\u597d\u7684\u7406\u89e3\u201c\u4e3a\u4ec0\u4e48\u201d\u662f\u8fd9\u6837\uff0c\u8fd9\u91cc\u5148\u5f15\u5165\u4e86\u201cSimilarity\u201d\u7684\u6982\u5ff5\uff0c\u6700\u7ec8\u5728\u8be5\u6982\u5ff5\u4e0a\uff0c\u65b0\u589e\u6743\u91cd\u77e9\u9635\uff0c\u5c31\u662f\u6700\u7ec8\u7684 Attention \uff1a<\/p>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\">\n<p>$$ \\text{Similarity} = \\text{softmax}(\\frac{XX^{T}}{\\sqrt{d}})X $$<\/p>\n\n\n\n<p>\u8fd9\u91cc\u5220\u9664\u4e86\u53c2\u6570\u77e9\u9635\uff1a\\(W^Q \\quad  W^K \\quad  W^V \\)<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"border-left-color:var(--wp--preset--color--contrast);border-left-style:dotted;flex-basis:3%\"><\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\">\n<p>$$ \\text{Attention} = \\text{softmax}(\\frac{QK^{T}}{\\sqrt{d}})V $$<\/p>\n\n\n\n<p>\u5176\u4e2d\uff0c \\(Q = XW^Q \\quad  K = XW^K \\quad  V = XW^V \\)<\/p>\n<\/div>\n<\/div>\n\n\n\n<p><\/p>\n\n\n\n<h4 class=\"wp-block-heading\">3.1 \u4e24\u4e2a\u201c\u8bcd\u8bed\u201d\u7684\u76f8\u4f3c\u5ea6<\/h4>\n\n\n\n<p>\u5728\u5411\u91cf\u4e3a\u5355\u4f4d\u957f\u5ea6\u7684\u65f6\u5019\uff0c\u901a\u5e38\u53ef\u4ee5\u76f4\u63a5\u4f7f\u7528\u201c\u5185\u79ef\u201d\u4f5c\u4e3a\u4e24\u4e2a\u5411\u91cf\u7684\u76f8\u4f3c\u5ea6\u5ea6\u91cf\u3002\u4f8b\u5982\uff0c\u8003\u8651\u8bcd\u8bed \u201chot\u201d \u4e0e \u201csummer\u201d \u7684\u76f8\u4f3c\u5ea6\uff0c\u5219\u53ef\u4ee5\u201c\u7b80\u5316\u201d\u7684\u5904\u7406\u8fd9\u4e24\u4e2a\u8bcd\uff08Token\uff09\u7684\u5411\u91cf\u7684\u201c\u5185\u79ef\u201d\u3002<\/p>\n\n\n\n<p>\u5728\u524d\u9762\u7684\u6587\u7ae0\u201c<a href=\"https:\/\/www.orczhou.com\/index.php\/2025\/11\/understanding-llm-tokenize\/\">\u5927\u8bed\u8a00\u6a21\u578b\u7684\u8f93\u5165\uff1atokenize<\/a>\u201d\u4e2d\uff0c\u8f83\u4e3a\u8be6\u7ec6\u7684\u4ecb\u7ecd\u4e86\u5927\u8bed\u8a00\u6a21\u578b\u5982\u4f55\u628a\u4e00\u4e2a\u53e5\u5b50\u8f6c\u6362\u6210\u4e00\u4e2a\u7684 Token\uff0c\u7136\u540e\u518d\u8f6c\u6362\u4e3a\u4e00\u4e2a\u4e2a\u201c\u5411\u91cf\u201d\u3002\u90a3\u4e48\uff0c\u6211\u4eec\u901a\u5e38\u4f1a\u901a\u8fc7\u4e24\u4e2a\u5411\u91cf\u7684\u4f59\u5f26\u76f8\u4f3c\u5ea6\u6765\u63cf\u8ff0\u5176\u76f8\u4f3c\u5ea6\uff0c\u5982\u679c\u5411\u91cf\u7684\u201c\u957f\u5ea6\u201d\uff08\\(L_2 \\) \u8303\u6570\uff09\u662f\u5355\u4f4d\u957f\u5ea6\uff0c\u90a3\u4e48\u4e5f\u901a\u5e38\u4f1a\u76f4\u63a5\u4f7f\u7528\u201c\u5185\u79ef\u201d\u63cf\u8ff0\u4e24\u4e2a\u5411\u91cf\u7684\u76f8\u4f3c\u5ea6\uff1a<\/p>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\"><p>$$<br \/>\n\\cos \\theta = \\frac{\\alpha \\cdot \\beta}{\\|\\alpha\\| \\|\\beta\\| }<br \/>\n$$<\/p>\n\n\n\n<p>\\(f(x) = \\cos(x) \\) \u7684\u56fe\u50cf\u5982\u53f3\u56fe\uff0c\u6545\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u5939\u89d2\u4e3a 0 \u65f6\uff0c\u6700\u4e3a\u76f8\u4f3c\uff0c\u8fd9\u65f6\u5019 \\(\\cos(x) = 1 \\)<\/li>\n\n\n\n<li>\u5939\u89d2 \\(\\pi \\) \u65f6\uff0c\u6700\u201c\u4e0d\u201d\u76f8\u4f3c\uff0c\u8fd9\u65f6\u5019 \\(\\cos(x) = 0 \\)<\/li>\n<\/ul>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\">\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"613\" src=\"https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/11\/image-8-1024x613.png\" alt=\"\" class=\"wp-image-20849\" srcset=\"https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/11\/image-8-1024x613.png 1024w, https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/11\/image-8-300x180.png 300w, https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/11\/image-8-768x460.png 768w, https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/11\/image-8.png 1390w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n<\/div>\n<\/div>\n\n\n\n<p>\u4f8b\u5982\uff0c<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">3.2 \u201cSimilarity Score Matrix\u201d<\/h4>\n\n\n\n<p>\u56e0\u4e3a\u4e24\u4e2a\u5411\u91cf\u7684\u201c\u5185\u79ef\u201d\u67d0\u79cd\u7a0b\u5ea6\u53ef\u4ee5\u8868\u793a\u4e3a\u76f8\u4f3c\u5ea6\u3002\u90a3\u4e48\uff0c\u5bf9\u4e8e\u53e5\u5b50\u4e2d\u7684\u67d0\u4e2a token \u6765\u8bf4\uff0c\u4e0e\u5176\u4ed6\u6240\u6709\u5411\u91cf\u5404\u81ea\u8ba1\u7b97\u201c\u5185\u79ef\u201d\uff0c\u5c31\u53ef\u4ee5\u83b7\u5f97\u4e00\u4e2a\u4e0e\u5176\u4ed6\u6240\u6709\u5411\u91cf\u201c\u76f8\u4f3c\u7a0b\u5ea6\u201d\u7684\u6570\u7ec4\uff0c\u518d\u5bf9\u8fd9\u4e2a\u6570\u7ec4\u8fdb\u884c softmax \u8ba1\u7b97\u5c31\u53ef\u4ee5\u83b7\u5f97\u4e00\u4e2a\u8be5 token \u4e0e\u5176\u4ed6\u6240\u6709\u5411\u91cf\u201c\u76f8\u4f3c\u7a0b\u5ea6\u201d\u7684\u5f52\u4e00\u5316\u6570\u7ec4\u3002\u8fd9\u4e2a\u5f52\u4e00\u5316\u7684\u6570\u636e\uff0c\u5c31\u53ef\u4ee5\u7406\u89e3\u4e3a\u8fd9\u91cc\u7684\u201cSimilarity Score Matrix\u201d\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image alignright size-large is-resized\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"830\" src=\"https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/11\/image-9-1024x830.png\" alt=\"\" class=\"wp-image-20859\" style=\"aspect-ratio:1.2337409455565076;width:410px;height:auto\" srcset=\"https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/11\/image-9-1024x830.png 1024w, https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/11\/image-9-300x243.png 300w, https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/11\/image-9-768x622.png 768w, https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/11\/image-9.png 1150w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>\u8fd9\u91cc\u4f9d\u65e7\u4ee5\u201c<a href=\"https:\/\/www.orczhou.com\/index.php\/2025\/11\/understanding-llm-tokenize\/\">\u5927\u8bed\u8a00\u6a21\u578b\u7684\u8f93\u5165\uff1atokenize<\/a>\u201d\u793a\u4f8b\u4e2d\u7684\u53e5\u5b50\u4e3a\u6f14\u793a\u793a\u4f8b\u3002<\/p>\n\n\n\n<p>\u66f4\u4e3a\u5177\u4f53\u7684\uff0c\u53ef\u4ee5\u53c2\u8003\u53f3\u56fe\u3002\u8fd9\u91cc\u8003\u8651 Token \u201cIt\u201d \u4e0e\u5176\u4ed6\u6240\u6709\u8bcd\u8bed\u7684\u76f8\u4f3c\u5ea6\u3002\u5373\u8ba1\u7b97 Token \u201cIt\u201d \u7684 Embedding \u5411\u91cf\uff0c\u4e0e\u5176\u4ed6\u6240\u6709\u5411\u91cf\u7684\u201c\u5185\u79ef\u201d\u3002<\/p>\n\n\n\n<p>\u66f4\u8fdb\u4e00\u6b65\uff0c\u5982\u679c\u8ba1\u7b97\u4e24\u4e24\u8bcd\u8bed\u4e4b\u95f4\u7684\u76f8\u4f3c\u5ea6\uff0c\u5e76\u8fdb\u884c\u5f52\u4e00\u5316\uff08softmax \uff09\uff0c\u5219\u6709\u5982\u4e0b\u7684Similarity Matrix\uff1a<\/p>\n\n\n\n<div style=\"height:4px\" aria-hidden=\"true\" class=\"wp-block-spacer\"><\/div>\n\n\n\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-6891debeed86b615ab0b3804ecc84b29\" style=\"font-size:clamp(14px, 0.875rem + ((1vw - 3.2px) * 0.156), 16px);\">[0.6975, 0.0347, 0.0236, 0.0298, 0.0386, 0.0282, 0.0272, 0.0270, 0.0216, 0.0244, 0.0219, 0.0254]<br>[0.0000, 0.9994, 0.0002, 0.0000, 0.0000, 0.0001, 0.0000, 0.0000, 0.0000, 0.0001, 0.0001, 0.0000]<br>[0.0000, 0.0004, 0.9987, 0.0001, 0.0001, 0.0001, 0.0000, 0.0002, 0.0000, 0.0001, 0.0002, 0.0000]<br>[0.0002, 0.0003, 0.0004, 0.9945, 0.0008, 0.0004, 0.0011, 0.0003, 0.0007, 0.0005, 0.0002, 0.0007]<br>[0.0013, 0.0012, 0.0013, 0.0042, 0.9724, 0.0044, 0.0047, 0.0021, 0.0030, 0.0013, 0.0012, 0.0028]<br>[0.0002, 0.0003, 0.0003, 0.0004, 0.0009, 0.9932, 0.0005, 0.0020, 0.0004, 0.0009, 0.0006, 0.0003]<br>[0.0049, 0.0048, 0.0038, 0.0299, 0.0258, 0.0125, 0.7728, 0.0086, 0.0779, 0.0095, 0.0025, 0.0471]<br>[0.0001, 0.0001, 0.0005, 0.0002, 0.0003, 0.0015, 0.0002, 0.9959, 0.0002, 0.0003, 0.0003, 0.0002]<br>[0.0049, 0.0048, 0.0045, 0.0255, 0.0203, 0.0126, 0.0974, 0.0082, 0.7698, 0.0094, 0.0024, 0.0401]<br>[0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0003, 0.0001, 0.0002, 0.0001, 0.9983, 0.0002, 0.0002]<br>[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.9998, 0.0000]<br>[0.0033, 0.0027, 0.0028, 0.0129, 0.0110, 0.0063, 0.0334, 0.0055, 0.0228, 0.0075, 0.0025, 0.8893]<\/pre>\n\n\n\n<p>\u5728\u8fd9\u4e2a\u793a\u4f8b\u4e2d\uff0c\u5219\u4f1a\u6709\u4e0a\u8ff0 12&#215;12 \u7684\u77e9\u9635\u3002\u8be5\u77e9\u9635\u53cd\u5e94\u4e86\u201c\u8bcd\u201d\u4e0e\u201c\u8bcd\u201d\u4e4b\u95f4\u7684\u76f8\u4f3c\u5ea6\u3002\u5982\u679c\uff0c\u6211\u4eec\u628a\u6bcf\u4e00\u884c\u518d\u8fdb\u884c\u4e00\u4e2a\u201c\u5f52\u4e00\u5316\u201d\uff08\u6ce8\u53f3\u56fe\u5df2\u7ecf\u7ecf\u8fc7\u4e86\u5f52\u4e00\u5316\uff09\uff0c\u90a3\u4e48\u6bcf\u4e00\u884c\uff0c\u5c31\u53cd\u5e94\u4e86\u4e00\u4e2a\u8bcd\u8bed\u4e0e\u5176\u4ed6\u6240\u6709\u8bcd\u8bed\u76f8\u4f3c\u7a0b\u5ea6\u7684\u4e00\u4e2a\u5ea6\u91cf\u3002<\/p>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\">\n<p>\u4f8b\u5982\uff0c\u53f3\u56fe\u4e2d it \u53ef\u80fd\u4e0e very \u6700\u4e3a\u76f8\u4f3c\uff08\u9664\u4e86\u81ea\u8eab\uff09\u3002<\/p>\n<\/div>\n<\/div>\n\n\n\n<h3 class=\"wp-block-heading\">4. Self-Attention<\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">4.1 \u5bf9\u6bd4<\/h4>\n\n\n\n<p>\u6ce8\u610f\u5230\u6700\u7ec8\u7684 \u201cAttention\u201d \u8ba1\u7b97\u516c\u5f0f\u548c\u4e0a\u8ff0\u7684\u201cSimilarity Score Matrix\u201d\u7684\u5dee\u522b\u5c31\u662f\u53c2\u6570\u77e9\u9635W\uff1a<\/p>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\">\n<p>$$ \\text{Similarity Score} = \\text{softmax}(\\frac{XX^{T}}{\\sqrt{d}})X $$<\/p>\n\n\n\n<p>\u8fd9\u91cc\u6ca1\u6709\u53c2\u6570\u77e9\u9635\uff1a\\(W^Q \\quad W^K \\quad W^V \\)<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"border-left-color:var(--wp--preset--color--contrast);border-left-style:dotted;flex-basis:3%\"><\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\">\n<p>$$ \\text{Attention} = \\text{softmax}(\\frac{QK^{T}}{\\sqrt{d}})V $$<\/p>\n\n\n\n<p>\u5176\u4e2d\uff0c \\(Q = XW^Q \\quad  K = XW^K \\quad  V = XW^V \\)<\/p>\n<\/div>\n<\/div>\n\n\n\n<h4 class=\"wp-block-heading\">4.2 \u4e3a\u4ec0\u4e48\u9700\u8981\u53c2\u6570\u77e9\u9635 W<\/h4>\n\n\n\n<p>\u90a3\u4e48\uff0c\u4e3a\u4ec0\u4e48\u9700\u8981 \\(W^Q \\,, W^K \\,, W^V \\) \u5462\uff1f\u8fd9\u4e09\u4e2a\u53c2\u6570\u77e9\u9635\u4e58\u6cd5\uff0c\u610f\u5473\u7740\u4ec0\u4e48\u5462\uff1f\u8981\u8bf4\u6e05\u695a\u3001\u8981\u7406\u89e3\u8fd9\u4e2a\u70b9\u5e76\u4e0d\u5bb9\u6613\uff0c\u4e5f\u6ca1\u6709\u4ec0\u4e48\u7b80\u5355\u7684\u63cf\u8ff0\u53ef\u4ee5\u8bf4\u6e05\u695a\u7684\uff0c\u8fd9\u4e5f\u5927\u6982\u662f\u4e3a\u4ec0\u4e48\uff0c\u5bf9\u4e8e\u975e NLP \u4e13\u4e1a\u7684\u4eba\uff0c\u8981\u60f3\u771f\u6b63\u7406\u89e3 Transformer \u6216 Attention \u662f\u6bd4\u8f83\u56f0\u96be\u7684\u3002<\/p>\n\n\n\n<p>\u4f60\u53ef\u80fd\u4f1a\u770b\u5230\u8fc7\u4e00\u79cd\u6bd4\u8f83\u666e\u904d\u7684\u3001\u7b80\u5316\u7248\u672c\uff0c\u5927\u6982\u662f\u8bf4  \\(W^Q \\) \u662f\u4e00\u4e2a Query \u77e9\u9635\uff0c\u8868\u793a\u8981\u67e5\u8be2\u4ec0\u4e48\uff1b\\(W^K \\) \u662f\u4e00\u4e2a Key \u77e9\u9635\uff0c\u8868\u793a\u4e00\u4e2a\u8bcd\u6709\u4ec0\u4e48\u3002\u8fd9\u4e2a\u8bf4\u6cd5\u4f3c\u4e4e\u5e76\u4e0d\u80fd\u589e\u52a0\u5bf9\u4e0a\u8ff0\u516c\u5f0f\u7684\u7406\u89e3\u3002<\/p>\n\n\n\n<p>\u90a3\u4e48\uff0c\u4e00\u4e2a\u5411\u91cf\u4e58\u4ee5\u4e00\u4e2a\u77e9\u9635\u65f6\uff0c\u8fd9\u4e2a\u201c\u77e9\u9635\u201d\u610f\u5473\u7740\u4ec0\u4e48\uff1f\u662f\u7684\uff0c\u5c31\u662f\u201c\u7ebf\u6027\u53d8\u6362\u201d\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">4.3 \u7ebf\u6027\u53d8\u6362 Linear Transformations<\/h4>\n\n\n\n<p>\u4e00\u822c\u6765\u8bf4\uff0c\\(W^Q \\,, W^K \\,, W^V \\) \u662f\u4e00\u4e2a \\(d \\times d \\) \u7684\u77e9\u9635<sup>[1]<\/sup>\u3002\u5bf9\u4e8e\u7684 Token Embedding \uff08\u4e0a\u8ff0\u7684\u77e9\u9635 X \uff09\u6240\u5728\u7684\u5411\u91cf\u7a7a\u95f4\uff0c\u90a3\u4e48 \\(W^Q \\,, W^K \\,, W^V \\) \u5c31\u662f\u8be5\u5411\u91cf\u7a7a\u95f4\u7684\u4e09\u4e2a\u201c\u7ebf\u6027\u53d8\u6362\u201d\u3002<\/p>\n\n\n\n<p>\u90a3\u4e48\u7ebf\u6027\u53d8\u6362\u5bf9\u4e8e\u5411\u91cf\u7a7a\u95f4\u7684\u4f5c\u7528\u662f\u4ec0\u4e48\u5462\uff1f\u8fd9\u91cc\u6211\u4eec\u4ee5\u201c\u5947\u5f02\u503c\u5206\u89e3\u201d\u7684\u89d2\u5ea6\u6765\u7406\u89e3\u8fd9\u4e2a\u95ee\u9898<sup>[2]<\/sup>\uff0c\u5373\u5bf9\u5411\u91cf\u8fdb\u884c\u62c9\u4f38\/\u538b\u7f29\u3001\u65cb\u8f6c\u3001\u955c\u50cf\u53d8\u6362\u3002\\(W^Q \\,, W^K \\,, W^V \\) \u5219\u4f1a\u5206\u522b\u5bf9\u5411\u91cf\u7a7a\u95f4\u7684\u5411\u91cf\uff08\u5373Token Embedding\uff09\u505a\u7c7b\u4f3c\u7684\u53d8\u6362\u3002\u53d8\u6362\u7684\u7ed3\u679c\u5373\u4e3a\uff1a<\/p>\n\n\n<p>$$ Q = XW^Q \\quad \\quad K = XW^K \\quad \\quad V = XW^V $$<\/p>\n\n\n\n<p>\u90a3\u4e48\uff0c\u5982\u679c\u53c2\u6570\u77e9\u9635\u201c\u8bbe\u8ba1\u201d\u5408\u7406\uff0c\u201cToken\u201d\u4e0e\u201cToken\u201d\u4e4b\u95f4\u5c31\u53ef\u4ee5\u5efa\u7acb\u201c\u671f\u671b\u201d\u7684 Attention \u5173\u7cfb\uff0c\u4f8b\u5982\uff1a\u201c\u4ee3\u8bcd\u201d\uff08it\uff09\uff0c\u603b\u662f\u66f4\u591a\u7684\u5173\u6ce8\u4e8e\u201c\u540d\u8bcd\u201d\uff1b\u201c\u540d\u8bcd\u201d\u66f4\u591a\u7684\u5173\u6ce8\u4e0e\u9644\u8fd1\u7684\u201c\u5f62\u5bb9\u8bcd\u201d\uff1b\u518d\u6bd4\u5982\uff0c\u201c\u52a8\u8bcd\u201d\u66f4\u591a\u5173\u6ce8\u524d\u540e\u7684\u201c\u540d\u8bcd\u201d\u7b49\u3002\u9664\u4e86\u8bcd\u6027\uff0c\u7ebf\u6027\u53d8\u6362\u5173\u6ce8\u7684\u201c\u7ef4\u5ea6\u201d\u53ef\u80fd\u6709\u5f88\u591a\uff0c\u4f8b\u5982\u201c\u4f4d\u7f6e\u201d\u3001\u201c\u60c5\u611f\u201d\u3001\u201c\u52a8\u7269\u201d\u3001\u201c\u690d\u7269\u201d\u3001\u201c\u79ef\u6781\/\u6d88\u6781\u201d\u7b49\u3002\u5173\u4e8e\u5982\u4f55\u7406\u89e3 token embedding \u7684\u5404\u4e2a\u201c\u7ef4\u5ea6\u201d\u542b\u4e49\u53ef\u4ee5\u53c2\u8003\uff1a<a href=\"https:\/\/www.orczhou.com\/index.php\/2025\/08\/understanding-word-embedding-some-way\/\">Word Embedding \u7684\u53ef\u89e3\u91ca\u6027\u63a2\u7d22<\/a>\u3002<\/p>\n\n\n\n<p>\u5f53\u7136\uff0c\u8fd9\u4e09\u4e2a\u53c2\u6570\u77e9\u9635\u90fd\u4e0d\u662f\u201c\u8bbe\u8ba1\u201d\u51fa\u6765\u7684\uff0c\u800c\u662f\u201c\u8bad\u7ec3\u201d\u51fa\u6765\u7684\u3002\u6240\u4ee5\uff0c\u8981\u60f3\u5bfb\u627e\u4e0a\u8ff0\u5982\u6b64\u6e05\u6670\u7684\u201c\u53ef\u89e3\u91ca\u6027\u201d\u5e76\u4e0d\u5bb9\u6613\u30022019\u5e74\u7684\u8bba\u6587\u300a<a href=\"https:\/\/arxiv.org\/abs\/1906.04341\">What Does BERT Look At? An Analysis of BERT&#8217;s Attention<\/a>\u300b\u8f83\u4e3a\u7cfb\u7edf\u7684\u8ba8\u8bba\u4e86\u8fd9\u4e2a\u95ee\u9898\uff0c\u611f\u5174\u8da3\u7684\u53ef\u4ee5\u53bb\u770b\u770b\u3002<\/p>\n\n\n\n<p>\u5173\u4e8e\u7ebf\u6027\u53d8\u6362\u5982\u4f55\u4f5c\u7528\u5728\u5411\u91cf\u7a7a\u95f4\u4e0a\uff0c\u53ef\u4ee5\u53c2\u8003\uff1a\u7ebf\u6027\u4ee3\u6570\u3001<a href=\"https:\/\/www.orczhou.com\/index.php\/2025\/09\/svd-01\/\">\u5947\u5f02\u503c\u5206\u89e3\u2013\u6df1\u5ea6\u5b66\u4e60\u7684\u6570\u5b66\u57fa\u7840<\/a>\u3002<\/p>\n\n\n\n<p>\u6240\u4ee5\uff0c\\( \\frac{QK^T}{\\sqrt{d}} =  \\frac{XW^Q (XW^K)^T}{\\sqrt{d}} \\) \u5219\u53ef\u4ee5\u7cfb\u7edf\u7684\u8868\u793a\uff0c\u6bcf\u4e2a\u201cToken\u201d\u5bf9\u4e8e\u5176\u4ed6\u201cToken\u201d\u7684\u5173\u6ce8\u7a0b\u5ea6\uff08\u5373pay attention\u7684\u7a0b\u5ea6\uff09\u3002\u53ef\u4ee5\u6ce8\u610f\u5230\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u589e\u52a0\u4e86\u53c2\u6570\u77e9\u9635\\(W^Q \\,, W^K \\,, W^V \\)\u540e\uff0c\u524d\u9762\u7684\u201c\u76f8\u4f3c\u6027\u201d\u77e9\u9635\uff0c\u5c31\u53d8\u4e3a\u201c\u6ce8\u610f\u529b\u201d\u77e9\u9635<\/li>\n\n\n\n<li>\u201cToken\u201d \u4e4b\u95f4\u7684\u5173\u6ce8\u7a0b\u5ea6\u4e0d\u662f\u5bf9\u79f0\u7684\u3002\u4f8b\u5982 Token A \u53ef\u80fd\u5f88\u5173\u6ce8 B\uff1b\u4f46\u662f B \u53ef\u80fd\u5e76\u4e0d\u5173\u6ce8 A<\/li>\n\n\n\n<li>\u8fd9\u91cc\u7684 \\(\\sqrt{d} \\) \u6839\u636e\u8bba\u6587\u63cf\u8ff0\uff0c\u53ef\u4ee5\u63d0\u5347\u8ba1\u7b97\u6027\u80fd\uff1b<\/li>\n<\/ul>\n\n\n\n<p>\u5982\u679c\uff0c\u4f60\u6070\u597d\u7406\u89e3\u4e86\u4e0a\u9762\u6240\u6709\u7684\u63cf\u8ff0\uff0c\u5927\u6982\u4f1a\u6709\u70b9\u5931\u671b\u7684\u3002\u5c31\u53ea\u80fd\u5230\u8fd9\u513f\u5417\uff1f\u4f3c\u4e4e\u5c31\u53ea\u80fd\u5230\u8fd9\u91cc\u4e86\u3002\u5982\u679c\uff0c\u4f60\u6709\u66f4\u6df1\u523b\u7684\u7406\u89e3\uff0c\u6b22\u8fce\u7559\u8a00\u8ba8\u8bba\u3002<\/p>\n\n\n\n<p>\u63a5\u4e0b\u91cc\uff0c\u6211\u4eec\u6765\u770b\u770b \u201cAttention Score Matrix\u201d \u7684\u8ba1\u7b97\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">4.4 Attention Score Matrix<\/h4>\n\n\n\n<p>\u4f7f\u7528\u4e0a\u8ff0\u7684 \u201cSimilarity Score Matrix\u201d \u7684\u8ba1\u7b97\u65b9\u5f0f\uff0c\u53ef\u4ee5\u8ba1\u7b97\u7c7b\u4f3c\u7684 \u201cAttention Score Matrix\u201d\uff0c\u4e4b\u540e\u518d\u5bf9\u8be5\u77e9\u9635\u8fdb\u884c softmax \u8ba1\u7b97\u5c31\u53ef\u4ee5\u83b7\u5f97\u6bcf\u4e2a\u8bcd\u8bed\u5bf9\u4e8e\u5176\u4ed6\u6240\u6709\u8bcd\u8bed\u7684 Attention Score\uff0c\u6216\u8005\u53eb\u201c\u5173\u6ce8\u7a0b\u5ea6\u201d\u3002\u6709\u4e86\u8fd9\u4e2a\u5173\u6ce8\u7a0b\u5ea6\uff0c\u518d\u4e58\u4ee5 V \u77e9\u9635\uff0c\u539f\u6765\u7684 Token Embedding \u5c31\u53d8\u6362\u4e3a\u4e00\u4e2a\u65b0\u7684\u5e26\u6709\u4e0a\u4e0b\u6587\u7684\u542b\u4e49\u7684 Token Eembedding \u4e86\uff0c\u5373 Context Embedding<sup>[3]<\/sup>\u3002<\/p>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:30%\">\n<p>\u7c7b\u4f3c\u7684\uff0c\u6211\u4eec\u6709\u53f3\u56fe\u7684 Attention Score Matrix \u8ba1\u7b97\u3002<\/p>\n\n\n\n<p>\u8be5\u77e9\u9635\u53cd\u5e94\u4e86\u4e24\u4e2a Token \u4e4b\u95f4\u7684 Attention \u5173\u7cfb\u3002\u8be5\u5173\u7cfb\uff0c\u901a\u8fc7\u5bf9\u7ecf\u8fc7\u7ebf\u6027\u53d8\u6362\u7684 Token Embedding \uff0c\u518d\u8fdb\u884c\u5185\u79ef\u8ba1\u7b97\u83b7\u5f97\u3002<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:70%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-64a14c2b85c3daf300385ad87c9f5c55\" style=\"font-size:10px\">           Attention Score Matrix (12 x 12)<br>           It     \u00e2\u0122     \u013b      s    \u0120very  \u0120hot   \u0120in   \u0120summer   .     \u0120Sw   imming  \u0120is<br>         -----------------------------------------------------------------------------------<br> It      [ 0.14, -1.53, -1.45, -1.71, -1.69, -1.74, -2.36, -2.27, -2.37, -1.33, -0.58, -2.40]  |<br> \u00e2\u0122      [ 0.70, -0.93, -1.72, -1.02, -1.52, -2.24, -1.90, -2.19, -1.63, -2.13, -1.66, -2.14]  |<br> \u013b       [-0.60, -1.81, -1.99, -1.96, -2.57, -1.84, -1.62, -2.04, -0.98, -1.18, -2.23, -2.25]  |<br> s       [-0.46, -1.33, -1.60, -2.65, -2.24, -1.99, -2.89, -1.44, -2.05, -2.77, -2.09, -2.74]  |<br> \u0120very   [ 0.29, -1.42, -1.77, -1.15, -0.94, -1.14, -1.81, -1.04, -1.77, -2.13, -0.60, -0.82]  |<br> \u0120hot    [ 0.03, -0.68, -0.59, -0.95, -1.78, -0.10, -0.95, -0.14, -1.32, -0.57,  0.06, -1.07]  12<br> \u0120in     [-0.71, -1.72, -1.53, -2.18, -1.67, -1.93, -3.41, -1.69, -2.74, -1.89, -1.17, -2.02]  rows<br> \u0120summer [-0.34, -1.49, -1.35, -1.31, -1.12, -0.89, -1.49, -1.11, -1.51, -1.15, -1.45, -1.20]  |<br> .       [-0.89, -1.73, -2.67, -2.80, -2.45, -2.37, -4.39, -2.33, -4.42, -2.73, -1.82, -3.21]  |<br> \u0120Sw     [-0.05, -1.15, -1.76, -1.15, -1.68, -0.74, -1.15, -1.35, -1.36, -1.29, -0.43, -1.51]  |<br> imming  [-0.02, -1.65, -0.87, -0.35, -1.18, -0.65, -0.33, -1.25, -0.38, -1.68, -2.15, -1.08]  |<br> \u0120is     [-0.97, -2.03, -2.56, -2.94, -1.96, -2.71, -4.07, -2.46, -3.51, -2.68, -1.88, -2.99]  |<br>         -----------------------------------------------------------------------------------<br>         |&lt;----------------------------- columns: 12 --------------------------------------&gt;|<\/pre>\n<\/div>\n<\/div>\n\n\n\n<h4 class=\"wp-block-heading\">4.5 Masked Attention Score Matrix<\/h4>\n\n\n\n<p>\u4e0a\u8ff0\u8ba1\u7b97\uff0c\u662f\u4e00\u4e2a\u5178\u578b\u7684 Self Attention \u8ba1\u7b97\u8fc7\u7a0b\uff0cBERT \u6a21\u578b\u5c31\u4f7f\u7528\u7c7b\u4f3c\u7684\u8ba1\u7b97\uff0c\u4f46 GPT \u6a21\u578b\uff08\u6216\u8005\u53eb Decoder \u6a21\u578b\uff09\u8fd8\u6709\u4e00\u4e9b\u4e0d\u540c\u3002GPT \u6a21\u578b\u4e2d\u4e3a\u4e86\u8bad\u7ec3\u51fa\u66f4\u597d\u7684\u4ece\u73b0\u6709 Token \u4e2d\u751f\u6210\u65b0 Token \u7684\u6a21\u578b\uff0c\u5c06\u4e0a\u8ff0\u7684 Self Attention \u66f4\u6539\u6210\u4e86 Masked Self Attention \uff0c\u5373\u5c06 Attention Score Matrix \u7684\u53f3\u4e0a\u89d2\u90e8\u5206\u5168\u90e8\u7f6e\u4e3a -inf \uff08\u5373\u8d1f\u65e0\u7a77\uff09\uff0c\u540e\u7eed\u7ecf\u8fc7 softmax \u4e4b\u540e\u8fd9\u4e9b\u503c\u90fd\u4f1a\u53d8\u6210\u96f6\uff0c\u5373\uff0c\u5728\u8be5\u7c7b\u6a21\u578b\u4e0b\uff0c\u4e00\u4e2a\u8bcd\u8bed\u5bf9\u4e8e\u5176\u540e\u9762\u7684\u8bcd\u7684\u5173\u6ce8\u5ea6\u4e3a 0 \u3002<\/p>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:30%\">\n<p>\u5728 Decoder \u6a21\u578b\u8bbe\u8ba1\u4e2d\uff0c\u4e3a\u4e86\u751f\u6210\u66f4\u51c6\u786e\u7684\u4e0b\u4e00\u4e2a Token \u6240\u4ee5\u5728\u8bad\u7ec3\u548c\u63a8\u7406\u4e2d\uff0c\u4ec5\u4f1a\u8ba1\u7b97Token \u5bf9\u4e4b\u524d\u7684 Token \u7684 Attention \uff0c\u6240\u4ee5\u4e0a\u8ff0\u7684\u77e9\u9635\u7684\u53f3\u4e0a\u89d2\u90e8\u5206\u5c31\u4f1a\u88ab\u906e\u76d6\uff0c\u5373\u5c31\u662f\u53f3\u4fa7\u7684 \u201cMasked Attention Score Matrix\u201d\u3002<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:70%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-85ff0273cfdaf01144db33a73d971f3b\" style=\"font-size:10px\">           Masked Attention Score Matrix (12 x 12)<br>           It     \u00e2\u0122     \u013b      s    \u0120very  \u0120hot   \u0120in   \u0120summer   .     \u0120Sw   imming  \u0120is<br>         ---------------------------------------------------------------------------------------<br> It      [ 0.14,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf]  |<br> \u00e2\u0122      [ 0.70, -0.93,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf]  |<br> \u013b       [-0.60, -1.81, -1.99,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf]  |<br> s       [-0.46, -1.33, -1.60, -2.65,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf]  |<br> \u0120very   [ 0.29, -1.42, -1.77, -1.15, -0.94,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf]  |<br> \u0120hot    [ 0.03, -0.68, -0.59, -0.95, -1.78, -0.10,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf]  12<br> \u0120in     [-0.71, -1.72, -1.53, -2.18, -1.67, -1.93, -3.41,  -inf,  -inf,  -inf,  -inf,  -inf]  rows<br> \u0120summer [-0.34, -1.49, -1.35, -1.31, -1.12, -0.89, -1.49, -1.11,  -inf,  -inf,  -inf,  -inf]  |<br> .       [-0.89, -1.73, -2.67, -2.80, -2.45, -2.37, -4.39, -2.33, -4.42,  -inf,  -inf,  -inf]  |<br> \u0120Sw     [-0.05, -1.15, -1.76, -1.15, -1.68, -0.74, -1.15, -1.35, -1.36, -1.29,  -inf,  -inf]  |<br> imming  [-0.02, -1.65, -0.87, -0.35, -1.18, -0.65, -0.33, -1.25, -0.38, -1.68, -2.15,  -inf]  |<br> \u0120is     [-0.97, -2.03, -2.56, -2.94, -1.96, -2.71, -4.07, -2.46, -3.51, -2.68, -1.88, -2.99]  |<br>         ---------------------------------------------------------------------------------------<br>         |&lt;----------------------------- columns: 12 --------------------------------------&gt;|<\/pre>\n<\/div>\n<\/div>\n\n\n\n<p>\u901a\u5e38\u4e3a\u4e86\u5feb\u901f\u8ba1\u7b97\u5bf9\u4e8e\u4e0a\u8ff0\u7684\u8ba1\u7b97\u503c\u4f1a\u9664\u4ee5 \\(\\sqrt{d} \\) \uff0c\u53ef\u4ee5\u63d0\u5347\u8ba1\u7b97\u7684\u6548\u7387\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">4.6 \u5f52\u4e00\u5316(softmax) Attention Score<\/h4>\n\n\n\n<p>\u5bf9\u4e8e\u4e0a\u8ff0\u77e9\u9635\u7684\u6bcf\u4e00\u884c\u90fd\u8fdb\u884c\u4e00\u4e2a softmax \u8ba1\u7b97\uff0c\u5c31\u53ef\u4ee5\u83b7\u5f97\u4e00\u4e2a\u5f52\u4e00\u5316\u7684\u6309\u7167\u767e\u5206\u6bd4\u5206\u914d\u7684Attention Score\u3002<\/p>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:30%\">\n<p>\u7ecf\u8fc7\u5f52\u4e00\u5316\u4e4b\u540e\uff0c\u6bcf\u4e2a\u8bcd\u8bed\u5bf9\u4e8e\u5176\u4ed6\u8bcd\u8bed\u7684 Attention \u7a0b\u5ea6\u90fd\u53ef\u4ee5\u4f7f\u7528\u767e\u5206\u6bd4\u8868\u8fbe\u5904\u7406\u3002\u4f8b\u5982\uff0c\u201csummer\u201d\u5bf9\u4e8e\u201cIt\u201d\u7684\u5173\u6ce8\u7a0b\u5ea6\u6700\u9ad8\uff0c\u4e3a26%\uff1b\u5176\u6b21\u662f\u5173\u6ce8\u201chot\u201d\uff0c\u4e3a15%\u3002\u53ef\u4ee5\u770b\u5230\u8fd9\u4e00\u7ec4\u7ebf\u6027\u53d8\u6362\uff08\\(W^Q\\,W^K \\)\uff09\u5bf9\u4e8e\u7b2c\u4e00\u4e2a\u4f4d\u7f6e\u8868\u8fbe\u7279\u522b\u7684\u5173\u6ce8\u3002<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:70%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-3e2c729f5544b47f258cad7fdbacffdd\" style=\"font-size:10px\">           The Attention Score Matrix (12 x 12)<br>           It   \u00e2\u0122    \u013b     s     \u0120very  \u0120hot  \u0120in  \u0120summer  .   \u0120Sw   imming \u0120is<br>         -----------------------------------------------------------------------------<br> It      [1.00  0.00  0.00  0.00   0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00]  |<br> \u00e2\u0122      [0.84  0.16  0.00  0.00   0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00]  |<br> \u013b       [0.65  0.19  0.16  0.00   0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00]  |<br> s       [0.54  0.23  0.17  0.06   0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00]  |<br> \u0120very   [0.54  0.10  0.07  0.13   0.16  0.00  0.00  0.00  0.00  0.00  0.00  0.00]  |<br> \u0120hot    [0.29  0.14  0.16  0.11   0.05  0.25  0.00  0.00  0.00  0.00  0.00  0.00]  12<br> \u0120in     [0.36  0.13  0.16  0.08   0.14  0.11  0.02  0.00  0.00  0.00  0.00  0.00]  rows<br> \u0120summer [0.26  0.08  0.09  0.10   0.12  0.15  0.08  0.12  0.00  0.00  0.00  0.00]  |<br> .       [0.40  0.17  0.07  0.06   0.08  0.09  0.01  0.10  0.01  0.00  0.00  0.00]  |<br> \u0120Sw     [0.27  0.09  0.05  0.09   0.05  0.14  0.09  0.07  0.07  0.08  0.00  0.00]  |<br> imming  [0.19  0.04  0.08  0.14   0.06  0.10  0.14  0.06  0.13  0.04  0.02  0.00]  |<br> \u0120is     [0.30  0.10  0.06  0.04   0.11  0.05  0.01  0.07  0.02  0.05  0.12  0.04]  |<br>         -----------------------------------------------------------------------------<br>         |&lt;----------------------------- columns: 12 ---------------------------&gt;|<\/pre>\n<\/div>\n<\/div>\n\n\n\n<h4 class=\"wp-block-heading\">4.7 Contextual Embeddings<\/h4>\n\n\n\n<p>\u6700\u540e\uff0c\u518d\u6309\u7167\u4e0a\u8ff0\u7684 Attention Matrix \u7684\u6bd4\u4f8b\uff0c\u5c06\u5404\u4e2a Token Embedding \u8fdb\u884c\u4e00\u4e2a\u201c\u52a0\u6743\u5e73\u5747\u8ba1\u7b97\u201d\u3002<\/p>\n\n\n\n<p>\u4f8b\u5982\uff0c\u4e0a\u8ff0\u7684\u52a0\u6743\u8ba1\u7b97\u65f6\uff0c\u201csummer\u201d \u5219\u4f1a\u878d\u5165 26% \u7684\u201cIt\u201d\uff0c15%\u7684\u201chot\u201d&#8230; \uff0c\u6700\u540e\u751f\u6210\u65b0\u7684 \u201csummer\u201d \u7684\u8868\u8fbe\uff0c\u8fd9\u4e2a\u8868\u8fbe\u4e5f\u53ef\u4ee5\u67d0\u79cd\u7a0b\u5ea6\u7406\u89e3\u4e3a \u201cContextual Embeddings\u201d\u3002\u9700\u8981\u6ce8\u610f\u7684\u662f\uff0c\u8fd9\u91cc\u5728\u8ba1\u7b97\u52a0\u6743\u5e73\u5747\uff0c\u4e5f\u4e0d\u662f\u76f4\u63a5\u4f7f\u7528\u539f\u59cb\u7684 Token Embedding \uff0c\u4e5f\u662f\u4e00\u4e2a\u7ecf\u8fc7\u4e86\u7ebf\u6027\u53d8\u6362\u7684Embedding\uff0c\u8be5\u7ebf\u6027\u53d8\u6362\u77e9\u9635\u4e5f\u662f\u7ecf\u8fc7\u8bad\u7ec3\u800c\u6765\u7684\uff0c\u5373\u77e9\u9635 \\(W^V \\)\u3002<\/p>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\">\n<p>\u4f8b\u5982\uff0c\u4e0a\u8ff0\u7684\u52a0\u6743\u8ba1\u7b97\u65f6\uff0c\u201csummer\u201d \u5219\u4f1a\u878d\u5165 26% \u7684\u201cIt\u201d\uff0c15%\u7684\u201chot\u201d&#8230; \uff0c\u6700\u540e\u751f\u6210\u65b0\u7684 \u201csummer\u201d \u7684\u8868\u8fbe\uff0c\u8fd9\u4e2a\u8868\u8fbe\u4e5f\u53ef\u4ee5\u67d0\u79cd\u7a0b\u5ea6\u7406\u89e3\u4e3a \u201cContextual Embeddings\u201d\u3002<\/p>\n\n\n\n<p>\u9700\u8981\u6ce8\u610f\u7684\u662f\uff0c\u8fd9\u91cc\u5728\u8ba1\u7b97\u52a0\u6743\u5e73\u5747\uff0c\u4e5f\u4e0d\u662f\u76f4\u63a5\u4f7f\u7528\u539f\u59cb\u7684 Token Embedding \uff0c\u4e5f\u662f\u4e00\u4e2a\u7ecf\u8fc7\u4e86\u7ebf\u6027\u53d8\u6362\u7684Embedding\uff0c\u8be5\u7ebf\u6027\u53d8\u6362\u77e9\u9635\u4e5f\u662f\u7ecf\u8fc7\u8bad\u7ec3\u800c\u6765\u7684\uff0c\u5373\u77e9\u9635 \\(W^V \\)\u3002<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:14px\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-29068a14ea60b9a764f8dfe9f40f423c\" style=\"font-size:14px\">Token      | Contextual Embeddings(12 x 768)<br>--------------------------------------------<br>It         | [ 0.0452,  0.0628,  0.1463,...]<br>\u00e2\u0122         | [ 0.0153,  0.0752,  0.1247,...]<br>\u013b          | [ 0.0034,  0.0464,  0.0923,...]<br>s          | [-0.0082,  0.0464,  0.0801,...]<br>\u0120very      | [ 0.0218,  0.1029,  0.0621,...]<br>\u0120hot       | [ 0.0327,  0.0892,  0.0409,...]<br>\u0120in        | [ 0.0249,  0.0964,  0.0329,...]<br>\u0120summer    | [ 0.0583,  0.1195,  0.0068,...]<br>.          | [ 0.0334,  0.1100,  0.0366,...]<br>\u0120Sw        | [ 0.0086,  0.0846,  0.0074,...]<br>imming     | [-0.0049,  0.0841, -0.0339,...]<br>\u0120is        | [ 0.0410,  0.0706,  0.0077,...]<\/pre>\n<\/div>\n<\/div>\n\n\n\n<h3 class=\"wp-block-heading\">5 \u8ba1\u7b97\u793a\u610f\u56fe<\/h3>\n\n\n\n<p>\u5982\u4e0b\u7684\u793a\u610f\u56fe\uff0c\u4e00\u5b9a\u7684\u53ef\u89c6\u5316\u7684\u8868\u8fbe\u4e86\uff0c\u4e00\u4e2a Token \u5982\u4f55\u7ecf\u8fc7\u4e0a\u8ff0\u7684\u77e9\u9635\u8fd0\u7b97\uff0c\u5982\u4f55\u4e86\u5176\u4ed6 Token \u7684\u5185\u5bb9\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"389\" src=\"https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/12\/image-1024x389.png\" alt=\"\" class=\"wp-image-20979\" srcset=\"https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/12\/image-1024x389.png 1024w, https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/12\/image-300x114.png 300w, https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/12\/image-768x292.png 768w, https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/12\/image-1536x584.png 1536w, https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/12\/image.png 1668w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<h3 class=\"wp-block-heading\">6. \u6ce8\u610f\u529b\u77e9\u9635\u7684\u89c2\u5bdf<\/h3>\n\n\n\n<p>\u90a3\u4e48\uff0c\u6211\u4eec\u7ed9\u5b9a\u4e8e\u5982\u4e0b\u7684\u63d0\u793a\u8bcd\u8f93\u5165\uff1a\u201cMartin\u2019s one of my sons, and the other is Chanler.\u201d\u3002\u770b\u770b\u5728 GPT \u6a21\u578b\u4e2d\uff0c\u5404\u4e2aToken\u4e4b\u95f4\u7684 Attention \u60c5\u51b5\uff1a<\/p>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\">\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" decoding=\"async\" width=\"1018\" height=\"930\" src=\"https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/11\/image-11.png\" alt=\"\" class=\"wp-image-20926\" srcset=\"https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/11\/image-11.png 1018w, https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/11\/image-11-300x274.png 300w, https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/11\/image-11-768x702.png 768w\" sizes=\"auto, (max-width: 1018px) 100vw, 1018px\" \/><\/figure>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\">\n<ul class=\"wp-block-list\">\n<li>\u8fd9\u53e5\u8bdd\u603b\u8ba1\u670921\u4e2atoken\uff0c\u6240\u4ee5\u8fd9\u662f\u4e00\u4e2a21&#215;21\u7684\u77e9\u9635<\/li>\n\n\n\n<li>\u8fd9\u91cc\u662f\u201cmasked self-attention\u201d\uff0c\u6240\u4ee5\u77e9\u9635\u7684\u53f3\u4e0a\u534a\u533a\u90fd\u662f \u201c0\u201d \u3002<\/li>\n\n\n\n<li>\u5728GPT2\u4e2d\uff0c\u4e00\u517112\u5c42\uff0c\u6bcf\u5c4212\u4e2a\u201c\u5934\u201d\uff0c\u6240\u4ee5\u4e00\u5171\u6709\u201c144\u201d\u4e2a\u7c7b\u4f3c\u7684\u77e9\u9635<\/li>\n\n\n\n<li>\\(W^Q \\,, W^K \\,, W^V \\) \u7684\u7ef4\u5ea6\u90fd\u662f768&#215;64\uff0c\u6240\u4ee5\u7c97\u7565\u7684\u4f30\u8ba1\u8fd9\u90e8\u5206\u7684\u53c2\u6570\u91cf\u5c31\u8d85\u8fc72000\u4e07\uff0c\u5177\u4f53\u7684\uff1a<\/li>\n<\/ul>\n\n\n\n<p class=\"has-text-align-center\">768*64*3*144 = 21,233,664<\/p>\n<\/div>\n<\/div>\n\n\n\n<h3 class=\"wp-block-heading\">7. Multi-Head Attention<\/h3>\n\n\n\n<h4 class=\"wp-block-heading\">7.1 Scaled Dot-Product Attention<\/h4>\n\n\n\n<p>\u4ee5\u524d\u9762\u5c0f\u7ed3\u201c\u9884\u5904\u7406\u201d\u4e2d\u7684 X \u4e3a\u4f8b\uff0cAttention Score Matrix \u5c31\u6709\u5982\u4e0b\u7684\u8ba1\u7b97\u516c\u5f0f\uff1a<\/p>\n\n\n<p>$$<br \/>\n\\begin{aligned}<br \/>\n\\text{Attention Score Matrix} &#038; = \\text{softmax}(\\frac{QK^T}{\\sqrt{d}})  \\\\<br \/>\n&#038; = \\text{softmax}(\\frac{XW^Q(XW^K)^T}{\\sqrt{d}})<br \/>\n\\end{aligned}<br \/>\n$$<\/p>\n\n\n\n<p>\u6700\u7ec8\u7684 Attention \u8ba1\u7b97\u5982\u4e0b\uff1a<\/p>\n\n\n<p>$$<br \/>\n\\begin{aligned}<br \/>\n\\text{Attention}(Q,K,V) &#038; = \\text{softmax}(\\frac{QK^T}{\\sqrt{d}})V  \\\\<br \/>\n&#038; = \\text{softmax}(\\frac{XW^Q(XW^K)^T}{\\sqrt{d}})XW^V<br \/>\n\\end{aligned}<br \/>\n$$<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">7.2 Multi-Head Attention<\/h4>\n\n\n\n<p>\u4e0a\u8ff0\u7684\u201cAttention\u201d\u5728\u8bba\u6587\u201cAttention Is All You Need\u201d\u79f0\u4e3a\u201cScaled Dot-Product Attention\u201d\u3002\u66f4\u8fdb\u4e00\u6b65\u7684\u5728\u8bba\u6587\u4e2d\u63d0\u51fa\u4e86\u201cMulti-Head Attention\u201d\uff08\u7ecf\u5e38\u88ab\u7f29\u5199\u4e3a\u201cMHA\u201d\uff09\u3002\u5bf9\u5e94\u7684\u516c\u5f0f\u5982\u4e0b\uff08\u6765\u81ea\u539f\u59cb\u8bba\u6587\uff09\uff1a<\/p>\n\n\n<p>$$<br \/>\n\\begin{aligned}<br \/>\n\\text{MultiHead}(Q, K, V) &#038; = \\text{Concat}(\\text{head}_1, \\dots, \\text{head}_h)W^O \\\\<br \/>\n\\text{where} \\quad \\text{head}_i &#038; = \\text{Attention}(QW_i^Q, KW_i^K, VW_i^V)<br \/>\n\\end{aligned}<br \/>\n$$<\/p>\n\n\n\n<p>\u66f4\u5b8c\u6574\u7684\u89e3\u91ca\uff0c\u53ef\u4ee5\u53c2\u8003\u539f\u59cb\u8bba\u6587\u3002\u8fd9\u91cc\u4f9d\u65e7\u4ee5\u524d\u6587\u7684\u793a\u4f8b\u6765\u8bf4\u660e\u4ec0\u4e48\u662fMHA\u3002<\/p>\n\n\n\n<p>\u5728\u672c\u6587\u7b2c4\u7ae0\u201cSelf-Attention\u201d\u4e2d\uff0c\u8f83\u4e3a\u8be6\u7ec6\u7684\u4ecb\u7ecd\u4e86\u76f8\u5173\u7684\u8ba1\u7b97\uff08\u5373\u6a21\u578b\u7684\u63a8\u7406\u8fc7\u7a0b\uff09\u3002\u5728\u793a\u4f8b\u4e2d\uff0c\u4e00\u5171\u670912\u4e2a\u201cToken\u201d\uff0c\u5728\u8fdb\u5165 Attention \u8ba1\u7b97\u65f6\u7ecf\u8fc7\u4f4d\u7f6e\u7f16\u7801\u3001\u6b63\u5219\u5316\u540e\uff0c12\u4e2a\u201cToken\u201d\u5411\u91cf\u7ec4\u6210\u77e9\u9635\u201cX\u201d\uff0c\u8fd9\u91cc\u7684\u201cX\u201d\u7684 shape \u4e3a 12 x 768\uff0c\u901a\u5e38\u4f7f\u7528\u7b26\u53f7 \\(l \\times d \\) \u6216\u8005 \\(l \\times d_{model} \\) \u8868\u793a\u3002\u6700\u7ec8\u8f93\u51fa\u7684 Contextual Embedding \u4e5f\u662f \\(l \\times d_{model} \\) \u7684\u4e00\u7ec4\u8868\u793a12\u4e2a Token \u5411\u91cf\uff0c\u8fd9\u662f\u6bcf\u4e2a\u5411\u91cf\u76f8\u6bd4\u6700\u521d\u7684\u8f93\u5165\u5411\u91cf\uff0c\u5219\u878d\u5408\u4e0a\u4e0b\u6587\u4e2d\u5176\u4ed6\u8bcd\u8bed\u7684\u542b\u4e49\u3002\u5728\u4e00\u4e2a\u591a\u5c42\u7684\u6a21\u578b\u4e2d\uff0c\u8fd9\u7ec4\u5411\u91cf\u5219\u53ef\u4ee5\u4f5c\u4e3a\u4e0b\u4e00\u5c42\u7684\u8f93\u5165\u3002<\/p>\n\n\n\n<p>\u5728\u201cMulti-Head Attention\u201d\u5176\u8f93\u5165\u3001\u8f93\u51fa\u4e0e\u201cSelf-Attention\u201d\u4e00\u6837\uff0c\u90fd\u662f \\(l \\times d_{model} \\) \u3002\u4f46\u662f\uff0c\u5bf9\u4e8e\u6700\u7ec8\u8f93\u51fa\u7684  \\(l \\times d_{model} \\) \u7684\u5411\u91cf\/\u77e9\u9635\uff0c\u5728 MHA \u4e2d\u5219\u5206\u4e3a\u591a\u4e2a HEAD \u5404\u81ea\u8ba1\u7b97\u5176\u4e2d\u7684\u4e00\u90e8\u5206\uff0c\u4f8b\u5982\uff0c\u4e00\u5171\u6709 \\(d_{model} \\) \u5217\uff0c\u90a3\u4e48\u5219\u5206\u522b\u6709 \\(h \\) \u4e2aHEAD\uff0c\u6bcf\u4e2a HEAD \u8f93\u51fa\u5176\u4e2d\u7684 \\(\\frac{d_{model}}{h} \\) \u5217\u3002\u5728\u4e0a\u8ff0\u793a\u4f8b\u4e2d\uff0c\u4f9b\u670912\u4e2aHEAD\uff0c\u5373 \\(h=12 \\)\uff0c\u6a21\u578b\u7ef4\u5ea6\u4e3a768\uff0c\u5373\\(d_{model} = 768 \\)\uff0c\u6240\u4ee5\u6bcf\u4e2aHEAD\uff0c\u6700\u7ec8\u8f93\u51fa \\(64 = \\frac{d_model}{h} = \\frac{768}{12} \\) \u5217\u3002\u5373\uff1a<\/p>\n\n\n<p>$$<br \/>\n\\begin{aligned}<br \/>\n\\text{where} \\quad \\text{head}_i &#038; = \\text{Attention}(QW_i^Q, KW_i^K, VW_i^V)<br \/>\n\\end{aligned}<br \/>\n$$<\/p>\n\n\n\n<p>\u7136\u540e\u753112\u4e2a Head \u5171\u540c\u7ec4\u6210\uff08concat\uff09\u8981\u8f93\u51fa\u7684 Contextual Embedding\uff0c\u5e76\u5bf9\u6b64\u8f93\u51fa\u505a\u4e86\u4e00\u4e2a\u7ebf\u6027\u53d8\u6362\\(W^O \\)\u3002\u5373\uff1a<\/p>\n\n\n<p>$$<br \/>\n\\begin{aligned}<br \/>\n\\text{MultiHead}(Q, K, V) &#038; = \\text{Concat}(\\text{head}_1, \\dots, \\text{head}_h)W^O<br \/>\n\\end{aligned}<br \/>\n$$<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">7.3 Self Attention vs MHA<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:47%\">\n<p class=\"has-text-align-center\">Self Attention<\/p>\n\n\n\n<hr class=\"wp-block-separator aligncenter has-alpha-channel-opacity is-style-wide\" style=\"margin-top:4px;margin-bottom:24px\"\/>\n\n\n\n<figure data-wp-context=\"{&quot;imageId&quot;:&quot;6a138d963386d&quot;}\" data-wp-interactive=\"core\/image\" data-wp-key=\"6a138d963386d\" class=\"wp-block-image size-large wp-lightbox-container\"><img loading=\"lazy\" decoding=\"async\" width=\"6563\" height=\"5422\" data-wp-class--hide=\"state.isContentHidden\" data-wp-class--show=\"state.isContentVisible\" data-wp-init=\"callbacks.setButtonStyles\" data-wp-on--click=\"actions.showLightbox\" data-wp-on--load=\"callbacks.setButtonStyles\" data-wp-on-window--resize=\"callbacks.setButtonStyles\" src=\"https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/12\/self-attention-1.svg\" alt=\"\" class=\"wp-image-21463\"\/><button\n\t\t\tclass=\"lightbox-trigger\"\n\t\t\ttype=\"button\"\n\t\t\taria-haspopup=\"dialog\"\n\t\t\taria-label=\"Enlarge\"\n\t\t\tdata-wp-init=\"callbacks.initTriggerButton\"\n\t\t\tdata-wp-on--click=\"actions.showLightbox\"\n\t\t\tdata-wp-style--right=\"state.imageButtonRight\"\n\t\t\tdata-wp-style--top=\"state.imageButtonTop\"\n\t\t>\n\t\t\t<svg xmlns=\"http:\/\/www.w3.org\/2000\/svg\" width=\"12\" height=\"12\" fill=\"none\" viewBox=\"0 0 12 12\">\n\t\t\t\t<path fill=\"#fff\" d=\"M2 0a2 2 0 0 0-2 2v2h1.5V2a.5.5 0 0 1 .5-.5h2V0H2Zm2 10.5H2a.5.5 0 0 1-.5-.5V8H0v2a2 2 0 0 0 2 2h2v-1.5ZM8 12v-1.5h2a.5.5 0 0 0 .5-.5V8H12v2a2 2 0 0 1-2 2H8Zm2-12a2 2 0 0 1 2 2v2h-1.5V2a.5.5 0 0 0-.5-.5H8V0h2Z\" \/>\n\t\t\t<\/svg>\n\t\t<\/button><\/figure>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:47%\">\n<p class=\"has-text-align-center\">Multi Head Attention<\/p>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity has-background is-style-wide\" style=\"margin-top:4px;margin-bottom:24px\"\/>\n\n\n\n<figure data-wp-context=\"{&quot;imageId&quot;:&quot;6a138d963420f&quot;}\" data-wp-interactive=\"core\/image\" data-wp-key=\"6a138d963420f\" class=\"wp-block-image size-large wp-lightbox-container\"><img loading=\"lazy\" decoding=\"async\" width=\"6765\" height=\"5422\" data-wp-class--hide=\"state.isContentHidden\" data-wp-class--show=\"state.isContentVisible\" data-wp-init=\"callbacks.setButtonStyles\" data-wp-on--click=\"actions.showLightbox\" data-wp-on--load=\"callbacks.setButtonStyles\" data-wp-on-window--resize=\"callbacks.setButtonStyles\" src=\"https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/12\/multi-head-attention-1.svg\" alt=\"\" class=\"wp-image-21462\"\/><button\n\t\t\tclass=\"lightbox-trigger\"\n\t\t\ttype=\"button\"\n\t\t\taria-haspopup=\"dialog\"\n\t\t\taria-label=\"Enlarge\"\n\t\t\tdata-wp-init=\"callbacks.initTriggerButton\"\n\t\t\tdata-wp-on--click=\"actions.showLightbox\"\n\t\t\tdata-wp-style--right=\"state.imageButtonRight\"\n\t\t\tdata-wp-style--top=\"state.imageButtonTop\"\n\t\t>\n\t\t\t<svg xmlns=\"http:\/\/www.w3.org\/2000\/svg\" width=\"12\" height=\"12\" fill=\"none\" viewBox=\"0 0 12 12\">\n\t\t\t\t<path fill=\"#fff\" d=\"M2 0a2 2 0 0 0-2 2v2h1.5V2a.5.5 0 0 1 .5-.5h2V0H2Zm2 10.5H2a.5.5 0 0 1-.5-.5V8H0v2a2 2 0 0 0 2 2h2v-1.5ZM8 12v-1.5h2a.5.5 0 0 0 .5-.5V8H12v2a2 2 0 0 1-2 2H8Zm2-12a2 2 0 0 1 2 2v2h-1.5V2a.5.5 0 0 0-.5-.5H8V0h2Z\" \/>\n\t\t\t<\/svg>\n\t\t<\/button><\/figure>\n<\/div>\n<\/div>\n\n\n\n<h4 class=\"wp-block-heading\">7.4 Multi-Head Attention \u5c0f\u7ed3<\/h4>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u201cMulti-Head Attention\u201d \u4e0e \u7ecf\u5178\u201cAttention\u201d \u6709\u7740\u7c7b\u4f3c\u7684\u6548\u679c\uff0c\u4f46\u662f\u6709\u7740\u66f4\u597d\u7684\u8868\u73b0\u6027\u80fd<\/li>\n\n\n\n<li>\u201cMulti-Head Attention\u201d \u4e0e \u7ecf\u5178\u201cAttention\u201d \u6709\u76f8\u540c\u7684\u8f93\u5165\uff0c\u76f8\u540c\u7684\u8f93\u51fa<\/li>\n<\/ul>\n\n\n\n<h3 class=\"wp-block-heading\">8. Attention \u6570\u5b66\u8ba1\u7b97\u793a\u610f\u56fe<\/h3>\n\n\n\n<p>\u5982\u4e0b\u7684\u56fe\u7247\uff0c\u534a\u53ef\u89c6\u5316\u7684\u5c55\u793a\u4e86\u5728GTP2\u4e2d\uff0c\u67d0\u4e00\u4e2aHEAD\u4e2dAttention\u7684\u8ba1\u7b97\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"7696\" height=\"11825\" src=\"https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/12\/multi-head-attention-math.svg\" alt=\"\" class=\"wp-image-21469\"\/><\/figure>\n\n\n\n<h3 class=\"wp-block-heading\">9. \u5168\u6d41\u7a0b\u6570\u5b66\u8ba1\u7b97<\/h3>\n\n\n\n<p>\u5b8c\u6574\u7684\u8ba1\u7b97\uff0c\u5c31\u662f\u4e00\u4e2a\u201cforward propagation\u201d\u6216\u8005\u53eb\u201cinference\u201d\u7684\u8fc7\u7a0b\uff0c\u8fd9\u91cc\u4f9d\u65e7\u4ee5\u4e0a\u8ff0\u7684\u63d0\u793a\u8bcd\u201cIt\u2019s very hot in summer. Swimming is\u201d\uff0c\u5e76\u89c2\u5bdf\u8be5\u63d0\u793a\u8bcd\u5728 GPT2 \u6a21\u578b\u4e2d\u7684\u7b2c\u4e00\u4e2aLayer\u3001\u7b2c\u4e00\u4e2aHead\u4e2d\u7684\u8ba1\u7b97\u3002\u5b8c\u6210\u7684\u4ee3\u7801\u53ef\u4ee5\u53c2\u8003\uff1a<a href=\"https:\/\/colab.research.google.com\/drive\/1ZVD3eJvFefE1x48u5XO9BNyGkrim4kk7#scrollTo=6dG1kpz3j7Wd\">Attention-Please.ipynb<\/a><\/p>\n\n\n\n<h4 class=\"wp-block-heading\">9.1 Token Embedding \u548c Positional Embedding<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:30%\">\n<p class=\"has-text-align-center\">Token Embedding <\/p>\n\n\n\n<p class=\"has-text-align-center\">+<\/p>\n\n\n\n<p class=\"has-text-align-center\">Positional Embedding<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:70%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-968781aa492d22c5a6b252f92a547102\" style=\"font-size:8px\">|------------  -----------------------------------     -------------------------------     ------------------------------<br>|  Token    |  | wte[:3] (word token embedding)  |     | wpe[:3](word positional e..)|     |   wte [:3]  + wpe [:3]     |<br>|------------  -----------------------------------     -------------------------------     ------------------------------<br>|It         |  | [0.039,   -0.0869, 0.0662 ,...] |     | [-0.0188, -0.1974, 0.004  ] |     | [0.0202, -0.2844,0.0702 ]  |<br>|\u00e2\u0122         |  | [-0.075,  0.0948,  -0.0034,...] |     | [0.024,   -0.0538, -0.0949] |     | [-0.051, 0.041,  -0.0982]  |<br>|\u013b          |  | [-0.0223, 0.0182,  0.2631 ,...] |     | [0.0042,  -0.0848, 0.0545 ] |     | [-0.0181,-0.0666,0.3176 ]  |<br>|s          |  | [-0.064,  -0.0469, 0.2061 ,...] |     | [-0.0003, -0.0738, 0.1055 ] |     | [-0.0643,-0.1207,0.3116 ]  |<br>|\u0120very      |  | [-0.0553, -0.0348, 0.0606 ,...] |  +  | [0.0076,  -0.0251, 0.127  ] |  =  | [-0.0477,-0.0599,0.1876 ]  |<br>|\u0120hot       |  | [0.0399,  -0.0053, 0.0742 ,...] |     | [0.0096,  -0.0339, 0.1312 ] |     | [0.0495, -0.0392,0.2054 ]  |<br>|\u0120in        |  | [-0.0337, 0.0108,  0.0293 ,...] |     | [0.0027,  -0.0205, 0.1196 ] |     | [-0.031, -0.0098,0.149  ]  |<br>|\u0120summer    |  | [0.0422,  0.0138,  -0.0213,...] |     | [0.0025,  -0.0032, 0.1174 ] |     | [0.0448, 0.0106, 0.0961 ]  |<br>|.          |  | [0.0466,  -0.0113, 0.0283 ,...] |     | [-0.0012, -0.0018, 0.111  ] |     | [0.0454, -0.0131,0.1394 ]  |<br>|\u0120Sw        |  | [0.0617,  0.0373,  0.1018 ,...] |     | [0.0049,  0.0021,  0.1178 ] |     | [0.0666, 0.0395, 0.2196 ]  |<br>|imming     |  | [-0.1385, -0.1774, -0.0181,...] |     | [0.0016,  0.0062,  0.1004 ] |     | [-0.1369,-0.1711,0.0823 ]  |<br>|\u0120is        |  | [-0.0097, 0.0101,  0.0556 ,...] |     | [-0.0036, 0.0175,  0.1068 ] |     | [-0.0133,0.0275, 0.1623 ]  |<br>|------------  -----------------------------------     -------------------------------     ------------------------------<\/pre>\n<\/div>\n<\/div>\n\n\n\n<h4 class=\"wp-block-heading\">9.2 Normalize<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:30%\">\n<p class=\"has-text-align-left\">\u5373\uff0c\u5c06\u6bcf\u4e00\u4e2atoken\u7684embedding \u8fdb\u884c\u6b63\u89c4\u5316\uff0c\u5c06\u5176\u5747\u503c\u53d8\u4e3a0\uff0c\u65b9\u5dee\u53d8\u4e3a1<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:70%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-8241e925d889e191ef8b3851a6bd6133\" style=\"font-size:12px\">Token      |   X norm[:3]  (12 x 768)<br>--------------------------------------------<br>It         | [ 0.0129, -0.1104, -0.0317,...]<br>\u00e2\u0122         | [-0.0530,  0.0588, -0.1290,...]<br>\u013b          | [-0.0170, -0.0242,  0.1639,...]<br>s          | [-0.0754, -0.0842,  0.1842,...]<br>\u0120very      | [-0.0566, -0.0280,  0.0953,...]<br>\u0120hot       | [ 0.0587, -0.0086,  0.1073,...]<br>\u0120in        | [-0.0391,  0.0209,  0.0731,...]<br>\u0120summer    | [ 0.0532,  0.0397,  0.0181,...]<br>.          | [ 0.0553,  0.0152,  0.0579,...]<br>\u0120Sw        | [ 0.0807,  0.0691,  0.1216,...]<br>imming     | [-0.1528, -0.1249, -0.0017,...]<br>\u0120is        | [-0.0175,  0.0605,  0.0880,...]<\/pre>\n<\/div>\n<\/div>\n\n\n\n<h4 class=\"wp-block-heading\">9.3 Attention \u5c42\u7684\u53c2\u6570\u77e9\u9635<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:25%\">\n<p class=\"has-text-align-center\"><\/p>\n\n\n\n<p class=\"has-text-align-center\">\\(W^Q\\,,W^K\\,,W^V \\)<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:75%\">\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\" style=\"font-size:8px\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:8px;flex-basis:100%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-58ce20c1cc44ecaeb62a2e3f0e71f935\" style=\"font-size:8px\">   W^Q [:3]  shape (768 x 64)               W^K [:3]  shape (768 x 64)                                  W^V [:3]  shape (768 x 64)<br>-------------------------------------    --------------------------------                            --------------------------------<br>[-0.4738, -0.2614, -0.0978, ...]   |     [ 0.3660,  0.0771,  0.2226, ...]                            [ 0.1421,  0.0329, -0.0667, ...]<br>[ 0.0874,  0.1473,  0.2387, ...]   |     [-0.4380, -0.1446, -0.4717, ...]                            [ 0.0162, -0.0633, -0.0636, ...]<br>[ 0.0039,  0.0695,  0.3668, ...]   |     [ 0.1237,  0.0174,  0.1181, ...]                            [ 0.0229, -0.0828,  0.0437, ...]<br>[ 0.2215, -0.1884, -0.0141, ...]  64     [-0.2247,  0.0148, -0.1859, ...]                            [-0.0106,  0.0070,  0.0565, ...]<br>[-0.0947,  0.1678, -0.0143, ...]  rows   [-0.2001, -0.1052, -0.1743, ...]                            [ 0.0416,  0.0938, -0.1792, ...]<br>   ...                             |        ...                                                          ...<br>[-0.4100, -0.1924, -0.2400, ...]   |     [,0.1567,  0.2664,  0.1851, ...]                            [-0.0341,  0.0034,  0.0203, ...]<br>-------------------------------------    --------------------------------                            --------------------------------<br>|&lt;------- columns: 768 -------&gt;|         |&lt;------- columns: 768 -------&gt;|                            |&lt;------- columns: 768 -------&gt;|<\/pre>\n<\/div>\n<\/div>\n<\/div>\n<\/div>\n\n\n\n<h4 class=\"wp-block-heading\">9.4 \u77e9\u9635 Q K V\u7684\u8ba1\u7b97<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:25%\">\n<p class=\"has-text-align-center\">\\(Q = XW^Q \\)<\/p>\n\n\n\n<p class=\"has-text-align-center\">\\(K = XW^K \\)<\/p>\n\n\n\n<p class=\"has-text-align-center\">\\(V = XW^V \\)<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:75%\">\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\" style=\"font-size:8px\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:8px;flex-basis:100%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-81b31a22c049843a1158f36a3d46cd70\" style=\"font-size:8px\">    Q [:3]  shape (12 x 64)                  K [:3]  shape (12 x 64)                                      V [:3]  shape (12 x 64)<br>-------------------------------------    ---------------------------------                           --------------------------------<br>[ 0.4207, -0.9178,  0.1760, ...]  |      [ -1.4202,  1.6791,  0.9837, ...]                           [ 0.0452,  0.0628,  0.1463, ...]<br>[ 0.7757,  0.2485,  0.7349, ...]  |      [ -2.5320,  2.2932,  1.5592, ...]                           [-0.1361,  0.1379,  0.0150, ...]<br>[ 0.4481,  0.0206, -0.0825, ...]  |      [ -2.2571,  2.7764,  1.8401, ...]                           [ 0.0039, -0.1295, -0.0311, ...]<br>[ 0.9500,  0.1481,  0.3469, ...] 12      [ -2.4322,  3.1454,  2.0600, ...]                           [-0.0391,  0.0581,  0.0511, ...]<br>[ 0.4989, -0.4376,  0.1678, ...] rows    [ -3.5428,  2.1485,  2.0414, ...]                           [ 0.0963,  0.3563, -0.1477, ...]<br>  ...                             |        ...                                                         ...<br>[ 0.4429, -1.1997,  0.5611, ...]  |      [ -2.2559,  2.0384,  2.2542, ...]                           [ 0.2759, -0.2783,  0.3240, ...]<br>-------------------------------------    ---------------------------------                           --------------------------------<br>|&lt;------- columns: 64 -------&gt;|          |&lt;------- columns: 64 -------&gt;|                             |&lt;------- columns: 64 -------&gt;|<\/pre>\n<\/div>\n<\/div>\n<\/div>\n<\/div>\n\n\n\n<h4 class=\"wp-block-heading\">9.5 \u8ba1\u7b97 Attention Score<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:25%\">\n<p class=\"has-text-align-center\">\\(\\text{Attention Score} \\)<\/p>\n\n\n\n<p class=\"has-text-align-center\">\\(= \\frac{QK^T}{\\sqrt{d}} \\)<\/p>\n\n\n\n<p><\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:75%\">\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\" style=\"font-size:8px\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:8px;flex-basis:100%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-521b75d90ebc4d824c01ec13f2b1d987\" style=\"font-size:8px\">|---------------------------------------------------------------------------------------------|<br>|       |           Attention Score Matrix shape (12 x 12)                                    |<br>| Token |-------------------------------------------------------------------------------------|<br>|       |   It     \u00e2\u0122     \u013b      s     \u0120very  \u0120hot    \u0120in  \u0120summer  .     \u0120Sw    imming  \u0120is  |<br>|-------|-------------------------------------------------------------------------------------|----<br>|It     | [ 0.14, -1.53, -1.45, -1.71, -1.69, -1.74, -2.36, -2.27, -2.37, -1.33, -0.58, -2.40]|  |<br>|\u00e2\u0122     | [ 0.70, -0.93, -1.72, -1.02, -1.52, -2.24, -1.90, -2.19, -1.63, -2.13, -1.66, -2.14]|  |<br>|\u013b      | [-0.60, -1.81, -1.99, -1.96, -2.57, -1.84, -1.62, -2.04, -0.98, -1.18, -2.23, -2.25]|  |<br>|s      | [-0.46, -1.33, -1.60, -2.65, -2.24, -1.99, -2.89, -1.44, -2.05, -2.77, -2.09, -2.74]|  |<br>|\u0120very  | [ 0.29, -1.42, -1.77, -1.15, -0.94, -1.14, -1.81, -1.04, -1.77, -2.13, -0.60, -0.82]|  |<br>|\u0120hot   | [ 0.03, -0.68, -0.59, -0.95, -1.78, -0.10, -0.95, -0.14, -1.32, -0.57,  0.06, -1.07]|  12<br>|\u0120in    | [-0.71, -1.72, -1.53, -2.18, -1.67, -1.93, -3.41, -1.69, -2.74, -1.89, -1.17, -2.02]|  rows<br>|\u0120summer| [-0.34, -1.49, -1.35, -1.31, -1.12, -0.89, -1.49, -1.11, -1.51, -1.15, -1.45, -1.20]|  |<br>|.      | [-0.89, -1.73, -2.67, -2.80, -2.45, -2.37, -4.39, -2.33, -4.42, -2.73, -1.82, -3.21]|  |<br>|\u0120Sw    | [-0.05, -1.15, -1.76, -1.15, -1.68, -0.74, -1.15, -1.35, -1.36, -1.29, -0.43, -1.51]|  |<br>|imming | [-0.02, -1.65, -0.87, -0.35, -1.18, -0.65, -0.33, -1.25, -0.38, -1.68, -2.15, -1.08]|  |<br>|\u0120is    | [-0.97, -2.03, -2.56, -2.94, -1.96, -2.71, -4.07, -2.46, -3.51, -2.68, -1.88, -2.99]|  |<br>|-------|-------------------------------------------------------------------------------------|----<br>        |&lt;-------------------------------- columns: 12 --------------------------------------&gt;|<\/pre>\n<\/div>\n<\/div>\n<\/div>\n<\/div>\n\n\n\n<h4 class=\"wp-block-heading\">9.6 \u8ba1\u7b97 Masked Attention Score<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:25%\">\n<p class=\"has-text-align-center\">\\(\\text{Masked Attention Score} \\)<\/p>\n\n\n\n<p class=\"has-text-align-center\">\\(= \\frac{QK^T}{\\sqrt{d}} + \\text{mask} \\)<\/p>\n\n\n\n<p><\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:75%\">\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\" style=\"font-size:8px\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:8px;flex-basis:100%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-223ccdbe316d4b7780a928e6e09f867c\" style=\"font-size:8px\">|---------------------------------------------------------------------------------------------|<br>|       |   Masked  Attention Score Matrix shape (12 x 12)                                    |<br>| Token |-------------------------------------------------------------------------------------|<br>|       |   It     \u00e2\u0122     \u013b      s     \u0120very  \u0120hot    \u0120in  \u0120summer  .     \u0120Sw    imming  \u0120is  |<br>|-------|-------------------------------------------------------------------------------------|----<br>|It     | [ 0.14,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf]|  |<br>|\u00e2\u0122     | [ 0.70, -0.93,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf]|  |<br>|\u013b      | [-0.60, -1.81, -1.99,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf]|  |<br>|s      | [-0.46, -1.33, -1.60, -2.65,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf]|  |<br>|\u0120very  | [ 0.29, -1.42, -1.77, -1.15, -0.94,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf]|  |<br>|\u0120hot   | [ 0.03, -0.68, -0.59, -0.95, -1.78, -0.10,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf]|  12<br>|\u0120in    | [-0.71, -1.72, -1.53, -2.18, -1.67, -1.93, -3.41,  -inf,  -inf,  -inf,  -inf,  -inf]|  rows<br>|\u0120summer| [-0.34, -1.49, -1.35, -1.31, -1.12, -0.89, -1.49, -1.11,  -inf,  -inf,  -inf,  -inf]|  |<br>|.      | [-0.89, -1.73, -2.67, -2.80, -2.45, -2.37, -4.39, -2.33, -4.42,  -inf,  -inf,  -inf]|  |<br>|\u0120Sw    | [-0.05, -1.15, -1.76, -1.15, -1.68, -0.74, -1.15, -1.35, -1.36, -1.29,  -inf,  -inf]|  |<br>|imming | [-0.02, -1.65, -0.87, -0.35, -1.18, -0.65, -0.33, -1.25, -0.38, -1.68, -2.15,  -inf]|  |<br>|\u0120is    | [-0.97, -2.03, -2.56, -2.94, -1.96, -2.71, -4.07, -2.46, -3.51, -2.68, -1.88, -2.99]|  |<br>|-------|-------------------------------------------------------------------------------------|----<br>        |&lt;-------------------------------- columns: 12 --------------------------------------&gt;|<\/pre>\n<\/div>\n<\/div>\n<\/div>\n<\/div>\n\n\n\n<h4 class=\"wp-block-heading\">9.7 \u8ba1\u7b97 Softmax Masked Attention Score<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:25%\">\n<p class=\"has-text-align-center\">\\(\\text{Softmax Masked Attention Score} \\)<\/p>\n\n\n\n<p class=\"has-text-align-center\">\\(= \\text{softmax}(\\frac{QK^T}{\\sqrt{d}} + \\text{mask}) \\)<\/p>\n\n\n\n<p><\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:75%\">\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\" style=\"font-size:8px\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:8px;flex-basis:100%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-ed92b61fa8917e386c315ae81aa5e462\" style=\"font-size:8px\">|-------------------------------------------------------------------------------------|<br>|       |  Softmax Masked  Attention Score Matrix shape (12 x 12)                     |<br>| Token |-----------------------------------------------------------------------------|<br>|       |  It    \u00e2\u0122    \u013b     s     \u0120very  \u0120hot  \u0120in  \u0120summer  .   \u0120Sw   imming  \u0120is   |<br>|-------|-----------------------------------------------------------------------------|<br>|It     | [1.00  0.00  0.00  0.00   0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00]|  |<br>|\u00e2\u0122     | [0.84  0.16  0.00  0.00   0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00]|  |                  V [:3]  shape (12 x 64)<br>|\u013b      | [0.65  0.19  0.16  0.00   0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00]|  |              --------------------------------<br>|s      | [0.54  0.23  0.17  0.06   0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00]|  |              [ 0.0452,  0.0628,  0.1463, ...]<br>|\u0120very  | [0.54  0.10  0.07  0.13   0.16  0.00  0.00  0.00  0.00  0.00  0.00  0.00]|  |              [-0.1361,  0.1379,  0.0150, ...]<br>|\u0120hot   | [0.29  0.14  0.16  0.11   0.05  0.25  0.00  0.00  0.00  0.00  0.00  0.00]|  12             [ 0.0039, -0.1295, -0.0311, ...]<br>|\u0120in    | [0.36  0.13  0.16  0.08   0.14  0.11  0.02  0.00  0.00  0.00  0.00  0.00]|  rows           [-0.0391,  0.0581,  0.0511, ...]<br>|\u0120summer| [0.26  0.08  0.09  0.10   0.12  0.15  0.08  0.12  0.00  0.00  0.00  0.00]|  |              [ 0.0963,  0.3563, -0.1477, ...]<br>|.      | [0.40  0.17  0.07  0.06   0.08  0.09  0.01  0.10  0.01  0.00  0.00  0.00]|  |                ...<br>|\u0120Sw    | [0.27  0.09  0.05  0.09   0.05  0.14  0.09  0.07  0.07  0.08  0.00  0.00]|  |              [ 0.2759, -0.2783,  0.3240, ...]<br>|imming | [0.19  0.04  0.08  0.14   0.06  0.10  0.14  0.06  0.13  0.04  0.02  0.00]|  |              --------------------------------<br>|\u0120is    | [0.30  0.10  0.06  0.04   0.11  0.05  0.01  0.07  0.02  0.05  0.12  0.04]|  |              |&lt;------- columns: 64 -------&gt;|<br>|-------|-----------------------------------------------------------------------------|<br>        |&lt;-------------------------------- columns: 12 ------------------------------&gt;|<\/pre>\n<\/div>\n<\/div>\n<\/div>\n<\/div>\n\n\n\n<h4 class=\"wp-block-heading\">9.8 \u8ba1\u7b97 Contextual Embeddings<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:25%\">\n<p class=\"has-text-align-center\">\\(\\text{Contextual Embeddings} \\)<\/p>\n\n\n\n<p class=\"has-text-align-center\">\\(= \\text{softmax}(\\frac{QK^T}{\\sqrt{d}} + \\text{mask})V \\)<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:75%\">\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\" style=\"font-size:8px\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:8px;flex-basis:100%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-83e2d415e4975989ba2445da1079479e\" style=\"font-size:8px\">Token      | Contextual Embedding (12 x 768)<br>--------------------------------------------<br>It         | [ 0.0452,  0.0628,  0.1463,...]<br>\u00e2\u0122         | [ 0.0153,  0.0752,  0.1247,...]<br>\u013b          | [ 0.0034,  0.0464,  0.0923,...]<br>s          | [-0.0082,  0.0464,  0.0801,...]<br>\u0120very      | [ 0.0218,  0.1029,  0.0621,...]<br>\u0120hot       | [ 0.0327,  0.0892,  0.0409,...]<br>\u0120in        | [ 0.0249,  0.0964,  0.0329,...]<br>\u0120summer    | [ 0.0583,  0.1195,  0.0068,...]<br>.          | [ 0.0334,  0.1100,  0.0366,...]<br>\u0120Sw        | [ 0.0086,  0.0846,  0.0074,...]<br>imming     | [-0.0049,  0.0841, -0.0339,...]<br>\u0120is        | [ 0.0410,  0.0706,  0.0077,...]<\/pre>\n<\/div>\n<\/div>\n<\/div>\n<\/div>\n\n\n\n<p><\/p>\n\n\n\n<h3 class=\"wp-block-heading\">10. \u5176\u4ed6<\/h3>\n\n\n\n<p>\u4e0a\u8ff0\u6d41\u7a0b\u8be6\u8ff0\u4e86 LLM \u6a21\u578b\u4e2d Attention \u8ba1\u7b97\u7684\u6838\u5fc3\u90e8\u5206\u3002\u4e5f\u6709\u4e00\u4e9b\u7ec6\u8282\u662f\u7701\u7565\u4e86\u7684\uff0c\u4f8b\u5982\uff0c<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u5728GPT2\u4e2d\uff0c\u201c\u7ebf\u6027\u53d8\u6362\u201d\u662f\u6709\u4e00\u4e2a\u201c\u622a\u8ddd\u201d(Bias)\u7684\uff0c\u6240\u4ee5\u4e5f\u53ef\u4ee5\u79f0\u4e3a\u4e00\u4e2a\u201c\u4eff\u5c04\u53d8\u6362\u201d\uff0c\u5373\u5728\u4e00\u4e2a\u7ebf\u6027\u53d8\u6362\u57fa\u7840\u4e0a\uff0c\u518d\u8fdb\u884c\u4e00\u6b21\u5e73\u79fb\uff1b<\/li>\n\n\n\n<li>\u5728GTP2\u4e2d\uff0cAttention\u8ba1\u7b97\u90fd\u662f\u591a\u5c42\u3001\u591a\u5934\u7684\uff0c\u672c\u6587\u4e3b\u8981\u4ee5Layer 0 \/ Head 0 \u4e3a\u4f8b\u8fdb\u884c\u4ecb\u7ecd\uff1b<\/li>\n\n\n\n<li>\u5728\u751f\u6210\u6700\u7ec8\u7684\u201cContextual Embeddings\u201d\u4e4b\u524d\uff0c\u901a\u5e38\u8fd8\u9700\u8981\u4e00\u4e2aMLP\u5c42\uff08\u5168\u8fde\u63a5\u7684\u524d\u9988\u795e\u7ecf\u7f51\u7edc\uff09\u7b49\uff0c\u672c\u6587\u4e3a\u4e86\u8fde\u8d2f\u6027\uff0c\u5ffd\u7565\u4e86\u8be5\u90e8\u5206\u3002<\/li>\n<\/ul>\n\n\n\n<p>\u603b\u7ed3\u4e00\u4e0b\uff0c\u672c\u6587\u9700\u8981\u7684\u524d\u7f6e\u77e5\u8bc6\u5305\u62ec\uff1a\u77e9\u9635\u57fa\u672c\u8fd0\u7b97\u3001\u77e9\u9635\u4e0e\u7ebf\u6027\u53d8\u6362\u3001SVD \u5206\u89e3\/\u7279\u5f81\u503c\u7279\u5f81\u5411\u91cf\u3001\u795e\u7ecf\u7f51\u7edc\u57fa\u7840\u3001\u6df1\u5ea6\u5b66\u4e60\u57fa\u7840\u7b49\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">\u6ce8<\/h3>\n\n\n\n<ul class=\"wp-block-list is-style-no-disc\">\n<li>[1] \u591a\u5934\u6ce8\u610f\u60c5\u51b5\u53ef\u80fd\u662f \\(d \\times \\frac{d}{h} \\) <\/li>\n\n\n\n<li>[2] \u5bf9\u4e8e\u6ee1\u79e9\u65b9\u9635\u4e5f\u53ef\u4ee5\u4f7f\u7528\u201c\u7279\u5f81\u503c\/\u7279\u5f81\u5411\u91cf\u201d\u7684\u65b9\u5f0f\u53bb\u7406\u89e3<\/li>\n\n\n\n<li>[3] \u5728 GPT 2 \u4e2d\uff0c\u4e00\u4e2a Attention \u5c42\u7684\u8ba1\u7b97\uff0c\u4f1a\u5206\u4e3a\u201c\u591a\u5934\u201d\u53bb\u8ba1\u7b97\uff1b\u5e76\u5728\u8ba1\u7b97\u540e\uff0c\u8fd8\u4f1a\u518d\u7ecf\u8fc7\u4e00\u4e2a MLP \u5c42<\/li>\n<\/ul>\n","protected":false},"excerpt":{"rendered":"<p>\u5728\u6574\u4e2a\u5927\u8bed\u8a00\u6a21\u578b\u5b66\u4e60\u4e4b\u8def\u4e2d\uff0c\u5bf9 Attention \u673a\u5236\u7684\u7406\u89e3\u5927\u6982\u662f\u6700\u4e3a\u8ba9\u6211\u56f0\u60d1\u7684\u90e8\u5206\uff0c\u6700\u7ec8\u7ecf\u8fc7\u5c42\u5c42\u89e3\u6784\u3001\u52a0\u4e0a\u91cd\u65b0\u628a\u201c\u7ebf\u6027\u4ee3\u6570\u201d\u6e29\u4e60\u4e86\u4e00\u904d\u4e4b\u540e\uff0c\u6700\u7ec8\uff0c\u603b\u7b97\u67d0\u79cd\u7a0b\u5ea6\u7684\u7406\u89e3\u4e86 Attention \u673a\u5236\u7684\u8bbe\u8ba1\u3002\u76f8\u4fe1\u5bf9\u4e8e\u6240\u6709NLP\u4e13\u4e1a\u7684\u4eba\uff0c\u8fd9\u90e8\u5206\u90fd\u662f\u4e0d\u592a\u5bb9\u6613\u7406\u89e3\u7684\u3002 1. \u6982\u8ff0 \u8981\u60f3\u8bb2\u6e05\u695a\uff0c\u5927\u6982\u4e5f\u662f\u975e\u5e38\u4e0d\u5bb9\u6613\u7684\uff0c\u8fd9\u91cc\u5c31\u505a\u4e00\u4e2a\u5c1d\u8bd5\u5427\u3002\u8fd9\u91cc\u7684\u91cd\u70b9\u662f\u8bb2\u6e05\u695a Attention Score \uff08\u7b80\u79f0Attention\uff09\u7684\u8ba1\u7b97\u3002\u4ecb\u7ecd\u7684\u987a\u5e8f\u662f\u201c\u4e24\u4e2a\u8bcd\u8bed\u7684\u76f8\u4f3c\u5ea6\u201d\u3001\u201cSimilarity Score Matrix\u201d\u3001\u201cAttention Score Matrix\u201d\u3002 1.1 \u8981\u6784\u5efa\u7684\u662f\u76f4\u89c9\uff0c\u800c\u4e0d\u662f\u201c\u63a8\u7406\u201d \u4e3a\u4ec0\u4e48 Attention \u7406\u89e3\u8d77\u6765\u5f88\u96be\u5462\uff1f\u6211\u60f3\u5176\u4e2d\u6709\u4e00\u4e2a\u539f\u56e0\u5927\u6982\u662f\u8fd9\u4e2a\u201c\u673a\u5236\u201d\u672c\u8eab\u5e76\u4e0d\u662f\u67d0\u79cd\u201c\u516c\u5f0f\u63a8\u5bfc\u201d\u51fa\u6765\u7684\uff0c\u800c\u662f\u901a\u8fc7\u4e00\u7bc7\u7bc7\u8bba\u6587\u4e0e\u5b9e\u8df5\uff0c\u88ab\u8bc1\u660e\u975e\u5e38\u6709\u6548\u7684\u4e00\u4e2a\u673a\u5236\uff0c\u6240\u4ee5\uff0c\u8fd9\u4e2a\u673a\u5236\u672c\u8eab\u7684\u6240\u5177\u5907\u7684\u201c\u53ef\u89e3\u91ca\u6027\u201d\u5176\u5b9e\u4e5f\u662f\u6709\u9650\u7684\u3002\u8fd9\u5927\u6982\u4e5f\u662f\uff0c\u65e0\u8bba\u4f60\u5728\u4e92\u8054\u7f51\u4e0a\u5982\u4f55\u641c\u7d22\uff0c\u4e5f\u6ca1\u6709\u8c01\u53ef\u4ee5\u6bd4\u8f83\u7b80\u5355\u7684\u628a\u8fd9\u4e2a\u673a\u5236\u8bf4\u6e05\u695a\u7684\u539f\u56e0\u3002\u4f46\uff0c\u7406\u89e3\u8fd9\u4e2a\u673a\u5236\u6784\u5efa\u7684\u76f4\u89c9\uff0c\u5bf9\u4e8e\u7406\u89e3\u6574\u4e2a Transformer \uff0c\u4ee5\u53ca\u6574\u4e2a\u5f53\u4ee3\u5927\u8bed\u8a00\u6a21\u578b\u6280\u672f\u57fa\u7840\u90fd\u662f\u81f3\u5173\u91cd\u8981\u7684\u3002 2. \u9884\u5904\u7406 \u5728\u201c\u5927\u8bed\u8a00\u6a21\u578b\u7684\u8f93\u5165\uff1atokenize\u201d\u4e2d\u8be6\u7ec6\u4ecb\u7ecd\u4e86\u201c\u63d0\u793a\u8bcd\u201d\u8fdb\u5165\u5927\u6a21\u578b\u5904\u7406\u4e4b\u524d\uff0c\u5982\u4f55\u5c06\u63d0\u793a\u8bcd\u6362\u884c\u6210\u5927\u6a21\u578b\u53ef\u4ee5\u5904\u7406\u7684\u201c\u8bcd\u5411\u91cf\u201d\u6216\u8005\u8bf4\u201ctoken embedding\u201d\u3002 \u5927\u8bed\u8a00\u6a21\u578b\u5728\u5f00\u59cb\u201cAttention\u201d\u8ba1\u7b97\u4e4b\u524d\uff0c\u8fd8\u4f1a\u5bf9\u201ctoken embedding\u201d\u8fdb\u884c\u4e00\u4e9b\u9884\u5904\u7406\uff0c\u8fd9\u4e9b\u9884\u5904\u7406\u5305\u62ec\u4e86\u201c\u878d\u5165\u201d\u4f4d\u7f6e\u5411\u91cf\u3001\u5bf9\u5411\u91cf\u8fdb\u884c\u201c\u5f52\u4e00\u5316\u201d\u5904\u7406\uff08\u5c06\u5404\u4e2a\u5411\u91cf\u90fd\u8f6c\u5316\u4e3a\u5747\u503c\u4e3a0\u3001\u65b9\u5dee\u4e3a1\u7684\u5411\u91cf\uff0c\u957f\u5ea6\u8981\u7edf\u4e00\u53d8\u62101\u5417\uff1f\uff09\u3002 \u4f8b\u5982\uff0c\u5728\u8fd9\u91cc\u7684\u4f8b\u5b50\u4e2d\uff0c\u63d0\u793a\u8bcd \u201cIt\u2019s very hot in summer. Swimming is\u201d\uff0c\u5148\u8f6c\u6362\u4e3aembedding\uff0c\u7136\u540e\u52a0\u4e0a\u4f4d\u7f6e\u7f16\u7801\uff08positional encoder)\u3001\u518d\u8fdb\u884c\u6b63\u89c4\u5316\uff0c\u6700\u540e\u53d8\u6362\u4e3a\u5982\u4e0b\u7684\u5411\u91cf \u201c X \u201d \uff1a |&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;-| Token | wte[:3] (word token embedding) | wpe[:3](word positional e..)| wte [:3] + [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":22037,"comment_status":"open","ping_status":"closed","sticky":false,"template":"wp-custom-template-a-1440-px-width-template","format":"standard","meta":{"_eb_attr":"","inline_featured_image":false,"_tocer_settings":[],"footnotes":""},"categories":[139,137],"tags":[],"class_list":["post-20839","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-ai-llm","category-learning-more"],"_links":{"self":[{"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/posts\/20839","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/comments?post=20839"}],"version-history":[{"count":229,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/posts\/20839\/revisions"}],"predecessor-version":[{"id":22048,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/posts\/20839\/revisions\/22048"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/media\/22037"}],"wp:attachment":[{"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/media?parent=20839"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/categories?post=20839"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/tags?post=20839"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}