{"id":21291,"date":"2026-01-17T18:47:00","date_gmt":"2026-01-17T10:47:00","guid":{"rendered":"https:\/\/www.orczhou.com\/?p=21291"},"modified":"2026-01-17T20:20:12","modified_gmt":"2026-01-17T12:20:12","slug":"understanding-llm-inference-with-kvcache","status":"publish","type":"post","link":"https:\/\/www.orczhou.com\/index.php\/2026\/01\/understanding-llm-inference-with-kvcache\/","title":{"rendered":"\u7406\u89e3\u5927\u8bed\u8a00\u6a21\u578b\u63a8\u7406\u7684 KVCache"},"content":{"rendered":"\n\n\n\n<p style=\"margin-top:4px\">\u5927\u8bed\u8a00\u6a21\u578b\u7684\u4e00\u4e2a\u91cd\u8981\u65b9\u5411\u662f\u201c\u63a8\u7406\u201d\u4f18\u5316\uff0c\u5373\u5982\u4f55\u5728\u6709\u9650\u7684\u786c\u4ef6\u73af\u5883\u4e2d\u63d0\u5347\u63a8\u7406\u7684\u6548\u7387\u3002\u5bf9\u4e8e\u6240\u6709\u7684 MaaS \u670d\u52a1\u63d0\u4f9b\u65b9\uff0c\u8fd9\u90fd\u662f\u81f3\u5173\u91cd\u8981\u7684\u3002\u4e00\u65b9\u9762\u5173\u4e4e\u7528\u6237\u7684\u4f7f\u7528\u4f53\u9a8c\uff08\u8bf8\u5982TTFT\uff0ctime to first token\uff09\u3001\u53e6\u4e00\u65b9\u9762\u5173\u4e8e\u670d\u52a1\u63d0\u4f9b\u7684\u6210\u672c\uff08\u6709\u9650\u7684GPU\u5982\u4f55\u63d0\u4f9b\u66f4\u9ad8\u7684\u541e\u5410\u91cf\uff09\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">1. \u6982\u8ff0<\/h3>\n\n\n\n<p>\u4ece Transformer \u67b6\u6784\u7684 Decoder \u9636\u6bb5\u539f\u7406\u6765\u770b\uff0c\u4e00\u4e2a\u5e38\u89c1\u7684\u3001\u81ea\u7136\u7684\u4f18\u5316\u5c31\u662f\u4f7f\u7528\u201cKV Cache\u201d\u5927\u5927\u51cf\u5c11\u63a8\u7406\uff08\u81ea\u56de\u5f52\u9636\u6bb5\uff09\u8fc7\u7a0b\u9700\u8981\u8ba1\u7b97\u91cf\uff0c\u5b9e\u73b0\u4ee5\u663e\u5b58\u6362\u6548\u7387\uff0c\u4ece\u800c\u52a0\u901f\u63a8\u7406\u8fc7\u7a0b\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">2. Decoder \u6a21\u578b\u7684\u81ea\u56de\u5f52\u8ba1\u7b97<\/h3>\n\n\n\n<p>\u5728\u4e86\u89e3\u4e86\u201cAttention\u201d\u3001\u201cmask attention\u201d\u3001\u201cautoregression\u201d\u8ba1\u7b97\u4e4b\u540e\uff0c\u6bd4\u8f83\u81ea\u7136\u53ef\u4ee5\u6ce8\u610f\u5230\u5728 Q\u3001K\u3001V \u77e9\u9635\u5728\u201cautoregression\u201d\u7684\u8fc7\u7a0b\u4e2d\uff0c\u6709\u5f88\u591a\u7684\u90e8\u5206\u662f\u65e0\u9700\u989d\u5916\u8ba1\u7b97\u7684\u3002<\/p>\n\n\n\n<p>\u8fd9\u91cc\u4f9d\u65e7\u7ee7\u7eed\u4f7f\u7528\u300a<a href=\"https:\/\/www.orczhou.com\/index.php\/2025\/12\/understanding-llm-attention\/\">\u7406\u89e3\u5927\u8bed\u8a00\u6a21\u578b\u7684\u6838\u5fc3\uff1aAttention<\/a>\u300b\u4e2d\u7684\u793a\u4f8b\uff0c\u8fd9\u91cc\u8003\u8651\u5728\u6587\u7ae0\u4e2d\u7684\u63d0\u793a\u8bcd\u201cIt\u2019s very hot in summer. Swimming is\u201d\uff0c\u751f\u6210\u65b0\u7684Token\u4e3a \u201c a\u201d\uff0c\u90a3\u4e48\u6211\u4eec\u770b\u770b\u8fd9\u4e2a\u81ea\u56de\u5f52\u8fc7\u7a0b\u67d0\u4e2aHead\u4e2d\u7684\u8ba1\u7b97\u3002\u5b8c\u6210\u7684\u4ee3\u7801\u53ef\u4ee5\u53c2\u8003\uff1a<a href=\"https:\/\/colab.research.google.com\/drive\/1VGi8kA-KIcMefyTElXTttbr4K1fWGlMN#scrollTo=xojpXSc_W4sx\">autoregression-of-attention.ipynb<\/a>\u3002<\/p>\n\n\n\n<p>\u76f8\u6bd4\u4e0e\u5728 prefill \u9636\u6bb5\uff0c\u9700\u8981\u989d\u5916\u8ba1\u7b97\u7684\uff0c\u5728\u540e\u7eed\u4f7f\u7528\u9ec4\u8272\u6807\u8bc6\u51fa\u6765\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">2. 1 Token Embedding \u548c Positional Embedding<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:25%\">\n<p class=\"has-text-align-center\">Token Embedding <\/p>\n\n\n\n<p class=\"has-text-align-center\">+<\/p>\n\n\n\n<p class=\"has-text-align-center\">Positional Embedding<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:75%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-d7ef37e186c8df82e6d9e4aabaab6b6e\" style=\"font-size:8px\">----------------------------------------------------------------------------------------------------------------------------------<br>| Token   | Token ID | Token Embeddings(first 3 of 768 ) |  Positional Embeddings            |  Token Embedding +  Positional    |<br>----------------------------------------------------------------------------------------------------------------------------------<br>| It      | 1026     | [  0.0390, -0.0869,  0.0662, ...] | [ -0.0188, -0.1974,  0.0040, ...] | [  0.0202, -0.2844,  0.0702, ...] |<br>| \u00e2\u0122      | 447      | [ -0.0750,  0.0948, -0.0034, ...] | [  0.0240, -0.0538, -0.0949, ...] | [ -0.0510,  0.0410, -0.0982, ...] |<br>| \u013b       | 247      | [ -0.0223,  0.0182,  0.2631, ...] | [  0.0042, -0.0848,  0.0545, ...] | [ -0.0181, -0.0666,  0.3176, ...] |<br>| s       | 82       | [ -0.0640, -0.0469,  0.2061, ...] | [ -0.0003, -0.0738,  0.1055, ...] | [ -0.0643, -0.1207,  0.3116, ...] |<br>| \u0120very   | 845      | [ -0.0553, -0.0348,  0.0606, ...] | [  0.0076, -0.0251,  0.1270, ...] | [ -0.0477, -0.0599,  0.1876, ...] |<br>| \u0120hot    | 3024     | [  0.0399, -0.0053,  0.0742, ...] | [  0.0096, -0.0339,  0.1312, ...] | [  0.0495, -0.0392,  0.2054, ...] |<br>| \u0120in     | 287      | [ -0.0337,  0.0108,  0.0293, ...] | [  0.0027, -0.0205,  0.1196, ...] | [ -0.0310, -0.0098,  0.1490, ...] |<br>| \u0120summer | 3931     | [  0.0422,  0.0138, -0.0213, ...] | [  0.0025, -0.0032,  0.1174, ...] | [  0.0448,  0.0106,  0.0961, ...] |<br>| .       | 13       | [  0.0466, -0.0113,  0.0283, ...] | [ -0.0012, -0.0018,  0.1110, ...] | [  0.0454, -0.0131,  0.1394, ...] |<br>| \u0120Sw     | 2451     | [  0.0617,  0.0373,  0.1018, ...] | [  0.0049,  0.0021,  0.1178, ...] | [  0.0666,  0.0395,  0.2196, ...] |<br>| imming  | 27428    | [ -0.1385, -0.1774, -0.0181, ...] | [  0.0016,  0.0062,  0.1004, ...] | [ -0.1369, -0.1711,  0.0823, ...] |<br>| \u0120is     | 318      | [ -0.0097,  0.0101,  0.0556, ...] | [ -0.0036,  0.0175,  0.1068, ...] | [ -0.0133,  0.0275,  0.1623, ...] |<br><mark style=\"background-color:rgba(0, 0, 0, 0);color:#ffff00\" class=\"has-inline-color\">| \u0120a      | 257      | [ -0.0506,  0.0056,  0.0471, ...] | [  0.0001,  0.0172,  0.0969, ...] | [ -0.0506,  0.0228,  0.1440, ...] |<\/mark><br>----------------------------------------------------------------------------------------------------------------------------------<\/pre>\n<\/div>\n<\/div>\n\n\n\n<p><mark style=\"background-color:rgba(0, 0, 0, 0)\" class=\"has-inline-color has-primary-color\">\u8fd9\u91cc\uff0c\u53ea\u9700\u8981\u8ba1\u7b97\u6700\u65b0\u7684Token\uff08\u5373\u8fd9\u91cc\u7684\u201c a\u201d\uff09\u7684Embedding\u5373\u53ef\u3002\u4e8b\u5b9e\u4e0a\uff0c\u4e0a\u9762\u77e9\u9635\u767d\u8272\u90e8\u5206\u518d\u81ea\u56de\u5f52\u9636\u6bb5\u5b8c\u5168\u4e0d\u518d\u9700\u8981\u4f7f\u7528\u4e86\u3002\u6240\u4ee5\uff0c\u4e0a\u8ff0\u5185\u5bb9\u8ba1\u7b97\u5b8c\u6210\u540e\uff0c\u5185\u5b58\u5373\u53ef\u91ca\u653e\uff0c\u65e0\u9700\u7f13\u5b58\u3002<\/mark><\/p>\n\n\n\n<h4 class=\"wp-block-heading\">2. 2 Normalize<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:30%\">\n<p class=\"has-text-align-left\">\u5373\uff0c\u5c06\u6bcf\u4e00\u4e2atoken\u7684embedding \u8fdb\u884c\u6b63\u89c4\u5316\uff0c\u5c06\u5176\u5747\u503c\u53d8\u4e3a0\uff0c\u65b9\u5dee\u53d8\u4e3a1<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:70%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-3e48b27ce8433521aaab86d68db7d3f5\" style=\"font-size:12px\">------------------------------------------------------<br>| Token   | Token ID | Normalized(first 3 of 768  )  |<br>------------------------------------------------------<br>| It      | 1026     | [ 0.0129 , -0.1104 , -0.0317] |<br>| \u00e2\u0122      | 447      | [-0.0530 ,  0.0588 , -0.1290] |<br>| \u013b       | 247      | [-0.0170 , -0.0242 ,  0.1639] |<br>| s       | 82       | [-0.0754 , -0.0842 ,  0.1842] |<br>| \u0120very   | 845      | [-0.0566 , -0.0280 ,  0.0953] |<br>| \u0120hot    | 3024     | [ 0.0587 , -0.0086 ,  0.1073] |<br>| \u0120in     | 287      | [-0.0391 ,  0.0209 ,  0.0731] |<br>| \u0120summer | 3931     | [ 0.0532 ,  0.0397 ,  0.0181] |<br>| .       | 13       | [ 0.0553 ,  0.0152 ,  0.0579] |<br>| \u0120Sw     | 2451     | [ 0.0807 ,  0.0691 ,  0.1216] |<br>| imming  | 27428    | [-0.1528 , -0.1249 , -0.0017] |<br>| \u0120is     | 318      | [-0.0175 ,  0.0605 ,  0.0880] |<br><mark style=\"background-color:rgba(0, 0, 0, 0);color:#ffff00\" class=\"has-inline-color\">| \u0120a      | 257      | [-0.0688 ,  0.0540 ,  0.0697] |<\/mark><br>------------------------------------------------------<\/pre>\n<\/div>\n<\/div>\n\n\n\n<p><mark style=\"background-color:rgba(0, 0, 0, 0)\" class=\"has-inline-color has-primary-color\">\u4e0e\u524d\u9762\u7c7b\u4f3c\uff0c\u8fd9\u91cc\u8ba1\u7b97\u5b8c\u6210\u5e76\u63a8\u8fdb\u5230\u4e0b\u4e00\u6b65\u540e\uff0c\u5185\u5b58\u5373\u53ef\u91ca\u653e\uff0c\u65e0\u9700\u7f13\u5b58\u3002<\/mark><\/p>\n\n\n\n<h4 class=\"wp-block-heading\">2. 3 Attention \u5c42\u7684\u53c2\u6570\u77e9\u9635<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:25%\">\n<p class=\"has-text-align-center\"><\/p>\n\n\n\n<p class=\"has-text-align-center\">\\(W^Q\\,,W^K\\,,W^V \\)<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:75%\">\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\" style=\"font-size:8px\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:8px;flex-basis:100%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-58ce20c1cc44ecaeb62a2e3f0e71f935\" style=\"font-size:8px\">   W^Q [:3]  shape (768 x 64)               W^K [:3]  shape (768 x 64)                                  W^V [:3]  shape (768 x 64)<br>-------------------------------------    --------------------------------                            --------------------------------<br>[-0.4738, -0.2614, -0.0978, ...]   |     [ 0.3660,  0.0771,  0.2226, ...]                            [ 0.1421,  0.0329, -0.0667, ...]<br>[ 0.0874,  0.1473,  0.2387, ...]   |     [-0.4380, -0.1446, -0.4717, ...]                            [ 0.0162, -0.0633, -0.0636, ...]<br>[ 0.0039,  0.0695,  0.3668, ...]   |     [ 0.1237,  0.0174,  0.1181, ...]                            [ 0.0229, -0.0828,  0.0437, ...]<br>[ 0.2215, -0.1884, -0.0141, ...]  64     [-0.2247,  0.0148, -0.1859, ...]                            [-0.0106,  0.0070,  0.0565, ...]<br>[-0.0947,  0.1678, -0.0143, ...]  rows   [-0.2001, -0.1052, -0.1743, ...]                            [ 0.0416,  0.0938, -0.1792, ...]<br>   ...                             |        ...                                                          ...<br>[-0.4100, -0.1924, -0.2400, ...]   |     [,0.1567,  0.2664,  0.1851, ...]                            [-0.0341,  0.0034,  0.0203, ...]<br>-------------------------------------    --------------------------------                            --------------------------------<br>|&lt;------- columns: 768 -------&gt;|         |&lt;------- columns: 768 -------&gt;|                            |&lt;------- columns: 768 -------&gt;|<\/pre>\n<\/div>\n<\/div>\n<\/div>\n<\/div>\n\n\n\n<p class=\"has-primary-color has-text-color has-link-color wp-elements-19096e0cda15946a80168761464d44d2\">\u8fd9\u662f\u4e09\u4e2a\u6743\u91cd\u77e9\u9635\uff0c\u603b\u662f\u9700\u8981\u5e38\u9a7b\u5185\u5b58\u7684\uff0c\u5e76\u4e14\u53ef\u4ee5\u88ab\u591a\u4e2a\u201c\u63a8\u7406\u201d\u5171\u4eab\u4f7f\u7528\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">2. 4 \u77e9\u9635 Q K V\u7684\u8ba1\u7b97<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:25%\">\n<p class=\"has-text-align-center\">\\(Q = XW^Q \\)<\/p>\n\n\n\n<p class=\"has-text-align-center\">\\(K = XW^K \\)<\/p>\n\n\n\n<p class=\"has-text-align-center\">\\(V = XW^V \\)<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:75%\">\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\" style=\"font-size:8px\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:8px;flex-basis:100%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-8b9e8af32ca1fbae3c1cdf25510c8857\" style=\"font-size:8px\">    Q [:3]  shape (12 x 64)                  K [:3]  shape (12 x 64)                                      V [:3]  shape (12 x 64)<br>-------------------------------------    ---------------------------------                           --------------------------------<br>[ 0.4207, -0.9178,  0.1760, ...]  |      [ -1.4202,  1.6791,  0.9837, ...]                           [ 0.0452,  0.0628,  0.1463, ...]<br>[ 0.7757,  0.2485,  0.7349, ...]  |      [ -2.5320,  2.2932,  1.5592, ...]                           [-0.1361,  0.1379,  0.0150, ...]<br>[ 0.4481,  0.0206, -0.0825, ...]  |      [ -2.2571,  2.7764,  1.8401, ...]                           [ 0.0039, -0.1295, -0.0311, ...]<br>[ 0.9500,  0.1481,  0.3469, ...] 12      [ -2.4322,  3.1454,  2.0600, ...]                           [-0.0391,  0.0581,  0.0511, ...]<br>[ 0.4989, -0.4376,  0.1678, ...] rows    [ -3.5428,  2.1485,  2.0414, ...]                           [ 0.0963,  0.3563, -0.1477, ...]<br>  ...                             |        ...                                                         ...<br>[ 0.4429, -1.1997,  0.5611, ...]  |      [ -2.2559,  2.0384,  2.2542, ...]                           [ 0.2759, -0.2783,  0.3240, ...]<br><mark style=\"background-color:rgba(0, 0, 0, 0);color:#ffff00\" class=\"has-inline-color\">[ 0.4989, -0.4376,  0.1678, ...]  |      [ -2.6703,  2.3629,  1.7493, ...]                           [ -0.0633, 0.0431, -0.0422, ...]<\/mark><br>-------------------------------------    ---------------------------------                           --------------------------------<br>|&lt;------- columns: 64 -------&gt;|          |&lt;------- columns: 64 -------&gt;|                             |&lt;------- columns: 64 -------&gt;|<\/pre>\n<\/div>\n<\/div>\n<\/div>\n<\/div>\n\n\n\n<p class=\"has-primary-color has-text-color has-link-color wp-elements-9d88ced3a5834b79062bd8b6afd9eccb\">\u8ba1\u7b97 Q\u3001K\u3001V \u77e9\u9635\uff0c\u8fd9\u91cc\u53ea\u6709\u6700\u540e\u4e00\u884c\uff08\u5373\u5bf9\u5e94\u6700\u540e\u4e00\u4e2aToken \u201c a\u201d\uff09\u3002\u8fd9\u91cc\u7684\u77e9\u9635 K \u3001V \u9700\u8981\u8fdb\u884c\u7f13\u5b58\uff0c\u5728\u540e\u7eed\u6bcf\u4e00\u6b21\u81ea\u56de\u5f52\u7684\u8fc7\u7a0b\u90fd\u9700\u8981\u5b8c\u6574\u7684\u4f7f\u7528 K V \u77e9\u9635\u4e2d\u6240\u6709\u503c\uff0c\u4e0b\u4e00\u6b65\u4f1a\u8bf4\u660e\u539f\u56e0\u3002Q \u77e9\u9635\u5728\u5b8c\u6210\u540e\u77e9\u9635\u8ba1\u7b97\uff0c\u5c31\u53ef\u4ee5\u91ca\u653e\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">2. 5 \u8ba1\u7b97 Attention Score<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:25%\">\n<p class=\"has-text-align-center\">\\(\\text{Attention Score} \\)<\/p>\n\n\n\n<p class=\"has-text-align-center\">\\(= \\frac{QK^T}{\\sqrt{d}} \\)<\/p>\n\n\n\n<p><\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:75%\">\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\" style=\"font-size:8px\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:8px;flex-basis:100%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-bfc9b857849d77a514343b47da1b1a02\" style=\"font-size:8px\">|-----------------------------------------------------------------------------------------------------|<br>|       |           Attention Score Matrix shape (13 x 13)                                            |<br>| Token |---------------------------------------------------------------------------------------------|<br>|       |   It     \u00e2\u0122     \u013b      s     \u0120very  \u0120hot    \u0120in  \u0120summer  .     \u0120Sw    imming  \u0120is    \u0120a    |<br>|-------|---------------------------------------------------------------------------------------------|----<br>|It     | [ 0.14, -1.53, -1.45, -1.71, -1.69, -1.74, -2.36, -2.27, -2.37, -1.33, -0.58, -2.40,   \/   ]|  |<br>|\u00e2\u0122     | [ 0.70, -0.93, -1.72, -1.02, -1.52, -2.24, -1.90, -2.19, -1.63, -2.13, -1.66, -2.14,   \/   ]|  |<br>|\u013b      | [-0.60, -1.81, -1.99, -1.96, -2.57, -1.84, -1.62, -2.04, -0.98, -1.18, -2.23, -2.25,   \/   ]|  |<br>|s      | [-0.46, -1.33, -1.60, -2.65, -2.24, -1.99, -2.89, -1.44, -2.05, -2.77, -2.09, -2.74,   \/   ]|  |<br>|\u0120very  | [ 0.29, -1.42, -1.77, -1.15, -0.94, -1.14, -1.81, -1.04, -1.77, -2.13, -0.60, -0.82,   \/   ]|  |<br>|\u0120hot   | [ 0.03, -0.68, -0.59, -0.95, -1.78, -0.10, -0.95, -0.14, -1.32, -0.57,  0.06, -1.07,   \/   ]|  13<br>|\u0120in    | [-0.71, -1.72, -1.53, -2.18, -1.67, -1.93, -3.41, -1.69, -2.74, -1.89, -1.17, -2.02,   \/   ]|  rows<br>|\u0120summer| [-0.34, -1.49, -1.35, -1.31, -1.12, -0.89, -1.49, -1.11, -1.51, -1.15, -1.45, -1.20,   \/   ]|  |<br>|.      | [-0.89, -1.73, -2.67, -2.80, -2.45, -2.37, -4.39, -2.33, -4.42, -2.73, -1.82, -3.21,   \/   ]|  |<br>|\u0120Sw    | [-0.05, -1.15, -1.76, -1.15, -1.68, -0.74, -1.15, -1.35, -1.36, -1.29, -0.43, -1.51,   \/   ]|  |<br>|imming | [-0.02, -1.65, -0.87, -0.35, -1.18, -0.65, -0.33, -1.25, -0.38, -1.68, -2.15, -1.08,   \/   ]|  |<br>|\u0120is    | [-0.97, -2.03, -2.56, -2.94, -1.96, -2.71, -4.07, -2.46, -3.51, -2.68, -1.88, -2.99,   \/   ]|  |<br>|<mark style=\"background-color:rgba(0, 0, 0, 0);color:#ffff00\" class=\"has-inline-color\">\u0120a     | [-1.10, -1.95, -2.12, -3.12, -2.72, -2.17, -3.88, -2.06, -3.57, -2.49, -1.86, -2.83, -3.40 ]<\/mark>|  |<br>|-------|---------------------------------------------------------------------------------------------|----<br>        |&lt;------------------------------------ columns: 13 ------------------------------------------&gt;|<\/pre>\n<\/div>\n<\/div>\n<\/div>\n<\/div>\n\n\n\n<p>\u7279\u522b\u9700\u8981\u6ce8\u610f\u7684\uff0c\u8fd9\u4e00\u6b65\u4e2d\uff0c<strong><mark style=\"background-color:rgba(0, 0, 0, 0)\" class=\"has-inline-color has-primary-color\">\u201cAttention Score Matrix\u201d\u6700\u540e\u4e00\u884c\u7684\u8ba1\u7b97\uff0c\u9700\u8981\u524d\u9762\u7684Q\u7684\u6700\u540e\u4e00\u884c\uff0c\u6b64\u5916\u8fd8\u9700\u8981\u6574\u4e2a K \u77e9\u9635\u3002\u8fd9\u5c31\u662f\u4e3a\u4ec0\u4e48 K \u77e9\u9635\u662f\u9700\u8981\u7f13\u5b58\u7684\u3002<\/mark><\/strong><\/p>\n\n\n\n<h4 class=\"wp-block-heading\">2. 6 \u8ba1\u7b97 Masked Attention Score<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:25%\">\n<p class=\"has-text-align-center\">\\(\\text{Masked Attention Score} \\)<\/p>\n\n\n\n<p class=\"has-text-align-center\">\\(= \\frac{QK^T}{\\sqrt{d}} + \\text{mask} \\)<\/p>\n\n\n\n<p><\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:75%\">\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\" style=\"font-size:8px\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:8px;flex-basis:100%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-69a4683401f28e504071f08636567ac3\" style=\"font-size:8px\">|-----------------------------------------------------------------------------------------------------|<br>|       |           Attention Score Matrix shape (13 x 13)                                            |<br>| Token |---------------------------------------------------------------------------------------------|<br>|       |   It     \u00e2\u0122     \u013b      s     \u0120very  \u0120hot    \u0120in  \u0120summer  .     \u0120Sw    imming  \u0120is    \u0120a    |<br>|-------|---------------------------------------------------------------------------------------------|----<br>|It     | [ 0.14,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf ]|  |<br>|\u00e2\u0122     | [ 0.70, -0.93,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf ]|  |<br>|\u013b      | [-0.60, -1.81, -1.99,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf ]|  |<br>|s      | [-0.46, -1.33, -1.60, -2.65,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf ]|  |<br>|\u0120very  | [ 0.29, -1.42, -1.77, -1.15, -0.94,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf ]|  |<br>|\u0120hot   | [ 0.03, -0.68, -0.59, -0.95, -1.78, -0.10,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf ]|  13<br>|\u0120in    | [-0.71, -1.72, -1.53, -2.18, -1.67, -1.93, -3.41,  -inf,  -inf,  -inf,  -inf,  -inf,  -inf ]|  rows<br>|\u0120summer| [-0.34, -1.49, -1.35, -1.31, -1.12, -0.89, -1.49, -1.11,  -inf,  -inf,  -inf,  -inf,  -inf ]|  |<br>|.      | [-0.89, -1.73, -2.67, -2.80, -2.45, -2.37, -4.39, -2.33, -4.42,  -inf,  -inf,  -inf,  -inf ]|  |<br>|\u0120Sw    | [-0.05, -1.15, -1.76, -1.15, -1.68, -0.74, -1.15, -1.35, -1.36, -1.29,  -inf,  -inf,  -inf ]|  |<br>|imming | [-0.02, -1.65, -0.87, -0.35, -1.18, -0.65, -0.33, -1.25, -0.38, -1.68, -2.15,  -inf,  -inf ]|  |<br>|\u0120is    | [-0.97, -2.03, -2.56, -2.94, -1.96, -2.71, -4.07, -2.46, -3.51, -2.68, -1.88, -2.99,  -inf ]|  |<br>|<mark style=\"background-color:rgba(0, 0, 0, 0);color:#ffff00\" class=\"has-inline-color\">\u0120a     | [-1.10, -1.95, -2.12, -3.12, -2.72, -2.17, -3.88, -2.06, -3.57, -2.49, -1.86, -2.83, -3.40<\/mark> ]|  |<br>|-------|---------------------------------------------------------------------------------------------|----<br>        |&lt;------------------------------------ columns: 13 ------------------------------------------&gt;|<\/pre>\n<\/div>\n<\/div>\n<\/div>\n<\/div>\n\n\n\n<h4 class=\"wp-block-heading\">2.7 \u8ba1\u7b97 Softmax Masked Attention Score<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:25%\">\n<p class=\"has-text-align-center\">\\(\\text{Softmax Masked Attention Score} \\)<\/p>\n\n\n\n<p class=\"has-text-align-center\">\\(= \\text{softmax}(\\frac{QK^T}{\\sqrt{d}} + \\text{mask}) \\)<\/p>\n\n\n\n<p><\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:75%\">\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\" style=\"font-size:8px\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:8px;flex-basis:100%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-9e68f03baa3e34f7a085da5b04d081a7\" style=\"font-size:8px\">|---------------------------------------------------------------------------------------|<br>|       |  Softmax Masked  Attention Score Matrix shape (13 x 13)                       |<br>| Token |-------------------------------------------------------------------------------|<br>|       |  It    \u00e2\u0122    \u013b     s     \u0120very  \u0120hot  \u0120in  \u0120summer  .   \u0120Sw   imming  \u0120is     |<br>|-------|-------------------------------------------------------------------------------|----<br>|It     | [1.00  0.00  0.00  0.00   0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00 0.00]|  |<br>|\u00e2\u0122     | [0.84  0.16  0.00  0.00   0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00 0.00]|  |                  V [:3]  shape (12 x 64)<br>|\u013b      | [0.65  0.19  0.16  0.00   0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00 0.00]|  |              --------------------------------<br>|s      | [0.54  0.23  0.17  0.06   0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00 0.00]|  |              [ 0.0452,  0.0628,  0.1463, ...]<br>|\u0120very  | [0.54  0.10  0.07  0.13   0.16  0.00  0.00  0.00  0.00  0.00  0.00  0.00 0.00]|  |              [-0.1361,  0.1379,  0.0150, ...]<br>|\u0120hot   | [0.29  0.14  0.16  0.11   0.05  0.25  0.00  0.00  0.00  0.00  0.00  0.00 0.00]|  13             [ 0.0039, -0.1295, -0.0311, ...]<br>|\u0120in    | [0.36  0.13  0.16  0.08   0.14  0.11  0.02  0.00  0.00  0.00  0.00  0.00 0.00]|  rows           [-0.0391,  0.0581,  0.0511, ...]<br>|\u0120summer| [0.26  0.08  0.09  0.10   0.12  0.15  0.08  0.12  0.00  0.00  0.00  0.00 0.00]|  |              [ 0.0963,  0.3563, -0.1477, ...]<br>|.      | [0.40  0.17  0.07  0.06   0.08  0.09  0.01  0.10  0.01  0.00  0.00  0.00 0.00]|  |                ...<br>|\u0120Sw    | [0.27  0.09  0.05  0.09   0.05  0.14  0.09  0.07  0.07  0.08  0.00  0.00 0.00]|  |              [ 0.2759, -0.2783,  0.3240, ...]<br>|imming | [0.19  0.04  0.08  0.14   0.06  0.10  0.14  0.06  0.13  0.04  0.02  0.00 0.00]|  |              [<mark style=\"background-color:rgba(0, 0, 0, 0);color:#ffff00\" class=\"has-inline-color\">-0.0633,  0.0431, -0.0422, ...]<\/mark><br>|\u0120is    | [0.30  0.10  0.06  0.04   0.11  0.05  0.01  0.07  0.02  0.05  0.12  0.04 0.00]|  |              --------------------------------<br>|\u0120a     | [<mark style=\"background-color:rgba(0, 0, 0, 0);color:#ffff00\" class=\"has-inline-color\">0.25  0.11  0.09  0.03   0.05  0.09  0.02  0.10  0.02  0.06  0.12  0.04 0.03<\/mark>]|  |              |&lt;------- columns: 64 -------&gt;|<br>|-------|-------------------------------------------------------------------------------|----<br>        |&lt;---------------------------------- columns: 13 ------------------------------&gt;|<\/pre>\n<\/div>\n<\/div>\n<\/div>\n<\/div>\n\n\n\n<h4 class=\"wp-block-heading\">2. 8 \u8ba1\u7b97 Contextual Embeddings<\/h4>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:25%\">\n<p class=\"has-text-align-center\">\\(\\text{Contextual Embeddings} \\)<\/p>\n\n\n\n<p class=\"has-text-align-center\">\\(= \\text{softmax}(\\frac{QK^T}{\\sqrt{d}} + \\text{mask})V \\)<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:12px;flex-basis:75%\">\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-28f84493 wp-block-columns-is-layout-flex\" style=\"font-size:8px\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"font-size:8px;flex-basis:100%\">\n<pre class=\"wp-block-preformatted has-base-color has-contrast-background-color has-text-color has-background has-link-color wp-elements-2dfd343ac9278805af79aad10ba49219\" style=\"font-size:8px\">Token      | Contextual Embedding (12 x 768)<br>--------------------------------------------<br>It         | [ 0.0452,  0.0628,  0.1463,...]<br>\u00e2\u0122         | [ 0.0153,  0.0752,  0.1247,...]<br>\u013b          | [ 0.0034,  0.0464,  0.0923,...]<br>s          | [-0.0082,  0.0464,  0.0801,...]<br>\u0120very      | [ 0.0218,  0.1029,  0.0621,...]<br>\u0120hot       | [ 0.0327,  0.0892,  0.0409,...]<br>\u0120in        | [ 0.0249,  0.0964,  0.0329,...]<br>\u0120summer    | [ 0.0583,  0.1195,  0.0068,...]<br>.          | [ 0.0334,  0.1100,  0.0366,...]<br>\u0120Sw        | [ 0.0086,  0.0846,  0.0074,...]<br>imming     | [-0.0049,  0.0841, -0.0339,...]<br>\u0120is        | [ 0.0410,  0.0706,  0.0077,...]<br><mark style=\"background-color:rgba(0, 0, 0, 0);color:#ffff00\" class=\"has-inline-color\">\u0120a         | [ 0.0427 , 0.0503 , 0.0080,...]<\/mark><\/pre>\n<\/div>\n<\/div>\n<\/div>\n<\/div>\n\n\n\n<p>\u6240\u4ee5\uff0c\u8fd9\u4e00\u6b65\u4e2d\uff0c<strong><mark style=\"background-color:rgba(0, 0, 0, 0)\" class=\"has-inline-color has-primary-color\">\u201cContextual Embeddings\u201d\u6700\u540e\u4e00\u884c\u7684\u8ba1\u7b97\uff0c\u9700\u8981\u524d\u9762 Softmax Masked Attention Score Matrix \u7684\u6700\u540e\u4e00\u884c\uff0c\u6b64\u5916\u8fd8\u9700\u8981\u6574\u4e2a V \u77e9\u9635\u3002\u8fd9\u5c31\u662f\u4e3a\u4ec0\u4e48 V \u77e9\u9635\u662f\u9700\u8981\u7f13\u5b58\u7684\u3002<\/mark><\/strong><\/p>\n\n\n\n<p>\u6b64\u5916\u53ef\u4ee5\u770b\u5230\uff0c\u5728\u8fd9\u4e2a\u81ea\u56de\u5f52\u7684\u8ba1\u7b97\u4e2d\uff0cQ \u77e9\u9635\u524d\u9762\u7684\u6240\u6709\u884c\uff08\u5373\u4e0a\u4e00\u8f6e\u8ba1\u7b97\u7684Q\u77e9\u9635\uff09\u90fd\u7528\u4e0d\u4e0a\uff0c\u8fd9\u4e5f\u662f\u4e3a\u4ec0\u4e48 Q \u77e9\u9635\u4e0d\u9700\u8981\u7f13\u5b58\uff0c\u5373\u6211\u4eec\u9700\u8981\u7684\u201cKV Cache\u201d\uff0c\u800c\u4e0d\u662f\u201cQKV Cache\u201d\u7684\u539f\u56e0\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">3. \u8ba1\u7b97\u56fe\u793a<\/h3>\n\n\n\n<p>\u8fd9\u91cc\u4f9d\u636e\u4f7f\u7528\u4e86\u56fe\u793a\u7684\u65b9\u5f0f\u5c55\u793a\u4e86\u5728\u201c\u81ea\u56de\u5f52\u201d\u8fc7\u7a0b\u4e2d\u7684\u6570\u5b66\u8ba1\u7b97\u3002\u5728\u4e0b\u56fe\u4e2d\uff0c\u7b2c\u4e00\u4e2a\u751f\u6210\u7684 Token \u4e3a\u201c a\u201d\uff0c\u8be5 Token \u5728\u8fdb\u5165 Decoder \u6a21\u578b\u518d\u6b21\u8fdb\u884c\u8ba1\u7b97\u65f6\uff08\u5373\u201c\u81ea\u56de\u5f52\u201d\uff09\uff0c\u4e0b\u56fe\u4e2d\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u7c89\u7ea2\u8272\u80cc\u666f\u90e8\u5206\u4e3a\u65b0\u7684\u3001\u9700\u8981\u8ba1\u7b97\u7684\u90e8\u5206\uff1b<\/li>\n\n\n\n<li>\u7070\u8272\u80cc\u666f\u90e8\u5206\u4e3a\u867d\u7136\u4e0d\u9700\u8981\u8ba1\u7b97\uff0c\u4f46\u5728\u8ba1\u7b97\u65b0\u7684\u5185\u5bb9\u65f6\uff0c\u9700\u8981\u4f7f\u7528\u7684\u90e8\u5206\u3002<\/li>\n<\/ul>\n\n\n\n<p>\u7070\u8272\u90e8\u5206\u5373\u4e3a\u201cKV Cache\u201d\u9700\u8981\u7f13\u5b58\u7684\u90e8\u5206\u3002\u5373\uff0c\u6bcf\u4e00\u4e2a Token \u5bf9\u5e94\u7684 \u201cK\u201d\u3001\u201cV\u201d \u77e9\u9635\u90fd\u9700\u8981\u5728\u540e\u7eed\u7684\u8ba1\u7b97\u4e2d\u4f7f\u7528\u3002\u4ea6\u5373\uff0c\u6bcf\u4e00\u4e2a Token \u7684 Key \u5411\u91cf\u90fd\u9700\u8981\u4fdd\u5b58\uff0c\u7528\u4e8e\u4e0e\u65b0\u7684 Token \u7684 Query \u5411\u91cf\u8fdb\u884c\u70b9\u51fb\u8ba1\u7b97\u201c\u5173\u6ce8\u5ea6\u201d\u503c\uff1b\u6bcf\u4e00\u4e2a Token \u7684 V \u5411\u91cf\u4e5f\u9700\u8981\u4fdd\u6301<\/p>\n\n\n\n<figure data-wp-context=\"{&quot;imageId&quot;:&quot;69eaa9645421c&quot;}\" data-wp-interactive=\"core\/image\" data-wp-key=\"69eaa9645421c\" class=\"wp-block-image size-large wp-lightbox-container\"><img loading=\"lazy\" decoding=\"async\" width=\"7256\" height=\"12293\" data-wp-class--hide=\"state.isContentHidden\" data-wp-class--show=\"state.isContentVisible\" data-wp-init=\"callbacks.setButtonStyles\" data-wp-on--click=\"actions.showLightbox\" data-wp-on--load=\"callbacks.setButtonStyles\" data-wp-on-window--resize=\"callbacks.setButtonStyles\" src=\"https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/12\/autoregression-multi-head-attention.svg\" alt=\"\" class=\"wp-image-21468\"\/><button\n\t\t\tclass=\"lightbox-trigger\"\n\t\t\ttype=\"button\"\n\t\t\taria-haspopup=\"dialog\"\n\t\t\taria-label=\"Enlarge\"\n\t\t\tdata-wp-init=\"callbacks.initTriggerButton\"\n\t\t\tdata-wp-on--click=\"actions.showLightbox\"\n\t\t\tdata-wp-style--right=\"state.imageButtonRight\"\n\t\t\tdata-wp-style--top=\"state.imageButtonTop\"\n\t\t>\n\t\t\t<svg xmlns=\"http:\/\/www.w3.org\/2000\/svg\" width=\"12\" height=\"12\" fill=\"none\" viewBox=\"0 0 12 12\">\n\t\t\t\t<path fill=\"#fff\" d=\"M2 0a2 2 0 0 0-2 2v2h1.5V2a.5.5 0 0 1 .5-.5h2V0H2Zm2 10.5H2a.5.5 0 0 1-.5-.5V8H0v2a2 2 0 0 0 2 2h2v-1.5ZM8 12v-1.5h2a.5.5 0 0 0 .5-.5V8H12v2a2 2 0 0 1-2 2H8Zm2-12a2 2 0 0 1 2 2v2h-1.5V2a.5.5 0 0 0-.5-.5H8V0h2Z\" \/>\n\t\t\t<\/svg>\n\t\t<\/button><\/figure>\n\n\n\n<p>\u5728\u4e0a\u8ff0\u7684\u8ba1\u7b97\u4e2d\uff0c\u6ce8\u610f\u5230\uff0c\u5728\u4e00\u6b21\u7684\u65b0\u7684\u201c\u81ea\u56de\u5f52\u201d\u4e2d\uff0c\u6700\u7ec8\u9700\u8981\u989d\u5916\u8ba1\u7b97\u7684\u5c31\u662f\u65b0Token\uff08\u8fd9\u91cc\u662f\u201c a\u201d\uff09\u5bf9\u5e94\u7684 Centextual Embedding\uff0c\u8be5\u5185\u5bb9\u8ba1\u7b97\uff0c\u9700\u8981\u4f7f\u7528\u524d\u8ff0\u6240\u6709 Token \u5bf9\u5e94\u7684 K\u3001V \u503c\uff0c\u5373\u8fd9\u91cc\u7684 K \u548c V \u77e9\u9635\u3002<\/p>\n\n\n\n<p>\u6240\u4ee5\uff0c\u5728\u4e00\u6b21\u81ea\u56de\u5f52\u63a8\u7406\u4e2d\uff0c\u6700\u597d\u4e0a\u4e00\u6b21\u8ba1\u7b97\u7684\u6240\u6709 Token \u7684 K\u3001V \u5411\u91cf\u90fd\u7f13\u5b58\u8d77\u6765\uff0c\u907f\u514d\u91cd\u590d\u8ba1\u7b97\u3002\u672c\u6b21\u81ea\u56de\u5f52\u4e2d\u8ba1\u7b97\u65b0Token\u7684\u5bf9\u5e94\u7684 K\u3001V \u5411\u91cf\u4e5f\u9700\u8981\u7f13\u5b58\uff0c\u4ee5\u4f9b\u540e\u7eed\u4f7f\u7528\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">4. KV Cache \u7684\u5185\u5b58\u6d88\u8017<\/h3>\n\n\n\n<p>\u5728\u63a8\u7406\u4f18\u5316\u4e2d\uff0c\u4e00\u4e2a\u91cd\u8981\u786c\u9650\u5236\u4fbf\u662fGPU\u5361\u7684\u663e\u5b58\uff08memory\uff09\u5927\u5c0f\u3002\u5f53\u524d\uff0c\u4e3b\u6d41\u7684\u4f01\u4e1a\u7ea7\u663e\u5361<a href=\"https:\/\/www.nvidia.com\/en-us\/data-center\/h100\/\">H100\u663e\u5b58\u4e3a80GB<\/a>\uff0c\u9ad8\u7aef\u663e\u5361 <a href=\"https:\/\/www.nvidia.com\/en-us\/data-center\/h200\/\">H200 \u663e\u5b58\u4e3a141 GB<\/a>\u3002\u73b0\u5728\u7684 LLM \u53c2\u6570\u91cf\u901a\u5e38\u5de8\u5927\uff0c\u53c2\u6570\u52a0\u8f7d\u5c31\u9700\u8981\u8017\u8d39\u5de8\u5927\u7684\u663e\u5b58\uff0c\u4ee5\u6700\u65b0\u7684 llama 4 17B\u4e3a\u4f8b\uff0c\u8003\u8651 FP16 \uff08\u534a\u7cbe\u5ea6\uff09\u8003\u8651\uff0c\u5219\u9700\u8981\u6d88\u8017\u7ea6 30+ GB \u3002\u5361\u7247\u4e0a\u5269\u4f59\u7684\u5185\u5b58\uff0c\u624d\u662f\u7528\u4e8e\u5b9e\u9645\u7684\u63a8\u7406\u4f7f\u7528\u3002\u800c\u6bcf\u6b21\u63a8\u7406\uff0c\u4f8b\u5982\u63d0\u793a\u8bcd\u662f1000\u4e2aToken\uff0c\u8f93\u51fa\u4e5f\u662f1000\u4e2aToken\uff0c\u90a3\u4e48\uff0c\u5728\u751f\u6210\u6700\u540e\u4e00\u4e2aToken\u7684\u65f6\u5019\uff0c\u9700\u8981\u7684\u5185\u5b58\uff08\u63095%\u7684\u7ecf\u9a8c\u503c\u8ba1\u7b97\uff09\u7ea6\u4e3a1.5GB\u3002\u8fd9\u65f6\u5019\uff0c\u5355\u4e2aH100\u7684\u663e\u5361\u4e5f\u53ea\u80fd\u652f\u6301\u7ea633\u4e2a\u5e76\u53d1\uff0c\u5b9e\u9645\u7684\u60c5\u51b5\u5219\u8981\u8003\u8651\u7cfb\u7edf\u5185\u5b58\u7b49\uff0c\u4f1a\u6bd4\u8fd9\u4e2a\u9884\u4f30\u591a\u5f88\u591a\u3002<\/p>\n\n\n\n<p>\u5728\u8fd9\u7bc7\u6587\u7ae0\uff1a<a href=\"https:\/\/developer.nvidia.com\/blog\/mastering-llm-techniques-inference-optimization\/\">Mastering LLM Techniques: Inference Optimization@developer.nvidia.com<\/a> \u4e2d\u4e5f\u7c7b\u4f3c\u7684\u4f30\u7b97\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>7B \u7684\u6a21\u578b\uff08\u5982Llama 2 7B\uff09\uff0c\u53c2\u6570\u662f16\u4f4d\uff08FP16 or BF16\uff09\u5219\u53c2\u6570\u9700\u8981\u6d88\u8017\u7ea6 14 GB \u663e\u5b58<\/li>\n\n\n\n<li>Token \u6570\u4e3a4096\u7684\u63a8\u7406\uff08decoder\uff09\uff0c\u5219\u9700\u8981\u7ea6 2 GB KV Cache <\/li>\n<\/ul>\n\n\n\n<p>\u4ece\u4e0a\u8ff0\u7c97\u7565\u7684\u9884\u4f30\u53ef\u4ee5\u770b\u5f97\u51fa\u6765\uff0c\u9ad8\u6548\u4f7f\u7528\u663e\u5b58\u8d44\u6e90\u5bf9\u4e8e LLM \u63a8\u7406\u6765\u8bf4\u81f3\u5173\u91cd\u8981\u3002\u6240\u4ee5\uff0c\u5404\u63a8\u7406\u6846\u67b6\u5219\u4f1a\u901a\u8fc7\u5404\u79cd\u65b9\u6cd5\u5c1d\u8bd5\u53bb\u4f18\u5316\u201cKV Cache\u201d\u4ee5\u964d\u4f4e\u663e\u5b58\u4f7f\u7528\u3002\u8fd9\u4e9b\u65b9\u6cd5\u5305\u62ec\u201c\u91cf\u5316\u201d\uff08Quantization\uff09\u3001MQA\/MGA \u7b49\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">5. Multi-Query Attention\/Group-Query Attention<\/h3>\n\n\n\n<p>\u53ef\u4ee5\u770b\u5230\uff0c\u65e0\u8bba\u662f\u5728\u6a21\u578b\u53c2\u6570\u52a0\u8f7d\u7684\u65f6\u5019\uff0c\u8fd8\u662f\u63a8\u7406 KV Cache \u9636\u6bb5\uff0c\u90fd\u9700\u8981\u5927\u91cf\u7684\u663e\u5b58\u3002\u5173\u4e8e MQA \u548c GQA \u7684\u7ecf\u5178\u8bba\u6587\u662f\uff1a<a href=\"https:\/\/arxiv.org\/pdf\/2305.13245\">GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints<\/a>\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">5.1 \u5173\u4e8eMQA\u4e0eGQA<\/h4>\n\n\n\n<p>Multi-Query Attention \u5219\u5c1d\u8bd5\u901a\u8fc7\u51cf\u5c11 \\(W^K \\, W^V \\) \u53c2\u6570\u7684\u6570\u91cf\u6765\u51cf\u5c11\u4e0a\u8ff0\u663e\u5b58\uff0c\u4ece\u800c\u589e\u52a0\u63a8\u7406\u901f\u5ea6\u4e0e\u5e76\u53d1\u80fd\u529b\u3002\u53c2\u8003\u4e0b\u56fe\uff0c\u53ef\u4ee5\u770b\u5230\u5728\u6bcf\u4e00\u4e2a Layer \u4e2d\uff0c\u6240\u6709\u7684 Head \u5171\u4eab\u4e00\u7ec4 \\(W^K \\, W^V \\) \u53c2\u6570\uff0c\u90a3\u4e48\u8fd9\u4e24\u4e2a\u76f8\u5173\u53c2\u6570\u5c31\u51cf\u5c11\u5230\u4e86\u539f\u6765\u7684 \\(\\frac{1}{h} \\)\u3002<\/p>\n\n\n\n<p>\u66f4\u8fdb\u4e00\u6b65\u7684\uff0c\u4e3a\u4e86\u51cf\u5c11\u4e0a\u8ff0\u65b9\u6cd5\uff08MQA\uff09\u5bf9\u4e8e\u6a21\u578b\u6548\u679c\u7684\u5f71\u54cd\uff0c\u53e6\u4e00\u4e2a\u4f18\u5316\u662f Group-Query Attention\u3002\u5373\u5982\u4e0b\u56fe\uff0c\u4e00\u7ec4 Heads \u5171\u4eab\u4e00\u7ec4 \\(W^K \\, W^V \\) \u3002\u53ef\u4ee5\u4f9d\u7167\u5206\u7ec4\u7684\u5927\u5c0f\uff0c\u4ee5\u5e73\u8861\u6a21\u578b\u6548\u679c\u4e0e\u8d44\u6e90\u4f7f\u7528\u3002\u5982\u679c\u4e00\u4e2a Head \u4e00\u7ec4 \\(W^K \\, W^V \\) \u5219\u9000\u5316\u5230\u666e\u901a\u7684 Multi-Head Attention\uff1b\u5982\u679c\u6240\u6709 Heads \u5206\u5230\u4e00\u7ec4\uff0c\u5219\u9000\u5316\u5230\u666e\u901a\u7684 Multi-Query Attention\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"7645\" height=\"3273\" src=\"https:\/\/www.orczhou.com\/wp-content\/uploads\/2025\/12\/mha-mqa-gqa.svg\" alt=\"\" class=\"wp-image-21486\"\/><\/figure>\n\n\n\n<h4 class=\"wp-block-heading\">5.2 \u6a21\u578b\u8bad\u7ec3 Uptraining<\/h4>\n\n\n\n<p>\u6b64\u5916\uff0c\u6bd4\u8f83\u5173\u952e\u7684\uff0c\u8bba\u6587\u63d0\u51fa\u4e86\u4e00\u4e9b\u5173\u4e8e GQA \u67b6\u6784\u7684\u8bad\u7ec3\u4f18\u5316\u3002<\/p>\n\n\n\n<p>\u4f8b\u5982\uff0c\u4ece\u4e00\u4e2a MHA \u67b6\u6784\u5f00\u59cb\u8bad\u7ec3\uff0c\u7136\u540e\u4ece\u67d0\u4e2a checkpoint \u5f00\u59cb\uff0c\u5c06MHA\u6a21\u578b\u6539\u6210GQA\u6a21\u578b\uff0c\u5728\u521d\u59cb\u5316\u5206\u7ec4\u53c2\u6570\u65f6\uff0c\u5219\u4f7f\u7528\u539f MHA \u6a21\u578b\u4e2d\u53c2\u6570\u53bb\u6c42\u4e00\u4e2a\u5747\u503c\u7684\u65b9\u5f0f\u521d\u59cb\u5316GQA\u4e2d\u5bf9\u5e94\u7684 \\(W^K \\, W^V \\) \u3002\u7136\u540e\u7ee7\u7eed\u4f7f\u7528\u8bed\u6599\u5e93\u5bf9\u4e8e\u8be5\u65b0\u6a21\u578b\u8bad\u7ec3\u3002<\/p>\n\n\n\n<p>\u8bba\u6587\u6307\u51fa\uff0c\u8fd9\u65f6\u5019\u53ea\u9700\u8981\u4f7f\u7528\u975e\u5e38\u5c11\u7684\u8ba1\u7b97\u8d44\u6e90\u5c31\u53ef\u4ee5\u8bad\u7ec3\u5904\u6548\u679c\u8fd8\u4e0d\u9519\u7684GQA\u65b0\u6a21\u578b\u3002\u65b0\u7684GQA\u6a21\u578b\uff0c\u5219\u53ef\u4ee5\u4f7f\u7528\u66f4\u5c11\u7684\u663e\u5b58\u8d44\u6e90\uff0c\u6709\u66f4\u597d\u7684\u5e76\u53d1\u541e\u5410\u80fd\u529b\uff0c\u540c\u65f6\u4e5f\u8fbe\u5230\u8fd8\u6bd4\u8f83\u597d\u7684\u6548\u679c\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">\u53c2\u8003<\/h3>\n\n\n\n<ul class=\"wp-block-list\">\n<li><a href=\"https:\/\/developer.nvidia.com\/blog\/mastering-llm-techniques-inference-optimization\/\">Mastering LLM Techniques: Inference Optimization@2023-11<\/a> <\/li>\n<\/ul>\n","protected":false},"excerpt":{"rendered":"<p>\u5927\u8bed\u8a00\u6a21\u578b\u7684\u4e00\u4e2a\u91cd\u8981\u65b9\u5411\u662f\u201c\u63a8\u7406\u201d\u4f18\u5316\uff0c\u5373\u5982\u4f55\u5728\u6709\u9650\u7684\u786c\u4ef6\u73af\u5883\u4e2d\u63d0\u5347\u63a8\u7406\u7684\u6548\u7387\u3002\u5bf9\u4e8e\u6240\u6709\u7684 MaaS \u670d\u52a1\u63d0\u4f9b\u65b9\uff0c\u8fd9\u90fd\u662f\u81f3\u5173\u91cd\u8981\u7684\u3002\u4e00\u65b9\u9762\u5173\u4e4e\u7528\u6237\u7684\u4f7f\u7528\u4f53\u9a8c\uff08\u8bf8\u5982TTFT\uff0ctime to first token\uff09\u3001\u53e6\u4e00\u65b9\u9762\u5173\u4e8e\u670d\u52a1\u63d0\u4f9b\u7684\u6210\u672c\uff08\u6709\u9650\u7684GPU\u5982\u4f55\u63d0\u4f9b\u66f4\u9ad8\u7684\u541e\u5410\u91cf\uff09\u3002 1. \u6982\u8ff0 \u4ece Transformer \u67b6\u6784\u7684 Decoder \u9636\u6bb5\u539f\u7406\u6765\u770b\uff0c\u4e00\u4e2a\u5e38\u89c1\u7684\u3001\u81ea\u7136\u7684\u4f18\u5316\u5c31\u662f\u4f7f\u7528\u201cKV Cache\u201d\u5927\u5927\u51cf\u5c11\u63a8\u7406\uff08\u81ea\u56de\u5f52\u9636\u6bb5\uff09\u8fc7\u7a0b\u9700\u8981\u8ba1\u7b97\u91cf\uff0c\u5b9e\u73b0\u4ee5\u663e\u5b58\u6362\u6548\u7387\uff0c\u4ece\u800c\u52a0\u901f\u63a8\u7406\u8fc7\u7a0b\u3002 2. Decoder \u6a21\u578b\u7684\u81ea\u56de\u5f52\u8ba1\u7b97 \u5728\u4e86\u89e3\u4e86\u201cAttention\u201d\u3001\u201cmask attention\u201d\u3001\u201cautoregression\u201d\u8ba1\u7b97\u4e4b\u540e\uff0c\u6bd4\u8f83\u81ea\u7136\u53ef\u4ee5\u6ce8\u610f\u5230\u5728 Q\u3001K\u3001V \u77e9\u9635\u5728\u201cautoregression\u201d\u7684\u8fc7\u7a0b\u4e2d\uff0c\u6709\u5f88\u591a\u7684\u90e8\u5206\u662f\u65e0\u9700\u989d\u5916\u8ba1\u7b97\u7684\u3002 \u8fd9\u91cc\u4f9d\u65e7\u7ee7\u7eed\u4f7f\u7528\u300a\u7406\u89e3\u5927\u8bed\u8a00\u6a21\u578b\u7684\u6838\u5fc3\uff1aAttention\u300b\u4e2d\u7684\u793a\u4f8b\uff0c\u8fd9\u91cc\u8003\u8651\u5728\u6587\u7ae0\u4e2d\u7684\u63d0\u793a\u8bcd\u201cIt\u2019s very hot in summer. Swimming is\u201d\uff0c\u751f\u6210\u65b0\u7684Token\u4e3a \u201c a\u201d\uff0c\u90a3\u4e48\u6211\u4eec\u770b\u770b\u8fd9\u4e2a\u81ea\u56de\u5f52\u8fc7\u7a0b\u67d0\u4e2aHead\u4e2d\u7684\u8ba1\u7b97\u3002\u5b8c\u6210\u7684\u4ee3\u7801\u53ef\u4ee5\u53c2\u8003\uff1aautoregression-of-attention.ipynb\u3002 \u76f8\u6bd4\u4e0e\u5728 prefill \u9636\u6bb5\uff0c\u9700\u8981\u989d\u5916\u8ba1\u7b97\u7684\uff0c\u5728\u540e\u7eed\u4f7f\u7528\u9ec4\u8272\u6807\u8bc6\u51fa\u6765\u3002 2. 1 Token Embedding \u548c Positional Embedding Token Embedding + Positional Embedding &#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;-| Token | Token ID | Token Embeddings(first 3 of [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":22444,"comment_status":"open","ping_status":"closed","sticky":false,"template":"wp-custom-template-a-1440-px-width-template","format":"standard","meta":{"_eb_attr":"","inline_featured_image":false,"_tocer_settings":[],"footnotes":""},"categories":[139],"tags":[],"class_list":["post-21291","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-ai-llm"],"_links":{"self":[{"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/posts\/21291","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/comments?post=21291"}],"version-history":[{"count":83,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/posts\/21291\/revisions"}],"predecessor-version":[{"id":22450,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/posts\/21291\/revisions\/22450"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/media\/22444"}],"wp:attachment":[{"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/media?parent=21291"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/categories?post=21291"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.orczhou.com\/index.php\/wp-json\/wp\/v2\/tags?post=21291"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}