{"id":940,"date":"2026-06-01T14:53:11","date_gmt":"2026-06-01T05:53:11","guid":{"rendered":"http:\/\/kyunam.com\/?p=940"},"modified":"2026-06-01T14:53:11","modified_gmt":"2026-06-01T05:53:11","slug":"vllm%ec%9d%80-%ec%9d%b4%eb%af%b8-%eb%b9%a0%eb%a5%b4%eb%8b%a4-%eb%8b%b9%ec%8b%a0%ec%9d%b4-%ec%84%a4%ec%a0%95%ec%9d%84-%ec%95%88-%ec%bc%b0%ec%9d%84-%eb%bf%90","status":"publish","type":"post","link":"http:\/\/kyunam.com\/?p=940","title":{"rendered":"vLLM\uc740 \uc774\ubbf8 \ube60\ub974\ub2e4. \ub2f9\uc2e0\uc774 \uc124\uc815\uc744 \uc548 \ucf30\uc744 \ubfd0"},"content":{"rendered":"\n\n<style>\n.vllm-post{max-width:860px;margin:0 auto;color:#161410;font-size:17px;line-height:1.78;word-break:keep-all;}\n.vllm-post .kicker{font-size:14px;color:#B8651F;letter-spacing:.04em;margin-bottom:18px;font-style:italic;}\n.vllm-post h1{font-size:40px;line-height:1.25;letter-spacing:-.03em;margin:0 0 24px;font-weight:800;}\n.vllm-post h1 em{color:#1F3D34;font-style:italic;}\n.vllm-post .subtitle{font-size:21px;line-height:1.55;color:#3A352D;margin:0 0 34px;}\n.vllm-post .byline{display:flex;flex-wrap:wrap;gap:12px;font-size:13px;color:#7A746B;border-top:1px solid #D9D0BE;border-bottom:1px solid #D9D0BE;padding:14px 0;margin-bottom:44px;}\n.vllm-post h2{font-size:28px;line-height:1.35;margin:64px 0 8px;font-weight:800;letter-spacing:-.02em;}\n.vllm-post h2 .num{color:#B8651F;margin-right:10px;font-style:italic;font-weight:400;}\n.vllm-post .section-lede{font-size:18px;color:#7A746B;margin:0 0 28px;font-style:italic;}\n.vllm-post h3{font-size:20px;margin:36px 0 12px;font-weight:700;}\n.vllm-post p{margin:0 0 20px;}\n.vllm-post .lead{font-size:19px;line-height:1.75;color:#3A352D;}\n.vllm-post code{font-family:ui-monospace,SFMono-Regular,Menlo,Consolas,monospace;font-size:.88em;background:#F2EDE2;padding:2px 6px;border-radius:4px;color:#1F3D34;}\n.vllm-post pre{background:#1A1814;color:#E8DFC9;padding:22px;border-radius:8px;overflow-x:auto;font-size:13px;line-height:1.7;}\n.vllm-post pre code{background:none;color:inherit;padding:0;}\n.vllm-post figure{margin:36px 0;text-align:center;}\n.vllm-post figure img{width:100%;height:auto;border-radius:10px;background:#FAF6EE;}\n.vllm-post figcaption{font-size:14px;color:#7A746B;font-style:italic;line-height:1.55;margin-top:12px;}\n.vllm-post .insight{margin:44px 0;padding:26px 30px;border-top:2px solid #161410;border-bottom:2px solid #161410;font-size:22px;line-height:1.5;font-style:italic;}\n.vllm-post .insight .marker{display:block;font-size:11px;letter-spacing:.15em;color:#B8651F;font-style:normal;font-weight:700;margin-bottom:8px;text-transform:uppercase;}\n.vllm-post .closer{margin-top:64px;padding-top:32px;border-top:1px solid #D9D0BE;color:#7A746B;font-style:italic;font-size:15px;}\n.vllm-post ul,.vllm-post ol{padding-left:22px;margin:0 0 24px;}\n.vllm-post li{margin-bottom:8px;}\n@media(max-width:720px){.vllm-post{font-size:16px}.vllm-post h1{font-size:31px}.vllm-post .subtitle{font-size:18px}.vllm-post h2{font-size:24px}.vllm-post .insight{font-size:18px;padding:22px}}\n<\/style>\n<div class=\"vllm-post\">\n<header>\n<div class=\"kicker\">vLLM Performance \u00b7 Engineering Notes<\/div>\n<p class=\"subtitle\">\ucf54\ub4dc \ud55c \uc904 \uace0\uce58\uc9c0 \uc54a\uace0, vLLM\uc5d0 \uc774\ubbf8 \ub4e4\uc5b4 \uc788\ub294 \ub0b4\uc7a5 \uc124\uc815\ub9cc\uc73c\ub85c Llama-3.3-70B \ucd94\ub860\uc744 6\uac1c \uc6cc\ud06c\ub85c\ub4dc \uc804\ubd80 \uac00\uc18d\ud55c 156\uc140 \uce21\uc815 \ub9ac\ud3ec\ud2b8.<\/p>\n\n<\/header> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 <p class=\"lead\">vLLM \uc18c\uc2a4\uc5d0\ub294 \uc190\ub300\uc9c0 \uc54a\uc558\ub2e4. <code>speculative_config<\/code>, <code>cudagraph_mode<\/code>, <code>gpu_memory_utilization<\/code> \u2014 vLLM\uc774 \uc774\ubbf8 \uc81c\uacf5\ud558\ub294 \ub0b4\uc7a5 \uc124\uc815 \uba87 \uac1c\ub97c \ucf1c\uace0 \uc870\ud569\ud588\uc744 \ubfd0\uc778\ub370, Llama-3.3-70B (TP=8, H100\u00d78) \ud658\uacbd\uc5d0\uc11c 6\uac1c \uc6cc\ud06c\ub85c\ub4dc \ubaa8\ub450 throughput\uc774 net-positive\ub85c \uc62c\ub790\ub2e4. sonnet \uae30\uc900 +52%, chat\uc740 +69%. \ub2e4\ub9cc \uc774 \uc124\uc815\ub4e4\uc740 \uc544\ubb34 \ub54c\ub098 \ucf1c\uba74 \uc624\ud788\ub824 \ub290\ub824\uc9c4\ub2e4. \uc774 \uae00\uc740 <strong>\uc5b8\uc81c\u00b7\ubb34\uc5c7\uc744 \ucf1c\uc57c \ud558\ub294\uc9c0<\/strong>\ub97c 156\uc140\uc758 \uce21\uc815\uc73c\ub85c \ucc44\uc6b4 \uae30\ub85d\uc774\ub2e4.<\/p><h2><span class=\"num\">01<\/span>\ud55c \uc7a5\uc758 \uadf8\ub9bc\uc73c\ub85c<\/h2><p class=\"section-lede\">\uba3c\uc800 \uacb0\uacfc\ubd80\ud130. 6\uac1c \uc6cc\ud06c\ub85c\ub4dc, \ub450 \uac00\uc9c0 \uc124\uc815.<\/p><p>\uac19\uc740 GPU, \uac19\uc740 \ubaa8\ub378, \uac19\uc740 \uce21\uc815 \uc870\uac74\uc774\ub2e4. spec decoding\uc744 \ub048 <em>vanilla<\/em>\uc640, vLLM \ub0b4\uc7a5 \uc124\uc815\ub9cc \uc870\ud569\ud55c <em>\ucd5c\uc801 \uad6c\uc131<\/em> \u2014 <code>cudagraph_mode=\"PIECEWISE\"<\/code> + <code>method=\"suffix\"<\/code> + <code>num_speculative_tokens=32<\/code> + <code>gpu_memory_utilization=0.80<\/code> \u2014 \uc758 \ube44\uad50\ub2e4. \uc774 \uae00\uc5d0\uc11c\ub294 \uc774 \uad6c\uc131\uc744 \ud3b8\uc758\uc0c1 <strong>Trident<\/strong>\ub77c \ubd80\ub974\uaca0\ub2e4.<\/p><figure>\n<img decoding=\"async\" alt=\"\ubaa8\ub4e0 \uc6cc\ud06c\ub85c\ub4dc\uc5d0\uc11c net-positive. mix-ch\ucc98\ub7fc code \ube44\uc911\uc774 70%\uc778 \uae4c\ub2e4\ub85c\uc6b4 \ubd84\ud3ec\uc5d0\uc11c\ub3c4 +45.6%. SUB_093\" loading=\"lazy\" src=\"http:\/\/kyunam.com\/wp-content\/uploads\/2026\/06\/vllm-figure-01.png\">\n<figcaption>\ubaa8\ub4e0 \uc6cc\ud06c\ub85c\ub4dc\uc5d0\uc11c net-positive. mix-ch\ucc98\ub7fc code \ube44\uc911\uc774 70%\uc778 \uae4c\ub2e4\ub85c\uc6b4 \ubd84\ud3ec\uc5d0\uc11c\ub3c4 +45.6%. <span class=\"src\">SUB_093<\/span><\/figcaption>\n<\/figure><p>\uc5ec\uae30\uae4c\uc9c0\uac00 \uacb0\uacfc \uc694\uc57d\uc774\ub2e4. 6\uac1c \uc6cc\ud06c\ub85c\ub4dc \ubaa8\ub450 +18.8%\ubd80\ud130 +68.9%\uae4c\uc9c0 \ud5a5\uc0c1, wall time 31% \ub2e8\ucd95. \uadf8\ub9ac\uace0 \uc774\uac78 \ub9cc\ub4e0 \uac74 <strong>\ub531 \ub124 \uac1c\uc758 vLLM \ub0b4\uc7a5 \uc124\uc815<\/strong>\uc774\ub2e4.<\/p><h2><span class=\"num\">02<\/span>\ub531 \ub124 \uac1c\uc758 \uc124\uc815<\/h2><p class=\"section-lede\">\uc804\ubd80 vLLM\uc5d0 \uc774\ubbf8 \ub4e4\uc5b4 \uc788\ub2e4. \ucf1c\uae30\ub9cc \ud558\uba74 \ub41c\ub2e4.<\/p><p>\uc704 \uadf8\ub798\ud504\uc758 \uac00\uc18d\uc740 \ub2e4\uc74c \ub124 \uac1c\uc758 lever\ub97c \uc870\ud569\ud55c \uacb0\uacfc\ub2e4. \uc0c8 \ub77c\uc774\ube0c\ub7ec\ub9ac\ub3c4, \uc18c\uc2a4 \ud328\uce58\ub3c4 \uc5c6\ub2e4. <code>LLM()<\/code> \uc0dd\uc131\uc790\uc5d0 \uc778\uc790 \uba87 \uac1c\ub97c \ub118\uae30\ub294 \uac8c \uc804\ubd80\ub2e4.<\/p><ol>\n<li><strong><code>speculative_config<\/code><\/strong> \u2014 speculative decoding\uc744 \ucf1c\ub294 \uc2a4\uc704\uce58. drafter\uac00 \ub2e4\uc74c \ud1a0\ud070 \uc5ec\ub7ec \uac1c\ub97c \ubbf8\ub9ac \ucd94\uce21\ud558\uace0 \ubcf8 \ubaa8\ub378\uc774 \ud55c \ubc88\uc5d0 \uac80\uc99d\ud55c\ub2e4. \ubc1b\uc544\ub4e4\uc5ec\uc9c0\ub294 \ud1a0\ud070\uc774 \ub9ce\uc744\uc218\ub85d forward step \uc218\uac00 \uc904\uc5b4 \ube68\ub77c\uc9c4\ub2e4.<\/li>\n<li><strong><code>method<\/code> (ngram vs suffix)<\/strong> \u2014 drafter\uac00 \ud6c4\ubcf4\ub97c \uc5b4\ub514\uc11c \ucc3e\ub294\uc9c0. ngram\uc740 prompt \uc548\uc5d0\uc11c\ub9cc, suffix\ub294 prompt\uc640 \uc774\ubbf8 \uc0dd\uc131\ud55c \ud1a0\ud070 \uc591\ucabd\uc5d0\uc11c \ucc3e\ub294\ub2e4. \uc774 \ucc28\uc774\uac00 \uc6cc\ud06c\ub85c\ub4dc\ubcc4 \uc131\ud328\ub97c \uac00\ub978\ub2e4.<\/li>\n<li><strong><code>num_speculative_tokens<\/code><\/strong> \u2014 \ud55c \ubc88\uc5d0 \uba87 \uac1c\ub97c \ucd94\uce21\ud560\uc9c0. ngram\uc740 7, suffix\ub294 32\uac00 \ubcf8 \ud658\uacbd\uc758 sweet spot\uc774\ub2e4.<\/li>\n<li><strong><code>cudagraph_mode=\"PIECEWISE\"<\/code> + <code>gpu_memory_utilization=0.80<\/code><\/strong> \u2014 suffix\uc758 \uac00\ubcc0 draft \uae38\uc774\ub97c CUDA graph\uac00 \ub9e4\ubc88 \ub2e4\uc2dc \ucea1\ucc98\ud558\uc9c0 \uc54a\ub3c4\ub85d PIECEWISE\ub85c \uc790\ub974\uace0, \uadf8\ub9cc\ud07c \ud544\uc694\ud55c \uba54\ubaa8\ub9ac \uc5ec\uc720\ub97c gmu\ub97c \ub0ae\ucdb0 \ud655\ubcf4\ud55c\ub2e4.<\/li>\n<\/ol><p>\uc911\uc694\ud55c \uac74 &#8220;\uc774\uac78 \ucf1c\uba74 \ubb34\uc870\uac74 \ube68\ub77c\uc9c4\ub2e4&#8221;\uac00 \uc544\ub2c8\ub77c\ub294 \uc810\uc774\ub2e4. \uac19\uc740 \uc124\uc815\uc774 \uc5b4\ub5a4 \uc6cc\ud06c\ub85c\ub4dc\uc5d0\uc11c\ub294 +52%, \uc5b4\ub5a4 \uc6cc\ud06c\ub85c\ub4dc\uc5d0\uc11c\ub294 \u221220% \ud68c\uadc0\ub97c \ub9cc\ub4e0\ub2e4. 7B \uc774\ud558 \ubaa8\ub378\uc5d0\uc11c\ub294 \ubb34\uc5c7\uc744 \ucf1c\ub3c4 \ub290\ub824\uc9c4\ub2e4. \uadf8\ub798\uc11c \ud575\uc2ec \uc9c8\ubb38\uc740 &#8220;\uc5b4\ub5bb\uac8c \ucf1c\ub290\ub0d0&#8221;\uac00 \uc544\ub2c8\ub77c <strong>&#8220;\uc5b8\uc81c \ucf1c\ub290\ub0d0&#8221;<\/strong>\ub2e4. \uadf8 \ub2f5\uc744 \ub2e4\uc74c \uc808\uc758 \ub2e8\uc77c \ubd80\ub4f1\uc2dd\uc774 \uc124\uba85\ud55c\ub2e4.<\/p><h2><span class=\"num\">03<\/span>\ubaa8\ub4e0 \uacb0\uacfc\ub97c \uad00\ud1b5\ud558\ub294 \ud558\ub098\uc758 \ubd80\ub4f1\uc2dd<\/h2><p class=\"section-lede\">\uc65c \uc5b4\ub5a4 \uc6cc\ud06c\ub85c\ub4dc\ub294 \uac00\uc18d\uc774\uace0 \uc5b4\ub5a4 \uc6cc\ud06c\ub85c\ub4dc\ub294 \ud68c\uadc0\uc778\uac00.<\/p><p>\uc774 \ubaa8\ub4e0 \uce21\uc815 \uacb0\uacfc\ub97c \ud55c \uac1c\uc758 \ubd80\ub4f1\uc2dd\uc73c\ub85c \uc124\uba85\ud560 \uc218 \uc788\ub2e4. speculative decoding\uc758 \uac00\uc18d\u00b7\ud68c\uadc0 \uc5ec\ubd80\ub294 <em>K\uc640 R\uc758 \ub300\uc18c\uad00\uacc4<\/em>\uac00 \uacb0\uc815\ud55c\ub2e4.<\/p><figure>\n<img decoding=\"async\" alt=\"\uac19\uc740 code \uc6cc\ud06c\ub85c\ub4dc\uc5d0\uc11c ngram\uc740 K=1.09\ub85c R=1.30\uc744 \ubabb \ub118\uc5b4 \u221220.7% \ud68c\uadc0, suffix\ub294 K=4.08\ub85c R\uc744 \ub118\uaca8 +18.9% \uac00\uc18d. \ub9c9\ub300 \ub192\uc774\uac00 R\u00b7K\uc758 \ub300\uc18c\ub97c, \ubd80\ub4f1\uc2dd\uc774 \ubd80\ud638\ub97c \uacb0\uc815\ud55c\ub2e4.\" loading=\"lazy\" src=\"http:\/\/kyunam.com\/wp-content\/uploads\/2026\/06\/vllm-figure-02.png\">\n<figcaption>\uac19\uc740 code \uc6cc\ud06c\ub85c\ub4dc\uc5d0\uc11c ngram\uc740 K=1.09\ub85c R=1.30\uc744 \ubabb \ub118\uc5b4 \u221220.7% \ud68c\uadc0, suffix\ub294 K=4.08\ub85c R\uc744 \ub118\uaca8 +18.9% \uac00\uc18d. \ub9c9\ub300 \ub192\uc774\uac00 R\u00b7K\uc758 \ub300\uc18c\ub97c, \ubd80\ub4f1\uc2dd\uc774 \ubd80\ud638\ub97c \uacb0\uc815\ud55c\ub2e4.<\/figcaption>\n<\/figure><p>R\uc740 spec step \ud55c \ubc88\uc758 overhead(vanilla step time \ub300\ube44 \ubc30\uc218)\uc774\uace0, K\ub294 \ud55c draft \uc0ac\uc774\ud074\uc5d0\uc11c \ud3c9\uade0\uc801\uc73c\ub85c \ubc1b\uc544\ub4e4\uc5ec\uc9c0\ub294 \ud1a0\ud070\uc758 \uc218\ub2e4. \ubcf8 \ud658\uacbd\uc5d0\uc11c R\uc740 \ub300\ub7b5 1.25~1.40 \uc0ac\uc774\ub85c \uc6cc\ud06c\ub85c\ub4dc\uc640 \uac70\uc758 \ubb34\uad00\ud558\uac8c \uc77c\uc815\ud558\ub2e4. \ub530\ub77c\uc11c \ubaa8\ub4e0 \ubcc0\ub3d9\uc740 \uc0ac\uc2e4\uc0c1 K\uc5d0\uc11c \uc628\ub2e4.<\/p><p>Leviathan et al. (ICML 2023)\uc758 closed-form\uc740 <code>K = (1\u2212\u03b1^(\u03b3+1))\/(1\u2212\u03b1)<\/code>\uc9c0\ub9cc, \ubcf8 \ud658\uacbd ngram drafter\uc758 \uc2e4\uce21\uc740 linear \uadfc\uc0ac <code>K \u2248 1 + \u03b1 \u00d7 \u03b3<\/code>\uac00 \ub354 \uc798 \ub9de\ub294\ub2e4. \uc774\uc720\ub294 ngram drafter\uc758 per-position acceptance\uac00 i.i.d.\uac00 \uc544\ub2c8\ub77c strong positive correlation\uc744 \uac16\uae30 \ub54c\ubb38\uc774\ub2e4 \u2014 \ud55c position\uc774 accept\ub418\uba74 \ub2e4\uc74c\ub3c4 accept\ub420 \ud655\ub960\uc774 \ub192\ub2e4.<\/p><h2><span class=\"num\">04<\/span>\uc124\uc815\uc744 \ud558\ub098\uc529 \ucf1c \ubcf4\uba74<\/h2><p class=\"section-lede\">vanilla\uc5d0\uc11c \uc2dc\uc791\ud574 \uc124\uc815 \uc138 \uac1c\ub97c \ucc28\ub840\ub85c \ucf20\ub2e4.<\/p><p>\ucd5c\uc801 \uad6c\uc131\uc740 \ud55c \ubc88\uc5d0 \ub098\uc628 \uac8c \uc544\ub2c8\ub2e4. vanilla\uc5d0\uc11c \ucd9c\ubc1c\ud574 vLLM \ub0b4\uc7a5 \uc124\uc815\uc744 \ud558\ub098\uc529 \ucf1c \ubcf4\uba74 \uac01 \uc124\uc815\uc774 \uc5bc\ub9c8\ub098 \uae30\uc5ec\ud558\ub294\uc9c0 \ubd84\ub9ac\ud574\uc11c \ubcfc \uc218 \uc788\ub2e4. sonnet \uc6cc\ud06c\ub85c\ub4dc \uae30\uc900 throughput \ubcc0\ud654\ub294 \ub2e4\uc74c\uacfc \uac19\ub2e4.<\/p><figure>\n<img decoding=\"async\" alt=\"vanilla \u2192 ngram spec \u2192 suffix \uad50\uccb4 \u2192 cudagraph PIECEWISE. \ud5a5\uc0c1\uc758 \ub300\ubd80\ubd84\uc740 \uccab \ub2e8\uacc4(speculative decoding \ucf1c\uae30)\uc5d0\uc11c \ub098\uc624\uace0, \uc774\ud6c4 \ub450 \ub2e8\uacc4\uac00 \ud68c\uadc0 \uc6cc\ud06c\ub85c\ub4dc\ub97c \ub9c8\uc800 \ub04c\uc5b4\uc62c\ub9b0\ub2e4.\" loading=\"lazy\" src=\"http:\/\/kyunam.com\/wp-content\/uploads\/2026\/06\/vllm-figure-03.png\">\n<figcaption>vanilla \u2192 ngram spec \u2192 suffix \uad50\uccb4 \u2192 cudagraph PIECEWISE. \ud5a5\uc0c1\uc758 \ub300\ubd80\ubd84\uc740 \uccab \ub2e8\uacc4(speculative decoding \ucf1c\uae30)\uc5d0\uc11c \ub098\uc624\uace0, \uc774\ud6c4 \ub450 \ub2e8\uacc4\uac00 \ud68c\uadc0 \uc6cc\ud06c\ub85c\ub4dc\ub97c \ub9c8\uc800 \ub04c\uc5b4\uc62c\ub9b0\ub2e4.<\/figcaption>\n<\/figure><h3>\u2460 <code>speculative_config<\/code> \ucf1c\uae30<\/h3><p>vanilla LLM \uc0dd\uc131\uc790\uc5d0 <code>speculative_config={\"method\":\"ngram\", \"num_speculative_tokens\":7, \"prompt_lookup_max\":5, \"prompt_lookup_min\":2}<\/code>\ub97c \ucd94\uac00\ud558\ub294 \uac83\uc774 \uc804\ubd80\ub2e4. sonnet 7,710 \u2192 10,778 tps (+39.8%). \ud5a5\uc0c1\uc758 \ub300\ubd80\ubd84\uc774 \uc774 \ud55c \ub2e8\uacc4\uc5d0\uc11c \ub098\uc628\ub2e4. \uadf8\ub9cc\ud07c &#8220;\uadf8\ub0e5 \uc548 \ucf1c\uace0 \uc788\uc5c8\uc744 \ubfd0&#8221;\uc778 \uacbd\uc6b0\uac00 \ub9ce\ub2e4\ub294 \ub73b\uc774\uae30\ub3c4 \ud558\ub2e4.<\/p><h3>\u2461 suffix decoder\ub85c \uad50\uccb4<\/h3><p><code>method<\/code>\ub97c <code>\"ngram\"<\/code>\uc5d0\uc11c <code>\"suffix\"<\/code>\ub85c \ubc14\uafb8\uace0 <code>num_speculative_tokens<\/code>\uc744 7\uc5d0\uc11c 32\ub85c \uc62c\ub9b0\ub2e4. \ub3d9\uc2dc\uc5d0 <code>gpu_memory_utilization<\/code>\uc744 0.80\uc73c\ub85c \ub0b4\ub824 cudagraph\uac00 \uc4f8 memory headroom\uc744 \ud655\ubcf4\ud55c\ub2e4. suffix decoder\ub294 prompt\uc640 \uc774\uc804 generation \uc591\ucabd\uc744 suffix tree\ub85c \uc778\ub371\uc2f1\ud558\uae30 \ub54c\ubb38\uc5d0 K(\ud3c9\uade0 \ucc44\ud0dd \uae38\uc774)\uac00 ngram \ub300\ube44 1.7~3.7\ubc30 \ucee4\uc9c0\uba70, \ud2b9\ud788 code \uc6cc\ud06c\ub85c\ub4dc\uc5d0\uc11c\ub294 K\uac00 1.09\uc5d0\uc11c 4.08\ub85c 3.74\ubc30 \ud5a5\uc0c1\ub418\uc5b4 \ud68c\uadc0 \uc601\uc5ed\uc744 net-positive\ub85c \uc804\ud658\uc2dc\ud0a8\ub2e4.<\/p><h3>\u2462 cudagraph PIECEWISE<\/h3><p><code>compilation_config={\"cudagraph_mode\":\"PIECEWISE\"}<\/code> \ud55c \uc904 \ucd94\uac00. vLLM v0.11.0 \uc774\ud6c4 default\uac00 <code>FULL_AND_PIECEWISE<\/code>\ub85c \ubc14\ub00c\uc5c8\ub294\ub370, FULL \ubaa8\ub4dc\ub294 \uc804\uccb4 forward\ub97c \ud558\ub098\uc758 \uadf8\ub798\ud504\ub85c \ucea1\ucc98\ud558\ubbc0\ub85c suffix\uc758 \uac00\ubcc0 draft \uae38\uc774\ub97c \ucc98\ub9ac\ud560 \ub54c\ub9c8\ub2e4 recapture cost\uac00 \ubc1c\uc0dd\ud55c\ub2e4. PIECEWISE\ub294 attention\/MLP \ub2e8\uc704\ub85c \uadf8\ub798\ud504\ub97c \uc798\ub77c dynamic shape\ub97c \ud5c8\uc6a9\ud558\uba74\uc11c launch overhead\ub97c \ud68c\uc218\ud55c\ub2e4. \ud2b9\ud788 chat \uc6cc\ud06c\ub85c\ub4dc\uc5d0\uc11c FULL \ub300\ube44 eager penalty\ub97c \u221223%\uc5d0\uc11c +18.9%\ub85c \ub4a4\uc9d1\ub294\ub2e4.<\/p><h2><span class=\"num\">05<\/span>\uc65c \uc6cc\ud06c\ub85c\ub4dc\ub9c8\ub2e4 \ub2e4\ub974\uac8c \ubc18\uc751\ud558\ub294\uac00<\/h2><p class=\"section-lede\">prompt \uad6c\uc870\uac00 \uacb0\uc815\ud55c\ub2e4.<\/p><p>\uac19\uc740 spec configuration\uc774 sonnet\uc5d0\uc11c\ub294 +52%, chat\uc5d0\uc11c\ub294 +69%, code\uc5d0\uc11c\ub294 +19%\ub85c \ub098\ud0c0\ub098\ub294 \uc774\uc720\ub97c \ub2e8\uc21c\ud788 &#8220;\uc6cc\ud06c\ub85c\ub4dc\uac00 \ub2e4\ub974\ub2c8\uae4c&#8221;\ub85c \ub05d\ub0b4\uc9c0 \uc54a\uace0 mechanism\uc73c\ub85c \uc124\uba85\ud560 \uc218 \uc788\ub2e4. \ud575\uc2ec\uc740 K (mean accepted length)\uc640 \u03b1 (per-position acceptance rate)\uac00 \uc6cc\ud06c\ub85c\ub4dc\ubcc4 prompt \uad6c\uc870\uc5d0 \uc5b4\ub5bb\uac8c \uc758\uc874\ud558\ub294\uc9c0\ub2e4.<\/p><figure>\n<img decoding=\"async\" alt=\"\uc138 \uc6cc\ud06c\ub85c\ub4dc, \ub450 method\uc758 K \u00b7 \u03b1 \ube44\uad50. \ube68\uac04 \uc810\uc120\uc740 overhead \uae30\uc900\uc120 R\u22481.30 \u2014 code \u00b7 ngram \ub9c9\ub300\ub9cc \uadf8 \uc544\ub798\uc5d0 \uc788\uc5b4 \ud68c\uadc0\ud55c\ub2e4. SUB_075 \/ SUB_085\" loading=\"lazy\" src=\"http:\/\/kyunam.com\/wp-content\/uploads\/2026\/06\/vllm-figure-04.png\">\n<figcaption>\uc138 \uc6cc\ud06c\ub85c\ub4dc, \ub450 method\uc758 K \u00b7 \u03b1 \ube44\uad50. \ube68\uac04 \uc810\uc120\uc740 overhead \uae30\uc900\uc120 R\u22481.30 \u2014 code \u00b7 ngram \ub9c9\ub300\ub9cc \uadf8 \uc544\ub798\uc5d0 \uc788\uc5b4 \ud68c\uadc0\ud55c\ub2e4. <span class=\"src\">SUB_075 \/ SUB_085<\/span><\/figcaption>\n<\/figure><p><strong>sonnet<\/strong>\uc758 prompt\ub294 \uc170\uc775\uc2a4\ud53c\uc5b4 sonnet\uc5d0\uc11c random sampling\ud55c 164\uc904(\uc804\uccb4 517\uc904 \uc911)\ub85c \uad6c\uc131\ub41c\ub2e4. with replacement sampling\uc774\ubbc0\ub85c birthday problem\uc5d0 \uc758\ud574 \ud3c9\uade0 24\uac1c\uc758 line\uc774 \uc911\ubcf5 \ub4f1\uc7a5\ud558\uace0, sonnet \uc5b4\ud718 \uc790\uccb4\uac00 unique word 5,000 \ubbf8\ub9cc\uc73c\ub85c \uc81c\ud55c\uc801\uc774\uba70 <code>thou<\/code>, <code>thy<\/code>, <code>love<\/code>, <code>time<\/code> \uac19\uc740 \ub2e8\uc5b4\uac00 \ub9e4\uc6b0 dense\ud55c bi-gram pool\uc744 \ud615\uc131\ud55c\ub2e4. \uacb0\uc815\uc801\uc73c\ub85c generated response\uac00 \uac19\uc740 sonnet style\uc758 continuation\uc774\uae30 \ub54c\ubb38\uc5d0 prompt\uc5d0 \uc774\ubbf8 \ub4f1\uc7a5\ud55c token sequence\uac00 \uadf8\ub300\ub85c \uc778\uc6a9\ub418\ub294 \ube44\uc728\uc774 \ub192\ub2e4.<\/p><p><strong>chat<\/strong>\uc758 prompt\ub294 \ud765\ubbf8\ub86d\uac8c\ub3c4 80%\uac00 sonnet excerpt\uc774\uace0 \ub05d\uc5d0\ub9cc &#8220;What is the author&#8217;s intent?&#8221; \uac19\uc740 \uc9c8\ubb38\uc774 \ubd99\ub294 \ud615\ud0dc\ub2e4. ngram pool \uc790\uccb4\ub294 sonnet\uacfc \uac70\uc758 \ub3d9\uc77c\ud558\uac8c dense\ud558\ubbc0\ub85c per-draft acceptance\uac00 \uc77c\uc5b4\ub0a0 \ub54c\uc758 K\uac00 sonnet\ubcf4\ub2e4 \ub354 \ub192\uac8c \ub098\uc628\ub2e4 \u2014 \uc774\uac83\uc774 SUB_075\uc758 surprise\uc600\ub2e4. \uc0ac\uc804 \uc608\uce21 sonnet \u226b chat\uc744 \ub4a4\uc9d1\uc740 \uacb0\uacfc\ub2e4. \uadf8\ub7ec\ub098 \uc751\ub2f5\uc774 \uc9e7\uae30 \ub54c\ubb38\uc5d0 (\ud3c9\uade0 660 token, EOS \uc870\uae30 \ub3c4\ub2ec) coverage\uac00 \ub0ae\uc544 throughput \ud5a5\uc0c1\ud3ed\uc740 sonnet\uacfc \ube44\uc2b7\ud55c \uc218\uc900\uc5d0 \uadf8\uce5c\ub2e4.<\/p><p><strong>code<\/strong>\ub294 builder\uac00 \uc758\ub3c4\uc801\uc73c\ub85c \uc758\ubbf8 \uc5c6\ub294 word salad \uc8fc\uc11d\uc744 \ucc44\uc6cc \ub123\uc740 \ud568\uc218 stub\uc774\ub2e4. prompt \ubcf8\ubb38\uc774 <code># comment line N: &lt;8 random words&gt;<\/code> \ud615\ud0dc\uc774\uace0 \uc5b4\ud718 pool\uc774 \ub2e8 8 \ub2e8\uc5b4(<code>python, code, function, list, dict, loop, return, test<\/code>)\uc5d0 \ubd88\uacfc\ud574 bi-gram\uc758 unique \uac00\ub2a5 \uc870\ud569\uc774 64\uac1c\ubfd0\uc774\ub2e4. \uadf8\ub7f0\ub370 model\uc774 \uc2e4\uc81c\ub85c \uc0dd\uc131\ud558\ub294 \uac83\uc740 \uc9c4\uc9dc Python \ucf54\ub4dc\uc774\ubbc0\ub85c prompt\uc758 word salad\uc640 generated code \uc0ac\uc774\uc5d0 \uacf5\ud1b5 \ud1a0\ud070\uc774 \uac70\uc758 \uc5c6\ub2e4. ngram\uc758 K=1.09, \u03b1=1.4%\uac00 \ub098\uc624\ub294 \uc774\uc720\ub2e4.<\/p><p>\uc5ec\uae30\uc11c suffix decoder\uac00 code\ub97c net-positive\ub85c \ub04c\uc5b4\uc62c\ub9ac\ub294 mechanism\uc774 \ud765\ubbf8\ub86d\ub2e4. ngram\uc774 prompt \uc548\uc5d0\uc11c\ub9cc KMP match\ub97c \uc2dc\ub3c4\ud558\ub294 \ubc18\uba74 suffix\ub294 prompt\uc640 <em>\uc774\uc804 generated tokens \uc591\ucabd<\/em>\uc744 suffix tree\ub85c \uc778\ub371\uc2f1\ud558\uae30 \ub54c\ubb38\uc5d0, model\uc774 \ud55c \ubc88 \uc0dd\uc131\ud55c Python keyword(<code>for<\/code>, <code>if<\/code>, <code>return<\/code>, <code>range<\/code>, <code>len<\/code>)\uac00 \uc774\ud6c4 step\uc5d0\uc11c self-lookup\uc73c\ub85c hit\ub41c\ub2e4. \uacb0\uacfc\uc801\uc73c\ub85c code \uc6cc\ud06c\ub85c\ub4dc\uc5d0\uc11c K\uac00 3.74\ubc30, \u03b1\uac00 33\ubc30 \uc99d\uac00\ud558\uc5ec K &gt; R \uc870\uac74\uc744 \ub9cc\uc871\ud558\uac8c \ub41c\ub2e4.<\/p><h2><span class=\"num\">06<\/span>\ubaa8\ub378 \ud06c\uae30\ub294 \uc5b4\ub514\uae4c\uc9c0 spec decoding\uc744 \ud5c8\uc6a9\ud558\ub294\uac00<\/h2><p class=\"section-lede\">7B\uc640 14B \uc0ac\uc774 \uc5b4\ub518\uac00\uc758 \uacbd\uacc4.<\/p><p>spec decoding\uc740 \ubb34\ub8cc\uac00 \uc544\ub2c8\ub2e4. R\uc774 vanilla step time\uc5d0 \ube44\ub840\ud558\uc5ec \ub298\uc5b4\ub098\uae30 \ub54c\ubb38\uc5d0, vanilla step time\uc774 \uc9e7\uc740 small model\uc5d0\uc11c\ub294 R\uc774 \ud3ed\uc99d\ud574\uc11c \uc5b4\ub5a4 K\ub85c\ub3c4 \ub530\ub77c\uc7a1\uc744 \uc218 \uc5c6\uac8c \ub41c\ub2e4. 5\uac1c \ubaa8\ub378\u00b72\uac1c \ud558\ub4dc\uc6e8\uc5b4 cross-validation\uc73c\ub85c \uc774\uac8c universal regression\uc784\uc774 \uc785\uc99d\ub418\uc5c8\ub2e4.<\/p><figure>\n<img decoding=\"async\" alt=\"opt-125m\ubd80\ud130 Llama 70B\uae4c\uc9c0. 7B\uc640 14B \uc0ac\uc774\uac00 boundary. 14B \uc774\uc0c1\uc740 \uc548\uc804, 7B \uc774\ud558\ub294 vanilla \uad8c\uc7a5. SUB_078 \/ SUB_090 \/ SUB_091 \/ SUB_093 \/ SUB_096\" loading=\"lazy\" src=\"http:\/\/kyunam.com\/wp-content\/uploads\/2026\/06\/vllm-figure-05.png\">\n<figcaption>opt-125m\ubd80\ud130 Llama 70B\uae4c\uc9c0. 7B\uc640 14B \uc0ac\uc774\uac00 boundary. <strong>14B \uc774\uc0c1\uc740 \uc548\uc804, 7B \uc774\ud558\ub294 vanilla \uad8c\uc7a5.<\/strong> <span class=\"src\">SUB_078 \/ SUB_090 \/ SUB_091 \/ SUB_093 \/ SUB_096<\/span><\/figcaption>\n<\/figure><p>opt-125m\uc744 ngram\uacfc \ud568\uaed8 \ub3cc\ub9ac\uba74 \uc678\ubd80 vLLM Issue #16258 (2\u00d7L4 \ud658\uacbd)\uc5d0\uc11c 2.12\ubc30 \ud68c\uadc0\uac00 \ubcf4\uace0\ub418\uc5c8\ub2e4. \uac19\uc740 \ubaa8\ub378\uc744 H100\u00d71 \ud658\uacbd\uc5d0\uc11c \uce21\uc815\ud55c \uacb0\uacfc <strong>2.13\ubc30 \ud68c\uadc0<\/strong>\ub77c\ub294 \uc815\ud655\ud55c \uc77c\uce58\ub97c \uc5bb\uc5c8\ub2e4. starcoder2-3b 2.30\ubc30, Qwen 0.5B 2.46\ubc30, Qwen 1.5B 2.63\ubc30 \ud68c\uadc0, suffix\ub97c \uc368\ub3c4 \ud68c\uadc0 \u2014 5\uac1c \ubaa8\ub378, 2\uac1c \ud558\ub4dc\uc6e8\uc5b4, 2\uac1c spec method \ubaa8\ub450\uc5d0\uc11c small model\uc758 universal regression\uc774 hardware-independent\uc784\uc774 \ud655\uc778\ub418\uc5c8\ub2e4. \uc989 \uc791\uc740 \ubaa8\ub378\uc5d0\uc11c\ub294 \uc124\uc815\uc744 \ucf1c\ub294 \uac83 \uc790\uccb4\uac00 \uc190\ud574\ub2e4.<\/p><p>boundary\uac00 \uc5b4\ub514\uc778\uc9c0\ub294 SUB_090\uacfc SUB_096\uc758 \ucd94\uac00 \uce21\uc815\uc73c\ub85c \uc881\ud600\uc84c\ub2e4. Qwen 7B\uc758 ngram\uc774 \u221217.3%\ub85c boundary\uc5d0 \uadfc\uc811\ud588\uace0 (\uc5ec\uc804\ud788 \ud68c\uadc0\uc774\uc9c0\ub9cc \ud3ed\uc774 \uc791\uc544\uc9d0), Phi-3-medium 14B\ub294 6\/6 \uc6cc\ud06c\ub85c\ub4dc \ubaa8\ub450 net-positive (sonnet +90%, mix-balanced +117%). \ub530\ub77c\uc11c <strong>boundary\ub294 7B\uc640 14B \uc0ac\uc774<\/strong>\uc5d0 \uc788\ub2e4.<\/p><p>\ud765\ubbf8\ub85c\uc6b4 \ubd80\uc218 \uad00\ucc30\uc740 same-size cross-vendor \ucc28\uc774\ub2e4. Llama 3.3-70B\ub294 6\/6 \ubaa8\ub450 net-positive (+19%~+69%)\uc778\ub370 \ube44\ud574 Qwen 2.5-72B\ub294 code \uc6cc\ud06c\ub85c\ub4dc\uc5d0\uc11c\ub9cc \u22125%\ub85c \ud68c\uadc0\ud558\ub294 5\/6 \uacb0\uacfc\uc600\ub2e4. \ubaa8\ub378 architecture\uac00 spec acceptance\uc5d0 \uc815\ub7c9\uc801\uc73c\ub85c \uc601\ud5a5\uc744 \ubbf8\uce5c\ub2e4\ub294 \uccab \uad00\uce21\uc774\uba70, &#8220;model size\uac00 \uac19\ub2e4\uace0 \uac19\uc740 spec \ud6a8\uacfc\ub97c \uae30\ub300\ud558\uba74 \uc548 \ub41c\ub2e4&#8221;\ub294 \uacbd\uacc4 \uc2e0\ud638\ub2e4.<\/p><h2><span class=\"num\">07<\/span>\uc2e4\uc804 \uc801\uc6a9 \u2014 production decision tree<\/h2><p class=\"section-lede\">\ubaa8\ub378 \ud06c\uae30\uc640 vendor\ub97c \ubcf4\uace0 \uacb0\uc815\ud55c\ub2e4.<\/p><ul>\n<li><strong>\u2264 7B (small model)<\/strong> \u2014 \uc5b4\ub5a4 spec method\ub3c4 net-negative. <em>vanilla\ub9cc \uc0ac\uc6a9<\/em>. 5\uac1c \ubaa8\ub378 cross-validation \uacb0\uacfc\uc774\uba70, hardware-independent.<\/li>\n<li><strong>14B class<\/strong> (Phi-3 14B \ud655\uc778) \u2014 Trident \uc124\uc815 \uc989\uc2dc \uc801\uc6a9. 6\/6 net-positive.<\/li>\n<li><strong>32B class<\/strong> (Qwen 32B \ud655\uc778) \u2014 Trident \uc124\uc815 \uad8c\uc7a5. sonnet\/chat\/code \ubaa8\ub450 net-positive.<\/li>\n<li><strong>70~72B class<\/strong> \u2014 vendor\uc5d0 \ub530\ub978 \ubbf8\uc138 \uc870\uc815. Llama 3.3-70B\ub294 6\/6 net-positive\uc774\ubbc0\ub85c Trident \uc124\uc815 \uadf8\ub300\ub85c, Qwen 2.5-72B\ub294 code \uc6cc\ud06c\ub85c\ub4dc \ube44\uc911\uc774 \ub192\uc73c\uba74 vanilla\ub85c \ub5a8\uc5b4\ub728\ub9ac\ub294 \uc6b4\uc601 \uc815\ucc45\uc774 \uc548\uc804.<\/li>\n<\/ul><p>\ud65c\uc131\ud654 \ucf54\ub4dc\ub294 \ub2e8\uc77c LLM \uc0dd\uc131\uc790 \ud638\ucd9c\ub85c \uc815\ub9ac\ub41c\ub2e4. \ubcc4\ub3c4 patch \uc5c6\uc774 vLLM built-in flag\ub9cc \uc0ac\uc6a9\ud55c\ub2e4.<\/p><pre><code><span class=\"kw\">from<\/span> vllm <span class=\"kw\">import<\/span> LLM, SamplingParams\n\nllm = LLM(\n    model=<span class=\"str\">\"meta-llama\/Llama-3.3-70B-Instruct\"<\/span>,\n    tensor_parallel_size=<span class=\"num\">8<\/span>,\n    max_model_len=<span class=\"num\">16384<\/span>,\n    max_num_seqs=<span class=\"num\">256<\/span>,\n    gpu_memory_utilization=<span class=\"num\">0.80<\/span>,        <span class=\"cmt\"># cudagraph headroom<\/span>\n    enforce_eager=<span class=\"kw\">False<\/span>,\n    kv_cache_dtype=<span class=\"str\">\"fp8\"<\/span>,\n    max_num_batched_tokens=<span class=\"num\">8192<\/span>,\n    disable_log_stats=<span class=\"kw\">True<\/span>,\n    seed=<span class=\"num\">0<\/span>,\n    compilation_config={<span class=\"str\">\"cudagraph_mode\"<\/span>: <span class=\"str\">\"PIECEWISE\"<\/span>},\n    speculative_config={\n        <span class=\"str\">\"method\"<\/span>: <span class=\"str\">\"suffix\"<\/span>,\n        <span class=\"str\">\"num_speculative_tokens\"<\/span>: <span class=\"num\">32<\/span>,\n    },\n)<\/code><\/pre><p>\ud658\uacbd \uc124\uc815\uc5d0\uc11c \ube60\ub728\ub9ac\uba74 \uc548 \ub418\ub294 \ub450 \uac00\uc9c0 env\uac00 \uc788\ub2e4. \ud558\ub098\ub294 <code>ARCTIC_INFERENCE_ENABLED=0<\/code>\uacfc <code>VLLM_PLUGINS=\"\"<\/code>\uc73c\ub85c arctic_inference plugin auto-load\ub97c \ucc28\ub2e8\ud558\ub294 \uac83\uc774\uace0 (plugin\uc740 vLLM 1.6\uacfc binary incompat\uc774\uc9c0\ub9cc <code>SuffixDecodingCache<\/code> class\ub294 lazy import\ub85c \uc0ac\uc6a9 \uac00\ub2a5), \ub2e4\ub978 \ud558\ub098\ub294 <code>PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True<\/code>\ub85c cudagraph PIECEWISE\uc640 \ud568\uaed8 \ubc1c\uc0dd\ud560 \uc218 \uc788\ub294 memory fragmentation\uc744 \uc904\uc774\ub294 \uac83\uc774\ub2e4.<\/p><h2><span class=\"num\">08<\/span>\uc5ed\uc124\ucc98\ub7fc \ubcf4\uc774\ub294 GPU \ud65c\uc6a9\ub960 \ud558\ub77d<\/h2><p class=\"section-lede\">GPU \uc0ac\uc6a9\ub7c9\uc774 \uc904\uc5c8\ub294\ub370 throughput\uc774 \ub298\uc5c8\ub2e4.<\/p><figure>\n<img decoding=\"async\" alt=\"GPU \ud65c\uc6a9\ub960\uc774 93.8%\uc5d0\uc11c 73.3%\ub85c \uc904\uc5c8\ub294\ub370 throughput\uc740 52% \ub298\uc5c8\ub2e4. \uac19\uc740 \ucd9c\ub825\uc5d0 \ud544\uc694\ud55c forward step \uc218 \uc790\uccb4\uac00 \uc904\uc5b4\ub4e0 \uacb0\uacfc \u2014 wall time\uc740 31% \ub2e8\ucd95.\" loading=\"lazy\" src=\"http:\/\/kyunam.com\/wp-content\/uploads\/2026\/06\/vllm-figure-06.png\">\n<figcaption>GPU \ud65c\uc6a9\ub960\uc774 93.8%\uc5d0\uc11c 73.3%\ub85c \uc904\uc5c8\ub294\ub370 throughput\uc740 52% \ub298\uc5c8\ub2e4. \uac19\uc740 \ucd9c\ub825\uc5d0 \ud544\uc694\ud55c forward step \uc218 \uc790\uccb4\uac00 \uc904\uc5b4\ub4e0 \uacb0\uacfc \u2014 wall time\uc740 31% \ub2e8\ucd95.<\/figcaption>\n<\/figure><p>vanilla\ub294 \ub9e4 forward step\uc5d0\uc11c 1 token\ub9cc \uc9c4\ucc99\ud558\ub294 \ub370 \ube44\ud574, spec decode\ub294 K(\u22483~6) token\uc744 \ud55c \ubc88\uc5d0 \uc9c4\ucc99\ud55c\ub2e4. \uac19\uc740 \ucd9c\ub825 \uae38\uc774\uc5d0 \ub3c4\ub2ec\ud558\uae30\uae4c\uc9c0 \ud544\uc694\ud55c GPU forward \ud638\ucd9c \ud69f\uc218 \uc790\uccb4\uac00 \uc904\uae30 \ub54c\ubb38\uc5d0 wall time\uc774 31% \ub2e8\ucd95\ub41c\ub2e4. GPU \ud65c\uc6a9\ub960\uc740 \ubd80\uc218 \uc9c0\ud45c\uc774\uc9c0 \ubaa9\ud45c \uc9c0\ud45c\uac00 \uc544\ub2c8\ub77c\ub294 \uc810\uc774 \uc774 \uce21\uc815\uc5d0\uc11c \ub4dc\ub7ec\ub098\ub294 \uc9c1\uad00\uc5d0 \ubc18\ud558\ub294 \uc0ac\uc2e4\uc774\ub2e4 \u2014 production engineer\uac00 GPU%\ub97c KPI\ub85c \uc0bc\uace0 \uc788\ub2e4\uba74 \uc774 \ud328\ud134\uc5d0\uc11c \uc798\ubabb\ub41c \uacb0\ub860\uc5d0 \ub3c4\ub2ec\ud560 \uc218 \uc788\ub2e4.<\/p><h2><span class=\"num\">09<\/span>\ud68c\ud53c\ud574\uc57c \ud560 \ud568\uc815\ub4e4<\/h2><p class=\"section-lede\">\uce21\uc815 \uacfc\uc815\uc5d0\uc11c \ubc1c\uacac\ud55c \ud68c\uadc0 \ud328\ud134.<\/p><ol>\n<li><strong>7B \uc774\ud558 model\uc5d0\uc11c spec decoding\uc744 \ucf1c\ub454 \ucc44\ub85c \uc6b4\uc601\ud558\uae30.<\/strong> \uc5b4\ub5a4 K\ub85c\ub3c4 R\uc744 \ubabb \ub530\ub77c\uac00\uc11c throughput\uc774 1.2~2.6\ubc30 \ub5a8\uc5b4\uc9c4\ub2e4.<\/li>\n<li><strong>chat \uc6cc\ud06c\ub85c\ub4dc\ub97c <code>cudagraph_mode=\"FULL\"<\/code>\ub85c \ub3cc\ub9ac\uae30.<\/strong> eager penalty\uac00 \u221223%\uae4c\uc9c0 \ubc1c\uc0dd\ud558\uc9c0\ub9cc <code>PIECEWISE<\/code>\ub85c \ubc14\uafb8\uba74 \uc644\uc804\ud788 \uc0ac\ub77c\uc9c0\uace0 +18.9%\ub85c \ud68c\ubcf5.<\/li>\n<li><strong><code>num_speculative_tokens<\/code>\uc744 \ub108\ubb34 \ud06c\uac8c(&gt;10) \uc7a1\uae30.<\/strong> accept rate\uac00 \uae09\ub77d\ud558\uace0 KV memory \uc555\ubc15\uc774 \ub298\uc5b4 OOM \uac00\ub2a5. ngram\uc740 sonnet\uc5d0\uc11c 7, suffix\ub294 32\uac00 \ubcf8 \ud658\uacbd sweet spot.<\/li>\n<li><strong><code>prompt_lookup_min<\/code>\uc744 \ub108\ubb34 \uc791\uac8c(=1) \uc7a1\uae30.<\/strong> \ud754\ud55c unigram\uae4c\uc9c0 \ud6c4\ubcf4\uac00 \ub418\uc5b4 acceptance \ub5a8\uc5b4\uc9c4\ub2e4. \uad8c\uc7a5\uac12 2.<\/li>\n<li><strong>\uc6cc\ud06c\ub85c\ub4dc \ubd84\ub9ac \uce21\uc815 \uc5c6\uc774 spec config \ud310\ub2e8.<\/strong> \uac19\uc740 config\uac00 sonnet\uc5d0\uc11c\ub294 \uac00\uc18d, code\uc5d0\uc11c\ub294 \ud68c\uadc0 \u2014 R\/K mechanism\uc774 \uc6cc\ud06c\ub85c\ub4dc\ubcc4\ub85c \ub2e4\ub974\uae30 \ub54c\ubb38\uc5d0 \ubd84\ub9ac \uce21\uc815 \uc5c6\uc774\ub294 \uc798\ubabb\ub41c \uacb0\ub860\uc5d0 \ub3c4\ub2ec.<\/li>\n<li><strong>\uce21\uc815 \uc870\uac74\uc774 \ub2e4\ub978 \uac12\ub07c\ub9ac \uc9c1\uc811 \ube44\uad50\ud558\uae30.<\/strong> wrapper, <code>gpu_memory_utilization<\/code>, cudagraph mode\ub9cc \ubc14\uafd4\ub3c4 \uac19\uc740 vanilla\uac00 \uc218\uc2ed % \ucc28\uc774 \ub09c\ub2e4. \uc124\uc815\uc758 \uc9c4\uc9dc \ud6a8\uacfc\ub97c \ubcf4\ub824\uba74 \ud55c \uac00\uc9c0\ub9cc \ubc14\uafb8\uace0 \ub098\uba38\uc9c0\ub294 \uace0\uc815\ud55c \ucc44 \ube44\uad50\ud574\uc57c \ud55c\ub2e4.<\/li>\n<\/ol><h2><span class=\"num\">10<\/span>\uc815\ub9ac\ud558\uba74<\/h2><p class=\"section-lede\">\uc138 \uc904\ub85c.<\/p><p>\uccab\uc9f8, <strong>\ucf54\ub4dc \ud55c \uc904 \uace0\uce58\uc9c0 \uc54a\uace0 vLLM \ub0b4\uc7a5 \uc124\uc815\ub9cc \uc815\ud655\ud788 \uc870\ud569\ud558\uba74 \ud070 \uac00\uc18d\uc774 \ub098\uc628\ub2e4<\/strong>. \ucd5c\uc801 \uad6c\uc131\uc740 <code>gpu_memory_utilization=0.80<\/code>, <code>compilation_config={\"cudagraph_mode\":\"PIECEWISE\"}<\/code>, <code>speculative_config={\"method\":\"suffix\",\"num_speculative_tokens\":32}<\/code> \uc774\uac8c \uc804\ubd80\ub2e4. vanilla \ub300\ube44 6\uac1c \uc6cc\ud06c\ub85c\ub4dc \ubaa8\ub450 +18.8%~+68.9% net-positive. \ud5a5\uc0c1\uc758 \ub300\ubd80\ubd84\uc740 \uc0ac\uc2e4 speculative decoding\uc744 &#8220;\ucf1c\ub294 \uac83&#8221; \uadf8 \uc790\uccb4\uc5d0\uc11c \ub098\uc628\ub2e4.<\/p><p>\ub458\uc9f8, <strong>\uc774 \uc124\uc815\uc740 \ubb34\ub8cc\uac00 \uc544\ub2c8\ub2e4 \u2014 \uc544\ubb34 \ub54c\ub098 \ucf1c\uba74 \ub290\ub824\uc9c4\ub2e4<\/strong>. 7B \uc774\ud558 \ubaa8\ub378\uc740 \ubb34\uc5c7\uc744 \ucf1c\ub3c4 universal regression, code \uc6cc\ud06c\ub85c\ub4dc\uc5d0 ngram\uc744 \uc4f0\uba74 \u221220% \ud68c\uadc0. \uadf8\ub798\uc11c &#8220;\ucf1c\ub290\ub0d0 \ub9c8\ub290\ub0d0&#8221;\uac00 \uc544\ub2c8\ub77c &#8220;\uc5b8\uc81c \ubb34\uc5c7\uc744 \ucf1c\ub290\ub0d0&#8221;\uac00 \ud575\uc2ec\uc774\uace0, \uadf8 \ub2f5\uc740 <code>R\/K<\/code> \ubd80\ub4f1\uc2dd \ud558\ub098\ub85c mechanism\uae4c\uc9c0 \uc124\uba85\ub41c\ub2e4.<\/p><p>\uc14b\uc9f8, <strong>\ubaa8\ub378 \ud06c\uae30\u00b7\uc6cc\ud06c\ub85c\ub4dc \uc885\ub958\u00b7\ubaa8\ub378 vendor\uac00 \ubaa8\ub450 \uc131\ud328\ub97c \uac00\ub978\ub2e4<\/strong>. 14B \uc774\uc0c1\uc740 \uc548\uc804, Llama 70B\ub294 6\/6 net-positive\uc9c0\ub9cc Qwen 72B\ub294 code\uc5d0\uc11c \ud68c\uadc0. \u00a77\uc758 decision tree\uac00 \uadf8 \uacbd\uacc4\ub97c \uc815\ub9ac\ud55c\ub2e4.<\/p><div class=\"insight\">\n<span class=\"marker\">closing<\/span>\n    vLLM\uc740 \uc774\ubbf8 \ube60\ub974\ub2e4. \ub300\ubd80\ubd84\uc758 \uacbd\uc6b0 \ubd80\uc871\ud55c \uac74 \uc0c8 \ucf54\ub4dc\uac00 \uc544\ub2c8\ub77c, \uc774\ubbf8 \ub4e4\uc5b4 \uc788\ub294 \uc124\uc815\uc744 \uc6cc\ud06c\ub85c\ub4dc\uc640 \ubaa8\ub378 \ud06c\uae30\uc5d0 \ub9de\uac8c <em>\ucf1c\ub294 \ud310\ub2e8<\/em>\uc774\ub2e4. 156\uc140\uc758 \uce21\uc815\uc774 \uadf8 \ud310\ub2e8 \uae30\uc900\uc744 \uaca9\uc790\ub85c \ucc44\uc6cc \ub123\uc5c8\ub2e4.\n  <\/div><p class=\"closer\">\n    \uce21\uc815 \ud658\uacbd: Llama-3.3-70B-Instruct \u00b7 TP=8 \u00b7 H100\u00d78 \u00b7 vLLM speculative decoding (ngram \/ suffix). 9\uac1c \ubaa8\ub378(opt-125m ~ Qwen 72B) \u00d7 6\uac1c \uc6cc\ud06c\ub85c\ub4dc \u00d7 \uc124\uc815 \uc870\ud569 156\uc140 \uce21\uc815 \uae30\ubc18.\n  <\/p>\n<\/div>\n\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n","protected":false},"excerpt":{"rendered":"<p>vLLM Performance \u00b7 Engineering Notes \ucf54\ub4dc \ud55c \uc904 \uace0\uce58\uc9c0 \uc54a\uace0, vLLM\uc5d0 \uc774\ubbf8 \ub4e4\uc5b4 \uc788\ub294 \ub0b4\uc7a5 \uc124\uc815\ub9cc\uc73c\ub85c Llama-3.3-70B \ucd94\ub860\uc744 6\uac1c \uc6cc\ud06c\ub85c\ub4dc \uc804\ubd80 \uac00\uc18d\ud55c 156\uc140 \uce21\uc815 \ub9ac\ud3ec\ud2b8. \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 vLLM \uc18c\uc2a4\uc5d0\ub294 \uc190\ub300\uc9c0 \uc54a\uc558\ub2e4.<span class=\"more-button\"><a href=\"http:\/\/kyunam.com\/?p=940\" class=\"more-link\">\uc77d\uc5b4 \ubcfc\uae4c?<span class=\"screen-reader-text\">vLLM\uc740 \uc774\ubbf8 \ube60\ub974\ub2e4. \ub2f9\uc2e0\uc774 \uc124\uc815\uc744 \uc548 \ucf30\uc744 \ubfd0<\/span><\/a><\/span><\/p>\n","protected":false},"author":1,"featured_media":954,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[309,44,310,1],"tags":[317,319,314,315,318,316,306],"class_list":["post-940","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-ai","category-tech","category-vllm","category-uncategorized","tag-cuda-graph","tag-gpu-optimization","tag-h100","tag-llama-3-3-70b","tag-llm-inference","tag-speculative-decoding","tag-vllm"],"_links":{"self":[{"href":"http:\/\/kyunam.com\/index.php?rest_route=\/wp\/v2\/posts\/940","targetHints":{"allow":["GET"]}}],"collection":[{"href":"http:\/\/kyunam.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/kyunam.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/kyunam.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/kyunam.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=940"}],"version-history":[{"count":3,"href":"http:\/\/kyunam.com\/index.php?rest_route=\/wp\/v2\/posts\/940\/revisions"}],"predecessor-version":[{"id":949,"href":"http:\/\/kyunam.com\/index.php?rest_route=\/wp\/v2\/posts\/940\/revisions\/949"}],"wp:featuredmedia":[{"embeddable":true,"href":"http:\/\/kyunam.com\/index.php?rest_route=\/wp\/v2\/media\/954"}],"wp:attachment":[{"href":"http:\/\/kyunam.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=940"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/kyunam.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=940"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/kyunam.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=940"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}