<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0">
    <channel>
        <title>ai-muninn</title>
        <link>https://ai-muninn.com/en/blog</link>
        <description>Notes on AI inference infrastructure: DGX Spark, vLLM, local AI agents.</description>
        <lastBuildDate>Wed, 08 Apr 2026 23:24:40 GMT</lastBuildDate>
        <docs>https://validator.w3.org/feed/docs/rss2.html</docs>
        <generator>https://github.com/jpmonette/feed</generator>
        <language>en</language>
        <copyright>2026 coolthor</copyright>
        <item>
            <title><![CDATA[[Ask AI Right] You Opened AI — Now What Do You Say?]]></title>
            <link>https://ai-muninn.com/en/blog/ai-ask-right-first-message</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/ai-ask-right-first-message</guid>
            <pubDate>Thu, 09 Apr 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[AI isn't Google — you're not searching, you're having a conversation. This article teaches you what to say when you first open ChatGPT, five things you can try right now, and how to adjust when the answer isn't quite right.]]></description>
            <category>AI</category>
            <category>ChatGPT</category>
            <category>Beginner</category>
            <category>Conversation</category>
        </item>
        <item>
            <title><![CDATA[[Ask AI Right] Which AI Should You Use in 2026?]]></title>
            <link>https://ai-muninn.com/en/blog/ai-ask-right-which-ai-to-use-2026</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/ai-ask-right-which-ai-to-use-2026</guid>
            <pubDate>Thu, 09 Apr 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[ChatGPT, Claude, and Gemini — the three AI assistants you can start using right now. A no-jargon guide to what each one does best, how much they cost, and how to get started.]]></description>
            <category>AI</category>
            <category>ChatGPT</category>
            <category>Claude</category>
            <category>Gemini</category>
            <category>Beginner</category>
        </item>
        <item>
            <title><![CDATA[[Benchmark] Rescuing Gemma 4 31B on a 32GB MacBook Pro: From 1.5 to 12.8 tok/s]]></title>
            <link>https://ai-muninn.com/en/blog/dgx-spark-gemma4-31b-rescue-mbp-32gb</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/dgx-spark-gemma4-31b-rescue-mbp-32gb</guid>
            <pubDate>Wed, 08 Apr 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Gemma 4 31B runs at 1.5 tok/s on MBP M1 Max with Ollama due to swap. The fix: reduce context window (9 tok/s) or switch to oMLX (12.8 tok/s). The real culprit is KV cache allocation, not model size.]]></description>
            <category>Gemma 4</category>
            <category>31B</category>
            <category>M1 Max</category>
            <category>Ollama</category>
            <category>oMLX</category>
            <category>swap</category>
            <category>KV cache</category>
            <category>TurboQuant</category>
            <category>Apple Silicon</category>
            <category>memory management</category>
        </item>
        <item>
            <title><![CDATA[[Benchmark] 4 Machines, 4 Models, 1 Answer: Memory Decides Everything]]></title>
            <link>https://ai-muninn.com/en/blog/dgx-spark-gemma4-4-machines-4-models-bandwidth</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/dgx-spark-gemma4-4-machines-4-models-bandwidth</guid>
            <pubDate>Wed, 08 Apr 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Gemma 4 E2B through 31B benchmarked on RTX 5090, M1 Max, DGX Spark, and M4 with Ollama. E2B hits 310 tok/s on 5090. 31B hits 1.5 tok/s on MBP — swap kills faster hardware. Memory capacity > bandwidth.]]></description>
            <category>Gemma 4</category>
            <category>RTX 5090</category>
            <category>DGX Spark</category>
            <category>GB10</category>
            <category>M1 Max</category>
            <category>M4</category>
            <category>Ollama</category>
            <category>benchmark</category>
            <category>memory bandwidth</category>
            <category>swap</category>
            <category>E2B</category>
            <category>E4B</category>
            <category>26B</category>
            <category>31B</category>
        </item>
        <item>
            <title><![CDATA[[LLM 101] Dense, MoE, PLE, SSM — Four AI Model Architectures Explained Simply]]></title>
            <link>https://ai-muninn.com/en/blog/llm-101-dense-moe-ple-ssm-architectures</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/llm-101-dense-moe-ple-ssm-architectures</guid>
            <pubDate>Wed, 08 Apr 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Dense is everyone working. MoE is expert rotation. PLE is a dictionary on every floor. SSM is a speed reader. A zero-jargon guide to the four main AI model architectures and how to pick between them.]]></description>
            <category>Dense</category>
            <category>MoE</category>
            <category>PLE</category>
            <category>SSM</category>
            <category>Mamba</category>
            <category>LLM</category>
            <category>model architecture</category>
            <category>beginner</category>
            <category>explainer</category>
        </item>
        <item>
            <title><![CDATA[[Benchmark] Gemma 4 E2B vs E4B: 81 tok/s vs 52 on Three Machines — Bandwidth Is Everything]]></title>
            <link>https://ai-muninn.com/en/blog/dgx-spark-gemma4-e2b-vs-e4b-ollama-3-machines</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/dgx-spark-gemma4-e2b-vs-e4b-ollama-3-machines</guid>
            <pubDate>Tue, 07 Apr 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Gemma 4 E2B is 44-82% faster than E4B across M1 Max, GB10, and M4. We benchmarked both on Ollama with 3 runs per scenario, unique prompts, and proper warm-up. Memory bandwidth predicts generation speed better than anything else.]]></description>
            <category>Gemma 4</category>
            <category>E2B</category>
            <category>E4B</category>
            <category>Ollama</category>
            <category>benchmark</category>
            <category>DGX Spark</category>
            <category>GB10</category>
            <category>M1 Max</category>
            <category>M4</category>
            <category>Apple Silicon</category>
            <category>memory bandwidth</category>
        </item>
        <item>
            <title><![CDATA[[Benchmark] From 19 to 50 tok/s: We Quantized Gemma 4 E4B to NVFP4 Before Anyone Else]]></title>
            <link>https://ai-muninn.com/en/blog/dgx-spark-gemma4-e4b-nvfp4-50-toks</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/dgx-spark-gemma4-e4b-nvfp4-50-toks</guid>
            <pubDate>Tue, 07 Apr 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Gemma 4 E4B NVFP4A16 hits 49.9 tok/s on DGX Spark — 2.6x faster than BF16. First NVFP4 checkpoint on HuggingFace. PLE architecture, FP8 vs NVFP4, and the llm-compressor version hell that almost stopped us.]]></description>
            <category>Gemma 4</category>
            <category>E4B</category>
            <category>NVFP4</category>
            <category>FP8</category>
            <category>vLLM</category>
            <category>DGX Spark</category>
            <category>GB10</category>
            <category>quantization</category>
            <category>llm-compressor</category>
            <category>PLE</category>
            <category>HuggingFace</category>
        </item>
        <item>
            <title><![CDATA[[LLM 101] Ollama vs vLLM: Two Ways to Run AI on Your Own Computer]]></title>
            <link>https://ai-muninn.com/en/blog/llm-101-ollama-vs-vllm</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/llm-101-ollama-vs-vllm</guid>
            <pubDate>Tue, 07 Apr 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Ollama is a microwave — one command and you're chatting with AI. vLLM is a professional oven — 30% faster, handles multiple users, but takes real setup. A zero-jargon guide to choosing between them.]]></description>
            <category>Ollama</category>
            <category>vLLM</category>
            <category>LLM</category>
            <category>local AI</category>
            <category>beginner</category>
            <category>explainer</category>
        </item>
        <item>
            <title><![CDATA[[vLLM] Gemma 4 26B-A4B NVFP4 on DGX Spark: 52 tok/s with 16 GB of Weights]]></title>
            <link>https://ai-muninn.com/en/blog/dgx-spark-gemma4-26b-nvfp4-52-toks</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/dgx-spark-gemma4-26b-nvfp4-52-toks</guid>
            <pubDate>Sun, 05 Apr 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Deploying Gemma 4 26B-A4B MoE NVFP4 on GB10 via vLLM 0.19 — 52 tok/s decode, 16.5 GB model, 82 GB free KV cache. Includes the Phase 0 decision that killed the 31B variant.]]></description>
            <category>Gemma 4</category>
            <category>NVFP4</category>
            <category>vLLM</category>
            <category>DGX Spark</category>
            <category>GB10</category>
            <category>SM121</category>
            <category>MoE</category>
            <category>benchmark</category>
        </item>
        <item>
            <title><![CDATA[[Benchmark] Gemma 4 31B Dense on DGX Spark: 7 tok/s and the Bandwidth Wall]]></title>
            <link>https://ai-muninn.com/en/blog/dgx-spark-gemma4-31b-dense-7-toks</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/dgx-spark-gemma4-31b-dense-7-toks</guid>
            <pubDate>Sun, 05 Apr 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Gemma 4 31B-IT NVFP4 on GB10 maxes out at 7.0 tok/s — bandwidth-bound at 273 GB/s. The math predicted 4.4 tok/s theoretical; NVFP4 compression buys 60% but can't escape the wall. Choose MoE.]]></description>
            <category>Gemma 4</category>
            <category>NVFP4</category>
            <category>vLLM</category>
            <category>DGX Spark</category>
            <category>GB10</category>
            <category>SM121</category>
            <category>dense</category>
            <category>benchmark</category>
            <category>bandwidth</category>
        </item>
        <item>
            <title><![CDATA[[Benchmark] vLLM vs Ollama on the Same Model: Why 30% Faster on GB10]]></title>
            <link>https://ai-muninn.com/en/blog/dgx-spark-vllm-vs-ollama-same-model</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/dgx-spark-vllm-vs-ollama-same-model</guid>
            <pubDate>Sun, 05 Apr 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Same Gemma 4 26B-A4B, same GPU, 30% speed gap. vLLM NVFP4 hits 52 tok/s while Ollama Q4_K_M tops at 40. Root cause: Marlin kernels, CUDA graphs, and an Ollama CPU/GPU split trap.]]></description>
            <category>vLLM</category>
            <category>Ollama</category>
            <category>benchmark</category>
            <category>DGX Spark</category>
            <category>GB10</category>
            <category>Gemma 4</category>
            <category>NVFP4</category>
            <category>inference</category>
        </item>
        <item>
            <title><![CDATA[[vLLM] GX10 Power Delivery: The 30W Safety Mode No Firmware Can Fix]]></title>
            <link>https://ai-muninn.com/en/blog/dgx-spark-30w-power-safety-mode</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/dgx-spark-30w-power-safety-mode</guid>
            <pubDate>Thu, 02 Apr 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Some GX10 units ship with a PD controller defect that caps the whole system at 30W permanently. How to diagnose it in 30 seconds, and when to stop troubleshooting and just RMA.]]></description>
            <category>GX10</category>
            <category>GB10</category>
            <category>DGX Spark</category>
            <category>power delivery</category>
            <category>vLLM</category>
            <category>hardware</category>
        </item>
        <item>
            <title><![CDATA[[Benchmark] TurboQuant on GX10: Is 3-bit KV Cache Compression Actually Lossless?]]></title>
            <link>https://ai-muninn.com/en/blog/turboquant-kv-cache-benchmark-gx10</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/turboquant-kv-cache-benchmark-gx10</guid>
            <pubDate>Mon, 30 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Real benchmark numbers for Google's TurboQuant on a GB10/SM121 (DGX Spark) — actual compression ratios, Qwen2.5-3B accuracy validation, and why Qwen3.5-35B's hybrid attention architecture makes things complicated.]]></description>
            <category>TurboQuant</category>
            <category>KV Cache</category>
            <category>Quantization</category>
            <category>vLLM</category>
            <category>Benchmark</category>
            <category>Qwen3.5</category>
            <category>GX10</category>
            <category>SM121</category>
        </item>
        <item>
            <title><![CDATA[[AI Agent] NemoClaw Without the Cloud: Swapping Nemotron for a Local Ollama Model]]></title>
            <link>https://ai-muninn.com/en/blog/nemoclaw-local-inference-ollama</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/nemoclaw-local-inference-ollama</guid>
            <pubDate>Tue, 24 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[How to point NemoClaw's inference backend to a local Ollama or vLLM endpoint. Config location, model swap, and what OpenShell still enforces when the cloud is gone.]]></description>
            <category>NemoClaw</category>
            <category>OpenClaw</category>
            <category>OpenShell</category>
            <category>Ollama</category>
            <category>vLLM</category>
            <category>AI Agent</category>
            <category>NVIDIA</category>
            <category>GX10</category>
            <category>Local Inference</category>
        </item>
        <item>
            <title><![CDATA[[AI Agent] openclaw + ChatGPT OAuth: GPT-5.4 Without Buying API Credits]]></title>
            <link>https://ai-muninn.com/en/blog/openclaw-chatgpt-oauth-gpt54-no-api-key</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/openclaw-chatgpt-oauth-gpt54-no-api-key</guid>
            <pubDate>Tue, 24 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[openclaw 2026.3.13 added OpenAI OAuth login. One command gives your agent GPT-5.4's 1M context window using a ChatGPT Plus subscription instead of API credits.]]></description>
            <category>openclaw</category>
            <category>GPT-5.4</category>
            <category>ChatGPT</category>
            <category>OAuth</category>
            <category>AI agent</category>
        </item>
        <item>
            <title><![CDATA[NemoClaw on DGX Spark: 4 Fixes the Official Docs Don't Tell You]]></title>
            <link>https://ai-muninn.com/en/blog/nemoclaw-install-gx10-from-scratch</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/nemoclaw-install-gx10-from-scratch</guid>
            <pubDate>Mon, 23 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[The NemoClaw installer fails on DGX Spark out of the box. Here are the 4 undocumented fixes — Node upgrade, npm link, OpenShell tar.gz, cgroupns — to get your first AI agent running in 30 min.]]></description>
            <category>NemoClaw</category>
            <category>OpenClaw</category>
            <category>OpenShell</category>
            <category>AI Agent</category>
            <category>NVIDIA</category>
            <category>DGX Spark</category>
            <category>GX10</category>
            <category>GB10</category>
        </item>
        <item>
            <title><![CDATA[NemoClaw Explained: NVIDIA's All-in-One AI Agent Framework for DGX Spark]]></title>
            <link>https://ai-muninn.com/en/blog/nemoclaw-what-it-is-why-it-exists</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/nemoclaw-what-it-is-why-it-exists</guid>
            <pubDate>Mon, 23 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[NemoClaw = OpenClaw + OpenShell + NVIDIA Agent Toolkit in one package. What problem it solves, how the architecture works, and whether it's worth installing on your DGX Spark.]]></description>
            <category>NemoClaw</category>
            <category>OpenClaw</category>
            <category>OpenShell</category>
            <category>AI Agent</category>
            <category>NVIDIA</category>
            <category>DGX Spark</category>
            <category>GX10</category>
        </item>
        <item>
            <title><![CDATA[[Claude Code] claude-agent-sdk vs subprocess: Why Intermediate Turns Disappear]]></title>
            <link>https://ai-muninn.com/en/blog/claude-code-agent-sdk-orchestrator</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/claude-code-agent-sdk-orchestrator</guid>
            <pubDate>Sat, 21 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Building a multi-agent orchestrator with `claude -p` subprocess reveals a silent data loss problem. The SDK fix, session resume, parallel execution, and why setting_sources matters.]]></description>
            <category>Claude Code</category>
            <category>claude-agent-sdk</category>
            <category>multi-agent</category>
            <category>orchestrator</category>
            <category>Python</category>
            <category>asyncio</category>
        </item>
        <item>
            <title><![CDATA[[vLLM] FP8 KV Cache on GB10: Why Outputs Collapse into Repetition Loops]]></title>
            <link>https://ai-muninn.com/en/blog/dgx-spark-fp8-kvcache-repetition</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/dgx-spark-fp8-kvcache-repetition</guid>
            <pubDate>Sat, 21 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Adding --kv-cache-dtype fp8 to a vLLM serve script on GB10 causes outputs to degrade into repetition after ~500 tokens. Root cause: missing calibration data, q_scale defaults to 1.0.]]></description>
            <category>vLLM</category>
            <category>FP8</category>
            <category>KV cache</category>
            <category>GB10</category>
            <category>DGX Spark</category>
            <category>quantization</category>
            <category>SM121</category>
        </item>
        <item>
            <title><![CDATA[[AI Agent] openclaw + 131K Context: When max_tokens Goes Negative]]></title>
            <link>https://ai-muninn.com/en/blog/openclaw-context-budget-negative-maxtokens</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/openclaw-context-budget-negative-maxtokens</guid>
            <pubDate>Sat, 21 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Connecting openclaw to a 131K context model and hitting 400 max_tokens must be at least 1, got -1292. The context budget math, the config key trap, and the fix.]]></description>
            <category>openclaw</category>
            <category>context window</category>
            <category>vLLM</category>
            <category>gpt-oss</category>
            <category>configuration</category>
        </item>
        <item>
            <title><![CDATA[[AI Agent] openclaw Real-Time Streaming via Telegram Bot API 9.5 sendMessageDraft]]></title>
            <link>https://ai-muninn.com/en/blog/openclaw-telegram-sendmessagedraft-streaming</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/openclaw-telegram-sendmessagedraft-streaming</guid>
            <pubDate>Sat, 21 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Replacing choppy editMessageText polling with Telegram's sendMessageDraft for live animated output. The patch, the think-block filter, and the optional chaining trap in DM chats.]]></description>
            <category>openclaw</category>
            <category>Telegram</category>
            <category>streaming</category>
            <category>Bot API</category>
            <category>undici</category>
            <category>GLM</category>
        </item>
        <item>
            <title><![CDATA[[AI Agent] openclaw: Why the Bot Went Silent — Tailscale, IPv6, and a Node.js Happy Eyeballs Trap]]></title>
            <link>https://ai-muninn.com/en/blog/openclaw-telegram-ipv6-tailscale-silent-bot</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/openclaw-telegram-ipv6-tailscale-silent-bot</guid>
            <pubDate>Thu, 19 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[The bot process is running. The token is valid. Messages are being consumed. Nobody is home. A systematic takedown of every wrong hypothesis — and the hidden causal chain that connects Tailscale routing tables to silent sendMessage failures in Node.js.]]></description>
            <category>Node.js</category>
            <category>Tailscale</category>
            <category>IPv6</category>
            <category>undici</category>
            <category>Happy Eyeballs</category>
            <category>Telegram</category>
            <category>Debugging</category>
            <category>Networking</category>
        </item>
        <item>
            <title><![CDATA[[vLLM] Running a 120B Model on DGX Spark at 60 tok/s — Zero API Cost, Six Bugs]]></title>
            <link>https://ai-muninn.com/en/blog/part2-gpt-oss-120b-serve-script</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/part2-gpt-oss-120b-serve-script</guid>
            <pubDate>Thu, 19 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[How to get gpt-oss-120B running on a DGX Spark (GB10, SM121) with vLLM. The goal: a 120B model serving a local AI agent at zero API cost. The path: six bugs, one silent env var, and a startup log that tells you everything.]]></description>
            <category>DGX Spark</category>
            <category>SM121</category>
            <category>vLLM</category>
            <category>gpt-oss</category>
            <category>MXFP4</category>
            <category>Blackwell</category>
            <category>LLM Serving</category>
        </item>
        <item>
            <title><![CDATA[[vLLM] Qwen3.5-122B Runs. But at 14 tok/s.]]></title>
            <link>https://ai-muninn.com/en/blog/part2-qwen-122b-14-toks-gdn-kernel-gap</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/part2-qwen-122b-14-toks-gdn-kernel-gap</guid>
            <pubDate>Thu, 19 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[After fixing the four SM121 NVFP4 bugs, Qwen3.5-122B boots cleanly and generates correct output. Then you check the speed. 14 tok/s. No flags to fix it. Here's why — and what to wait for.]]></description>
            <category>DGX Spark</category>
            <category>SM121</category>
            <category>Qwen3.5-122B</category>
            <category>vLLM</category>
            <category>NVFP4</category>
            <category>Marlin</category>
            <category>GDN</category>
            <category>LLM Serving</category>
        </item>
        <item>
            <title><![CDATA[[AI Agent] openclaw: When the Agent Calls for Help]]></title>
            <link>https://ai-muninn.com/en/blog/openclaw-callhelp-spawning-cli-from-agent-loop</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/openclaw-callhelp-spawning-cli-from-agent-loop</guid>
            <pubDate>Wed, 18 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[How to wire a callhelp tool into a local agent loop so it can spawn Codex CLI mid-reasoning. One permission flag you must set, and why Claude's quota stays mine.]]></description>
            <category>AI Agent</category>
            <category>openclaw</category>
            <category>Codex</category>
            <category>LLM</category>
            <category>Agent Tools</category>
            <category>Local AI</category>
        </item>
        <item>
            <title><![CDATA[[vLLM] Why Your DGX Spark Only Says "!!!!!": Debugging NVFP4 on SM121]]></title>
            <link>https://ai-muninn.com/en/blog/part1-why-your-dgx-spark-says-exclamation-marks</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/part1-why-your-dgx-spark-says-exclamation-marks</guid>
            <pubDate>Tue, 17 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[CUTLASS FP4 kernels target SM120 (GB200). On SM121 (GB10, DGX Spark) they run silently and produce garbage. Here's the full diagnostic story — 4 bugs, the row-identical failure signature, and the working fix.]]></description>
            <category>DGX Spark</category>
            <category>SM121</category>
            <category>vLLM</category>
            <category>NVFP4</category>
            <category>MXFP4</category>
            <category>Blackwell</category>
            <category>CUDA</category>
            <category>LLM Serving</category>
        </item>
        <item>
            <title><![CDATA[[AI Agent] The Codex-Executor Pattern: Keeping Agent Sessions Small]]></title>
            <link>https://ai-muninn.com/en/blog/openclaw-codex-executor-agent-architecture</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/openclaw-codex-executor-agent-architecture</guid>
            <pubDate>Mon, 16 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Why we stopped having the OpenClaw agent orchestrate multi-step tasks directly, and started spawning Codex subprocesses instead. The pattern that keeps agent context minimal and tasks reliable.]]></description>
            <category>AI Agent</category>
            <category>Claude Code</category>
            <category>Codex</category>
            <category>Agent Architecture</category>
            <category>OpenClaw</category>
        </item>
        <item>
            <title><![CDATA[[vLLM] Nemotron-3-Super-120B on a Single GB10: Full Day Debug Log]]></title>
            <link>https://ai-muninn.com/en/blog/dgx-spark-nemotron-120b-vllm</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/dgx-spark-nemotron-120b-vllm</guid>
            <pubDate>Fri, 13 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Getting NVIDIA's Nemotron-3-Super-120B-NVFP4 running on an ASUS GX10 (SM121, 128GB). Four SM121-specific pitfalls, the env-var-that-does-nothing, and a working docker command.]]></description>
            <category>DGX Spark</category>
            <category>GB10</category>
            <category>SM121</category>
            <category>Nemotron</category>
            <category>vLLM</category>
            <category>NVFP4</category>
            <category>Blackwell</category>
            <category>LLM Serving</category>
        </item>
        <item>
            <title><![CDATA[[vLLM] Ollama's KEEP_ALIVE Is Silently Eating Your vLLM Headroom]]></title>
            <link>https://ai-muninn.com/en/blog/openclaw-ollama-vllm-gpu-conflict</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/openclaw-ollama-vllm-gpu-conflict</guid>
            <pubDate>Sat, 07 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[vLLM OOMed on restart despite 128GB unified memory. Cause: Ollama's KEEP_ALIVE=2h was holding 19-51GB in GPU. Diagnosis command, manual unload fix, and why to set KEEP_ALIVE=0 once vLLM is your primary stack.]]></description>
            <category>vLLM</category>
            <category>Ollama</category>
            <category>GPU Memory</category>
            <category>DGX Spark</category>
            <category>GB10</category>
            <category>LLM Serving</category>
        </item>
        <item>
            <title><![CDATA[[vLLM] Don't Add --enable-chunked-prefill to SSM Models]]></title>
            <link>https://ai-muninn.com/en/blog/openclaw-chunked-prefill-ssm-trap</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/openclaw-chunked-prefill-ssm-trap</guid>
            <pubDate>Fri, 06 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Adding --enable-chunked-prefill to a Qwen3.5-35B (SSM+MoE hybrid) dropped throughput from 47 tok/s to 5.7 tok/s. Why SSM recurrence and chunked prefill are fundamentally incompatible.]]></description>
            <category>vLLM</category>
            <category>SSM</category>
            <category>Qwen</category>
            <category>DGX Spark</category>
            <category>LLM Serving</category>
            <category>Performance</category>
        </item>
        <item>
            <title><![CDATA[[vLLM] Qwen3.5-35B at 47 tok/s on a Desktop: Migrating from Ollama to vLLM]]></title>
            <link>https://ai-muninn.com/en/blog/dgx-spark-vllm-qwen35-setup</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/dgx-spark-vllm-qwen35-setup</guid>
            <pubDate>Thu, 05 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[TTFT 3s → 0.12s after switching from Ollama to vLLM on DGX Spark GB10. Six real gotchas: SSM + chunked prefill trap, memory conflicts, docker restart order.]]></description>
            <category>DGX Spark</category>
            <category>GB10</category>
            <category>vLLM</category>
            <category>Ollama</category>
            <category>Qwen3.5</category>
            <category>Docker</category>
            <category>Blackwell</category>
            <category>AI Agent</category>
        </item>
        <item>
            <title><![CDATA[[AI Agent] Zero API Cost: Running OpenClaw on DGX Spark + Mac Mini]]></title>
            <link>https://ai-muninn.com/en/blog/openclaw-dgx-spark-local-ai-agent</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/openclaw-dgx-spark-local-ai-agent</guid>
            <pubDate>Thu, 05 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Full stack local AI agent: Mac Mini M4 as the always-on gateway, GX10 for inference, Telegram as the UI. No subscriptions, no cloud APIs. Six deployment lessons from the trenches.]]></description>
            <category>OpenClaw</category>
            <category>AI Agent</category>
            <category>DGX Spark</category>
            <category>Mac Mini</category>
            <category>Self-Hosted</category>
            <category>Ollama</category>
            <category>SearXNG</category>
        </item>
        <item>
            <title><![CDATA[[Benchmark] Pure MoE vs SSM Hybrid: Context Decay and Why It Matters for Agents]]></title>
            <link>https://ai-muninn.com/en/blog/openclaw-moe-ssm-context-decay</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/openclaw-moe-ssm-context-decay</guid>
            <pubDate>Sun, 01 Mar 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[GLM-4.7-Flash hits 57.8 tok/s on short context but drops to 42 tok/s at 8K. Qwen3.5-35B SSM hybrid: 56 tok/s at short, 56 tok/s at 8K. Why agents with long system prompts should care about this difference.]]></description>
            <category>Benchmark</category>
            <category>SSM</category>
            <category>MoE</category>
            <category>DGX Spark</category>
            <category>GB10</category>
            <category>LLM Serving</category>
            <category>AI Agents</category>
        </item>
        <item>
            <title><![CDATA[[Dev Workflow] I Made Two AIs Argue. The Disagreements Are the Point.]]></title>
            <link>https://ai-muninn.com/en/blog/claude-code-debate-system</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/claude-code-debate-system</guid>
            <pubDate>Thu, 26 Feb 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[A custom /debate command that pits Codex CLI against Gemini CLI on architecture, code, and decisions. Different training data, different blind spots — and the disagreements between them are usually the most useful output.]]></description>
            <category>Dev Workflow</category>
            <category>Claude Code</category>
            <category>Gemini</category>
            <category>Codex</category>
            <category>Multi-AI</category>
            <category>Code Review</category>
        </item>
        <item>
            <title><![CDATA[[Claude Code] Testing iOS Apps with Claude Code: 81% Context Reduction]]></title>
            <link>https://ai-muninn.com/en/blog/claude-code-ios-testing-bpstracker</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/claude-code-ios-testing-bpstracker</guid>
            <pubDate>Thu, 26 Feb 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[How I replaced screenshot-heavy iOS test runs with ui_describe_all-first testing in Claude Code, cutting context usage by 81% for BPS Tracker. Plus Fastlane integration for App Store automation.]]></description>
            <category>Claude Code</category>
            <category>iOS</category>
            <category>Swift</category>
            <category>Testing</category>
            <category>Fastlane</category>
            <category>BPS Tracker</category>
        </item>
        <item>
            <title><![CDATA[[AI Agent] OpenClaw Config Hot-Reload: No Restart Needed]]></title>
            <link>https://ai-muninn.com/en/blog/openclaw-config-hot-reload</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/openclaw-config-hot-reload</guid>
            <pubDate>Wed, 25 Feb 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Spent weeks restarting the OpenClaw gateway for every config change. Then discovered the file watcher. What hot-reloads instantly, what still needs a restart, and how to tell auth failures from transient network errors.]]></description>
            <category>AI Agent</category>
            <category>OpenClaw</category>
            <category>Configuration</category>
            <category>Developer Workflow</category>
        </item>
        <item>
            <title><![CDATA[[Claude Code] I Wrote MANDATORY. The AI Ignored It.]]></title>
            <link>https://ai-muninn.com/en/blog/claude-code-mandatory-instructions</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/claude-code-mandatory-instructions</guid>
            <pubDate>Thu, 19 Feb 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[A Claude Code config rule marked MANDATORY was skipped twice in one session. Here's the root cause — three architectural reasons why emphasis doesn't work — and three system-level solutions that do.]]></description>
            <category>Claude Code</category>
            <category>AI Agents</category>
            <category>Prompt Engineering</category>
            <category>Systems Design</category>
            <category>Developer Workflow</category>
        </item>
        <item>
            <title><![CDATA[[Benchmark] 8 Models on DGX Spark: Finding the Best Stack for AI Agents]]></title>
            <link>https://ai-muninn.com/en/blog/dgx-spark-ollama-benchmark-8-models</link>
            <guid isPermaLink="false">https://ai-muninn.com/en/blog/dgx-spark-ollama-benchmark-8-models</guid>
            <pubDate>Thu, 19 Feb 2026 00:00:00 GMT</pubDate>
            <description><![CDATA[Benchmarking 8 local LLMs on NVIDIA GB10 (128GB unified memory) across 7 task categories. Quantization surprises, a 120B model that fails at JSON, and thinking models that spend their entire budget thinking.]]></description>
            <category>DGX Spark</category>
            <category>GB10</category>
            <category>Ollama</category>
            <category>Benchmark</category>
            <category>LLM</category>
            <category>AI Agent</category>
            <category>Blackwell</category>
        </item>
    </channel>
</rss>