diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 33112e2..e030c15 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -77,6 +77,14 @@ jobs:
         echo "HUGO_VERSION=$VERSION" >> $GITHUB_ENV
         echo "Using Hugo version: $VERSION"
 
+    - name: Checkout twinkle docs
+      uses: actions/checkout@v4
+      with:
+        repository: modelscope/twinkle
+        path: twinkle
+        sparse-checkout: docs/
+        sparse-checkout-cone-mode: true
+
     - name: Install dependencies
       run: |
         # Install Tailwind CLI if package.json exists
@@ -112,6 +120,14 @@ jobs:
         restore-keys: |
           ${{ runner.os }}-hugo-resources-
 
+    - name: Sync docs from twinkle repo
+      env:
+        TWINKLE_DOCS_DIR: ${{ github.workspace }}/twinkle/docs
+      run: python3 scripts/sync-docs.py
+
+    - name: Check internal links
+      run: python3 scripts/check-links.py
+
     - name: Build with Hugo
       env:
         HUGO_ENVIRONMENT: production
diff --git a/.gitignore b/.gitignore
index 88e173e..531fc72 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,6 +45,10 @@ node_modules/
 pagefind/
 static/pagefind/
 
+# Auto-generated docs (by scripts/sync-docs.py from docs/source_en & docs/source_zh)
+content/docs/components/
+content/docs/usage-guide/
+
 # ============================================================================
 # Operating System
 # ============================================================================
diff --git a/assets/css/custom.css b/assets/css/custom.css
index 4be7ea5..9b70052 100644
--- a/assets/css/custom.css
+++ b/assets/css/custom.css
@@ -463,6 +463,40 @@ header,
   color: var(--twinkle-primary) !important;
 }
 
+/* Force horizontal navbar at md breakpoint (768px) instead of lg (1024px) */
+@media (min-width: 768px) {
+  /* Hide hamburger button */
+  label[for="nav-toggle"] {
+    display: none !important;
+  }
+  /* Show nav menu horizontally */
+  #nav-menu {
+    display: flex !important;
+    width: auto !important;
+    padding-bottom: 0 !important;
+    gap: 0.25rem;
+  }
+  .navbar-nav {
+    text-align: left;
+  }
+  .nav-link {
+    padding: 0.5rem 0.6rem;
+    font-size: 0.9rem;
+  }
+}
+
+/* Show docs sidebar at md breakpoint (768px) instead of lg (1024px) */
+@media (min-width: 768px) and (max-width: 1023px) {
+  .hb-sidebar-container {
+    position: sticky !important;
+    top: 4rem !important;
+    transform: none !important;
+    width: 16rem !important;
+    flex-shrink: 0 !important;
+    align-self: flex-start !important;
+  }
+}
+
 /* ─────────────────────────────────────────────────────────────────────────────
    Footer - Light
    ───────────────────────────────────────────────────────────────────────────── */
@@ -551,3 +585,134 @@ footer {
   background: #0f172a !important;
   border-color: #334155;
 }
+
+/* ─────────────────────────────────────────────────────────────────────────────
+   Cookbook (Showcase) - Two-Column Card Grid
+   ───────────────────────────────────────────────────────────────────────────── */
+.page-body .container.grid {
+  max-width: 72rem !important;
+  grid-template-columns: 1fr !important;
+}
+
+@media (min-width: 768px) {
+  .page-body .container.grid {
+    grid-template-columns: repeat(2, 1fr) !important;
+  }
+}
+
+/* Compact card image area */
+.page-body .container.grid .aspect-\[16\/9\] {
+  aspect-ratio: 16/10 !important;
+}
+
+/* Reduce card padding for compactness */
+.page-body .container.grid .p-8 {
+  padding: 1.25rem !important;
+}
+
+.page-body .container.grid .space-y-4 {
+  gap: 0.5rem !important;
+}
+
+/* ─────────────────────────────────────────────────────────────────────────────
+   Stats Block: force 2×2 grid instead of 4-column to prevent text truncation
+   ───────────────────────────────────────────────────────────────────────────── */
+.blox-stats .grid {
+  grid-template-columns: repeat(2, minmax(0, 1fr)) !important;
+}
+
+.blox-stats .grid h3 {
+  font-size: clamp(1.5rem, 4vw, 2.5rem) !important;
+  white-space: nowrap;
+}
+
+/* ─────────────────────────────────────────────────────────────────────────────
+   Language Switcher
+   ───────────────────────────────────────────────────────────────────────────── */
+.lang-switcher {
+  display: inline-flex;
+  align-items: center;
+  background: var(--twinkle-surface-alt);
+  border-radius: 6px;
+  padding: 2px;
+  gap: 2px;
+  font-size: 0.8rem;
+  font-weight: 500;
+}
+
+.lang-switcher-active {
+  padding: 0.25rem 0.6rem;
+  border-radius: 4px;
+  background: var(--twinkle-primary);
+  color: white;
+}
+
+.lang-switcher-link {
+  padding: 0.25rem 0.6rem;
+  border-radius: 4px;
+  text-decoration: none;
+  color: var(--twinkle-text-secondary);
+  transition: background 0.15s;
+}
+
+.lang-switcher-link:hover {
+  background: var(--twinkle-border);
+}
+
+.dark .lang-switcher {
+  background: #334155;
+}
+
+.dark .lang-switcher-link {
+  color: #cbd5e1;
+}
+
+.dark .lang-switcher-link:hover {
+  background: #475569;
+}
+
+/* ─────────────────────────────────────────────────────────────────────────────
+   List Children Shortcode
+   ───────────────────────────────────────────────────────────────────────────── */
+.list-children-grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fill, minmax(250px, 1fr));
+  gap: 1rem;
+  margin-top: 1.5rem;
+}
+
+.list-children-card {
+  display: flex;
+  flex-direction: column;
+  padding: 1rem 1.25rem;
+  border: 1px solid var(--twinkle-border);
+  border-radius: 0.5rem;
+  text-decoration: none;
+  color: inherit;
+  transition: box-shadow 0.2s, border-color 0.2s;
+}
+
+.list-children-card:hover {
+  border-color: var(--twinkle-primary);
+  box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
+}
+
+.list-children-card .card-title {
+  font-weight: 600;
+  font-size: 0.95rem;
+}
+
+.list-children-card .card-desc {
+  margin-top: 0.25rem;
+  font-size: 0.85rem;
+  color: var(--twinkle-text-secondary);
+}
+
+.dark .list-children-card {
+  border-color: #334155;
+}
+
+.dark .list-children-card:hover {
+  border-color: var(--twinkle-primary-light);
+}
+
diff --git a/assets/jsconfig.json b/assets/jsconfig.json
index c9bcdda..d7a3e33 100644
--- a/assets/jsconfig.json
+++ b/assets/jsconfig.json
@@ -3,9 +3,9 @@
   "baseUrl": ".",
   "paths": {
    "*": [
-    "../../../../Library/Caches/hugo_cache/modules/filecache/modules/pkg/mod/github.com/!hugo!blox/kit/modules/blox@v0.0.0-20260124040029-77f7678f08ac/assets/*",
-    "../../../../Library/Caches/hugo_cache/modules/filecache/modules/pkg/mod/github.com/!hugo!blox/kit/modules/blox@v0.0.0-20260124040029-77f7678f08ac/blox/*",
-    "../../../../Library/Caches/hugo_cache/modules/filecache/modules/pkg/mod/github.com/!hugo!blox/kit/modules/blox@v0.0.0-20260124040029-77f7678f08ac/blox/shared/js/*"
+    "../../../../../../../root/.cache/hugo_cache/modules/filecache/modules/pkg/mod/github.com/!hugo!blox/kit/modules/blox@v0.0.0-20260124040029-77f7678f08ac/assets/*",
+    "../../../../../../../root/.cache/hugo_cache/modules/filecache/modules/pkg/mod/github.com/!hugo!blox/kit/modules/blox@v0.0.0-20260124040029-77f7678f08ac/blox/*",
+    "../../../../../../../root/.cache/hugo_cache/modules/filecache/modules/pkg/mod/github.com/!hugo!blox/kit/modules/blox@v0.0.0-20260124040029-77f7678f08ac/blox/shared/js/*"
    ]
   }
  }
diff --git a/config/_default/hugo.yaml b/config/_default/hugo.yaml
index c31788d..95af963 100644
--- a/config/_default/hugo.yaml
+++ b/config/_default/hugo.yaml
@@ -97,7 +97,7 @@ timeout: 600000
 taxonomies:
   tag: tags
   category: categories
-  author: authors
+
 markup:
   _merge: deep
 related:
diff --git a/config/_default/menus.yaml b/config/_default/menus.yaml
index aba3fc7..88e48d3 100644
--- a/config/_default/menus.yaml
+++ b/config/_default/menus.yaml
@@ -5,18 +5,26 @@ main:
   - name: Home
     url: /
     weight: 10
+  - name: Quick Start
+    url: /docs/getting-started/
+    weight: 15
   - name: Documentation
-    url: docs/
+    url: /docs/
     weight: 20
   - name: Blog
-    url: blog/
+    url: /blog/
     weight: 30
-  - name: Showcase
-    url: showcase/
+  - name: Cookbook
+    url: /showcase/
     weight: 40
   - name: Community
-    url: community/
+    url: /community/
     weight: 50
+  - name: GitHub
+    url: https://github.com/modelscope/twinkle
+    weight: 60
+    params:
+      icon: github
 
 sidebar:
   - identifier: more
@@ -27,7 +35,7 @@ sidebar:
   - identifier: community
     name: "Community"
     # url matches main nav; avoids pageRef edge cases with multilingual + base path
-    url: community/
+    url: /community/
     weight: 2
   - identifier: github
     name: "GitHub ↗"
diff --git a/config/_default/menus.zh.yaml b/config/_default/menus.zh.yaml
index 4c73283..b5c4950 100644
--- a/config/_default/menus.zh.yaml
+++ b/config/_default/menus.zh.yaml
@@ -4,18 +4,26 @@ main:
   - name: 首页
     url: /
     weight: 10
+  - name: 快速上手
+    url: /docs/getting-started/
+    weight: 15
   - name: 文档
-    url: docs/
+    url: /docs/
     weight: 20
   - name: 博客
-    url: blog/
+    url: /blog/
     weight: 30
-  - name: 案例展示
-    url: showcase/
+  - name: Cookbook
+    url: /showcase/
     weight: 40
   - name: 社区
-    url: community/
+    url: /community/
     weight: 50
+  - name: GitHub
+    url: https://github.com/modelscope/twinkle
+    weight: 60
+    params:
+      icon: github
 
 sidebar:
   - identifier: more
@@ -26,7 +34,7 @@ sidebar:
   - identifier: community
     name: "社区"
     # Use url (not pageRef): pageRef breaks on zh pages → /zh/twinkle-web/zh/community/
-    url: community/
+    url: /community/
     weight: 2
   - identifier: github
     name: "GitHub ↗"
diff --git a/config/_default/params.yaml b/config/_default/params.yaml
index 9429872..ca5c840 100644
--- a/config/_default/params.yaml
+++ b/config/_default/params.yaml
@@ -95,9 +95,9 @@ hugoblox:
     language_switcher: true
     # Call-to-action button
     cta:
-      enable: true
-      text: "GitHub"
-      url: "https://github.com/modelscope/twinkle"
+      enable: false
+      text: ""
+      url: ""
 
   # ────────────────────────────────────────────────────────────────────────────
   # FOOTER
@@ -106,7 +106,7 @@ hugoblox:
   footer:
     # Footer style variant
     style: minimal                      # minimal | columns | centered
-    language_switcher: true
+    language_switcher: false
     # Custom footer text (HTML supported)
     text: ""
 
@@ -129,7 +129,7 @@ hugoblox:
   # ────────────────────────────────────────────────────────────────────────────
   seo:
     # Browser tab title override (defaults to identity.name if not set)
-    title: ""
+    title: "Twinkle – LLM Training Framework by ModelScope"
     # For local_business identity type only
     location:
       address:
@@ -180,51 +180,20 @@ hugoblox:
     quick_actions: []
 
   # ────────────────────────────────────────────────────────────────────────────
-  # COMMENTS
-  # User comments system
+  # COMMENTS (disabled)
   # ────────────────────────────────────────────────────────────────────────────
   comments:
     enable: false
-    provider: ""                        # giscus | disqus
-    giscus:
-      repo: ""
-      repo_id: ""
-      category: ""
-      category_id: ""
-    disqus:
-      shortname: ""
 
   # ────────────────────────────────────────────────────────────────────────────
-  # ANALYTICS
-  # Traffic and behavior analytics
-  # ────────────────────────────────────────────────────────────────────────────
-  analytics:
-    google:
-      measurement_id: ""                # Google Analytics 4: G-XXXXXXXXXX
-    google_tag_manager:
-      container_id: ""                  # GTM-XXXXXXX
-    plausible:
-      domain: ""
-    fathom:
-      site_id: ""
-    pirsch:
-      site_id: ""
-    clarity:
-      project_id: ""
-    baidu:
-      site_id: ""
+  # ANALYTICS (none configured)
+  # ────────────────────────────────────────────────────────────────────────────
+  analytics: {}
 
   # ────────────────────────────────────────────────────────────────────────────
-  # VERIFICATION
-  # Search engine site verification codes
+  # VERIFICATION (none configured)
   # ────────────────────────────────────────────────────────────────────────────
-  verification:
-    google: ""
-    bing: ""
-    baidu: ""
-    yandex: ""
-    pinterest: ""
-    naver: ""
+  verification: {}
 
   # ────────────────────────────────────────────────────────────────────────────
   # REPOSITORY
@@ -246,46 +215,25 @@ hugoblox:
     address_format: en-us
 
   # ────────────────────────────────────────────────────────────────────────────
-  # SECURITY
-  # Security headers and policies (requires Netlify integration)
+  # SECURITY (defaults)
   # ────────────────────────────────────────────────────────────────────────────
   security:
-    csp:
-      policy: ""
-      report_only: false
-    frame_options: allow                 # deny | sameorigin | allow - allow enables iframe embedding
-    permissions_policy: ""
+    frame_options: allow
 
   # ────────────────────────────────────────────────────────────────────────────
-  # PRIVACY
-  # Privacy and compliance features
+  # PRIVACY (disabled)
   # ────────────────────────────────────────────────────────────────────────────
   privacy:
     enable: false
-    anonymize_analytics: true
 
   # ────────────────────────────────────────────────────────────────────────────
-  # DEBUG
-  # Development and debugging (hidden in production)
+  # DEBUG (disabled in production)
   # ────────────────────────────────────────────────────────────────────────────
   debug:
     enable: false
-    hud:
-      position: bottom-left             # top-left | top-right | bottom-left | bottom-right
-      opacity: 1.0
-    export_logs: true
 
   # ────────────────────────────────────────────────────────────────────────────
   # PREMIUM
-  # Premium features and creator program (affiliates)
   # ────────────────────────────────────────────────────────────────────────────
   pro:
-    # Hide "Powered by HugoBlox" attribution
-    # Requires: HugoBlox Premium to support HugoBlox via alternative means
-    # Note: Even with Premium, you can keep this false to support open source!
-    # Get Premium: https://hugoblox.com/premium
     hide_attribution: false
-    
-    # Affiliate referral code for rewards program
-    # Join affiliate program: https://hugoblox.com/affiliates
-    affiliate_code: ""
diff --git a/content/_index.md b/content/_index.md
index 41c17d1..948a628 100644
--- a/content/_index.md
+++ b/content/_index.md
@@ -4,11 +4,11 @@ date: 2026-02-10
 type: landing
 
 design:
-  spacing: "3rem"
+  spacing: "2rem"
 
 sections:
   # ═══════════════════════════════════════════════════════════════════════════
-  # HERO - Dramatic entrance
+  # HERO
   # ═══════════════════════════════════════════════════════════════════════════
   - block: hero
     content:
@@ -16,66 +16,64 @@ sections:
       text: |
         <p style="font-size: 1.5rem; font-weight: 500; margin-bottom: 0.5rem;">Training workbench to make your model glow ✨</p>
         <p style="font-size: 1.1rem; color: #64748b;">One framework. Any scale. From your laptop to thousand-GPU clusters.</p>
+        <p style="margin-top: 1rem;"><a href="https://www.modelscope.cn/organization/twinkle-kit" style="color: #624aff; text-decoration: none; font-weight: 500;">ModelScope Organization →</a></p>
       primary_action:
         text: Get Started
         url: docs/getting-started/
         icon: rocket-launch
       secondary_action:
-        text: View Source
-        url: https://github.com/modelscope/twinkle
+        text: Documentation
+        url: docs/
       announcement:
-        text: "🚀 v0.2.0 — DPO, GKD & On-policy Distillation, Qwen3.5 Multimodal Training"
+        text: "🚀 v0.4.0 — DeepSeek V4, Gemma 4, Qwen3.5 MoE GatedDeltaNet, EP LoRA & NPU Acceleration"
         link:
           text: "See what's new →"
-          url: "https://github.com/modelscope/twinkle/releases/tag/v0.2.0"
+          url: "https://github.com/modelscope/twinkle/releases/tag/v0.4.0"
     design:
       spacing:
-        padding: ["5rem", 0, "3rem", 0]
+        padding: ["3rem", 0, "2rem", 0]
 
   # ═══════════════════════════════════════════════════════════════════════════
-  # STATS - Key numbers at a glance
+  # STATS
   # ═══════════════════════════════════════════════════════════════════════════
   - block: stats
     content:
       items:
-        - statistic: "All"
+        - statistic: "All Modalities"
           description: |
             Mainstream Models
             LLM · VLM · MoE
-        - statistic: "3"
+        - statistic: "3 Modes"
           description: |
-            Runtime Modes
-            Local · Ray · HTTP
-        - statistic: "∞"
+            Local · Ray · Client
+        - statistic: "TaaS"
           description: |
-            Multi-Tenancy
-            Parallel LoRA Training
+            Training as a Service
+            Cloud-native · Multi-tenant
         - statistic: "<5min"
           description: |
             Setup Time
             pip install & go
     design:
       spacing:
-        padding: ["2rem", 0, "2rem", 0]
+        padding: ["1rem", 0, "1rem", 0]
 
   # ═══════════════════════════════════════════════════════════════════════════
-  # WHAT IS TWINKLE - Core value prop
+  # WHAT IS TWINKLE
   # ═══════════════════════════════════════════════════════════════════════════
   - block: markdown
     content:
       title: ""
       text: |
         <div style="max-width: 800px; margin: 0 auto; text-align: center; padding: 2rem 0;">
-        
+
         ## What is Twinkle?
-        
-        Twinkle is a **client-server LLM training framework** that separates *what you train* from *how you train*. 
-        
-        Write your training logic once with clean Python APIs. Then deploy it anywhere — locally with `torchrun`, 
+
+        Twinkle is a **client-server LLM training framework** that separates *what you train* from *how you train*.
+
+        Write your training logic once with clean Python APIs. Then deploy it anywhere — locally with `torchrun`,
         across Ray clusters, or as serverless Training-as-a-Service.
-        
-        Built by the [ms-swift](https://github.com/modelscope/ms-swift) team at **ModelScope**.
-        
+
         </div>
     design:
       columns: '1'
@@ -83,47 +81,52 @@ sections:
         padding: ["1rem", 0, "2rem", 0]
 
   # ═══════════════════════════════════════════════════════════════════════════
-  # CODE EXAMPLE - Show don't tell
+  # CODE EXAMPLE
   # ═══════════════════════════════════════════════════════════════════════════
   - block: markdown
     content:
       title: ""
       text: |
         <div style="max-width: 800px; margin: 0 auto;">
-        
+
         ## Train in 20 Lines
-        
+
+        ```bash
+        pip install 'twinkle-kit[ray]'
+        ```
+
         ```python
         import twinkle
-        from peft import LoraConfig
         from twinkle import DeviceGroup
         from twinkle.dataloader import DataLoader
         from twinkle.dataset import Dataset, DatasetMeta
         from twinkle.model import TransformersModel
-        
+
         # Choose your runtime: 'local' (torchrun), 'ray', or 'http'
         twinkle.initialize(mode='ray', groups=[DeviceGroup(name='default', ranks=8)])
-        
+
         # Prepare data — works with ModelScope and Hugging Face
         dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition'))
         dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
         dataset.encode()
-        
-        # Create model with LoRA
+
+        # Create model — full-parameter training by default
         model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B', remote_group='default')
-        model.add_adapter_to_model('default', LoraConfig(r=8, lora_alpha=32))
+        # Optional: add LoRA for parameter-efficient training
+        # from peft import LoraConfig
+        # model.add_adapter_to_model('default', LoraConfig(r=8, lora_alpha=32))
         model.set_optimizer(optimizer_cls='AdamW', lr=1e-4)
-        
+
         # Train — you control the loop
         for batch in DataLoader(dataset=dataset, batch_size=8):
             model.forward_backward(inputs=batch)
             model.clip_grad_and_step()
-        
+
         model.save('my-finetuned-model')
         ```
-        
+
         ### Or train via ModelScope TaaS — no GPU required
-        
+
         ```python
         import os
         from twinkle import init_tinker_client
@@ -131,37 +134,37 @@ sections:
         from twinkle.dataset import Dataset, DatasetMeta
         from twinkle.preprocessor import SelfCognitionProcessor
         from twinkle.server.common import input_feature_to_datum
-        
+
         # Use ModelScope's official TaaS endpoint — free, no local GPU needed
         base_url = 'https://www.modelscope.cn/twinkle'
         api_key = os.environ.get('MODELSCOPE_TOKEN')
         base_model = 'Qwen/Qwen3.6-27B'
-        
+
         # Prepare data locally
         dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition'))
         dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256)
         dataset.map(SelfCognitionProcessor('My Model', 'My Team'))
         dataset.encode(batched=True)
-        
+
         # Connect to ModelScope TaaS
         init_tinker_client()
         from tinker import ServiceClient, types
-        
+
         service_client = ServiceClient(base_url=base_url, api_key=api_key)
         training_client = service_client.create_lora_training_client(
             base_model=base_model, rank=16
         )
-        
+
         # Train — same loop, running on ModelScope's cluster
         for batch in DataLoader(dataset=dataset, batch_size=8):
             training_client.forward_backward(
                 [input_feature_to_datum(f) for f in batch], 'cross_entropy'
             )
             training_client.optim_step(types.AdamParams(learning_rate=1e-4))
-        
+
         training_client.save_state('my-lora').result()
         ```
-        
+
         </div>
     design:
       columns: '1'
@@ -170,7 +173,7 @@ sections:
         padding: ["3rem", 0, "3rem", 0]
 
   # ═══════════════════════════════════════════════════════════════════════════
-  # ARCHITECTURE - Visual showcase
+  # ARCHITECTURE
   # ═══════════════════════════════════════════════════════════════════════════
   - block: markdown
     content:
@@ -179,15 +182,15 @@ sections:
         <div style="text-align: center; padding: 2rem 0;">
           <img src="framework.jpg" alt="Twinkle Architecture" style="max-width: 720px; width: 100%;" />
         </div>
-        
+
         <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 2rem; max-width: 900px; margin: 2rem auto;">
           <div style="text-align: center;">
-            <h4 style="color: #6366f1; margin-bottom: 0.5rem;">🔌 Dual API</h4>
-            <p style="font-size: 0.9rem; opacity: 0.8;">Native Twinkle API for full features, or Tinker-compatible API for easy migration</p>
+            <h4 style="color: #6366f1; margin-bottom: 0.5rem;">🔌 Triple API</h4>
+            <p style="font-size: 0.9rem; opacity: 0.8;">OpenAI-compatible /chat/completions, native Twinkle API, or Tinker-compatible API</p>
           </div>
           <div style="text-align: center;">
             <h4 style="color: #6366f1; margin-bottom: 0.5rem;">🧩 Modular</h4>
-            <p style="font-size: 0.9rem; opacity: 0.8;">15+ components: Dataset, Template, Model, Sampler, Loss, Reward, Metric...</p>
+            <p style="font-size: 0.9rem; opacity: 0.8;">25+ components: Dataset, Template, Model, Sampler, Loss, Reward, Metric...</p>
           </div>
           <div style="text-align: center;">
             <h4 style="color: #6366f1; margin-bottom: 0.5rem;">🔀 Backend Agnostic</h4>
@@ -196,12 +199,11 @@ sections:
         </div>
     design:
       columns: '1'
-      css_class: "bg-gray-50"
       spacing:
-        padding: ["3rem", 0, "3rem", 0]
+        padding: ["2rem", 0, "2rem", 0]
 
   # ═══════════════════════════════════════════════════════════════════════════
-  # FEATURES - Why Twinkle
+  # FEATURES
   # ═══════════════════════════════════════════════════════════════════════════
   - block: features
     id: features
@@ -212,7 +214,7 @@ sections:
         - name: Scale Without Rewriting
           icon: arrow-trending-up
           description: |
-            Same code runs on your laptop and on thousand-GPU clusters. Switch from `torchrun` to Ray to HTTP deployment without changing your training logic.
+            Same interface runs on your laptop and on thousand-GPU clusters. Switch from `torchrun` to Ray to HTTP deployment without changing your training logic.
         - name: Multi-Tenancy Built-In
           icon: users
           description: |
@@ -220,7 +222,7 @@ sections:
         - name: You Own the Loop
           icon: code-bracket
           description: |
-            No hidden magic. See and control every forward, backward, and optimizer step. Debug freely, customize completely.
+            No hidden magic. See and control every forward, backward, and optimizer step. Compose algorithms freely, customize completely.
         - name: Training as a Service
           icon: cloud-arrow-up
           description: |
@@ -228,72 +230,81 @@ sections:
         - name: All Training Methods
           icon: academic-cap
           description: |
-            SFT, pre-training, GRPO, GKD, and more. Dense models and MoE architectures. Full FSDP, tensor parallelism, pipeline parallelism support.
+            SFT, pre-training, GRPO, DPO, GKD, and more. Dense models and MoE architectures. Full FSDP, tensor parallelism, pipeline parallelism support.
         - name: Broad Model Support
           icon: cpu-chip
           description: |
-            Qwen 3.6/3.5/3/2.5, DeepSeek R1/V2, GLM-4, InternLM2, and more. Both Hugging Face and ModelScope model hubs.
+            Qwen 3.6/3.5/3/2.5, DeepSeek R1/V4, Gemma 4, GLM-4, InternLM2, and more. Both Hugging Face and ModelScope model hubs.
     design:
       spacing:
-        padding: ["3rem", 0, "3rem", 0]
+        padding: ["2rem", 0, "2rem", 0]
 
   # ═══════════════════════════════════════════════════════════════════════════
-  # MULTI-TENANCY - Killer feature
+  # MULTI-TENANCY
   # ═══════════════════════════════════════════════════════════════════════════
   - block: markdown
     content:
       title: ""
       text: |
         <div style="max-width: 900px; margin: 0 auto;">
-        
+
         ## Multi-Tenancy: N Jobs, 1 Base Model
-        
+
         <div style="text-align: center; margin: 2rem 0;">
           <img src="multi_lora.png" alt="Multi-Tenancy" style="max-width: 500px; width: 100%; display: block; margin: 0 auto;" />
         </div>
-        
+
         Run completely different training jobs on a shared deployment:
-        
+
         | Tenant | Setup | Task |
         |:------:|-------|------|
-        | **A** | LoRA r=8, private data | SFT fine-tuning |
+        | **A** | Full-parameter, private data | SFT fine-tuning |
         | **B** | LoRA r=32, Hub dataset | Continued pre-training |
         | **C** | GRPO loss + Sampler | Reinforcement learning |
         | **D** | Inference mode | Log-prob computation |
-        
-        Each tenant is **fully isolated** — different optimizers, data pipelines, loss functions. 
+
+        Each tenant is **fully isolated** — different optimizers, data pipelines, loss functions.
         They only share the base model's compute. Checkpoints auto-sync to ModelScope or Hugging Face.
-        
+
         </div>
     design:
       columns: '1'
       spacing:
-        padding: ["3rem", 0, "3rem", 0]
+        padding: ["2rem", 0, "2rem", 0]
 
   # ═══════════════════════════════════════════════════════════════════════════
-  # SUPPORTED MODELS - Social proof
+  # SUPPORTED MODELS
   # ═══════════════════════════════════════════════════════════════════════════
   - block: markdown
     content:
       title: ""
       text: |
         <div style="text-align: center; padding: 2rem 0;">
-        
+
         ## Supported Models
-        
+
         <div style="margin: 1.5rem 0;">
           <span class="model-tag" style="background: linear-gradient(135deg, #ec4899 0%, #db2777 100%);">Qwen 3.6</span>
           <span class="model-tag" style="background: linear-gradient(135deg, #6366f1 0%, #4f46e5 100%);">Qwen 3.5</span>
-          <span class="model-tag" style="background: linear-gradient(135deg, #8b5cf6 0%, #7c3aed 100%);">Qwen MoE</span>
-          <span class="model-tag" style="background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%);">DeepSeek R1</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #8b5cf6 0%, #7c3aed 100%);">Qwen 2.5</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%);">DeepSeek R1 / V4</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #f59e0b 0%, #d97706 100%);">LLaMA 3</span>
           <span class="model-tag" style="background: linear-gradient(135deg, #10b981 0%, #059669 100%);">GLM-4</span>
-          <span class="model-tag" style="background: linear-gradient(135deg, #14b8a6 0%, #0d9488 100%);">InternLM2</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #14b8a6 0%, #0d9488 100%);">InternLM 2.5</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #6b7280 0%, #4b5563 100%);">Mistral</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #06b6d4 0%, #0891b2 100%);">Yi</span>
         </div>
-        
+        <div style="margin: 1rem 0;">
+          <span class="model-tag" style="background: linear-gradient(135deg, #a855f7 0%, #9333ea 100%);">Qwen VL</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #64748b 0%, #475569 100%);">InternVL</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #78716c 0%, #57534e 100%);">Qwen Embedding</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%);">Gemma 4</span>
+        </div>
+
         <p style="opacity: 0.7; font-size: 0.9rem;">
-          Works with mainstream LLMs · NVIDIA · Ascend NPU · SFT / PT / GRPO / GKD
+          Works with mainstream LLMs & VLMs · NVIDIA · Ascend NPU · SFT / PT / GRPO / DPO / GKD / Embedding
         </p>
-        
+
         </div>
     design:
       columns: '1'
@@ -302,7 +313,40 @@ sections:
         padding: ["2rem", 0, "2rem", 0]
 
   # ═══════════════════════════════════════════════════════════════════════════
-  # CTA - Final push
+  # USER JOURNEY
+  # ═══════════════════════════════════════════════════════════════════════════
+  - block: markdown
+    content:
+      title: ""
+      text: |
+        <div style="max-width: 900px; margin: 0 auto; text-align: center; padding: 1rem 0;">
+
+        ## Get Started in 3 Steps
+
+        <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 2rem; margin-top: 2rem; text-align: left;">
+          <div style="border: 1px solid #e2e8f0; border-radius: 12px; padding: 1.5rem;">
+            <h4 style="margin: 0 0 0.5rem 0;">1. Install</h4>
+            <code style="background: #f1f5f9; padding: 0.25rem 0.5rem; border-radius: 4px; font-size: 0.85rem;">pip install 'twinkle-kit[ray]'</code>
+            <p style="font-size: 0.85rem; opacity: 0.7; margin-top: 0.5rem;">30 seconds. Python 3.11+, PyTorch 2.7+.</p>
+          </div>
+          <div style="border: 1px solid #e2e8f0; border-radius: 12px; padding: 1.5rem;">
+            <h4 style="margin: 0 0 0.5rem 0;">2. Choose a Recipe</h4>
+            <p style="font-size: 0.85rem; opacity: 0.7;">Browse the <a href="/showcase/">Cookbook</a> — SFT, GRPO, DPO, GKD, Embedding, Multimodal, and more.</p>
+          </div>
+          <div style="border: 1px solid #e2e8f0; border-radius: 12px; padding: 1.5rem;">
+            <h4 style="margin: 0 0 0.5rem 0;">3. Train & Deploy</h4>
+            <p style="font-size: 0.85rem; opacity: 0.7;">Run locally with <code>torchrun</code>, scale with Ray, or use <a href="/blog/modelscope-taas/">TaaS</a> for zero-infra training.</p>
+          </div>
+        </div>
+
+        </div>
+    design:
+      columns: '1'
+      spacing:
+        padding: ["2rem", 0, "2rem", 0]
+
+  # ═══════════════════════════════════════════════════════════════════════════
+  # CTA
   # ═══════════════════════════════════════════════════════════════════════════
   - block: cta-card
     content:
diff --git a/content/_index.zh.md b/content/_index.zh.md
index cd21dd6..aa92f4b 100644
--- a/content/_index.zh.md
+++ b/content/_index.zh.md
@@ -4,7 +4,7 @@ date: 2026-02-10
 type: landing
 
 design:
-  spacing: "3rem"
+  spacing: "2rem"
 
 sections:
   # ═══════════════════════════════════════════════════════════════════════════
@@ -14,23 +14,24 @@ sections:
     content:
       title: '<span class="hero-title-with-logo"><img src="../slogan.png" alt="Twinkle" class="hero-logo" /></span>'
       text: |
-        <p style="font-size: 1.5rem; font-weight: 500; margin-bottom: 0.5rem;">让你的模型闪闪发光 ✨</p>
-        <p style="font-size: 1.1rem; color: #64748b;">一个框架，任意规模。从笔记本到千卡集群。</p>
+        <p style="font-size: 1.5rem; font-weight: 500; margin-bottom: 0.5rem;">让你的模型闪闪发光的训练工作台 ✨</p>
+        <p style="font-size: 1.1rem; color: #64748b;">一套框架，任意规模。从笔记本到千卡集群。</p>
+        <p style="margin-top: 1rem;"><a href="https://www.modelscope.cn/organization/twinkle-kit" style="color: #624aff; text-decoration: none; font-weight: 500;">ModelScope 组织主页 →</a></p>
       primary_action:
         text: 快速开始
         url: docs/getting-started/
         icon: rocket-launch
       secondary_action:
-        text: 查看源码
-        url: https://github.com/modelscope/twinkle
+        text: 查看文档
+        url: docs/
       announcement:
-        text: "🚀 v0.2.0 — DPO、GKD 与 On-policy 蒸馏、Qwen3.5 多模态训练"
+        text: "🚀 v0.4.0 — DeepSeek V4、Gemma 4、Qwen3.5 MoE GatedDeltaNet、EP LoRA 与 NPU 加速"
         link:
           text: "查看更新 →"
-          url: "https://github.com/modelscope/twinkle/releases/tag/v0.2.0"
+          url: "https://github.com/modelscope/twinkle/releases/tag/v0.4.0"
     design:
       spacing:
-        padding: ["5rem", 0, "3rem", 0]
+        padding: ["3rem", 0, "2rem", 0]
 
   # ═══════════════════════════════════════════════════════════════════════════
   # STATS
@@ -38,25 +39,24 @@ sections:
   - block: stats
     content:
       items:
-        - statistic: "全"
+        - statistic: "全模态"
           description: |
             主流模型
             LLM · VLM · MoE
-        - statistic: "3"
+        - statistic: "3 运行模式"
           description: |
-            运行模式
-            本地 · Ray · HTTP
-        - statistic: "∞"
+            本地 · Ray · Client
+        - statistic: "服务化"
           description: |
-            多租户
-            并行 LoRA 训练
+            训练即服务
+            云原生 · 多租户
         - statistic: "<5分钟"
           description: |
             上手时间
             pip install 即用
     design:
       spacing:
-        padding: ["2rem", 0, "2rem", 0]
+        padding: ["1rem", 0, "1rem", 0]
 
   # ═══════════════════════════════════════════════════════════════════════════
   # WHAT IS TWINKLE
@@ -66,16 +66,14 @@ sections:
       title: ""
       text: |
         <div style="max-width: 800px; margin: 0 auto; text-align: center; padding: 2rem 0;">
-        
+
         ## 什么是 Twinkle？
-        
+
         Twinkle 是一个 **客户端-服务端 LLM 训练框架**，将*训练什么*与*如何训练*分离。
-        
+
         使用简洁的 Python API 编写训练逻辑，然后部署到任何地方 —— 本地 `torchrun`、
         Ray 集群，或无服务器 Training-as-a-Service。
-        
-        由 **ModelScope** 的 [ms-swift](https://github.com/modelscope/ms-swift) 团队构建。
-        
+
         </div>
     design:
       columns: '1'
@@ -90,40 +88,45 @@ sections:
       title: ""
       text: |
         <div style="max-width: 800px; margin: 0 auto;">
-        
+
         ## 20 行代码开始训练
-        
+
+        ```bash
+        pip install 'twinkle-kit[ray]'
+        ```
+
         ```python
         import twinkle
-        from peft import LoraConfig
         from twinkle import DeviceGroup
         from twinkle.dataloader import DataLoader
         from twinkle.dataset import Dataset, DatasetMeta
         from twinkle.model import TransformersModel
-        
+
         # 选择运行模式: 'local' (torchrun), 'ray', 或 'http'
         twinkle.initialize(mode='ray', groups=[DeviceGroup(name='default', ranks=8)])
-        
+
         # 准备数据 — 支持魔搭和 Hugging Face
         dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition'))
         dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
         dataset.encode()
-        
-        # 创建带 LoRA 的模型
+
+        # 创建模型 — 默认全参数训练
         model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B', remote_group='default')
-        model.add_adapter_to_model('default', LoraConfig(r=8, lora_alpha=32))
+        # 可选：添加 LoRA 进行参数高效训练
+        # from peft import LoraConfig
+        # model.add_adapter_to_model('default', LoraConfig(r=8, lora_alpha=32))
         model.set_optimizer(optimizer_cls='AdamW', lr=1e-4)
-        
+
         # 训练 — 你掌控循环
         for batch in DataLoader(dataset=dataset, batch_size=8):
             model.forward_backward(inputs=batch)
             model.clip_grad_and_step()
-        
+
         model.save('my-finetuned-model')
         ```
-        
+
         ### 或通过魔搭 TaaS 训练 — 无需本地 GPU
-        
+
         ```python
         import os
         from twinkle import init_tinker_client
@@ -131,37 +134,37 @@ sections:
         from twinkle.dataset import Dataset, DatasetMeta
         from twinkle.preprocessor import SelfCognitionProcessor
         from twinkle.server.common import input_feature_to_datum
-        
+
         # 使用魔搭社区官方 TaaS 端点 — 免费，无需本地 GPU
         base_url = 'https://www.modelscope.cn/twinkle'
         api_key = os.environ.get('MODELSCOPE_TOKEN')
         base_model = 'Qwen/Qwen3.6-27B'
-        
+
         # 本地准备数据
         dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition'))
         dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256)
         dataset.map(SelfCognitionProcessor('My Model', 'My Team'))
         dataset.encode(batched=True)
-        
+
         # 连接魔搭 TaaS 服务
         init_tinker_client()
         from tinker import ServiceClient, types
-        
+
         service_client = ServiceClient(base_url=base_url, api_key=api_key)
         training_client = service_client.create_lora_training_client(
             base_model=base_model, rank=16
         )
-        
+
         # 训练 — 相同的循环，运行在魔搭集群上
         for batch in DataLoader(dataset=dataset, batch_size=8):
             training_client.forward_backward(
                 [input_feature_to_datum(f) for f in batch], 'cross_entropy'
             )
             training_client.optim_step(types.AdamParams(learning_rate=1e-4))
-        
+
         training_client.save_state('my-lora').result()
         ```
-        
+
         </div>
     design:
       columns: '1'
@@ -179,15 +182,15 @@ sections:
         <div style="text-align: center; padding: 2rem 0;">
           <img src="../framework.jpg" alt="Twinkle 架构" style="max-width: 720px; width: 100%;" />
         </div>
-        
+
         <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 2rem; max-width: 900px; margin: 2rem auto;">
           <div style="text-align: center;">
-            <h4 style="color: #6366f1; margin-bottom: 0.5rem;">🔌 双 API</h4>
-            <p style="font-size: 0.9rem; opacity: 0.8;">原生 Twinkle API 功能完整，Tinker 兼容 API 便于迁移</p>
+            <h4 style="color: #6366f1; margin-bottom: 0.5rem;">🔌 三套 API</h4>
+            <p style="font-size: 0.9rem; opacity: 0.8;">OpenAI 兼容 /chat/completions、原生 Twinkle API、Tinker 兼容 API</p>
           </div>
           <div style="text-align: center;">
             <h4 style="color: #6366f1; margin-bottom: 0.5rem;">🧩 模块化</h4>
-            <p style="font-size: 0.9rem; opacity: 0.8;">15+ 组件：Dataset、Template、Model、Sampler、Loss、Reward、Metric...</p>
+            <p style="font-size: 0.9rem; opacity: 0.8;">25+ 组件：Dataset、Template、Model、Sampler、Loss、Reward、Metric...</p>
           </div>
           <div style="text-align: center;">
             <h4 style="color: #6366f1; margin-bottom: 0.5rem;">🔀 后端无关</h4>
@@ -196,9 +199,8 @@ sections:
         </div>
     design:
       columns: '1'
-      css_class: "bg-gray-50"
       spacing:
-        padding: ["3rem", 0, "3rem", 0]
+        padding: ["2rem", 0, "2rem", 0]
 
   # ═══════════════════════════════════════════════════════════════════════════
   # FEATURES
@@ -212,7 +214,7 @@ sections:
         - name: 无需重写即可扩展
           icon: arrow-trending-up
           description: |
-            相同代码运行在笔记本和千卡集群。从 `torchrun` 切换到 Ray 或 HTTP 部署，无需修改训练逻辑。
+            同一份接口运行在笔记本和千卡集群。从 `torchrun` 切换到 Ray 或 HTTP 部署，无需修改训练逻辑。
         - name: 内置多租户
           icon: users
           description: |
@@ -220,7 +222,7 @@ sections:
         - name: 你掌控训练循环
           icon: code-bracket
           description: |
-            没有隐藏的魔法。查看和控制每一个 forward、backward 和优化器步骤。自由调试，完全定制。
+            没有隐藏的魔法。查看和控制每一个 forward、backward 和优化器步骤。自由组合算法，完全定制。
         - name: 训练即服务
           icon: cloud-arrow-up
           description: |
@@ -228,14 +230,14 @@ sections:
         - name: 全训练方法
           icon: academic-cap
           description: |
-            SFT、预训练、GRPO、GKD 等。稠密模型和 MoE 架构。完整的 FSDP、张量并行、流水线并行支持。
+            SFT、预训练、GRPO、DPO、GKD 等。稠密模型和 MoE 架构。完整的 FSDP、张量并行、流水线并行支持。
         - name: 广泛的模型支持
           icon: cpu-chip
           description: |
-            Qwen 3.6/3.5/3/2.5、DeepSeek R1/V2、GLM-4、InternLM2 等。同时支持 Hugging Face 和魔搭模型库。
+            Qwen 3.6/3.5/3/2.5、DeepSeek R1/V4、Gemma 4、GLM-4、InternLM2 等。同时支持 Hugging Face 和魔搭模型库。
     design:
       spacing:
-        padding: ["3rem", 0, "3rem", 0]
+        padding: ["2rem", 0, "2rem", 0]
 
   # ═══════════════════════════════════════════════════════════════════════════
   # MULTI-TENANCY
@@ -245,30 +247,30 @@ sections:
       title: ""
       text: |
         <div style="max-width: 900px; margin: 0 auto;">
-        
+
         ## 多租户：N 个任务，1 个基座模型
-        
+
         <div style="text-align: center; margin: 2rem 0;">
           <img src="../multi_lora.png" alt="多租户" style="max-width: 500px; width: 100%; display: block; margin: 0 auto;" />
         </div>
-        
+
         在共享部署上运行完全不同的训练任务：
-        
+
         | 租户 | 配置 | 任务 |
         |:---:|------|-----|
-        | **A** | LoRA r=8, 私有数据 | SFT 微调 |
+        | **A** | 全参数, 私有数据 | SFT 微调 |
         | **B** | LoRA r=32, Hub 数据集 | 增量预训练 |
         | **C** | GRPO 损失 + Sampler | 强化学习 |
         | **D** | 推理模式 | 对数概率计算 |
-        
+
         每个租户**完全隔离** —— 不同的优化器、数据流水线、损失函数。
         只共享基座模型的算力。检查点自动同步到魔搭或 Hugging Face。
-        
+
         </div>
     design:
       columns: '1'
       spacing:
-        padding: ["3rem", 0, "3rem", 0]
+        padding: ["2rem", 0, "2rem", 0]
 
   # ═══════════════════════════════════════════════════════════════════════════
   # SUPPORTED MODELS
@@ -278,22 +280,31 @@ sections:
       title: ""
       text: |
         <div style="text-align: center; padding: 2rem 0;">
-        
+
         ## 支持的模型
-        
+
         <div style="margin: 1.5rem 0;">
           <span class="model-tag" style="background: linear-gradient(135deg, #ec4899 0%, #db2777 100%);">Qwen 3.6</span>
           <span class="model-tag" style="background: linear-gradient(135deg, #6366f1 0%, #4f46e5 100%);">Qwen 3.5</span>
-          <span class="model-tag" style="background: linear-gradient(135deg, #8b5cf6 0%, #7c3aed 100%);">Qwen MoE</span>
-          <span class="model-tag" style="background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%);">DeepSeek R1</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #8b5cf6 0%, #7c3aed 100%);">Qwen 2.5</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%);">DeepSeek R1 / V4</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #f59e0b 0%, #d97706 100%);">LLaMA 3</span>
           <span class="model-tag" style="background: linear-gradient(135deg, #10b981 0%, #059669 100%);">GLM-4</span>
-          <span class="model-tag" style="background: linear-gradient(135deg, #14b8a6 0%, #0d9488 100%);">InternLM2</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #14b8a6 0%, #0d9488 100%);">InternLM 2.5</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #6b7280 0%, #4b5563 100%);">Mistral</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #06b6d4 0%, #0891b2 100%);">Yi</span>
         </div>
-        
+        <div style="margin: 1rem 0;">
+          <span class="model-tag" style="background: linear-gradient(135deg, #a855f7 0%, #9333ea 100%);">Qwen VL</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #64748b 0%, #475569 100%);">InternVL</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #78716c 0%, #57534e 100%);">Qwen Embedding</span>
+          <span class="model-tag" style="background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%);">Gemma 4</span>
+        </div>
+
         <p style="opacity: 0.7; font-size: 0.9rem;">
-          支持主流 LLM · NVIDIA · 昇腾 NPU · SFT / PT / GRPO / GKD
+          支持主流 LLM 与 VLM · NVIDIA · 昇腾 NPU · SFT / PT / GRPO / DPO / GKD / Embedding
         </p>
-        
+
         </div>
     design:
       columns: '1'
@@ -301,6 +312,39 @@ sections:
       spacing:
         padding: ["2rem", 0, "2rem", 0]
 
+  # ═══════════════════════════════════════════════════════════════════════════
+  # USER JOURNEY
+  # ═══════════════════════════════════════════════════════════════════════════
+  - block: markdown
+    content:
+      title: ""
+      text: |
+        <div style="max-width: 900px; margin: 0 auto; text-align: center; padding: 1rem 0;">
+
+        ## 三步上手
+
+        <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 2rem; margin-top: 2rem; text-align: left;">
+          <div style="border: 1px solid #e2e8f0; border-radius: 12px; padding: 1.5rem;">
+            <h4 style="margin: 0 0 0.5rem 0;">1. 安装</h4>
+            <code style="background: #f1f5f9; padding: 0.25rem 0.5rem; border-radius: 4px; font-size: 0.85rem;">pip install 'twinkle-kit[ray]'</code>
+            <p style="font-size: 0.85rem; opacity: 0.7; margin-top: 0.5rem;">30 秒完成。Python 3.11+，PyTorch 2.7+。</p>
+          </div>
+          <div style="border: 1px solid #e2e8f0; border-radius: 12px; padding: 1.5rem;">
+            <h4 style="margin: 0 0 0.5rem 0;">2. 选择配方</h4>
+            <p style="font-size: 0.85rem; opacity: 0.7;">浏览 <a href="/zh/showcase/">Cookbook</a> — SFT、GRPO、DPO、GKD、Embedding、多模态等。</p>
+          </div>
+          <div style="border: 1px solid #e2e8f0; border-radius: 12px; padding: 1.5rem;">
+            <h4 style="margin: 0 0 0.5rem 0;">3. 训练与部署</h4>
+            <p style="font-size: 0.85rem; opacity: 0.7;">本地 <code>torchrun</code> 运行，Ray 集群扩展，或使用 <a href="/zh/blog/modelscope-taas/">TaaS</a> 零基础设施训练。</p>
+          </div>
+        </div>
+
+        </div>
+    design:
+      columns: '1'
+      spacing:
+        padding: ["2rem", 0, "2rem", 0]
+
   # ═══════════════════════════════════════════════════════════════════════════
   # CTA
   # ═══════════════════════════════════════════════════════════════════════════
diff --git a/content/authors/_index.md b/content/authors/_index.md
deleted file mode 100644
index 853f3ed..0000000
--- a/content/authors/_index.md
+++ /dev/null
@@ -1,9 +0,0 @@
----
-# To publish author profile pages, remove all the `build` and `cascade` settings below.
-build:
-  render: never
-cascade:
-  build:
-    render: never
-    list: always
----
diff --git a/content/blog/modelscope-taas/index.md b/content/blog/modelscope-taas/index.md
index 9372c08..eadf5f3 100644
--- a/content/blog/modelscope-taas/index.md
+++ b/content/blog/modelscope-taas/index.md
@@ -185,3 +185,5 @@ After training, you can:
 3. Start training!
 
 Questions? Open an issue on [GitHub](https://github.com/modelscope/twinkle/issues) or join our WeChat group.
+
+> **Clarification**: TaaS currently offers LoRA-based training as a managed service. The Twinkle framework itself supports both **full-parameter** and **LoRA** training when running locally or on your own cluster. See the [Cookbook](/showcase/) for full-parameter examples.
diff --git a/content/blog/modelscope-taas/index.zh.md b/content/blog/modelscope-taas/index.zh.md
index 695612d..b02eb72 100644
--- a/content/blog/modelscope-taas/index.zh.md
+++ b/content/blog/modelscope-taas/index.zh.md
@@ -185,3 +185,5 @@ for i, seq in enumerate(result.sequences):
 3. 开始训练！
 
 有问题？在 [GitHub](https://github.com/modelscope/twinkle/issues) 上提 issue 或加入我们的微信群。
+
+> **说明**：TaaS 目前提供基于 LoRA 的托管训练服务。Twinkle 框架本身在本地或自建集群上同时支持**全参数训练**和 **LoRA 训练**。全参数训练示例请参见 [Cookbook](/zh/showcase/)。
diff --git a/content/blog/multi-lora/index.md b/content/blog/multi-lora/index.md
new file mode 100644
index 0000000..9817c03
--- /dev/null
+++ b/content/blog/multi-lora/index.md
@@ -0,0 +1,116 @@
+---
+title: "Multi-LoRA: Concurrent Multi-Tenant Training on Shared GPUs"
+date: 2026-06-01
+tags:
+  - Multi-LoRA
+  - Multi-Tenant
+  - LoRA
+  - FSDP
+  - Megatron
+categories:
+  - Technical Deep Dive
+---
+
+Twinkle's Multi-LoRA architecture enables multiple tenants to train independent LoRA adapters on a **single shared model** simultaneously. This post explains the technical design, covering both the Transformers and Megatron backends.
+
+<!--more-->
+
+## Why Multi-LoRA?
+
+Traditional LoRA training loads a full base model per user. For a 70B model this means ~140 GB of GPU memory per tenant — an enormous waste when the frozen base weights are identical across all users. Multi-LoRA solves this by:
+
+- **Sharing the base model**: All tenants share one copy of frozen base weights.
+- **Pre-allocating adapter slots**: A fixed pool of LoRA adapter slots (`max_loras × max_r`) is allocated at initialization, avoiding runtime memory fragmentation.
+- **Dynamic tenant switching**: Tenants acquire/release adapters on-the-fly with near-zero context-switch overhead.
+
+## Architecture Overview
+
+```
+┌──────────────────────────────────────────┐
+│           Shared Base Model              │
+│  (Frozen weights, loaded once)           │
+├──────────────────────────────────────────┤
+│         MultiLora Manager                │
+│  ┌────────┐ ┌────────┐ ┌────────┐       │
+│  │ Slot 0 │ │ Slot 1 │ │ Slot 2 │ ...   │
+│  │Tenant A│ │Tenant B│ │  Free  │       │
+│  └────────┘ └────────┘ └────────┘       │
+├──────────────────────────────────────────┤
+│  Per-Tenant: Optimizer, LR Scheduler,    │
+│  Template, Gradient Accumulation         │
+└──────────────────────────────────────────┘
+```
+
+The `MultiLora` class manages the lifecycle:
+
+1. **`patch(model)`** — Patches every `LoLayer` forward method to iterate over active adapters, applying LoRA weights with proper scaling.
+2. **`acquire_lora(tenant, config)`** — Assigns a pre-allocated slot to a tenant with the given `LoraConfig`.
+3. **`adapter(name)`** — Context manager that activates a specific adapter for forward/backward passes.
+4. **`release_lora(tenant)`** — Restores initial weights and returns the slot to the free pool.
+
+## Transformers Backend
+
+`MultiLoraTransformersModel` wraps the standard `TransformersModel` with per-adapter isolation:
+
+```python
+model = MultiLoraTransformersModel(model_id='Qwen/Qwen3.5-72B', max_loras=5)
+
+# Tenant A registers their adapter
+model.add_adapter_to_model('tenant_a', LoraConfig(r=16, target_modules='all-linear'))
+model.set_optimizer(optimizer_cls=Adam, lr=1e-4, adapter_name='tenant_a')
+
+# Tenant B registers independently
+model.add_adapter_to_model('tenant_b', LoraConfig(r=8, target_modules='all-linear'))
+model.set_optimizer(optimizer_cls=Adam, lr=2e-4, adapter_name='tenant_b')
+
+# Each tenant trains independently — gradients are isolated
+model.forward_backward(inputs=batch_a, adapter_name='tenant_a')
+model.clip_grad_and_step(adapter_name='tenant_a')
+```
+
+Key design choices:
+
+- **Optimizer Groups**: Each adapter has its own optimizer, LR scheduler, and gradient accumulation settings stored in an `OptimizerGroup`.
+- **Context-switched forward**: Every `forward_backward`, `step`, and `zero_grad` call is wrapped with `self.multi_adapter.adapter(name)` to ensure gradient isolation.
+- **Independent checkpointing**: `save()` extracts only the active adapter's state dict, so tenants never see each other's weights.
+
+## Megatron Backend
+
+`MultiLoraMegatronModel` extends Megatron's tensor/pipeline parallel training with multi-tenant support. The key challenge is that Megatron uses a **distributed optimizer** that sees all parameters — but we need per-adapter gradient isolation.
+
+The solution: **`optimizer_context` manager** that temporarily replaces `named_parameters()` on each pipeline-parallel module, filtering to only yield parameters matching the active adapter's regex pattern:
+
+```python
+@contextmanager
+def optimizer_context(self, adapter_name: str):
+    pattern = re.compile(rf'\.lora_\w+\.{re.escape(adapter_name)}\.')
+    for module in self.model:
+        orig = module.named_parameters
+        module.named_parameters = make_filtered(orig, pattern)
+    yield
+    # restore original named_parameters
+```
+
+This ensures the optimizer only updates the target adapter's LoRA weights, even in a distributed setting with TP/PP sharding.
+
+Additional Megatron-specific features:
+
+- **Per-rank optimizer checkpointing**: Each rank saves its own optimizer state, enabling efficient multi-GPU resume.
+- **HF + Megatron format export**: Save adapters in either HuggingFace PEFT format or native Megatron format.
+- **RNG state isolation**: Global RNG is intentionally *not* restored when loading a tenant checkpoint to avoid silently affecting other active tenants' dropout behavior.
+
+## Performance
+
+By sharing base model weights across tenants, Multi-LoRA reduces GPU memory usage proportionally:
+
+| Tenants | Traditional (N × full model) | Multi-LoRA (1 model + N adapters) |
+|---------|------------------------------|-----------------------------------|
+| 1       | 140 GB                       | 140 GB + 0.1 GB                   |
+| 5       | 700 GB                       | 140 GB + 0.5 GB                   |
+| 10      | 1400 GB                      | 140 GB + 1.0 GB                   |
+
+*Estimates for a 70B model with LoRA r=16.*
+
+## Getting Started
+
+See the [Multi-LoRA DPO Cookbook](https://github.com/modelscope/twinkle/blob/main/cookbook/rl/dpo/dpo_multi_lora.py) for a complete example.
diff --git a/content/blog/multi-lora/index.zh.md b/content/blog/multi-lora/index.zh.md
new file mode 100644
index 0000000..33e5f2d
--- /dev/null
+++ b/content/blog/multi-lora/index.zh.md
@@ -0,0 +1,116 @@
+---
+title: "Multi-LoRA：共享 GPU 上的多租户并行训练"
+date: 2026-06-01
+tags:
+  - Multi-LoRA
+  - 多租户
+  - LoRA
+  - FSDP
+  - Megatron
+categories:
+  - 技术深度解析
+---
+
+Twinkle 的 Multi-LoRA 架构支持多个租户在**同一份共享模型**上同时训练各自独立的 LoRA 适配器。本文介绍其技术方案，涵盖 Transformers 和 Megatron 两种后端。
+
+<!--more-->
+
+## 为什么需要 Multi-LoRA？
+
+传统 LoRA 训练中，每个用户都要加载一份完整的基座模型。对于 70B 模型，这意味着每个租户占用 ~140 GB 显存——当所有用户的冻结基座权重完全相同时，这是巨大的浪费。Multi-LoRA 的解决思路：
+
+- **共享基座模型**：所有租户共享一份冻结的基座权重。
+- **预分配适配器槽位**：初始化时分配固定的 LoRA 适配器槽位池（`max_loras × max_r`），避免运行时内存碎片化。
+- **动态租户切换**：租户可以即时获取/释放适配器，上下文切换开销接近零。
+
+## 架构概览
+
+```
+┌──────────────────────────────────────────┐
+│           共享基座模型                     │
+│  (冻结权重，仅加载一次)                    │
+├──────────────────────────────────────────┤
+│         MultiLora 管理器                  │
+│  ┌────────┐ ┌────────┐ ┌────────┐       │
+│  │ 槽位 0 │ │ 槽位 1 │ │ 槽位 2 │ ...   │
+│  │ 租户 A │ │ 租户 B │ │  空闲  │       │
+│  └────────┘ └────────┘ └────────┘       │
+├──────────────────────────────────────────┤
+│  每租户独立：优化器、学习率调度器、          │
+│  模板、梯度累积                           │
+└──────────────────────────────────────────┘
+```
+
+`MultiLora` 类管理完整生命周期：
+
+1. **`patch(model)`** — 为每个 `LoLayer` 的 forward 方法打补丁，使其遍历所有活跃适配器并施加 LoRA 权重。
+2. **`acquire_lora(tenant, config)`** — 从预分配池中为租户分配一个槽位。
+3. **`adapter(name)`** — 上下文管理器，在 forward/backward 期间激活指定适配器。
+4. **`release_lora(tenant)`** — 恢复初始权重，将槽位归还空闲池。
+
+## Transformers 后端
+
+`MultiLoraTransformersModel` 在标准 `TransformersModel` 之上实现了逐适配器隔离：
+
+```python
+model = MultiLoraTransformersModel(model_id='Qwen/Qwen3.5-72B', max_loras=5)
+
+# 租户 A 注册适配器
+model.add_adapter_to_model('tenant_a', LoraConfig(r=16, target_modules='all-linear'))
+model.set_optimizer(optimizer_cls=Adam, lr=1e-4, adapter_name='tenant_a')
+
+# 租户 B 独立注册
+model.add_adapter_to_model('tenant_b', LoraConfig(r=8, target_modules='all-linear'))
+model.set_optimizer(optimizer_cls=Adam, lr=2e-4, adapter_name='tenant_b')
+
+# 各租户独立训练——梯度隔离
+model.forward_backward(inputs=batch_a, adapter_name='tenant_a')
+model.clip_grad_and_step(adapter_name='tenant_a')
+```
+
+核心设计：
+
+- **优化器组**：每个适配器拥有独立的优化器、学习率调度器和梯度累积配置，存储在 `OptimizerGroup` 中。
+- **上下文切换 forward**：所有 `forward_backward`、`step`、`zero_grad` 调用都被 `self.multi_adapter.adapter(name)` 包裹，确保梯度隔离。
+- **独立 checkpoint**：`save()` 仅提取当前活跃适配器的状态字典，租户之间互不可见。
+
+## Megatron 后端
+
+`MultiLoraMegatronModel` 在 Megatron 张量/流水线并行训练的基础上支持多租户。核心挑战在于 Megatron 使用**分布式优化器**，它能看到所有参数——但我们需要按适配器隔离梯度。
+
+解决方案：**`optimizer_context` 上下文管理器**，临时替换每个流水线并行模块上的 `named_parameters()`，使其仅返回匹配当前活跃适配器正则模式的参数：
+
+```python
+@contextmanager
+def optimizer_context(self, adapter_name: str):
+    pattern = re.compile(rf'\.lora_\w+\.{re.escape(adapter_name)}\.')
+    for module in self.model:
+        orig = module.named_parameters
+        module.named_parameters = make_filtered(orig, pattern)
+    yield
+    # 恢复原始 named_parameters
+```
+
+这确保了即使在 TP/PP 分片的分布式环境中，优化器也只更新目标适配器的 LoRA 权重。
+
+Megatron 特有功能：
+
+- **按 rank 保存优化器状态**：每个 rank 独立保存优化器状态，高效支持多 GPU 恢复。
+- **HF + Megatron 双格式导出**：支持以 HuggingFace PEFT 格式或原生 Megatron 格式保存适配器。
+- **RNG 状态隔离**：加载租户 checkpoint 时，全局 RNG 故意*不*恢复，以避免影响其他活跃租户的 dropout 行为。
+
+## 性能对比
+
+通过跨租户共享基座模型权重，Multi-LoRA 按比例降低显存使用：
+
+| 租户数 | 传统方式（N × 完整模型） | Multi-LoRA（1 模型 + N 适配器） |
+|--------|--------------------------|--------------------------------|
+| 1      | 140 GB                   | 140 GB + 0.1 GB                |
+| 5      | 700 GB                   | 140 GB + 0.5 GB                |
+| 10     | 1400 GB                  | 140 GB + 1.0 GB                |
+
+*基于 70B 模型、LoRA r=16 的估算。*
+
+## 快速开始
+
+完整示例请参考 [Multi-LoRA DPO Cookbook](https://github.com/modelscope/twinkle/blob/main/cookbook/rl/dpo/dpo_multi_lora.py)。
diff --git a/content/blog/npu-support/index.md b/content/blog/npu-support/index.md
new file mode 100644
index 0000000..d7dde9b
--- /dev/null
+++ b/content/blog/npu-support/index.md
@@ -0,0 +1,110 @@
+---
+title: "Ascend NPU Support: Fused Operators and Flash Linear Attention"
+date: 2026-06-05
+tags:
+  - NPU
+  - Ascend
+  - Domestic Hardware
+  - Kernel Optimization
+  - MoE
+categories:
+  - Technical Deep Dive
+---
+
+Twinkle provides first-class support for **Huawei Ascend NPU** through a comprehensive monkey-patching system that replaces standard CUDA operators with NPU-optimized fused kernels. This post covers the kernel architecture and the optimizations enabled.
+
+<!--more-->
+
+## Kernel Architecture
+
+Twinkle's kernel module (`twinkle.kernel`) provides a unified entry point `kernelize_model()` that automatically detects the device and applies appropriate optimizations:
+
+```python
+from twinkle.kernel import kernelize_model
+model = kernelize_model(model, device='npu')  # or auto-detected
+```
+
+On NPU devices, the following fused operators are applied **unconditionally**:
+
+| Operator | NPU Implementation | Benefit |
+|----------|-------------------|---------|
+| RMSNorm | `torch_npu.npu_rms_norm` | Fused normalization, ~2x faster |
+| RoPE | `torch_npu.npu_rotary_mul` | Fused rotary embedding with partial RoPE support |
+| SwiGLU | `torch_npu.npu_swiglu` | Fused gate+up projection activation |
+| SDPA | NPU-compatible `scaled_dot_product_attention` | Correct mask handling for NPU |
+| MoE GMM | `torch_npu.npu_grouped_matmul` | EP-aware grouped matrix multiply |
+| FLA | MindSpeed Triton backend | Flash Linear Attention for Qwen3.5 |
+
+## Fused Operators in Detail
+
+### RMSNorm with Residual Parameterization
+
+Twinkle's `NpuRMSNorm` detects the **residual parameterization** pattern used by Qwen3.5 (where `scale = 1.0 + weight`) at initialization time, avoiding CPU-synchronizing `Tensor.item()` calls in the hot path:
+
+```python
+class NpuRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        # Detect once at init
+        self._residual_param = abs(self.weight.data.mean().item()) < 0.3
+
+    def forward(self, hidden_states):
+        scale = (1.0 + self.weight) if self._residual_param else self.weight
+        return torch_npu.npu_rms_norm(hidden_states, scale, epsilon=self.eps)[0]
+```
+
+### EP-Aware MoE Optimization
+
+The MoE grouped matmul patch is **EP-aware** — it only activates when Expert Parallelism is enabled (each rank holds a subset of experts, weights are small and contiguous). Without EP, each rank holds **all** experts, and the transpose+contiguous copy creates ~8x overhead:
+
+```
+TWINKLE_NPU_GMM_PATCH not set → skip (default safe)
+TWINKLE_NPU_GMM_PATCH=1 + EP enabled  → apply (efficient)
+TWINKLE_NPU_GMM_PATCH=1 + EP disabled → skip (avoid 8x overhead)
+```
+
+The `GmmFunction` autograd function wraps `torch_npu.npu_grouped_matmul` with full backward support, and weights are cached with automatic invalidation when updated (full-param training bumps `_version`, LoRA keeps it stable).
+
+### Flash Linear Attention for Qwen3.5
+
+Qwen3.5 introduces a hybrid architecture mixing standard attention with linear attention layers. Twinkle enables the **FLA fast path** on NPU via MindSpeed's Triton implementation of `chunk_gated_delta_rule`:
+
+1. Force `is_flash_linear_attention_available = True` in transformers
+2. Replace `chunk_gated_delta_rule` with MindSpeed NPU-compatible implementation
+3. Traverse instantiated model to patch per-layer instances
+4. Disable CUDA-only `FusedRMSNormGated` that would fail on NPU
+
+The MindSpeed implementation provides chunked forward/backward with WY representation, supporting variable-length sequences via `cu_seqlens`.
+
+## Environment Variable Control
+
+Every optimization is independently controllable:
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `TWINKLE_NPU_PATCH` | `1` | Master switch for all NPU patches |
+| `TWINKLE_NPU_FUSED_OPS` | `1` | Fused operators (RMSNorm/RoPE/SwiGLU/SDPA) |
+| `TWINKLE_NPU_GMM_PATCH` | unset | MoE grouped matmul (EP-aware) |
+| `TWINKLE_NPU_FLA` | `1` | Flash Linear Attention |
+| `TWINKLE_NPU_GATED_RMSNorm_FP32` | `0` | Force FP32 for Gated RMSNorm |
+
+## Supported Model Families
+
+The patching system automatically discovers and patches compatible model families:
+
+- **Qwen3** / **Qwen3-MoE** — Full operator fusion
+- **Qwen3.5** / **Qwen3.5-MoE** — Full fusion + FLA + Gated RMSNorm
+- **Qwen2.5-VL** — Full fusion + multimodal RoPE
+- **Dynamic discovery** — Unknown models are scanned for compatible RMSNorm/RoPE/SwiGLU patterns
+
+## Getting Started
+
+```bash
+# Install NPU dependencies
+pip install torch-npu mindspeed
+
+# Training automatically uses NPU optimizations
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 train.py
+```
+
+See the [NPU Support Guide](/docs/guide/npu-support/) for detailed setup instructions.
diff --git a/content/blog/npu-support/index.zh.md b/content/blog/npu-support/index.zh.md
new file mode 100644
index 0000000..ba6bbd4
--- /dev/null
+++ b/content/blog/npu-support/index.zh.md
@@ -0,0 +1,110 @@
+---
+title: "昇腾 NPU 支持：融合算子与 Flash Linear Attention"
+date: 2026-06-05
+tags:
+  - NPU
+  - 昇腾
+  - 国产硬件
+  - Kernel 优化
+  - MoE
+categories:
+  - 技术深度解析
+---
+
+Twinkle 通过全面的 monkey-patching 系统为**华为昇腾 NPU** 提供一等公民级别的支持，自动将标准 CUDA 算子替换为 NPU 优化的融合算子。本文介绍 kernel 架构与各项优化细节。
+
+<!--more-->
+
+## Kernel 架构
+
+Twinkle 的 kernel 模块（`twinkle.kernel`）提供统一入口 `kernelize_model()`，自动检测设备类型并应用对应优化：
+
+```python
+from twinkle.kernel import kernelize_model
+model = kernelize_model(model, device='npu')  # 或自动检测
+```
+
+在 NPU 设备上，以下融合算子会被**无条件应用**：
+
+| 算子 | NPU 实现 | 收益 |
+|------|---------|------|
+| RMSNorm | `torch_npu.npu_rms_norm` | 融合归一化，~2x 加速 |
+| RoPE | `torch_npu.npu_rotary_mul` | 融合旋转嵌入，支持部分 RoPE |
+| SwiGLU | `torch_npu.npu_swiglu` | 融合 gate+up 激活 |
+| SDPA | NPU 兼容的 `scaled_dot_product_attention` | NPU 正确的 mask 处理 |
+| MoE GMM | `torch_npu.npu_grouped_matmul` | EP 感知的分组矩阵乘 |
+| FLA | MindSpeed Triton 后端 | Qwen3.5 Flash Linear Attention |
+
+## 融合算子详解
+
+### 带残差参数化的 RMSNorm
+
+Twinkle 的 `NpuRMSNorm` 在初始化时即检测 Qwen3.5 使用的**残差参数化**模式（`scale = 1.0 + weight`），避免在热路径中执行 CPU 同步的 `Tensor.item()` 调用：
+
+```python
+class NpuRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        # 初始化时一次性检测
+        self._residual_param = abs(self.weight.data.mean().item()) < 0.3
+
+    def forward(self, hidden_states):
+        scale = (1.0 + self.weight) if self._residual_param else self.weight
+        return torch_npu.npu_rms_norm(hidden_states, scale, epsilon=self.eps)[0]
+```
+
+### EP 感知的 MoE 优化
+
+MoE 分组矩阵乘 patch 是 **EP 感知**的——仅在开启 Expert Parallelism 时激活（每个 rank 持有部分专家，权重小且连续）。未开启 EP 时，每个 rank 持有**所有**专家，转置+连续化拷贝会产生约 ~8x 开销：
+
+```
+TWINKLE_NPU_GMM_PATCH 未设置 → 跳过（默认安全）
+TWINKLE_NPU_GMM_PATCH=1 + EP 开启  → 应用（高效）
+TWINKLE_NPU_GMM_PATCH=1 + EP 未开启 → 跳过（避免 8x 开销）
+```
+
+`GmmFunction` 自定义 autograd function 封装了 `torch_npu.npu_grouped_matmul`，支持完整的反向传播。权重通过 `_version` 自动缓存失效检测（全参训练时 `_version` 递增，LoRA 模式下保持不变）。
+
+### Qwen3.5 Flash Linear Attention
+
+Qwen3.5 引入了标准注意力与线性注意力层的混合架构。Twinkle 通过 MindSpeed 的 Triton 实现在 NPU 上启用 **FLA 快速路径**（`chunk_gated_delta_rule`）：
+
+1. 强制设置 `is_flash_linear_attention_available = True`
+2. 将 `chunk_gated_delta_rule` 替换为 MindSpeed NPU 兼容实现
+3. 遍历已实例化模型，逐层 patch
+4. 禁用在 NPU 上会失败的 CUDA-only `FusedRMSNormGated`
+
+MindSpeed 实现提供分块 forward/backward（WY 表示），支持通过 `cu_seqlens` 处理变长序列。
+
+## 环境变量控制
+
+每项优化均可独立控制：
+
+| 变量 | 默认值 | 说明 |
+|------|--------|------|
+| `TWINKLE_NPU_PATCH` | `1` | 所有 NPU patch 的主开关 |
+| `TWINKLE_NPU_FUSED_OPS` | `1` | 融合算子（RMSNorm/RoPE/SwiGLU/SDPA） |
+| `TWINKLE_NPU_GMM_PATCH` | 未设置 | MoE 分组矩阵乘（EP 感知） |
+| `TWINKLE_NPU_FLA` | `1` | Flash Linear Attention |
+| `TWINKLE_NPU_GATED_RMSNorm_FP32` | `0` | 强制 Gated RMSNorm 使用 FP32 |
+
+## 支持的模型系列
+
+Patching 系统会自动发现并 patch 兼容的模型系列：
+
+- **Qwen3** / **Qwen3-MoE** — 完整算子融合
+- **Qwen3.5** / **Qwen3.5-MoE** — 完整融合 + FLA + Gated RMSNorm
+- **Qwen2.5-VL** — 完整融合 + 多模态 RoPE
+- **动态发现** — 未知模型会被扫描检测兼容的 RMSNorm/RoPE/SwiGLU 模式
+
+## 快速开始
+
+```bash
+# 安装 NPU 依赖
+pip install torch-npu mindspeed
+
+# 训练时自动启用 NPU 优化
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 train.py
+```
+
+更多详情请参阅 [NPU 支持指南](/docs/guide/npu-support/)。
diff --git a/content/blog/open-source-enterprise/index.md b/content/blog/open-source-enterprise/index.md
deleted file mode 100644
index 06b5d4b..0000000
--- a/content/blog/open-source-enterprise/index.md
+++ /dev/null
@@ -1,111 +0,0 @@
----
-title: "Why Open Source Matters: Twinkle vs Closed-Source Training Platforms"
-date: 2026-03-20
-tags:
-  - Open Source
-  - Enterprise
-  - Training
-categories:
-  - Announcements
----
-
-The LLM training infrastructure space has seen rapid growth, with various platforms emerging to help teams fine-tune and train large language models. However, a critical divide exists: **open source vs. closed source**. In this post, we explain why Twinkle chose the open-source path and what it means for enterprise adoption.
-
-<!--more-->
-
-## The Problem with Closed-Source Training Platforms
-
-Closed-source training platforms like Tinker have pioneered important concepts in LLM training infrastructure. However, they come with significant limitations for enterprise users:
-
-### 1. Vendor Lock-in
-
-When your training infrastructure is a black box, you're completely dependent on the vendor's roadmap, pricing decisions, and continued operation. If the vendor pivots, raises prices, or discontinues the service, your entire training pipeline is at risk.
-
-### 2. Limited Customization
-
-Every organization has unique requirements. Closed-source platforms offer configuration options, but when you need to modify core behaviors—like custom loss functions, specialized data pipelines, or integration with internal systems—you hit a wall.
-
-### 3. Security & Compliance Concerns
-
-For enterprises handling sensitive data, running training workloads through third-party closed systems raises serious questions:
-- Where does my data flow?
-- Can I audit the code processing my data?
-- How do I ensure compliance with internal security policies?
-
-### 4. No Community Innovation
-
-Closed platforms evolve based on vendor priorities. The broader community can't contribute improvements, bug fixes, or new features.
-
----
-
-## Twinkle: Open Source, Enterprise-Ready
-
-Twinkle was built from the ground up as an **open-source enterprise training platform**. Here's what that means:
-
-### Full API Compatibility
-
-Twinkle provides a **superset of Tinker APIs**, ensuring backward compatibility. If you've built on Tinker, you can migrate to Twinkle with minimal code changes—while gaining access to more features.
-
-```python
-# Existing Tinker client code works with Twinkle
-from tinker import ServiceClient
-service_client = ServiceClient(
-    base_url="https://your-twinkle-endpoint",  # Just change the endpoint
-    api_key=api_key
-)
-```
-
-### Deploy Anywhere
-
-With Twinkle, you control where your training runs:
-- **On-premise**: Deploy on your own GPU clusters
-- **Private cloud**: Run on your AWS, GCP, or Azure infrastructure
-- **Hybrid**: Mix local and cloud resources
-
-### Transparent & Auditable
-
-Every line of Twinkle's code is open for inspection. Your security team can:
-- Audit data handling paths
-- Verify there are no hidden telemetry
-- Understand exactly how your models are trained
-
-### Enterprise Features, Open Source
-
-Twinkle doesn't gate enterprise features behind paid tiers:
-
-| Feature | Closed Platforms | Twinkle |
-|---------|------------------|---------|
-| Multi-tenancy | Enterprise tier | ✅ Open Source |
-| Custom loss functions | Limited | ✅ Full access |
-| Megatron backend | Varies | ✅ Open Source |
-| On-premise deployment | Extra cost | ✅ Open Source |
-| API compatibility | N/A | ✅ Tinker-compatible |
-
-### Community-Driven Evolution
-
-Twinkle is part of the [ModelScope](https://github.com/modelscope) ecosystem. Contributions from the community drive new features:
-- Bug fixes land faster
-- New model support added by users who need it
-- Best practices shared openly
-
----
-
-## Built by the ms-swift Team
-
-Twinkle isn't a hobby project. It's built by the team behind [ms-swift](https://github.com/modelscope/ms-swift), one of the most popular LLM fine-tuning frameworks. We bring years of production experience to Twinkle's architecture.
-
----
-
-## Get Started
-
-Ready to try an open-source enterprise training platform?
-
-```bash
-pip install twinkle-kit
-```
-
-- [Documentation](https://twinkle-kit.readthedocs.io/)
-- [GitHub Repository](https://github.com/modelscope/twinkle)
-- [Quick Start Guide](../../docs/getting-started/)
-
-The future of LLM training infrastructure is open. Join us.
diff --git a/content/blog/open-source-enterprise/index.zh.md b/content/blog/open-source-enterprise/index.zh.md
deleted file mode 100644
index 267d689..0000000
--- a/content/blog/open-source-enterprise/index.zh.md
+++ /dev/null
@@ -1,111 +0,0 @@
----
-title: "为什么开源很重要：Twinkle vs 闭源训练平台"
-date: 2026-03-20
-tags:
-  - 开源
-  - 企业级
-  - 训练
-categories:
-  - 公告
----
-
-LLM 训练基础设施领域发展迅速，各种平台涌现帮助团队微调和训练大语言模型。然而，一个关键的分歧存在：**开源 vs 闭源**。本文将解释 Twinkle 为何选择开源路线，以及这对企业采用意味着什么。
-
-<!--more-->
-
-## 闭源训练平台的问题
-
-像 Tinker 这样的闭源训练平台在 LLM 训练基础设施方面开创了重要概念。但对于企业用户来说，它们存在显著的局限性：
-
-### 1. 供应商锁定
-
-当你的训练基础设施是一个黑盒时，你完全依赖于供应商的路线图、定价决策和持续运营。如果供应商转型、涨价或停止服务，你的整个训练流程都会面临风险。
-
-### 2. 定制能力有限
-
-每个组织都有独特的需求。闭源平台提供配置选项，但当你需要修改核心行为——如自定义损失函数、专门的数据流水线或与内部系统集成时——你会遇到瓶颈。
-
-### 3. 安全与合规问题
-
-对于处理敏感数据的企业，通过第三方闭源系统运行训练工作负载会引发严重问题：
-- 我的数据流向哪里？
-- 我能审计处理我数据的代码吗？
-- 如何确保符合内部安全策略？
-
-### 4. 没有社区创新
-
-闭源平台基于供应商优先级演进。更广泛的社区无法贡献改进、bug 修复或新功能。
-
----
-
-## Twinkle：开源、企业就绪
-
-Twinkle 从一开始就被构建为**开源企业训练平台**。这意味着：
-
-### 完全的 API 兼容性
-
-Twinkle 提供 **Tinker API 的超集**，确保向后兼容。如果你已经基于 Tinker 构建，可以以最小的代码更改迁移到 Twinkle——同时获得更多功能。
-
-```python
-# 现有的 Tinker 客户端代码可以与 Twinkle 一起使用
-from tinker import ServiceClient
-service_client = ServiceClient(
-    base_url="https://your-twinkle-endpoint",  # 只需更改端点
-    api_key=api_key
-)
-```
-
-### 随处部署
-
-使用 Twinkle，你可以控制训练运行的位置：
-- **本地部署**：在自己的 GPU 集群上部署
-- **私有云**：在 AWS、GCP 或 Azure 基础设施上运行
-- **混合**：混合使用本地和云资源
-
-### 透明且可审计
-
-Twinkle 的每一行代码都可供检查。你的安全团队可以：
-- 审计数据处理路径
-- 验证没有隐藏的遥测
-- 准确了解模型是如何训练的
-
-### 企业功能，开源免费
-
-Twinkle 不会将企业功能锁在付费层级后面：
-
-| 功能 | 闭源平台 | Twinkle |
-|-----|---------|---------|
-| 多租户 | 企业版 | ✅ 开源 |
-| 自定义损失函数 | 有限 | ✅ 完全访问 |
-| Megatron 后端 | 视情况而定 | ✅ 开源 |
-| 本地部署 | 额外收费 | ✅ 开源 |
-| API 兼容性 | N/A | ✅ Tinker 兼容 |
-
-### 社区驱动的演进
-
-Twinkle 是 [ModelScope](https://github.com/modelscope) 生态系统的一部分。来自社区的贡献推动新功能：
-- Bug 修复更快落地
-- 需要的用户添加新模型支持
-- 最佳实践公开分享
-
----
-
-## 由 ms-swift 团队构建
-
-Twinkle 不是业余项目。它由 [ms-swift](https://github.com/modelscope/ms-swift) 背后的团队构建，ms-swift 是最受欢迎的 LLM 微调框架之一。我们将多年的生产经验带入 Twinkle 的架构中。
-
----
-
-## 开始使用
-
-准备好尝试开源企业训练平台了吗？
-
-```bash
-pip install twinkle-kit
-```
-
-- [文档](https://twinkle-kit.readthedocs.io/zh-cn/latest/)
-- [GitHub 仓库](https://github.com/modelscope/twinkle)
-- [快速入门指南](../../docs/getting-started/)
-
-LLM 训练基础设施的未来是开源的。加入我们。
diff --git a/content/blog/openenv/index.md b/content/blog/openenv/index.md
new file mode 100644
index 0000000..e2d356d
--- /dev/null
+++ b/content/blog/openenv/index.md
@@ -0,0 +1,200 @@
+---
+title: "OpenEnv Integration: Connecting External Environments to RL Training"
+date: 2026-05-30
+tags:
+  - OpenEnv
+  - RL Training
+  - Environment
+  - WebSocket
+  - Multi-Turn Rollout
+categories:
+  - Technical Deep Dive
+---
+
+Twinkle's `envs` module bridges the gap between **asynchronous external environments** (code sandboxes, web browsers, game engines) and **synchronous RL training loops**. This post explains the Env abstraction, the EnvTool adapter, and the OpenEnv WebSocket client.
+
+<!--more-->
+
+## The Problem
+
+RL training with tool-calling LLMs requires interactive environments: the model generates a tool call, the environment executes it and returns an observation, and the model generates the next action. But:
+
+- External environments communicate over **WebSocket** (async)
+- Training loops run **synchronously** inside torch distributed
+- Environments may define **different tool schemas** that the LLM needs to understand
+- Rewards may be **sparse** (only at episode end) or **per-step**
+
+Twinkle's `envs` module solves all of these with three layers of abstraction.
+
+## Layer 1: The Env Base Class
+
+```python
+from twinkle_agentic.envs.base import Env, StepResult
+
+@dataclass
+class StepResult:
+    observation: str = ''
+    reward: float = 0.0
+    done: bool = False
+    info: Dict[str, Any] = field(default_factory=dict)
+```
+
+`Env` defines the standard interface with two usage modes:
+
+**Interactive mode** (multi-turn rollout):
+```python
+env.reset(trajectory)
+result = env.step(tool_name, arguments)
+# ... repeat until result.done
+```
+
+**Batch evaluation mode**:
+```python
+rewards = env.evaluate(trajectories)
+```
+
+The `tools()` method returns OpenAI function-call schemas so the LLM knows what actions are available.
+
+## Layer 2: EnvTool Adapter
+
+`EnvTool` wraps any `Env` as a standard `Tool` for Twinkle's `ToolManager`:
+
+```python
+from twinkle_agentic.envs.env_tool import EnvTool
+
+# Wrap an env — creates one tool per env.tools() entry
+tools = EnvTool.from_env(my_env)
+for tool in tools:
+    tool_manager.register(tool)
+```
+
+When the LLM generates a tool call, `EnvTool.__call__` dispatches to `env.step()` and returns the observation string. The caller can inspect:
+- `tool.done` — whether the episode terminated
+- `tool.episode_reward` — cumulative reward from `info['episode_reward']`
+
+This design decouples environment implementation from the rollout engine — any `Env` can be plugged into the existing `MultiTurnRollout` without changes.
+
+## Layer 3: OpenEnv WebSocket Client
+
+`OpenEnv` is the concrete adapter for environments running as remote services:
+
+```python
+from twinkle_agentic.envs.openenv import OpenEnv
+
+env = OpenEnv(
+    base_url='http://localhost:8000',
+    env_cls='coding_env.CodingEnv',  # or None for GenericEnvClient
+    env_kwargs={'message_timeout_s': 30},
+    tool_schema=[...],               # optional tool definitions
+    action_mapper=my_mapper,         # optional action transformation
+)
+```
+
+### Lazy Client Initialization
+
+The WebSocket client is created **lazily** on first `reset()` or `step()` call:
+
+```python
+def _ensure_client(self):
+    if self._sync_client is not None:
+        return
+    client = self._env_cls(base_url=self._base_url, **self._env_kwargs)
+    self._sync_client = client.sync()  # async -> sync wrapper
+    self._sync_client.__enter__()
+```
+
+This means you can create `OpenEnv` instances during setup without establishing connections — useful when environments aren't ready yet.
+
+### Action Mapping
+
+By default, actions are sent as `{'tool_name': ..., 'arguments': ...}`. The optional `action_mapper` transforms LLM tool calls into environment-specific formats:
+
+```python
+def code_action_mapper(tool_name, arguments):
+    if tool_name == 'execute_code':
+        return {'code': arguments['code'], 'language': 'python'}
+    return {'tool_name': tool_name, 'arguments': arguments}
+
+env = OpenEnv(base_url=url, action_mapper=code_action_mapper)
+```
+
+### Observation Extraction
+
+`OpenEnv._format_observation()` handles diverse observation formats:
+- **String** — returned as-is
+- **Dict** — tries common keys (`result`, `output`, `content`, `text`, `message`), falls back to JSON serialization
+- **Typed objects** — tries common attributes, then JSON
+
+### Episode Reward Tracking
+
+Rewards are accumulated per-episode:
+
+```python
+self._episode_reward += reward
+return StepResult(
+    observation=obs,
+    reward=reward,
+    done=done,
+    info={'raw_result': result, 'episode_reward': self._episode_reward},
+)
+```
+
+This enables both per-step reward signals and end-of-episode cumulative rewards.
+
+## Putting It All Together
+
+A typical multi-turn RL training setup:
+
+```python
+from twinkle_agentic.envs.openenv import OpenEnv
+from twinkle_agentic.envs.env_tool import EnvTool
+
+# 1. Create environment
+env = OpenEnv(
+    base_url='http://sandbox:8000',
+    tool_schema=[
+        {'type': 'function', 'function': {
+            'name': 'execute_code',
+            'description': 'Run Python code in sandbox',
+            'parameters': {'type': 'object', 'properties': {
+                'code': {'type': 'string'}
+            }}
+        }}
+    ],
+)
+
+# 2. Wrap as tools
+tools = EnvTool.from_env(env)
+
+# 3. Register with ToolManager
+for tool in tools:
+    tool_manager.register(tool)
+
+# 4. Use in multi-turn rollout
+env.reset()
+while True:
+    action = model.generate(observation)  # LLM generates tool call
+    result = env.step(action.tool_name, action.arguments)
+    if result.done:
+        break
+
+# 5. Cleanup
+env.close()
+```
+
+## Supported Environment Types
+
+The `env_cls` parameter supports:
+- `None` — uses `GenericEnvClient` (works with any dict-based environment)
+- `'module.ClassName'` — dynamically imports a typed client class
+- Class object — uses the class directly
+
+The dynamic import system includes fallback logic for broken sub-imports, making it robust against partial OpenEnv installations.
+
+## Key Design Principles
+
+1. **Synchronous interface** — RL training loops don't need to manage async/await
+2. **Lazy connections** — environments created at config time, connected at runtime
+3. **Schema transparency** — LLM sees standard OpenAI function-call format
+4. **Reward flexibility** — per-step, sparse, or custom aggregation
+5. **Zero coupling** — `Env` implementations know nothing about Twinkle's training infrastructure
diff --git a/content/blog/openenv/index.zh.md b/content/blog/openenv/index.zh.md
new file mode 100644
index 0000000..02f6556
--- /dev/null
+++ b/content/blog/openenv/index.zh.md
@@ -0,0 +1,200 @@
+---
+title: "OpenEnv 集成：连接外部环境到 RL 训练"
+date: 2026-05-30
+tags:
+  - OpenEnv
+  - RL 训练
+  - 环境
+  - WebSocket
+  - 多轮 Rollout
+categories:
+  - 技术深度解析
+---
+
+Twinkle 的 `envs` 模块在**异步外部环境**（代码沙箱、浏览器、游戏引擎）和**同步 RL 训练循环**之间架起桥梁。本文介绍 Env 抽象、EnvTool 适配器以及 OpenEnv WebSocket 客户端的设计。
+
+<!--more-->
+
+## 问题背景
+
+使用工具调用的 LLM 进行 RL 训练需要交互式环境：模型生成工具调用，环境执行并返回观测，模型生成下一个动作。但是：
+
+- 外部环境通过 **WebSocket**（异步）通信
+- 训练循环在 torch distributed 中**同步**运行
+- 环境可能定义**不同的工具 schema**，LLM 需要理解
+- 奖励可能是**稀疏的**（仅在 episode 结束时）或**逐步的**
+
+Twinkle 的 `envs` 模块通过三层抽象解决所有这些问题。
+
+## 第一层：Env 基类
+
+```python
+from twinkle_agentic.envs.base import Env, StepResult
+
+@dataclass
+class StepResult:
+    observation: str = ''
+    reward: float = 0.0
+    done: bool = False
+    info: Dict[str, Any] = field(default_factory=dict)
+```
+
+`Env` 定义了标准接口，支持两种使用模式：
+
+**交互模式**（多轮 rollout）：
+```python
+env.reset(trajectory)
+result = env.step(tool_name, arguments)
+# ... 重复直到 result.done
+```
+
+**批量评估模式**：
+```python
+rewards = env.evaluate(trajectories)
+```
+
+`tools()` 方法返回 OpenAI function-call schema，让 LLM 知道有哪些可用动作。
+
+## 第二层：EnvTool 适配器
+
+`EnvTool` 将任何 `Env` 封装为标准 `Tool`，可注册到 Twinkle 的 `ToolManager`：
+
+```python
+from twinkle_agentic.envs.env_tool import EnvTool
+
+# 封装环境 — 为 env.tools() 的每个条目创建一个 tool
+tools = EnvTool.from_env(my_env)
+for tool in tools:
+    tool_manager.register(tool)
+```
+
+当 LLM 生成工具调用时，`EnvTool.__call__` 分发到 `env.step()` 并返回观测字符串。调用方可检查：
+- `tool.done` — episode 是否已结束
+- `tool.episode_reward` — 从 `info['episode_reward']` 获取的累积奖励
+
+这种设计将环境实现与 rollout 引擎解耦——任何 `Env` 无需修改即可接入现有的 `MultiTurnRollout`。
+
+## 第三层：OpenEnv WebSocket 客户端
+
+`OpenEnv` 是面向远程服务环境的具体适配器：
+
+```python
+from twinkle_agentic.envs.openenv import OpenEnv
+
+env = OpenEnv(
+    base_url='http://localhost:8000',
+    env_cls='coding_env.CodingEnv',  # 或 None 使用 GenericEnvClient
+    env_kwargs={'message_timeout_s': 30},
+    tool_schema=[...],               # 可选的工具定义
+    action_mapper=my_mapper,         # 可选的动作转换
+)
+```
+
+### 懒初始化
+
+WebSocket 客户端在首次 `reset()` 或 `step()` 调用时**懒创建**：
+
+```python
+def _ensure_client(self):
+    if self._sync_client is not None:
+        return
+    client = self._env_cls(base_url=self._base_url, **self._env_kwargs)
+    self._sync_client = client.sync()  # async -> sync 封装
+    self._sync_client.__enter__()
+```
+
+这意味着可以在配置阶段创建 `OpenEnv` 实例而不建立连接——在环境尚未就绪时非常有用。
+
+### 动作映射
+
+默认情况下，动作以 `{'tool_name': ..., 'arguments': ...}` 格式发送。可选的 `action_mapper` 将 LLM 工具调用转换为环境特定格式：
+
+```python
+def code_action_mapper(tool_name, arguments):
+    if tool_name == 'execute_code':
+        return {'code': arguments['code'], 'language': 'python'}
+    return {'tool_name': tool_name, 'arguments': arguments}
+
+env = OpenEnv(base_url=url, action_mapper=code_action_mapper)
+```
+
+### 观测提取
+
+`OpenEnv._format_observation()` 处理多种观测格式：
+- **字符串** — 直接返回
+- **字典** — 尝试常用键（`result`、`output`、`content`、`text`、`message`），回退到 JSON 序列化
+- **类型化对象** — 尝试常用属性，然后 JSON
+
+### Episode 奖励追踪
+
+奖励按 episode 累积：
+
+```python
+self._episode_reward += reward
+return StepResult(
+    observation=obs,
+    reward=reward,
+    done=done,
+    info={'raw_result': result, 'episode_reward': self._episode_reward},
+)
+```
+
+这同时支持逐步奖励信号和 episode 结束时的累积奖励。
+
+## 完整使用示例
+
+典型的多轮 RL 训练配置：
+
+```python
+from twinkle_agentic.envs.openenv import OpenEnv
+from twinkle_agentic.envs.env_tool import EnvTool
+
+# 1. 创建环境
+env = OpenEnv(
+    base_url='http://sandbox:8000',
+    tool_schema=[
+        {'type': 'function', 'function': {
+            'name': 'execute_code',
+            'description': '在沙箱中运行 Python 代码',
+            'parameters': {'type': 'object', 'properties': {
+                'code': {'type': 'string'}
+            }}
+        }}
+    ],
+)
+
+# 2. 封装为工具
+tools = EnvTool.from_env(env)
+
+# 3. 注册到 ToolManager
+for tool in tools:
+    tool_manager.register(tool)
+
+# 4. 在多轮 rollout 中使用
+env.reset()
+while True:
+    action = model.generate(observation)  # LLM 生成工具调用
+    result = env.step(action.tool_name, action.arguments)
+    if result.done:
+        break
+
+# 5. 清理
+env.close()
+```
+
+## 支持的环境类型
+
+`env_cls` 参数支持：
+- `None` — 使用 `GenericEnvClient`（适用于任何基于 dict 的环境）
+- `'module.ClassName'` — 动态导入类型化客户端类
+- 类对象 — 直接使用
+
+动态导入系统包含对损坏子导入的回退逻辑，对部分安装的 OpenEnv 具有鲁棒性。
+
+## 核心设计原则
+
+1. **同步接口** — RL 训练循环无需管理 async/await
+2. **懒连接** — 配置时创建环境，运行时建立连接
+3. **Schema 透明** — LLM 看到标准 OpenAI function-call 格式
+4. **奖励灵活性** — 支持逐步、稀疏或自定义聚合
+5. **零耦合** — `Env` 实现不需要了解 Twinkle 的训练基础设施
diff --git a/content/blog/sequence-parallel/index.md b/content/blog/sequence-parallel/index.md
new file mode 100644
index 0000000..0f156d4
--- /dev/null
+++ b/content/blog/sequence-parallel/index.md
@@ -0,0 +1,239 @@
+---
+title: "Sequence Parallel & Ring Attention: Training with Ultra-Long Contexts"
+date: 2026-06-22
+tags:
+  - Sequence Parallel
+  - Ring Attention
+  - Long Context
+  - Distributed Training
+  - FlashAttention
+categories:
+  - Technical Deep Dive
+---
+
+Modern LLMs demand ever-longer context windows — 128K, 256K, even 1M tokens. A single GPU cannot hold such long sequences in memory. Twinkle's **Sequence Parallel** module solves this by splitting the sequence dimension across multiple devices, combining **Ulysses-style All-to-All** parallelism with **ZigZag Ring Attention** to achieve near-linear scaling.
+
+<!--more-->
+
+## Why Sequence Parallel?
+
+Standard data parallelism replicates the full sequence on every device. For a 128K-token input with 8K per-device memory budget, you simply cannot fit the KV cache and attention matrices on one GPU. Sequence parallel (SP) partitions the sequence across devices so each GPU only processes a shard.
+
+| Challenge | Solution in Twinkle |
+|---|---|
+| Attention needs full KV context | Ulysses All-to-All or Ring communication |
+| Causal masking with split sequences | ZigZag interleaving preserves causality |
+| Variable-length packed batches | `cu_seqlens`-based varlen FlashAttention |
+| MoE auxiliary loss needs global view | Post-forward router logit gathering |
+
+## Architecture Overview
+
+Twinkle's SP module lives at `src/twinkle/model/transformers/strategy/sequence_parallel/` and is composed of three layers:
+
+```
+┌─────────────────────────────────────────────────────────┐
+│            SequenceParallelStrategy (API)                │
+│   • initialize() • preprocess_inputs() • postprocess()  │
+├─────────────────────────────────────────────────────────┤
+│               SequenceParallel (Core)                    │
+│   • pad / split / gather   • DeviceMesh group init      │
+│   • Flash Attention hook   • Forward pre-hook (pad+split)│
+├───────────────────────┬─────────────────────────────────┤
+│  Ulysses (All-to-All) │  ZigZag Ring Attention          │
+│  _SeqAllToAll         │  RingComm P2P send/recv         │
+│  DistributedAttention │  zigzag_ring_flash_attn_varlen  │
+└───────────────────────┴─────────────────────────────────┘
+```
+
+## Two Parallelism Strategies
+
+### 1. Ulysses (All-to-All)
+
+Ulysses parallelism exploits the head dimension. Before attention, each GPU holds a full-length shard of **local heads**. An All-to-All transpose converts this to each GPU holding **all heads** for a local sequence shard — enabling standard attention computation.
+
+```python
+# Scatter along head dim, gather along seq dim
+query_layer = _SeqAllToAll.apply(sp_group, query, scatter_idx=2, gather_idx=1)
+key_layer   = _SeqAllToAll.apply(sp_group, key,   scatter_idx=2, gather_idx=1)
+value_layer = _SeqAllToAll.apply(sp_group, value, scatter_idx=2, gather_idx=1)
+
+# Local attention on full seq, subset of heads
+context = local_flash_attention(query_layer, key_layer, value_layer)
+
+# Reverse: scatter along seq, gather along head
+output = _SeqAllToAll.apply(sp_group, context, gather_idx=2, scatter_idx=1)
+```
+
+**Constraint**: `num_kv_heads` must be divisible by `sp_world_size`.
+
+### 2. ZigZag Ring Attention
+
+When KV heads are fewer than the SP size (e.g., GQA with 8 KV heads but 16 GPUs), Twinkle automatically derives a **ring attention** group. Ring attention passes KV blocks between GPUs in a ring topology — no global All-to-All needed.
+
+The **ZigZag** pattern is key: instead of naive sequential splitting, each GPU holds two non-contiguous chunks — the i-th from the front and the i-th from the back:
+
+```
+Sequence:  [chunk_0 | chunk_1 | chunk_2 | chunk_3 | chunk_4 | chunk_5 | chunk_6 | chunk_7]
+
+GPU 0:     [chunk_0, chunk_7]   (front-0 + back-0)
+GPU 1:     [chunk_1, chunk_6]   (front-1 + back-1)
+GPU 2:     [chunk_2, chunk_5]   (front-2 + back-2)
+GPU 3:     [chunk_3, chunk_4]   (front-3 + back-3)
+```
+
+This ensures **load balance** for causal attention — each GPU computes roughly the same number of attention pairs, avoiding the idle-GPU problem of naive splits.
+
+### Hybrid: Ulysses + Ring
+
+When `seq_world_size > num_kv_heads`, Twinkle automatically computes:
+
+```python
+sp_world_size = gcd(num_kv_heads, seq_world_size)   # Ulysses group size
+rp_world_size = seq_world_size // sp_world_size      # Ring group size
+```
+
+This creates a two-level hierarchy: Ulysses All-to-All within sub-groups, Ring P2P across sub-groups.
+
+## Ring Communication: The `RingComm` Class
+
+The core P2P communication is handled by `RingComm`:
+
+```python
+class RingComm:
+    def __init__(self, process_group):
+        self.send_rank = (self.rank + 1) % self.world_size
+        self.recv_rank = (self.rank - 1) % self.world_size
+
+    def send_recv_kv(self, k, v, k_buffer=None, v_buffer=None):
+        """Asynchronously send KV to next rank, receive from previous."""
+        next_k = self.send_recv(k, k_buffer)
+        next_v = self.send_recv(v, v_buffer)
+        self.commit()  # batch_isend_irecv
+        return next_k, next_v
+```
+
+Each ring step:
+1. **Send** current KV to the next GPU
+2. **Receive** KV from the previous GPU
+3. **Compute** local attention block with received KV
+4. **Accumulate** output using log-sum-exp correction
+
+## Forward Pass: ZigZag Ring FlashAttention
+
+The forward iterates over `world_size` ring steps:
+
+```python
+for step in range(comm.world_size):
+    if step + 1 != comm.world_size:
+        next_k, next_v = comm.send_recv_kv(k, v)  # async
+
+    if step == 0:
+        # Self-attention (causal)
+        block_out, block_lse = flash_attn_varlen(q, k, v, causal=True)
+    elif step <= comm.rank:
+        # Full cross-attention with front-half of received KV
+        block_out, block_lse = flash_attn_varlen(q, k[front], v[front], causal=False)
+    else:
+        # Only back-half of Q attends to full received KV
+        block_out, block_lse = flash_attn_varlen(q[back], k, v, causal=False)
+
+    # Online softmax correction (numerically stable)
+    out, lse = update_out_and_lse(out, lse, block_out, block_lse)
+
+    comm.wait()  # sync communication
+    k, v = next_k, next_v
+```
+
+The `update_out_and_lse` function uses the **online softmax trick** — it incrementally merges attention outputs from different KV blocks using their log-sum-exp values:
+
+```python
+def update_out_and_lse(out, lse, block_out, block_lse):
+    diff = block_lse - lse
+    sig_diff = torch.sigmoid(diff)
+    out = out - sig_diff * (out - block_out)
+    lse = lse - F.logsigmoid(lse - block_lse)
+    return out, lse
+```
+
+## The Pad → Split → Compute → Gather Pipeline
+
+Before the model forward, a pre-hook automatically handles the SP lifecycle:
+
+```
+Input [B, S, D]
+    │
+    ▼
+Pad to multiple of (sp_size × rp_size × 2)
+    │
+    ▼
+Split along seq dim (ZigZag for ring, chunk for Ulysses)
+    │
+    ▼
+Model Forward (each GPU sees [B, S/sp_size, D])
+    │
+    ▼
+Gather logits / loss across SP group
+    │
+    ▼
+Trim padding → Output [B, S, V]
+```
+
+Key implementation detail: **padding uses `position_ids = -1`** to mark invalid tokens. The attention mask automatically excludes these positions.
+
+## Usage in Twinkle
+
+Enable sequence parallel with a single config parameter:
+
+```python
+# In your training config YAML:
+sequence_parallel_size: 4   # Split across 4 GPUs
+
+# Or programmatically:
+from twinkle.model.transformers.strategy.sequence_parallel import SequenceParallelStrategy
+
+strategy = SequenceParallelStrategy(
+    device_mesh=mesh,
+    sp_config={'ulysses_size': 4, 'gather_logits': True},
+    model=model,
+    tokenizer_id='Qwen/Qwen2.5-7B',
+)
+strategy.initialize()
+```
+
+The framework handles:
+- Automatic `sp_world_size` / `rp_world_size` derivation from `num_kv_heads`
+- FlashAttention2 and SDPA backend support (ring requires FA2)
+- Variable-length packed batches (`padding_free` mode)
+- MoE router logit gathering for correct auxiliary loss
+- Qwen3.5 linear attention (GatedDeltaNet) SP support
+
+## Performance Characteristics
+
+| Configuration | Communication | Memory per GPU | Best Use Case |
+|---|---|---|---|
+| Pure Ulysses (sp=4, rp=1) | All-to-All (high bandwidth) | S/4 per GPU | High KV-head models (≥ sp_size heads) |
+| Pure Ring (sp=1, rp=4) | P2P ring (low bandwidth) | S/4 per GPU | GQA models with few KV heads |
+| Hybrid (sp=2, rp=2) | All-to-All + P2P | S/4 per GPU | Balanced models |
+
+**Key insight**: Ulysses requires high all-to-all bandwidth (best within NVLink domains), while Ring only needs point-to-point (works across nodes). Twinkle's automatic derivation picks the optimal split.
+
+## Backward Pass
+
+The backward pass recomputes attention block-by-block (to save memory) and uses the same ring communication pattern. Gradients for dQ accumulate locally, while dK/dV are communicated in reverse ring direction:
+
+```python
+# Forward ring: rank → rank+1
+# dK/dV ring: rank → rank-1 (reverse direction)
+next_dk, next_dv = d_kv_comm.send_recv_kv(dk, dv)
+```
+
+## Summary
+
+Twinkle's sequence parallel module provides:
+
+1. **Transparent integration** — a single `sequence_parallel_size` config enables SP with no code changes
+2. **Automatic strategy selection** — Ulysses vs Ring vs Hybrid based on model architecture
+3. **Production-ready** — supports packed batches, MoE, multimodal models (Qwen-VL), and linear attention (Qwen3.5)
+4. **Numerically correct** — online softmax correction ensures identical results to single-GPU attention
+
+For ultra-long context training (128K+ tokens), sequence parallel is the key enabler — scaling the context window linearly with the number of GPUs.
diff --git a/content/blog/sequence-parallel/index.zh.md b/content/blog/sequence-parallel/index.zh.md
new file mode 100644
index 0000000..a00d798
--- /dev/null
+++ b/content/blog/sequence-parallel/index.zh.md
@@ -0,0 +1,239 @@
+---
+title: "序列并行与 Ring Attention：超长上下文训练技术解析"
+date: 2026-06-22
+tags:
+  - Sequence Parallel
+  - Ring Attention
+  - Long Context
+  - Distributed Training
+  - FlashAttention
+categories:
+  - Technical Deep Dive
+---
+
+现代大语言模型对上下文窗口的需求不断增长——128K、256K 甚至 1M tokens。单卡 GPU 无法容纳如此长的序列。Twinkle 的 **Sequence Parallel** 模块通过在多设备间切分序列维度来解决这一问题，结合 **Ulysses All-to-All** 并行与 **ZigZag Ring Attention**，实现近线性扩展。
+
+<!--more-->
+
+## 为什么需要序列并行？
+
+标准数据并行在每个设备上复制完整序列。对于 128K token 输入，如果单卡只能容纳 8K 的 KV cache 和注意力矩阵，根本无法放下。序列并行（SP）将序列切分到多设备上，每张 GPU 只处理一个分片。
+
+| 挑战 | Twinkle 的解决方案 |
+|---|---|
+| Attention 需要完整 KV 上下文 | Ulysses All-to-All 或 Ring 通信 |
+| 切分后的因果掩码 | ZigZag 交错排布保持因果性 |
+| 变长 packed 批次 | 基于 `cu_seqlens` 的 varlen FlashAttention |
+| MoE 辅助 loss 需要全局视图 | 前向后聚合 router logits |
+
+## 架构概览
+
+Twinkle 的 SP 模块位于 `src/twinkle/model/transformers/strategy/sequence_parallel/`，由三层组成：
+
+```
+┌─────────────────────────────────────────────────────────┐
+│          SequenceParallelStrategy（API 层）              │
+│   • initialize() • preprocess_inputs() • postprocess()  │
+├─────────────────────────────────────────────────────────┤
+│             SequenceParallel（核心逻辑）                  │
+│   • pad / split / gather   • DeviceMesh 进程组初始化     │
+│   • Flash Attention hook   • Forward pre-hook（pad+split）│
+├───────────────────────┬─────────────────────────────────┤
+│  Ulysses (All-to-All) │  ZigZag Ring Attention          │
+│  _SeqAllToAll         │  RingComm P2P 收发              │
+│  DistributedAttention │  zigzag_ring_flash_attn_varlen  │
+└───────────────────────┴─────────────────────────────────┘
+```
+
+## 两种并行策略
+
+### 1. Ulysses（All-to-All）
+
+Ulysses 并行利用注意力头维度。在 attention 前，每张 GPU 持有完整长度的**本地 head 分片**。通过 All-to-All 转置，变为每张 GPU 持有**所有 head** 的局部序列分片——从而可以执行标准 attention 计算。
+
+```python
+# 沿 head 维度 scatter，沿 seq 维度 gather
+query_layer = _SeqAllToAll.apply(sp_group, query, scatter_idx=2, gather_idx=1)
+key_layer   = _SeqAllToAll.apply(sp_group, key,   scatter_idx=2, gather_idx=1)
+value_layer = _SeqAllToAll.apply(sp_group, value, scatter_idx=2, gather_idx=1)
+
+# 本地 attention：完整序列，head 子集
+context = local_flash_attention(query_layer, key_layer, value_layer)
+
+# 反向：沿 seq scatter，沿 head gather
+output = _SeqAllToAll.apply(sp_group, context, gather_idx=2, scatter_idx=1)
+```
+
+**约束**：`num_kv_heads` 必须能被 `sp_world_size` 整除。
+
+### 2. ZigZag Ring Attention
+
+当 KV head 数少于 SP 大小时（例如 GQA 有 8 个 KV head 但 16 张 GPU），Twinkle 自动派生 **ring attention** 组。Ring attention 在 GPU 之间以环形拓扑传递 KV block——无需全局 All-to-All。
+
+**ZigZag** 模式是关键：不是朴素的顺序切分，每张 GPU 持有两个不连续的块——从前面取第 i 块和从后面取第 i 块：
+
+```
+序列:  [chunk_0 | chunk_1 | chunk_2 | chunk_3 | chunk_4 | chunk_5 | chunk_6 | chunk_7]
+
+GPU 0: [chunk_0, chunk_7]   (前-0 + 后-0)
+GPU 1: [chunk_1, chunk_6]   (前-1 + 后-1)
+GPU 2: [chunk_2, chunk_5]   (前-2 + 后-2)
+GPU 3: [chunk_3, chunk_4]   (前-3 + 后-3)
+```
+
+这确保了因果 attention 的**负载均衡**——每张 GPU 计算大致相同数量的注意力对，避免朴素切分导致的 GPU 空闲问题。
+
+### 混合模式：Ulysses + Ring
+
+当 `seq_world_size > num_kv_heads` 时，Twinkle 自动计算：
+
+```python
+sp_world_size = gcd(num_kv_heads, seq_world_size)   # Ulysses 组大小
+rp_world_size = seq_world_size // sp_world_size      # Ring 组大小
+```
+
+形成两级层次：子组内部走 Ulysses All-to-All，子组之间走 Ring P2P。
+
+## Ring 通信：`RingComm` 类
+
+核心 P2P 通信由 `RingComm` 处理：
+
+```python
+class RingComm:
+    def __init__(self, process_group):
+        self.send_rank = (self.rank + 1) % self.world_size
+        self.recv_rank = (self.rank - 1) % self.world_size
+
+    def send_recv_kv(self, k, v, k_buffer=None, v_buffer=None):
+        """异步发送 KV 到下一个 rank，从上一个 rank 接收。"""
+        next_k = self.send_recv(k, k_buffer)
+        next_v = self.send_recv(v, v_buffer)
+        self.commit()  # batch_isend_irecv
+        return next_k, next_v
+```
+
+每个 ring step：
+1. **发送**当前 KV 到下一张 GPU
+2. **接收**上一张 GPU 的 KV
+3. **计算**本地 attention block
+4. **累积**输出（使用 log-sum-exp 校正）
+
+## 前向传播：ZigZag Ring FlashAttention
+
+前向过程迭代 `world_size` 个 ring step：
+
+```python
+for step in range(comm.world_size):
+    if step + 1 != comm.world_size:
+        next_k, next_v = comm.send_recv_kv(k, v)  # 异步
+
+    if step == 0:
+        # Self-attention（因果）
+        block_out, block_lse = flash_attn_varlen(q, k, v, causal=True)
+    elif step <= comm.rank:
+        # 与接收 KV 的前半部分做完整 cross-attention
+        block_out, block_lse = flash_attn_varlen(q, k[front], v[front], causal=False)
+    else:
+        # 只有 Q 的后半部分与完整接收 KV 做 attention
+        block_out, block_lse = flash_attn_varlen(q[back], k, v, causal=False)
+
+    # 在线 softmax 校正（数值稳定）
+    out, lse = update_out_and_lse(out, lse, block_out, block_lse)
+
+    comm.wait()  # 同步通信
+    k, v = next_k, next_v
+```
+
+`update_out_and_lse` 函数使用 **在线 softmax 技巧**——利用 log-sum-exp 值增量合并来自不同 KV block 的注意力输出：
+
+```python
+def update_out_and_lse(out, lse, block_out, block_lse):
+    diff = block_lse - lse
+    sig_diff = torch.sigmoid(diff)
+    out = out - sig_diff * (out - block_out)
+    lse = lse - F.logsigmoid(lse - block_lse)
+    return out, lse
+```
+
+## Pad → Split → Compute → Gather 流水线
+
+模型前向前，一个 pre-hook 自动处理 SP 生命周期：
+
+```
+输入 [B, S, D]
+    │
+    ▼
+Pad 到 (sp_size × rp_size × 2) 的倍数
+    │
+    ▼
+沿序列维度 Split（Ring 用 ZigZag，Ulysses 用 chunk）
+    │
+    ▼
+模型前向（每张 GPU 看到 [B, S/sp_size, D]）
+    │
+    ▼
+跨 SP 组 Gather logits / loss
+    │
+    ▼
+裁剪 padding → 输出 [B, S, V]
+```
+
+关键实现细节：**padding 使用 `position_ids = -1`** 标记无效 token，注意力掩码自动排除这些位置。
+
+## 在 Twinkle 中使用
+
+通过一个配置参数即可启用序列并行：
+
+```python
+# 训练配置 YAML：
+sequence_parallel_size: 4   # 在 4 张 GPU 间切分
+
+# 或编程方式：
+from twinkle.model.transformers.strategy.sequence_parallel import SequenceParallelStrategy
+
+strategy = SequenceParallelStrategy(
+    device_mesh=mesh,
+    sp_config={'ulysses_size': 4, 'gather_logits': True},
+    model=model,
+    tokenizer_id='Qwen/Qwen2.5-7B',
+)
+strategy.initialize()
+```
+
+框架自动处理：
+- 根据 `num_kv_heads` 自动推导 `sp_world_size` / `rp_world_size`
+- 支持 FlashAttention2 和 SDPA 后端（ring 要求 FA2）
+- 变长 packed 批次（`padding_free` 模式）
+- MoE router logit 聚合以计算正确的辅助 loss
+- Qwen3.5 线性注意力（GatedDeltaNet）SP 支持
+
+## 性能特性
+
+| 配置 | 通信模式 | 每卡显存 | 最佳场景 |
+|---|---|---|---|
+| 纯 Ulysses (sp=4, rp=1) | All-to-All（高带宽） | S/4 | KV head 多的模型（≥ sp_size 个 head） |
+| 纯 Ring (sp=1, rp=4) | P2P ring（低带宽） | S/4 | GQA 少量 KV head 的模型 |
+| 混合 (sp=2, rp=2) | All-to-All + P2P | S/4 | 均衡型模型 |
+
+**核心洞察**：Ulysses 需要高 All-to-All 带宽（最适合 NVLink 域内），而 Ring 只需点对点通信（可跨节点）。Twinkle 的自动推导会选择最优切分方案。
+
+## 反向传播
+
+反向过程逐 block 重计算 attention（节省显存），使用相同的 ring 通信模式。dQ 在本地累积，dK/dV 沿反向环形方向通信：
+
+```python
+# 前向 ring：rank → rank+1
+# dK/dV ring：rank → rank-1（反向）
+next_dk, next_dv = d_kv_comm.send_recv_kv(dk, dv)
+```
+
+## 总结
+
+Twinkle 的序列并行模块提供：
+
+1. **透明集成** —— 一个 `sequence_parallel_size` 配置即可启用 SP，无需修改代码
+2. **自动策略选择** —— 根据模型架构自动选择 Ulysses / Ring / 混合模式
+3. **生产就绪** —— 支持 packed 批次、MoE、多模态模型（Qwen-VL）和线性注意力（Qwen3.5）
+4. **数值正确** —— 在线 softmax 校正确保与单卡 attention 结果一致
+
+对于超长上下文训练（128K+ tokens），序列并行是关键使能技术——上下文窗口随 GPU 数量线性扩展。
diff --git a/content/blog/torchrun-ray/index.md b/content/blog/torchrun-ray/index.md
new file mode 100644
index 0000000..6ee5d9d
--- /dev/null
+++ b/content/blog/torchrun-ray/index.md
@@ -0,0 +1,174 @@
+---
+title: "Two Execution Modes: torchrun (Local) vs Ray (Distributed)"
+date: 2026-06-03
+tags:
+  - Infrastructure
+  - Ray
+  - torchrun
+  - Distributed Training
+  - Multi-Node
+categories:
+  - Technical Deep Dive
+---
+
+Twinkle's `infra` module provides a unified programming model that runs seamlessly in two modes: **local** (single-node via torchrun) and **ray** (multi-node via Ray cluster). This post explains the architecture, the decorator-based API, and when to use each mode.
+
+<!--more-->
+
+## The Two Modes at a Glance
+
+| | Local (torchrun) | Ray (Distributed) |
+|---|---|---|
+| Launch | `torchrun --nproc_per_node=N` | `ray start` + driver script |
+| Scope | Single node, shared filesystem | Multi-node cluster |
+| Process model | One process per GPU, torch.distributed | Ray actors with PlacementGroups |
+| Best for | Quick experiments, single-machine training | Production multi-node, heterogeneous resources |
+
+Both modes share the **same user code** — switching requires only changing the `mode` parameter in `twinkle.infra.initialize()`.
+
+## Initialization
+
+```python
+import twinkle.infra as infra
+
+# Local mode — auto-detects ranks and devices from torchrun env vars
+infra.initialize(mode='local', seed=42)
+
+# Ray mode — requires explicit DeviceGroup definitions
+infra.initialize(
+    mode='ray',
+    nproc_per_node=8,
+    groups=[
+        DeviceGroup(name='model', ranks=list(range(4)), device_type='cuda'),
+        DeviceGroup(name='sampler', ranks=list(range(4, 8)), device_type='cuda'),
+    ],
+    seed=42,
+)
+```
+
+In **local mode**, Twinkle reads `WORLD_SIZE`, `RANK`, and `LOCAL_RANK` from the environment (set by torchrun) and creates a single default `DeviceGroup` spanning all GPUs. A `DeviceMesh` is auto-constructed with a `dp` dimension.
+
+In **ray mode**, `RayHelper.initialize()` creates a `ResourceManager` that:
+1. Queries Ray cluster nodes for available GPUs/NPUs
+2. Creates `PlacementGroup` bundles — one per node — to guarantee co-located resources
+3. Maps each logical rank to a physical GPU via `visible_devices` discovery
+
+## The Decorator API
+
+Twinkle's key abstraction is two decorators that make any class distributed-transparent:
+
+### `@remote_class`
+
+Wraps a class so that `__init__` runs either locally or creates Ray actors:
+
+```python
+@infra.remote_class(execute='all')
+class MyModel:
+    def __init__(self, device_mesh: DeviceMesh):
+        self.model = load_model()
+        ...
+```
+
+In local mode, `__init__` runs normally. In Ray mode, `RayHelper.create_workers()` spawns one Ray actor per GPU rank in the specified `DeviceGroup`, each with:
+- Isolated `CUDA_VISIBLE_DEVICES` pointing to its assigned physical GPU
+- `MASTER_ADDR` / `MASTER_PORT` for torch.distributed init
+- Proper `WORLD_SIZE` / `RANK` environment variables
+
+### `@remote_function`
+
+Wraps methods with dispatch, execution, and collection semantics:
+
+```python
+@infra.remote_function(dispatch='slice_dp', collect='mean')
+def train_step(self, batch):
+    loss = self.model(batch)
+    return {'loss': loss.item()}
+```
+
+Three knobs control distributed behavior:
+
+**dispatch** — how arguments are split across workers:
+- `'all'`: Every worker receives the same arguments
+- `'slice'`: Arguments are evenly partitioned across workers
+- `'slice_dp'`: Arguments are partitioned along the data-parallel dimension of the DeviceMesh (EP-aware)
+
+**execute** — which workers run:
+- `'all'`: All workers (default)
+- `'first'`: Only the first worker
+- `'peer'`: Only peer workers (for inter-group communication)
+
+**collect** — how results are aggregated:
+- `'none'`: Return raw list of results
+- `'mean'` / `'sum'`: Reduce numerically
+- `'first'`: Return first worker's result
+- `'last_pp'`: Return results from the last pipeline-parallel stage
+- `Callable`: Custom aggregation function
+
+## LazyCollect: Deferred Result Aggregation
+
+A key optimization in Ray mode is **LazyCollect**. Instead of blocking on `ray.get()` immediately after each remote call, results are wrapped in a `LazyCollect` callable:
+
+```python
+result = model.train_step(batch)   # returns LazyCollect (non-blocking)
+# ... do other work ...
+actual_result = result()           # blocks only when value is needed
+```
+
+This enables overlapping computation and communication — the driver can dispatch work to multiple groups (model, sampler, processor) and only block when results are actually consumed.
+
+LazyCollect also supports `__iter__` and `__len__`, making it transparent to most consumer code.
+
+## ResourceManager: GPU Allocation
+
+The `ResourceManager` handles the complexity of GPU-to-node mapping:
+
+1. **Node discovery** — Queries Ray for all live nodes and their GPU counts
+2. **PlacementGroup creation** — Creates one PG per node with `{GPU: N, CPU: node_cpu//2}` bundles
+3. **GPU mapping** — Discovers actual `CUDA_VISIBLE_DEVICES` on each node to correctly map logical ranks to physical GPUs
+4. **Multi-accelerator support** — Works with GPU, NPU, and other accelerators via `Platform` abstraction. Uses `RAY_EXPERIMENTAL_NOSET_*` env vars to prevent Ray from overriding device visibility
+5. **CPU worker support** — Separate PlacementGroups for CPU-only processes (data processors)
+
+## Device Placement Visualization
+
+Twinkle provides `get_device_placement()` to render the training topology:
+
+```
+╔══════════════════════════════════════════════════════════════════════════════╗
+║                           DEVICE PLACEMENT TOPOLOGY                        ║
+╚══════════════════════════════════════════════════════════════════════════════╝
+
+┌──────────────────────────────────────────────────────────────────────────────┐
+│ ◈ DeviceGroup: model                                                       │
+├──────────────────────────────────────────────────────────────────────────────┤
+│  ├─ Device Type : cuda                                                     │
+│  └─ Ranks       : [0, 1, 2, 3]                                            │
+│  ┌─ DeviceMesh: MyModel                                                    │
+│  │  Dimensions : dp=4                                                      │
+│  │  Parallelism: DP=4                                                      │
+└──────────────────────────────────────────────────────────────────────────────┘
+```
+
+## Error Handling and Notifications
+
+Remote functions automatically capture the **driver-side call site** and attach it to any exception raised inside workers:
+
+```
+[twinkle driver caller: train.py:42] CUDA out of memory
+```
+
+An optional `notifier` (e.g. DingTalk webhook) can be passed to `initialize()` to receive alerts when any remote function fails — useful for long-running distributed jobs.
+
+## When to Use Which Mode
+
+**Use local mode when:**
+- Single machine with 1-8 GPUs
+- Quick prototyping and debugging
+- Simple data-parallel training
+
+**Use Ray mode when:**
+- Multi-node clusters
+- Heterogeneous resource allocation (model GPUs + sampler GPUs + CPU processors)
+- Production training with fault tolerance needs
+- Multi-model deployments (training + inference in the same cluster)
+
+The beauty of Twinkle's design is that your training code stays the same — only the `initialize()` call changes.
diff --git a/content/blog/torchrun-ray/index.zh.md b/content/blog/torchrun-ray/index.zh.md
new file mode 100644
index 0000000..c9ce8d6
--- /dev/null
+++ b/content/blog/torchrun-ray/index.zh.md
@@ -0,0 +1,174 @@
+---
+title: "两种执行模式：torchrun（本地）与 Ray（分布式）"
+date: 2026-06-03
+tags:
+  - 基础设施
+  - Ray
+  - torchrun
+  - 分布式训练
+  - 多机训练
+categories:
+  - 技术深度解析
+---
+
+Twinkle 的 `infra` 模块提供统一的编程模型，无缝支持两种运行模式：**local**（单机 torchrun）和 **ray**（多机 Ray 集群）。本文介绍其架构设计、基于装饰器的 API，以及各模式的适用场景。
+
+<!--more-->
+
+## 两种模式概览
+
+| | Local (torchrun) | Ray (分布式) |
+|---|---|---|
+| 启动方式 | `torchrun --nproc_per_node=N` | `ray start` + 驱动脚本 |
+| 适用范围 | 单机，共享文件系统 | 多机集群 |
+| 进程模型 | 每 GPU 一个进程，torch.distributed | Ray Actor + PlacementGroup |
+| 最适合 | 快速实验、单机训练 | 生产环境多机、异构资源 |
+
+两种模式使用**完全相同的用户代码**——切换只需修改 `twinkle.infra.initialize()` 的 `mode` 参数。
+
+## 初始化
+
+```python
+import twinkle.infra as infra
+
+# Local 模式 — 从 torchrun 环境变量自动检测 ranks 和设备
+infra.initialize(mode='local', seed=42)
+
+# Ray 模式 — 需要显式定义 DeviceGroup
+infra.initialize(
+    mode='ray',
+    nproc_per_node=8,
+    groups=[
+        DeviceGroup(name='model', ranks=list(range(4)), device_type='cuda'),
+        DeviceGroup(name='sampler', ranks=list(range(4, 8)), device_type='cuda'),
+    ],
+    seed=42,
+)
+```
+
+**Local 模式**下，Twinkle 从环境变量读取 `WORLD_SIZE`、`RANK`、`LOCAL_RANK`（由 torchrun 设置），创建一个涵盖所有 GPU 的默认 `DeviceGroup`，并自动构建带 `dp` 维度的 `DeviceMesh`。
+
+**Ray 模式**下，`RayHelper.initialize()` 创建 `ResourceManager`：
+1. 查询 Ray 集群所有活跃节点的 GPU/NPU 资源
+2. 为每个节点创建 `PlacementGroup` 包，保证资源共置
+3. 通过 `visible_devices` 发现将逻辑 rank 映射到物理 GPU
+
+## 装饰器 API
+
+Twinkle 的核心抽象是两个装饰器，让任何类都能透明地分布式化：
+
+### `@remote_class`
+
+封装类的 `__init__`，在本地直接运行或创建 Ray Actor：
+
+```python
+@infra.remote_class(execute='all')
+class MyModel:
+    def __init__(self, device_mesh: DeviceMesh):
+        self.model = load_model()
+        ...
+```
+
+Local 模式下 `__init__` 正常执行。Ray 模式下，`RayHelper.create_workers()` 为每个 GPU rank 创建一个 Ray Actor，每个 Actor 具备：
+- 独立的 `CUDA_VISIBLE_DEVICES`，指向分配的物理 GPU
+- 用于 torch.distributed 初始化的 `MASTER_ADDR` / `MASTER_PORT`
+- 正确的 `WORLD_SIZE` / `RANK` 环境变量
+
+### `@remote_function`
+
+为方法添加分发、执行和聚合语义：
+
+```python
+@infra.remote_function(dispatch='slice_dp', collect='mean')
+def train_step(self, batch):
+    loss = self.model(batch)
+    return {'loss': loss.item()}
+```
+
+三个参数控制分布式行为：
+
+**dispatch** — 参数如何分配给 worker：
+- `'all'`：所有 worker 收到相同参数
+- `'slice'`：参数均匀分片
+- `'slice_dp'`：按 DeviceMesh 的数据并行维度分片（EP 感知）
+
+**execute** — 哪些 worker 执行：
+- `'all'`：所有 worker（默认）
+- `'first'`：仅第一个 worker
+- `'peer'`：仅对等 worker（用于跨组通信）
+
+**collect** — 结果如何聚合：
+- `'none'`：返回原始列表
+- `'mean'` / `'sum'`：数值归约
+- `'first'`：返回第一个 worker 的结果
+- `'last_pp'`：返回最后一个流水线并行阶段的结果
+- `Callable`：自定义聚合函数
+
+## LazyCollect：延迟结果聚合
+
+Ray 模式下的一个关键优化是 **LazyCollect**。远程调用不会立即阻塞 `ray.get()`，而是返回一个 `LazyCollect` 可调用对象：
+
+```python
+result = model.train_step(batch)   # 返回 LazyCollect（非阻塞）
+# ... 执行其他工作 ...
+actual_result = result()           # 需要值时才阻塞
+```
+
+这使得计算和通信可以重叠——驱动端可以同时向多个组（model、sampler、processor）分发任务，仅在真正消费结果时阻塞。
+
+LazyCollect 还支持 `__iter__` 和 `__len__`，对大部分消费代码完全透明。
+
+## ResourceManager：GPU 分配
+
+`ResourceManager` 处理 GPU 到节点映射的复杂逻辑：
+
+1. **节点发现** — 查询 Ray 获取所有活跃节点及 GPU 数量
+2. **PlacementGroup 创建** — 每节点一个 PG，包含 `{GPU: N, CPU: node_cpu//2}` 资源包
+3. **GPU 映射** — 发现每个节点的实际 `CUDA_VISIBLE_DEVICES`，正确映射逻辑 rank 到物理 GPU
+4. **多加速器支持** — 通过 `Platform` 抽象支持 GPU、NPU 等多种加速器。使用 `RAY_EXPERIMENTAL_NOSET_*` 环境变量防止 Ray 覆盖设备可见性
+5. **CPU Worker 支持** — 为纯 CPU 进程（数据处理器）创建独立的 PlacementGroup
+
+## 设备拓扑可视化
+
+Twinkle 提供 `get_device_placement()` 渲染训练拓扑：
+
+```
+╔══════════════════════════════════════════════════════════════════════════════╗
+║                           DEVICE PLACEMENT TOPOLOGY                        ║
+╚══════════════════════════════════════════════════════════════════════════════╝
+
+┌──────────────────────────────────────────────────────────────────────────────┐
+│ ◈ DeviceGroup: model                                                       │
+├──────────────────────────────────────────────────────────────────────────────┤
+│  ├─ Device Type : cuda                                                     │
+│  └─ Ranks       : [0, 1, 2, 3]                                            │
+│  ┌─ DeviceMesh: MyModel                                                    │
+│  │  Dimensions : dp=4                                                      │
+│  │  Parallelism: DP=4                                                      │
+└──────────────────────────────────────────────────────────────────────────────┘
+```
+
+## 错误处理与通知
+
+远程函数会自动捕获**驱动端调用位置**，并附加到 worker 内部抛出的异常中：
+
+```
+[twinkle driver caller: train.py:42] CUDA out of memory
+```
+
+可通过 `initialize()` 传入可选的 `notifier`（如钉钉 webhook），在任何远程函数失败时发送告警——适用于长时间运行的分布式任务。
+
+## 如何选择模式
+
+**使用 Local 模式：**
+- 单机 1-8 张 GPU
+- 快速原型验证和调试
+- 简单数据并行训练
+
+**使用 Ray 模式：**
+- 多机集群
+- 异构资源分配（模型 GPU + 采样器 GPU + CPU 处理器）
+- 生产级训练，需要容错机制
+- 多模型部署（训练 + 推理在同一集群）
+
+Twinkle 设计的优雅之处在于——你的训练代码保持不变，只需修改 `initialize()` 调用即可切换模式。
diff --git a/content/blog/tui-auto-research/index.md b/content/blog/tui-auto-research/index.md
new file mode 100644
index 0000000..018b419
--- /dev/null
+++ b/content/blog/tui-auto-research/index.md
@@ -0,0 +1,156 @@
+---
+title: "TUI & Auto-Research: An AI Agent for Training Control"
+date: 2026-06-01
+aliases:
+  - /blog/auto-research/
+tags:
+  - TUI
+  - Agent
+  - Auto-Research
+  - LLM Tools
+  - Terminal UI
+categories:
+  - Technical Deep Dive
+---
+
+Twinkle ships a terminal-based UI (TUI) powered by an embedded LLM agent that can autonomously start, monitor, pause, and debug ML training runs. This post covers the architecture of the TUI, the agent loop, and the tool system that makes "auto-research" possible.
+
+<!--more-->
+
+## Architecture Overview
+
+The TUI is built on [Textual](https://textual.textualize.io/) and consists of four panels in a 2x3 grid layout:
+
+| Panel | Position | Purpose |
+|-------|----------|---------|
+| **StatusBar** | Top, full width | Run ID, model, step counter, training state |
+| **MetricsPanel** | Middle left | Real-time loss/reward/grad_norm charts |
+| **LogPanel** | Right, spanning 2 rows | Streaming stdout from training process |
+| **ChatPanel** | Bottom left | Natural language interaction with the agent |
+
+```css
+Screen {
+    layout: grid;
+    grid-size: 2 3;
+    grid-rows: auto 2fr 3fr;
+    grid-columns: 2fr 1fr;
+}
+```
+
+## The Agent Loop
+
+At the heart of the TUI is `AgentLoop` — an async tool-calling agent that uses any **OpenAI-compatible API** (local Ollama, cloud API, etc.):
+
+```python
+agent = AgentLoop(
+    connection=connection,
+    llm_base_url='http://localhost:11434/v1',
+    llm_model='qwen3.5',
+    llm_api_key='not-needed',
+)
+```
+
+The loop follows a standard ReAct pattern:
+
+1. User sends a message via ChatPanel
+2. Agent calls LLM with conversation history + tool schemas
+3. LLM either responds directly or generates tool calls
+4. Tools are executed, results fed back to LLM
+5. Repeat until LLM produces a final text response (max 10 rounds)
+
+Key design decisions:
+- **Streaming**: Tokens are streamed to the UI in real-time. If tool calls are detected mid-stream, `on_stream_reset` discards partial output
+- **History pruning**: Conversation is capped at 50 messages (excluding system prompt), with cuts always at `user` message boundaries to avoid breaking tool-call sequences
+- **Async skills loading**: Skills are loaded in the background — the agent is usable immediately, skills are injected via `inject_skills()` when ready
+
+## Tool System
+
+The agent has access to 15+ tools organized into categories:
+
+### Training Lifecycle
+| Tool | Description |
+|------|-------------|
+| `start_server` | Launch Ray cluster + Twinkle Server (GPU partition, config generation) |
+| `shutdown_server` | Stop server and release GPU resources |
+| `start_training` | Write training script, launch process, begin monitoring |
+| `pause_training` | SIGKILL client process (server retains state) |
+| `resume_training` | Re-launch client script from saved state |
+| `stop_training` | Graceful stop with checkpoint saving |
+| `update_script` | Archive current script, write new version |
+
+### Discovery & Search
+| Tool | Description |
+|------|-------------|
+| `list_training_runs` | List active and historical runs |
+| `get_training_status` | Get run state + recent metrics |
+| `search_models` | Search ModelScope Hub for models |
+| `search_datasets` | Search ModelScope Hub for datasets |
+| `list_supported_models` | Query server for available models |
+| `get_cluster_info` | Detect GPU resources (Ray or nvidia-smi) |
+
+### Visualization
+| Tool | Description |
+|------|-------------|
+| `zoom_metrics` | Pan/zoom the metrics chart |
+| `select_metrics` | Choose which metrics to display (max 4) |
+| `select_run` | Switch monitoring to a different run |
+
+## Server Startup Pipeline
+
+The `start_server` tool orchestrates a complete server deployment:
+
+1. **Hardware detection** — `nvidia-smi` GPU count
+2. **GPU allocation** — Partition GPUs between training model and sampler/teacher models
+3. **Config generation** — Auto-generate `server_config.yaml` with Ray Serve applications
+4. **Ray cluster start** — Multi-node GPU partitioning with separate raylets per role
+5. **Server launch** — `python -m twinkle.server launch --config ...`
+6. **Health check** — Poll `/api/v1/healthz` + sampler engine readiness
+
+The config generator supports **multi-model topology**: one training model + N sampler/teacher models, with GPU sorting by size (largest PG deploys first to avoid scheduling deadlock).
+
+## Skills System
+
+The TUI supports extensible **skills** — pluggable capabilities loaded from three sources:
+
+1. **Bundled skills** — shipped with the `twinkle_client` package
+2. **Local skills** — user-defined in `~/.cache/twinkle/tui/skills/local/`
+3. **Community skills** — fetched from ModelScope (with 10s timeout)
+
+Skills are loaded asynchronously after the agent starts, so the TUI is interactive immediately.
+
+## TrainingRuntime: Script-Side Integration
+
+Training scripts integrate with the TUI via `TrainingRuntime`:
+
+```python
+from twinkle_client.tui.runtime import TrainingRuntime
+
+rt = TrainingRuntime(run_id='grpo-gsm8k')
+rt.start(model_id='Qwen/Qwen3.5-4B', config={...})
+rt.register_graceful_shutdown(model, dataloader)
+
+for step, batch in enumerate(dataloader):
+    loss = train(batch)
+    rt.log_metrics(step=step, loss=loss, reward=reward)
+    rt.log(f'Step {step}, loss={loss:.4f}')
+
+rt.finish()
+```
+
+Key features:
+- **metrics.jsonl** — structured metrics with auto-timestamp, streamed to TUI in real-time
+- **Graceful shutdown** — SIGTERM handler saves checkpoint (LoRA weights + optimizer state + dataloader position)
+- **Auto-resume** — `get_resume_info()` reads last saved step from `meta.json`
+- **Script archival** — each `update_script` call archives `train.py` as `train_v{N}.py`
+
+## Getting Started
+
+```bash
+# Start TUI with local LLM
+twinkle tui --llm-base-url http://localhost:11434/v1 --llm-model qwen3.5
+
+# Or with a specific run
+twinkle tui --run-id my-grpo-run
+```
+
+The TUI turns ML training into a conversation — describe what you want to train, and the agent handles server setup, script writing, monitoring, and troubleshooting.
diff --git a/content/blog/tui-auto-research/index.zh.md b/content/blog/tui-auto-research/index.zh.md
new file mode 100644
index 0000000..4209ca9
--- /dev/null
+++ b/content/blog/tui-auto-research/index.zh.md
@@ -0,0 +1,156 @@
+---
+title: "TUI 与 Auto-Research：用 AI Agent 控制训练"
+date: 2026-06-01
+aliases:
+  - /blog/auto-research/
+tags:
+  - TUI
+  - Agent
+  - Auto-Research
+  - LLM 工具
+  - 终端界面
+categories:
+  - 技术深度解析
+---
+
+Twinkle 内置了一个终端 UI（TUI），集成 LLM Agent，可以自主启动、监控、暂停和调试 ML 训练任务。本文介绍 TUI 的架构设计、Agent 循环以及让「自动化研究」成为可能的工具系统。
+
+<!--more-->
+
+## 架构概览
+
+TUI 基于 [Textual](https://textual.textualize.io/) 构建，由四个面板组成 2x3 网格布局：
+
+| 面板 | 位置 | 功能 |
+|------|------|------|
+| **StatusBar** | 顶部，全宽 | Run ID、模型、步数、训练状态 |
+| **MetricsPanel** | 中左 | 实时 loss/reward/grad_norm 图表 |
+| **LogPanel** | 右侧，跨 2 行 | 训练进程的流式 stdout 输出 |
+| **ChatPanel** | 左下 | 与 Agent 的自然语言交互 |
+
+```css
+Screen {
+    layout: grid;
+    grid-size: 2 3;
+    grid-rows: auto 2fr 3fr;
+    grid-columns: 2fr 1fr;
+}
+```
+
+## Agent 循环
+
+TUI 的核心是 `AgentLoop`——一个异步工具调用 Agent，支持任何 **OpenAI 兼容 API**（本地 Ollama、云端 API 等）：
+
+```python
+agent = AgentLoop(
+    connection=connection,
+    llm_base_url='http://localhost:11434/v1',
+    llm_model='qwen3.5',
+    llm_api_key='not-needed',
+)
+```
+
+循环遵循标准的 ReAct 模式：
+
+1. 用户通过 ChatPanel 发送消息
+2. Agent 携带对话历史 + 工具 schema 调用 LLM
+3. LLM 直接回复或生成工具调用
+4. 执行工具，将结果回传 LLM
+5. 重复直到 LLM 产生最终文本回复（最多 10 轮）
+
+关键设计决策：
+- **流式输出**：Token 实时流式传输到 UI。如果在流中检测到工具调用，`on_stream_reset` 会丢弃已部分显示的输出
+- **历史修剪**：对话上限 50 条消息（不含系统提示），始终在 `user` 消息边界裁剪，避免破坏工具调用序列
+- **异步技能加载**：技能在后台加载——Agent 立即可用，技能就绪后通过 `inject_skills()` 注入
+
+## 工具系统
+
+Agent 拥有 15+ 个工具，按类别组织：
+
+### 训练生命周期
+| 工具 | 说明 |
+|------|------|
+| `start_server` | 启动 Ray 集群 + Twinkle Server（GPU 分区、配置生成） |
+| `shutdown_server` | 关停 Server 并释放 GPU 资源 |
+| `start_training` | 编写训练脚本、启动进程、开始监控 |
+| `pause_training` | SIGKILL 客户端进程（Server 保留状态） |
+| `resume_training` | 从保存的状态重新启动客户端脚本 |
+| `stop_training` | 优雅停止并保存 checkpoint |
+| `update_script` | 归档当前脚本，写入新版本 |
+
+### 发现与搜索
+| 工具 | 说明 |
+|------|------|
+| `list_training_runs` | 列出活跃和历史训练任务 |
+| `get_training_status` | 获取任务状态 + 最近指标 |
+| `search_models` | 在 ModelScope Hub 搜索模型 |
+| `search_datasets` | 在 ModelScope Hub 搜索数据集 |
+| `list_supported_models` | 查询 Server 支持的模型 |
+| `get_cluster_info` | 检测 GPU 资源（Ray 或 nvidia-smi） |
+
+### 可视化
+| 工具 | 说明 |
+|------|------|
+| `zoom_metrics` | 平移/缩放指标图表 |
+| `select_metrics` | 选择显示哪些指标（最多 4 个） |
+| `select_run` | 切换监控的训练任务 |
+
+## Server 启动流水线
+
+`start_server` 工具编排完整的服务端部署：
+
+1. **硬件检测** — `nvidia-smi` 获取 GPU 数量
+2. **GPU 分配** — 在训练模型和采样器/教师模型之间分区
+3. **配置生成** — 自动生成包含 Ray Serve 应用的 `server_config.yaml`
+4. **Ray 集群启动** — 多节点 GPU 分区，每个角色使用独立 raylet
+5. **Server 启动** — `python -m twinkle.server launch --config ...`
+6. **健康检查** — 轮询 `/api/v1/healthz` + 采样器引擎就绪检测
+
+配置生成器支持**多模型拓扑**：一个训练模型 + N 个采样器/教师模型，按 GPU 数量降序排列（最大 PG 优先部署以避免调度死锁）。
+
+## 技能系统
+
+TUI 支持可扩展的 **Skills**——可插拔能力，从三个来源加载：
+
+1. **内置技能** — 随 `twinkle_client` 包一起发布
+2. **本地技能** — 用户自定义，放在 `~/.cache/twinkle/tui/skills/local/`
+3. **社区技能** — 从 ModelScope 获取（10 秒超时）
+
+技能在 Agent 启动后异步加载，因此 TUI 立即可交互。
+
+## TrainingRuntime：训练脚本集成
+
+训练脚本通过 `TrainingRuntime` 与 TUI 集成：
+
+```python
+from twinkle_client.tui.runtime import TrainingRuntime
+
+rt = TrainingRuntime(run_id='grpo-gsm8k')
+rt.start(model_id='Qwen/Qwen3.5-4B', config={...})
+rt.register_graceful_shutdown(model, dataloader)
+
+for step, batch in enumerate(dataloader):
+    loss = train(batch)
+    rt.log_metrics(step=step, loss=loss, reward=reward)
+    rt.log(f'Step {step}, loss={loss:.4f}')
+
+rt.finish()
+```
+
+核心功能：
+- **metrics.jsonl** — 结构化指标，自动时间戳，实时流式传输到 TUI
+- **优雅停机** — SIGTERM 处理器保存 checkpoint（LoRA 权重 + 优化器状态 + dataloader 位置）
+- **自动续训** — `get_resume_info()` 从 `meta.json` 读取最后保存的步数
+- **脚本归档** — 每次 `update_script` 调用将 `train.py` 归档为 `train_v{N}.py`
+
+## 快速开始
+
+```bash
+# 使用本地 LLM 启动 TUI
+twinkle tui --llm-base-url http://localhost:11434/v1 --llm-model qwen3.5
+
+# 或指定运行 ID
+twinkle tui --run-id my-grpo-run
+```
+
+TUI 将 ML 训练变成一场对话——描述你想训练什么，Agent 自动处理服务器部署、脚本编写、监控和排障。
diff --git a/content/blog/twinkle-vs-verl/index.md b/content/blog/twinkle-vs-verl/index.md
deleted file mode 100644
index d9f4a58..0000000
--- a/content/blog/twinkle-vs-verl/index.md
+++ /dev/null
@@ -1,185 +0,0 @@
----
-title: "Twinkle vs veRL: Two Approaches to LLM Post-Training"
-date: 2026-03-18
-tags:
-  - Reinforcement Learning
-  - GRPO
-  - veRL
-  - Comparison
-categories:
-  - Technical
----
-
-Reinforcement Learning from Human Feedback (RLHF) and its variants have become essential for aligning LLMs. Two excellent open-source frameworks in this space are **veRL** (from ByteDance Seed team) and **Twinkle** (from ModelScope). Both are production-ready and support diverse training scenarios. In this post, we compare their architectural philosophies and help you choose the right tool for your needs.
-
-<!--more-->
-
-## Overview
-
-Both veRL and Twinkle are mature, production-ready frameworks for LLM post-training. They share many capabilities but differ in architectural philosophy:
-
-| Aspect | veRL | Twinkle |
-|--------|------|---------|
-| Architecture | Hybrid-controller (HybridFlow) | Client-Server decoupled |
-| Core Strength | RL algorithm richness | Multi-tenant unified platform |
-| Backends | FSDP, Megatron-LM, vLLM, SGLang | Transformers, Megatron |
-| Hardware | NVIDIA, AMD, Ascend | NVIDIA, Ascend |
-| Deployment | Ray cluster | torchrun / Ray / HTTP (TaaS) |
-
-## Architecture Comparison
-
-### veRL: Hybrid-Controller Architecture
-
-veRL implements the HybridFlow paper's hybrid-controller design, optimizing dataflow between training and inference phases:
-
-```
-┌─────────────────────────────────────────────┐
-│            veRL Hybrid Controller            │
-│  ┌────────────┐  ┌────────────┐  ┌─────────┐ │
-│  │  Rollout   │  │  Training  │  │  Reward │ │
-│  │ (vLLM/SGL) │──│  (FSDP/   │──│  Model  │ │
-│  │            │  │ Megatron) │  │         │ │
-│  └────────────┘  └────────────┘  └─────────┘ │
-│       3D-HybridEngine: Efficient Resharding   │
-└─────────────────────────────────────────────┘
-```
-
-Key strengths:
-- **3D-HybridEngine**: Eliminates memory redundancy during training/generation transitions
-- **Rich RL algorithms**: PPO, GRPO, DAPO, VAPO, REINFORCE++, RLOO, PRIME, and more
-- **Inference engine integration**: First-class vLLM and SGLang support
-- **Proven at scale**: Used to train Doubao-1.5-pro, achieving O1-level math performance
-
-### Twinkle: Client-Server Decoupled Architecture
-
-Twinkle separates concerns into client (data/logic) and server (model/compute) components:
-
-```
-┌──────────────┐     ┌──────────────────────────┐
-│    Client    │     │      Server Cluster      │
-│  ┌────────┐  │     │  ┌─────────────────────┐ │
-│  │Dataset │  │────▶│  │    Base Model       │ │
-│  │Template│  │     │  ├─────────────────────┤ │
-│  │  Loss  │  │     │  │ LoRA A │ LoRA B │...│ │
-│  └────────┘  │     │  └─────────────────────┘ │
-└──────────────┘     └──────────────────────────┘
-```
-
-Key strengths:
-- **Multi-tenancy**: Multiple LoRA training jobs on a shared base model
-- **HTTP/TaaS mode**: Deploy as a service, train via API calls
-- **Unified platform**: SFT, PT, and RL on the same infrastructure
-- **Explicit training loop**: Full control over each training step
-
-## Feature Comparison
-
-### RL Algorithms
-
-| Algorithm | veRL | Twinkle |
-|-----------|------|---------|
-| PPO | ✅ | ✅ |
-| GRPO | ✅ | ✅ |
-| DAPO / VAPO | ✅ | - |
-| REINFORCE++ | ✅ | - |
-| RLOO | ✅ | ✅ |
-| GKD | ✅ | ✅ |
-| Multi-turn RL | ✅ | ✅ |
-
-### Training Capabilities
-
-| Feature | veRL | Twinkle |
-|---------|------|---------|
-| SFT | ✅ | ✅ |
-| Pre-training | ✅ | ✅ |
-| LoRA | ✅ | ✅ |
-| VLM / Multimodal | ✅ (Qwen2.5-VL, Kimi-VL) | Planned |
-| Multi-turn + Tools | ✅ | ✅ |
-| Multi-tenancy | - | ✅ |
-
-### Scale & Performance
-
-| Aspect | veRL | Twinkle |
-|--------|------|---------|
-| Max tested scale | 671B (DeepSeek), hundreds of GPUs | 72B+, Ray clusters |
-| Inference engines | vLLM, SGLang, HF | vLLM, HF |
-| Training backends | FSDP, FSDP2, Megatron-LM | Transformers, Megatron |
-
-## When to Choose veRL
-
-veRL excels when:
-- You need **state-of-the-art RL algorithms** (DAPO, VAPO, REINFORCE++)
-- **VLM/multimodal RL** is a requirement
-- You want **vLLM/SGLang** as your inference engine for rollouts
-- You're pushing the **frontier of RL research** for reasoning models
-- You need **proven scale** (671B models, O1-level results)
-
-## When to Choose Twinkle
-
-Twinkle excels when:
-- **Multi-tenancy** is critical (multiple teams, concurrent training jobs)
-- You need a **unified SFT → RL pipeline** with one infrastructure
-- **Training-as-a-Service (TaaS)** deployment via HTTP is important
-- You want **explicit training loop control** for custom logic
-- **Pre-training** is part of your workflow
-
-## Code Style Comparison
-
-### veRL: Declarative Trainer
-
-```python
-# veRL style - configure and run
-from verl import DataProto
-from verl.trainer.ppo import PPOTrainer
-
-trainer = PPOTrainer(
-    config=config,
-    actor_rollout_ref=actor,
-    critic=critic,
-    reward_model=reward_fn,
-)
-trainer.fit()
-```
-
-### Twinkle: Explicit Training Loop
-
-```python
-# Twinkle style - explicit control
-from twinkle import TransformersModel
-
-model = TransformersModel(model_id=model_id)
-model.add_adapter_to_model('default', lora_config)
-model.set_optimizer(optimizer_cls='AdamW', lr=1e-4)
-
-for batch in dataloader:
-    model.forward_backward(inputs=batch)
-    # Custom logic here
-    model.clip_grad_and_step()
-```
-
-## Conclusion
-
-Both veRL and Twinkle are excellent choices for LLM post-training. They represent different design philosophies:
-
-- **veRL**: Optimized for RL performance and algorithm diversity, with cutting-edge research support
-- **Twinkle**: Optimized for operational flexibility, multi-tenancy, and unified training workflows
-
-The good news? Both are open source, actively maintained, and production-ready. Choose based on your primary use case:
-
-| Your Priority | Recommended |
-|---------------|-------------|
-| Cutting-edge RL algorithms | veRL |
-| VLM/multimodal training | veRL |
-| Multi-tenant platform | Twinkle |
-| TaaS deployment | Twinkle |
-| Unified SFT+RL infra | Twinkle |
-
-## Resources
-
-**veRL**:
-- [GitHub](https://github.com/verl-project/verl)
-- [Documentation](https://verl.readthedocs.io/)
-
-**Twinkle**:
-- [GitHub](https://github.com/modelscope/twinkle)
-- [Documentation](https://twinkle-kit.readthedocs.io/)
-- [GRPO Cookbook](https://github.com/modelscope/twinkle/tree/main/cookbook/rl)
diff --git a/content/blog/twinkle-vs-verl/index.zh.md b/content/blog/twinkle-vs-verl/index.zh.md
deleted file mode 100644
index b252044..0000000
--- a/content/blog/twinkle-vs-verl/index.zh.md
+++ /dev/null
@@ -1,185 +0,0 @@
----
-title: "Twinkle vs veRL：LLM 后训练的两种方案"
-date: 2026-03-18
-tags:
-  - 强化学习
-  - GRPO
-  - veRL
-  - 对比
-categories:
-  - 技术
----
-
-基于人类反馈的强化学习（RLHF）及其变体已成为 LLM 对齐的必备技术。这一领域有两个优秀的开源框架：**veRL**（来自字节 Seed 团队）和 **Twinkle**（来自魔搭社区）。两者都是生产就绪的框架，支持多种训练场景。本文将比较它们的架构理念，帮助你选择合适的工具。
-
-<!--more-->
-
-## 概述
-
-veRL 和 Twinkle 都是成熟的、生产就绪的 LLM 后训练框架。它们共享许多能力，但架构理念不同：
-
-| 方面 | veRL | Twinkle |
-|------|------|---------|
-| 架构 | Hybrid-controller (HybridFlow) | Client-Server 解耦 |
-| 核心优势 | RL 算法丰富度 | 多租户统一平台 |
-| 后端支持 | FSDP, Megatron-LM, vLLM, SGLang | Transformers, Megatron |
-| 硬件支持 | NVIDIA, AMD, 昇腾 | NVIDIA, 昇腾 |
-| 部署方式 | Ray 集群 | torchrun / Ray / HTTP (TaaS) |
-
-## 架构对比
-
-### veRL：Hybrid-Controller 架构
-
-veRL 实现了 HybridFlow 论文的混合控制器设计，优化训练和推理阶段之间的数据流：
-
-```
-┌─────────────────────────────────────────────┐
-│            veRL Hybrid Controller            │
-│  ┌────────────┐  ┌────────────┐  ┌─────────┐ │
-│  │  Rollout   │  │  Training  │  │  Reward │ │
-│  │ (vLLM/SGL) │──│  (FSDP/   │──│  Model  │ │
-│  │            │  │ Megatron) │  │         │ │
-│  └────────────┘  └────────────┘  └─────────┘ │
-│       3D-HybridEngine: 高效重分片            │
-└─────────────────────────────────────────────┘
-```
-
-核心优势：
-- **3D-HybridEngine**：消除训练/生成转换时的内存冗余
-- **丰富的 RL 算法**：PPO, GRPO, DAPO, VAPO, REINFORCE++, RLOO, PRIME 等
-- **推理引擎集成**：一流的 vLLM 和 SGLang 支持
-- **规模化验证**：用于训练豆包-1.5-pro，达到 O1 级别的数学性能
-
-### Twinkle：Client-Server 解耦架构
-
-Twinkle 将关注点分离为客户端（数据/逻辑）和服务端（模型/算力）组件：
-
-```
-┌──────────────┐     ┌──────────────────────────┐
-│     客户端    │     │        服务器集群         │
-│  ┌────────┐  │     │  ┌─────────────────────┐ │
-│  │Dataset │  │────▶│  │       基座模型        │ │
-│  │Template│  │     │  ├─────────────────────┤ │
-│  │  Loss  │  │     │  │ LoRA A │ LoRA B │...│ │
-│  └────────┘  │     │  └─────────────────────┘ │
-└──────────────┘     └──────────────────────────┘
-```
-
-核心优势：
-- **多租户**：共享基座模型上同时训练多个 LoRA
-- **HTTP/TaaS 模式**：部署为服务，通过 API 调用训练
-- **统一平台**：SFT、PT 和 RL 在同一基础设施上
-- **显式训练循环**：完全控制每个训练步骤
-
-## 功能对比
-
-### RL 算法
-
-| 算法 | veRL | Twinkle |
-|------|------|---------|
-| PPO | ✅ | ✅ |
-| GRPO | ✅ | ✅ |
-| DAPO / VAPO | ✅ | - |
-| REINFORCE++ | ✅ | - |
-| RLOO | ✅ | ✅ |
-| GKD | ✅ | ✅ |
-| 多轮 RL | ✅ | ✅ |
-
-### 训练能力
-
-| 功能 | veRL | Twinkle |
-|------|------|---------|
-| SFT | ✅ | ✅ |
-| 预训练 | ✅ | ✅ |
-| LoRA | ✅ | ✅ |
-| VLM / 多模态 | ✅ (Qwen2.5-VL, Kimi-VL) | 规划中 |
-| 多轮 + 工具调用 | ✅ | ✅ |
-| 多租户 | - | ✅ |
-
-### 规模与性能
-
-| 方面 | veRL | Twinkle |
-|------|------|---------|
-| 最大测试规模 | 671B (DeepSeek)，数百 GPU | 72B+，Ray 集群 |
-| 推理引擎 | vLLM, SGLang, HF | vLLM, HF |
-| 训练后端 | FSDP, FSDP2, Megatron-LM | Transformers, Megatron |
-
-## 何时选择 veRL
-
-veRL 在以下场景中表现优异：
-- 需要**最前沿的 RL 算法**（DAPO, VAPO, REINFORCE++）
-- **VLM/多模态 RL** 是必要需求
-- 想使用 **vLLM/SGLang** 作为 rollout 的推理引擎
-- 正在探索**推理模型的 RL 前沿研究**
-- 需要**已验证的规模化**（671B 模型，O1 级别效果）
-
-## 何时选择 Twinkle
-
-Twinkle 在以下场景中表现优异：
-- **多租户**是关键需求（多团队、并发训练任务）
-- 需要统一的 **SFT → RL 流水线**
-- **训练即服务（TaaS）** 通过 HTTP 部署很重要
-- 想要**显式训练循环控制**以实现自定义逻辑
-- **预训练**是工作流的一部分
-
-## 代码风格对比
-
-### veRL：声明式 Trainer
-
-```python
-# veRL 风格 - 配置并运行
-from verl import DataProto
-from verl.trainer.ppo import PPOTrainer
-
-trainer = PPOTrainer(
-    config=config,
-    actor_rollout_ref=actor,
-    critic=critic,
-    reward_model=reward_fn,
-)
-trainer.fit()
-```
-
-### Twinkle：显式训练循环
-
-```python
-# Twinkle 风格 - 显式控制
-from twinkle import TransformersModel
-
-model = TransformersModel(model_id=model_id)
-model.add_adapter_to_model('default', lora_config)
-model.set_optimizer(optimizer_cls='AdamW', lr=1e-4)
-
-for batch in dataloader:
-    model.forward_backward(inputs=batch)
-    # 自定义逻辑
-    model.clip_grad_and_step()
-```
-
-## 结论
-
-veRL 和 Twinkle 都是 LLM 后训练的优秀选择。它们代表了不同的设计理念：
-
-- **veRL**：为 RL 性能和算法多样性优化，支持前沿研究
-- **Twinkle**：为运营灵活性、多租户和统一训练工作流优化
-
-好消息是：两者都是开源的、积极维护的、生产就绪的。根据你的主要用例选择：
-
-| 你的优先级 | 推荐 |
-|----------|------|
-| 前沿 RL 算法 | veRL |
-| VLM/多模态训练 | veRL |
-| 多租户平台 | Twinkle |
-| TaaS 部署 | Twinkle |
-| 统一 SFT+RL 基础设施 | Twinkle |
-
-## 资源
-
-**veRL**：
-- [GitHub](https://github.com/verl-project/verl)
-- [文档](https://verl.readthedocs.io/)
-
-**Twinkle**：
-- [GitHub](https://github.com/modelscope/twinkle)
-- [文档](https://twinkle-kit.readthedocs.io/)
-- [GRPO Cookbook](https://github.com/modelscope/twinkle/tree/main/cookbook/rl)
diff --git a/content/community/index.md b/content/community/index.md
index cf3a59c..653eee1 100644
--- a/content/community/index.md
+++ b/content/community/index.md
@@ -42,6 +42,20 @@ Twinkle is part of the [ModelScope](https://github.com/modelscope) open-source e
 - [ms-swift](https://github.com/modelscope/ms-swift) — Efficient fine-tuning framework
 - [Transformers](https://github.com/huggingface/transformers) — Model framework
 
+## Contributors {#contributors}
+
+**Twinkle✨** is designed, developed, and maintained by an **Open Workshop** composed of members from various open-source technology teams. We welcome more developers passionate about large model training to join us in building and improving this framework.
+
+The core members of the workshop currently come from:
+
+- [ModelScope](https://modelscope.cn/home) Open Source Community Project Team
+- [China Merchants Bank](https://www.cmbchina.com/) Open Source Technology Team
+- Technical staff from various compute hardware teams
+
+We are grateful to the open-source community, particularly the projects that inspired us, including [Transformers](https://github.com/huggingface/transformers), [MS-SWIFT](https://github.com/modelscope/swift), [veRL](https://github.com/verl-project/verl), [Tinker](https://github.com/thinking-machines-lab/tinker), and many others.
+
+We welcome open contributions via [Issues](https://github.com/modelscope/twinkle/issues) and [Pull Requests](https://github.com/modelscope/twinkle/pulls).
+
 ## Further Resources
 
 - [Ray Documentation](https://docs.ray.io/) — The distributed computing framework
diff --git a/content/community/index.zh.md b/content/community/index.zh.md
index d5f6f6f..36d3268 100644
--- a/content/community/index.zh.md
+++ b/content/community/index.zh.md
@@ -42,6 +42,20 @@ Twinkle 是 [ModelScope](https://github.com/modelscope) 开源生态系统的一
 - [ms-swift](https://github.com/modelscope/ms-swift) — 高效微调框架
 - [Transformers](https://github.com/huggingface/transformers) — 模型框架
 
+## 贡献者 {#contributors}
+
+**Twinkle✨** 由来自多个开源技术团队成员组成的**开放工坊（Open Workshop）**设计、开发和维护。我们欢迎更多热衷于大模型训练的开发者加入，共同构建和完善这个框架。
+
+工坊核心成员目前来自：
+
+- [魔搭社区（ModelScope）](https://modelscope.cn/home) 开源社区项目组
+- [招商银行](https://www.cmbchina.com/) 开源技术团队
+- 多家算力硬件厂商技术人员
+
+我们感谢开源社区，特别是启发我们的项目，包括 [Transformers](https://github.com/huggingface/transformers)、[MS-SWIFT](https://github.com/modelscope/swift)、[veRL](https://github.com/verl-project/verl)、[Tinker](https://github.com/thinking-machines-lab/tinker) 等。
+
+欢迎通过 [Issues](https://github.com/modelscope/twinkle/issues) 和 [Pull Requests](https://github.com/modelscope/twinkle/pulls) 参与开源贡献。
+
 ## 更多资源
 
 - [Ray 文档](https://docs.ray.io/) — 分布式计算框架
diff --git a/content/docs/_index.md b/content/docs/_index.md
index 05d9199..e709c20 100644
--- a/content/docs/_index.md
+++ b/content/docs/_index.md
@@ -13,6 +13,8 @@ Whether you are a researcher customizing training algorithms, a developer buildi
 
 {{< cards >}}
   {{< card url="getting-started" title="Quick Start" icon="rocket-launch" subtitle="Install and run your first training in 5 minutes" >}}
-  {{< card url="guide/" title="User Guide" icon="book-open" subtitle="Deep dive into architecture, components, and deployment" >}}
+  {{< card url="guide/" title="Concepts & Architecture" icon="book-open" subtitle="Deep dive into architecture, runtime modes, and deployment" >}}
+  {{< card url="usage-guide/" title="Tutorials" icon="document-text" subtitle="Step-by-step guides: Training Guide, Qwen3.5, Embedding Training" >}}
+  {{< card url="components/" title="Components" icon="puzzle-piece" subtitle="Dataset, Model, Sampler, Reward and more" >}}
   {{< card url="reference/" title="API Reference" icon="code-bracket" subtitle="Component APIs and customization" >}}
 {{< /cards >}}
diff --git a/content/docs/_index.zh.md b/content/docs/_index.zh.md
index 346179b..4fbfb6a 100644
--- a/content/docs/_index.zh.md
+++ b/content/docs/_index.zh.md
@@ -13,6 +13,8 @@ title: Twinkle 文档
 
 {{< cards >}}
   {{< card url="getting-started" title="快速入门" icon="rocket-launch" subtitle="5 分钟内安装并执行首次训练" >}}
-  {{< card url="guide/" title="用户指南" icon="book-open" subtitle="深入了解架构、组件和部署" >}}
+  {{< card url="guide/" title="概念与架构" icon="book-open" subtitle="深入了解架构、运行模式和部署" >}}
+  {{< card url="usage-guide/" title="实战教程" icon="document-text" subtitle="分步指南：训练指南、Qwen3.5、Embedding 训练" >}}
+  {{< card url="components/" title="组件" icon="puzzle-piece" subtitle="数据集、模型、采样器、奖励等" >}}
   {{< card url="reference/" title="API 参考" icon="code-bracket" subtitle="组件 API 和定制" >}}
 {{< /cards >}}
diff --git a/content/docs/getting-started.md b/content/docs/getting-started.md
index 47a9c98..2795678 100644
--- a/content/docs/getting-started.md
+++ b/content/docs/getting-started.md
@@ -187,6 +187,78 @@ for data in dataloader:
     break
 ```
 
+## After Deployment: OpenAI-Compatible API
+
+After deploying your model with Twinkle Server, you get an **OpenAI-compatible API** out of the box. Any OpenAI SDK or tool can directly call your model for inference:
+
+```bash
+# Start the server
+twinkle-server launch -c server_config.yaml
+```
+
+> For details on writing `server_config.yaml`, see [Server & Client Guide](../guide/server-client/).
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    base_url='http://localhost:8000/api/v1',
+    api_key='your-token',
+)
+
+response = client.chat.completions.create(
+    model='Qwen/Qwen3.5-4B',
+    messages=[{'role': 'user', 'content': 'Hello!'}],
+    temperature=0.7,
+    stream=True,
+)
+for chunk in response:
+    print(chunk.choices[0].delta.content, end='')
+```
+
+Supported endpoints:
+
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/chat/completions` | POST | Chat completions (streaming & non-streaming) |
+| `/models` | GET | List available models |
+
+Features include full streaming support (SSE), sticky session routing for adapter isolation, automatic chat template initialization, and adapter-to-base-model resolution.
+
+## Auto Research: Train with Natural Language
+
+Auto Research is Twinkle's built-in LLM agent terminal that autonomously handles the entire training workflow through natural language — from cluster setup, script generation, and training launch to error diagnosis and auto-fix, without writing any shell commands:
+
+```bash
+# Install the client
+pip install twinkle-client
+
+# Launch Auto Research with a local LLM
+twinkle-tui --llm-base-url http://localhost:11434/v1 --llm-model qwen3.5
+
+# Or with a remote API
+twinkle-tui --llm-base-url https://api.example.com/v1 --llm-api-key sk-xxx --llm-model gpt-4o
+```
+
+**What you can do:**
+
+- *"Start a GRPO training with Qwen3.5-4B on gsm8k"* — auto-generates scripts and launches training
+- *"How is the training going?"* — real-time metrics and status monitoring
+- *"Show me the reward metrics, zoom into steps 100-200"* — interactive chart visualization
+- *"Search for math datasets on ModelScope"* — model and dataset discovery
+
+**Key capabilities:**
+
+| Feature | Description |
+|---------|-------------|
+| Training lifecycle | Start, pause, resume, stop with checkpoint saving |
+| Server management | Auto GPU partitioning, Ray cluster setup, health checks |
+| Auto-fix | Detects crashes, diagnoses errors, rewrites scripts, and restarts (up to 3 attempts) |
+| Real-time monitoring | ASCII metrics charts, log streaming, health checks every 30s |
+| Skills system | Extensible plugin architecture (bundled + local + community) |
+
+Auto Research turns ML training into a conversation — describe what you want to train, and the agent handles everything from server setup to troubleshooting.
+
 ## Supported Hardware
 
 | Hardware | Notes |
@@ -200,12 +272,12 @@ for data in dataloader:
 ## Next Steps
 
 {{< cards >}}
-  {{< card url="../guide/architecture" title="Architecture" icon="cpu-chip" subtitle="Understand the client-server architecture" >}}
   {{< card url="../guide/components" title="Components" icon="puzzle-piece" subtitle="Explore Dataset, Model, Sampler, and more" >}}
   {{< card url="../guide/runtime-modes" title="Runtime Modes" icon="server-stack" subtitle="torchrun, Ray, and HTTP deployment" >}}
-  {{< card url="../guide/server-client" title="Server & Client" icon="arrows-right-left" subtitle="HTTP training service architecture" >}}
   {{< card url="../guide/multi-tenancy" title="Multi-Tenancy" icon="user-group" subtitle="Train multiple LoRAs on shared base model" >}}
-  {{< card url="../guide/npu-support" title="NPU Support" icon="chip" subtitle="Ascend NPU training guide" >}}
+  {{< card url="../guide/server-client" title="Server & Client" icon="arrows-right-left" subtitle="HTTP training service architecture" >}}
   {{< card url="../guide/taas" title="Training as a Service" icon="cloud" subtitle="Deploy enterprise-grade training services" >}}
   {{< card url="../guide/cookbook" title="Cookbook" icon="book-open" subtitle="FSDP, MoE, RL training examples" >}}
+  {{< card url="../guide/npu-support" title="NPU Support" icon="chip" subtitle="Ascend NPU training guide" >}}
+  {{< card url="../guide/architecture" title="Architecture" icon="cpu-chip" subtitle="Understand the client-server architecture" >}}
 {{< /cards >}}
diff --git a/content/docs/getting-started.zh.md b/content/docs/getting-started.zh.md
index 7fa1887..9b82aca 100644
--- a/content/docs/getting-started.zh.md
+++ b/content/docs/getting-started.zh.md
@@ -187,6 +187,78 @@ for data in dataloader:
     break
 ```
 
+## 部署后：OpenAI 兼容 API
+
+使用 Twinkle Server 部署模型后，即可获得开箱即用的 **OpenAI 兼容 API**。任何 OpenAI SDK 或工具都可以直接调用你的模型进行推理：
+
+```bash
+# 启动 Server
+twinkle-server launch -c server_config.yaml
+```
+
+> `server_config.yaml` 的编写方式详见 [服务端与客户端指南](../guide/server-client/)。
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    base_url='http://localhost:8000/api/v1',
+    api_key='your-token',
+)
+
+response = client.chat.completions.create(
+    model='Qwen/Qwen3.5-4B',
+    messages=[{'role': 'user', 'content': '你好！'}],
+    temperature=0.7,
+    stream=True,
+)
+for chunk in response:
+    print(chunk.choices[0].delta.content, end='')
+```
+
+支持的端点：
+
+| 端点 | 方法 | 说明 |
+|------|------|------|
+| `/chat/completions` | POST | 聊天补全（支持流式与非流式） |
+| `/models` | GET | 列出可用模型 |
+
+特性包括：完整流式响应支持（SSE）、粘性会话路由实现 adapter 隔离、自动聊天模板初始化、Adapter 到基座模型的自动解析。
+
+## Auto Research：用自然语言驱动训练
+
+Auto Research 是 Twinkle 内置的 LLM Agent 终端，通过自然语言对话自主完成训练全流程——从集群部署、脚本生成、启动训练到异常诊断和自动修复，无需手动编写任何 shell 命令：
+
+```bash
+# 安装客户端
+pip install twinkle-client
+
+# 使用本地 LLM 启动 Auto Research
+twinkle-tui --llm-base-url http://localhost:11434/v1 --llm-model qwen3.5
+
+# 或使用远程 API
+twinkle-tui --llm-base-url https://api.example.com/v1 --llm-api-key sk-xxx --llm-model gpt-4o
+```
+
+**你可以这样对话：**
+
+- *"用 Qwen3.5-4B 在 gsm8k 上启动一个 GRPO 训练"* — 自动生成脚本并启动训练
+- *"训练进展如何？"* — 实时指标和状态监控
+- *"显示 reward 指标，放大到 step 100-200"* — 交互式图表可视化
+- *"在 ModelScope 上搜索数学数据集"* — 模型和数据集发现
+
+**核心能力：**
+
+| 能力 | 说明 |
+|------|------|
+| 训练生命周期 | 启动、暂停、恢复、停止，自动保存 checkpoint |
+| Server 管理 | 自动 GPU 分区、Ray 集群搭建、健康检查 |
+| 自动修复 | 检测崩溃、诊断错误、改写脚本并重启（最多 3 次尝试） |
+| 实时监控 | ASCII 指标图表、日志流、每 30 秒健康检查 |
+| Skills 系统 | 可扩展的插件架构（内置 + 本地 + 社区） |
+
+Auto Research 将 ML 训练变成一场对话——描述你想训练什么，Agent 自动处理从服务器部署到排障的全部工作。
+
 ## 支持的硬件
 
 | 硬件 | 备注 |
@@ -200,12 +272,12 @@ for data in dataloader:
 ## 下一步
 
 {{< cards >}}
-  {{< card url="../guide/architecture" title="架构" icon="cpu-chip" subtitle="理解客户端-服务端架构" >}}
   {{< card url="../guide/components" title="组件" icon="puzzle-piece" subtitle="探索 Dataset、Model、Sampler 等" >}}
   {{< card url="../guide/runtime-modes" title="运行模式" icon="server-stack" subtitle="torchrun、Ray 和 HTTP 部署" >}}
-  {{< card url="../guide/server-client" title="服务端与客户端" icon="arrows-right-left" subtitle="HTTP 训练服务架构" >}}
   {{< card url="../guide/multi-tenancy" title="多租户" icon="user-group" subtitle="在共享基座模型上训练多个 LoRA" >}}
-  {{< card url="../guide/npu-support" title="NPU 支持" icon="chip" subtitle="昇腾 NPU 训练指南" >}}
+  {{< card url="../guide/server-client" title="服务端与客户端" icon="arrows-right-left" subtitle="HTTP 训练服务架构" >}}
   {{< card url="../guide/taas" title="训练即服务" icon="cloud" subtitle="部署企业级训练服务" >}}
   {{< card url="../guide/cookbook" title="Cookbook" icon="book-open" subtitle="FSDP、MoE、RL 训练示例" >}}
+  {{< card url="../guide/npu-support" title="NPU 支持" icon="chip" subtitle="昇腾 NPU 训练指南" >}}
+  {{< card url="../guide/architecture" title="架构" icon="cpu-chip" subtitle="理解客户端-服务端架构" >}}
 {{< /cards >}}
diff --git a/content/docs/guide/_index.md b/content/docs/guide/_index.md
index f54d00a..d5ee5e0 100644
--- a/content/docs/guide/_index.md
+++ b/content/docs/guide/_index.md
@@ -1,11 +1,11 @@
 ---
-title: User Guide
+title: Concepts & Architecture
 weight: 2
 sidebar:
   open: true
 ---
 
-Comprehensive guides for configuring, deploying, and extending Twinkle.
+Comprehensive guides for understanding Twinkle's architecture, runtime modes, and deployment.
 
 {{< cards >}}
   {{< card url="architecture" title="Architecture" icon="cpu-chip" >}}
diff --git a/content/docs/guide/_index.zh.md b/content/docs/guide/_index.zh.md
index c23f5b6..6b4c102 100644
--- a/content/docs/guide/_index.zh.md
+++ b/content/docs/guide/_index.zh.md
@@ -1,11 +1,11 @@
 ---
-title: 用户指南
+title: 概念与架构
 weight: 2
 sidebar:
   open: true
 ---
 
-Twinkle 配置、部署和扩展的综合指南。
+Twinkle 架构、运行模式与部署的综合指南。
 
 {{< cards >}}
   {{< card url="architecture" title="架构" icon="cpu-chip" >}}
diff --git a/content/docs/guide/npu-support.md b/content/docs/guide/npu-support.md
index 8de1c4d..0895339 100644
--- a/content/docs/guide/npu-support.md
+++ b/content/docs/guide/npu-support.md
@@ -89,7 +89,7 @@ python cookbook/transformers/fsdp2.py
 
 ### GRPO Reinforcement Learning (8-card)
 
-**Example**: [cookbook/rl/grpo.py](https://github.com/modelscope/twinkle/blob/main/cookbook/rl/grpo.py)
+**Example**: [cookbook/rl/grpo.py](https://github.com/modelscope/twinkle/blob/main/cookbook/rl/grpo/grpo.py)
 
 ```bash
 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
diff --git a/content/docs/guide/npu-support.zh.md b/content/docs/guide/npu-support.zh.md
index 2b38b7d..520d209 100644
--- a/content/docs/guide/npu-support.zh.md
+++ b/content/docs/guide/npu-support.zh.md
@@ -89,7 +89,7 @@ python cookbook/transformers/fsdp2.py
 
 ### GRPO 强化学习（8 卡）
 
-**示例**：[cookbook/rl/grpo.py](https://github.com/modelscope/twinkle/blob/main/cookbook/rl/grpo.py)
+**示例**：[cookbook/rl/grpo.py](https://github.com/modelscope/twinkle/blob/main/cookbook/rl/grpo/grpo.py)
 
 ```bash
 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
diff --git a/content/docs/guide/taas.md b/content/docs/guide/taas.md
index 2e76c89..b8b3260 100644
--- a/content/docs/guide/taas.md
+++ b/content/docs/guide/taas.md
@@ -88,28 +88,71 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 ray start --head --port=6379 --num-gpus=4
 CUDA_VISIBLE_DEVICES=4,5,6,7 ray start --address=head:6379 --num-gpus=4
 ```
 
-### 2. Start Training Server
-
-```python
-# server.py
-import twinkle
-from twinkle import DeviceGroup
-
-device_groups = [
-    DeviceGroup(name='model', ranks=4, device_type='cuda'),
-    DeviceGroup(name='sampler', ranks=4, device_type='cuda'),
-]
+### 2. Write server_config.yaml
+
+```yaml
+http_options:
+  host: 0.0.0.0
+  port: 8000
+
+applications:
+  - name: server
+    route_prefix: /api/v1
+    import_path: server
+    args:
+      supported_models:
+        - Qwen/Qwen3.5-4B
+    deployments:
+      - name: TinkerCompatServer
+        ray_actor_options:
+          num_cpus: 0.1
+
+  - name: models-Qwen3.5-4B
+    route_prefix: /api/v1/model/Qwen/Qwen3.5-4B
+    import_path: model
+    args:
+      backend: transformers
+      model_id: "ms://Qwen/Qwen3.5-4B"
+      nproc_per_node: 1
+      device_group:
+        name: model
+        ranks: 1
+        device_type: cuda
+      device_mesh:
+        device_type: cuda
+        dp_size: 1
+    deployments:
+      - name: ModelManagement
+        ray_actor_options:
+          num_cpus: 0.1
+
+  - name: processor
+    route_prefix: /api/v1/processor
+    import_path: processor
+    args:
+      ncpu_proc_per_node: 2
+      device_group:
+        name: model
+        ranks: 2
+        device_type: CPU
+      device_mesh:
+        device_type: CPU
+        dp_size: 2
+    deployments:
+      - name: ProcessorManagement
+        ray_actor_options:
+          num_cpus: 0.1
+```
 
-twinkle.initialize(mode='http', groups=device_groups)
+> For the full configuration reference, see the [Server & Client Guide](/docs/guide/server-client/).
 
-# Server starts on http://0.0.0.0:8000
-```
+### 3. Launch the Server
 
 ```bash
-python server.py
+twinkle-server launch -c server_config.yaml
 ```
 
-### 3. Connect Clients
+### 4. Connect Clients
 
 ```python
 from twinkle_client import init_twinkle_client
@@ -153,7 +196,7 @@ client = init_twinkle_client(
 
 ### Management
 
-- `GET /health` - Service health check
+- `GET /healthz` - Service health check
 - `GET /metrics` - Training metrics
 
 ## Monitoring
diff --git a/content/docs/guide/taas.zh.md b/content/docs/guide/taas.zh.md
index 5239ee0..eba32ae 100644
--- a/content/docs/guide/taas.zh.md
+++ b/content/docs/guide/taas.zh.md
@@ -88,28 +88,71 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 ray start --head --port=6379 --num-gpus=4
 CUDA_VISIBLE_DEVICES=4,5,6,7 ray start --address=head:6379 --num-gpus=4
 ```
 
-### 2. 启动训练服务端
-
-```python
-# server.py
-import twinkle
-from twinkle import DeviceGroup
-
-device_groups = [
-    DeviceGroup(name='model', ranks=4, device_type='cuda'),
-    DeviceGroup(name='sampler', ranks=4, device_type='cuda'),
-]
+### 2. 编写 server_config.yaml
+
+```yaml
+http_options:
+  host: 0.0.0.0
+  port: 8000
+
+applications:
+  - name: server
+    route_prefix: /api/v1
+    import_path: server
+    args:
+      supported_models:
+        - Qwen/Qwen3.5-4B
+    deployments:
+      - name: TinkerCompatServer
+        ray_actor_options:
+          num_cpus: 0.1
+
+  - name: models-Qwen3.5-4B
+    route_prefix: /api/v1/model/Qwen/Qwen3.5-4B
+    import_path: model
+    args:
+      backend: transformers
+      model_id: "ms://Qwen/Qwen3.5-4B"
+      nproc_per_node: 1
+      device_group:
+        name: model
+        ranks: 1
+        device_type: cuda
+      device_mesh:
+        device_type: cuda
+        dp_size: 1
+    deployments:
+      - name: ModelManagement
+        ray_actor_options:
+          num_cpus: 0.1
+
+  - name: processor
+    route_prefix: /api/v1/processor
+    import_path: processor
+    args:
+      ncpu_proc_per_node: 2
+      device_group:
+        name: model
+        ranks: 2
+        device_type: CPU
+      device_mesh:
+        device_type: CPU
+        dp_size: 2
+    deployments:
+      - name: ProcessorManagement
+        ray_actor_options:
+          num_cpus: 0.1
+```
 
-twinkle.initialize(mode='http', groups=device_groups)
+> 完整配置参考请见 [服务端文档](/zh/docs/guide/server-client/)。
 
-# 服务启动于 http://0.0.0.0:8000
-```
+### 3. 启动服务端
 
 ```bash
-python server.py
+twinkle-server launch -c server_config.yaml
 ```
 
-### 3. 连接客户端
+### 4. 连接客户端
 
 ```python
 from twinkle_client import init_twinkle_client
@@ -153,7 +196,7 @@ client = init_twinkle_client(
 
 ### 管理
 
-- `GET /health` - 服务健康检查
+- `GET /healthz` - 服务健康检查
 - `GET /metrics` - 训练指标
 
 ## 监控
diff --git a/content/showcase/_index.md b/content/showcase/_index.md
index 0f3f1a1..ad2dbf7 100644
--- a/content/showcase/_index.md
+++ b/content/showcase/_index.md
@@ -1,19 +1,20 @@
 ---
-title: Showcase
-description: "See Twinkle in action — Ray distributed training, multi-tenancy, and Training-as-a-Service."
+title: Cookbook
+linkTitle: Cookbook
+description: "Ready-to-run training recipes — from SFT to GRPO, Megatron to multi-turn RL."
 type: landing
 
 sections:
   - block: hero
     content:
       title: Twinkle in Action
-      text: 'See how Twinkle powers LLM training — from single GPU to multi-node Ray clusters with multi-tenant support.'
+      text: 'See how Twinkle powers LLM training — from single GPU to multi-node Ray clusters with multi-tenancy.'
       primary_action:
         icon: brands/github
         text: View on GitHub
         url: "https://github.com/modelscope/twinkle"
       secondary_action:
-        text: Read the Documentation
+        text: Read the Docs
         url: ../docs/
     design:
       no_padding: true
@@ -22,6 +23,7 @@ sections:
         margin: [0, 0, 0, 0]
   - block: collection
     content:
+      count: 0
       filters:
         folders:
           - showcase
diff --git a/content/showcase/_index.zh.md b/content/showcase/_index.zh.md
index bb32ce2..7420ebd 100644
--- a/content/showcase/_index.zh.md
+++ b/content/showcase/_index.zh.md
@@ -1,6 +1,7 @@
 ---
-title: 案例展示
-description: "查看 Twinkle 实际效果 — Ray 分布式训练、多租户和训练即服务。"
+title: Cookbook
+linkTitle: Cookbook
+description: "开箱即用的训练示例 — 从 SFT 到 GRPO，从 Megatron 到多轮 RL。"
 type: landing
 
 sections:
@@ -22,6 +23,7 @@ sections:
         margin: [0, 0, 0, 0]
   - block: collection
     content:
+      count: 0
       filters:
         folders:
           - showcase
diff --git a/content/showcase/dpo/index.md b/content/showcase/dpo/index.md
new file mode 100644
index 0000000..0c22871
--- /dev/null
+++ b/content/showcase/dpo/index.md
@@ -0,0 +1,53 @@
+---
+title: DPO (Preference Optimization)
+linkTitle: DPO
+weight: 80
+---
+
+Direct Preference Optimization — align models using human preference data without reward modeling. Supports sigmoid/hinge/IPO/SimPO/ORPO/CPO variants.
+
+[View full source →](https://github.com/modelscope/twinkle/blob/main/cookbook/rl/dpo/dpo_full.py)
+
+```python
+import twinkle
+from twinkle import DeviceGroup, DeviceMesh
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.loss import DPOLoss
+from twinkle.metric import DPOMetric
+from twinkle.model import TransformersModel
+from twinkle.processor import InputProcessor
+
+MODEL_ID = 'ms://Qwen/Qwen3-4B'
+
+device_groups = [
+    DeviceGroup(name='policy', ranks=list(range(4)), device_type='GPU'),
+    DeviceGroup(name='reference', ranks=list(range(4, 8)), device_type='GPU'),
+]
+twinkle.initialize(mode='ray', nproc_per_node=8, groups=device_groups)
+
+policy_model = TransformersModel(model_id=MODEL_ID, remote_group='policy')
+policy_model.set_loss(DPOLoss(beta=0.1, loss_type='sigmoid'))
+policy_model.add_metric(DPOMetric, beta=0.1)
+
+ref_model = TransformersModel(model_id=MODEL_ID, remote_group='reference')
+
+dataset = Dataset(dataset_meta=DatasetMeta('ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji'))
+dataset.set_template('Template', model_id=MODEL_ID)
+dataset.encode()
+dataloader = DataLoader(dataset=dataset, batch_size=8)
+
+def prepare_dpo_batch(batch):
+    """Interleave positive/negative pairs: [pos, neg, pos, neg, ...]"""
+    result = []
+    for row in batch:
+        result.append({**row, **row['positive']})
+        result.append({**row, **row['negative']})
+    return result
+
+for batch in dataloader:
+    dpo_batch = prepare_dpo_batch(batch)
+    ref_outputs = ref_model.forward_only(inputs=dpo_batch)
+    policy_model.forward_backward(inputs=dpo_batch, ref_outputs=ref_outputs)
+    policy_model.clip_grad_and_step()
+```
diff --git a/content/showcase/dpo/index.zh.md b/content/showcase/dpo/index.zh.md
new file mode 100644
index 0000000..48e6282
--- /dev/null
+++ b/content/showcase/dpo/index.zh.md
@@ -0,0 +1,53 @@
+---
+title: DPO (偏好优化)
+linkTitle: DPO
+weight: 80
+---
+
+Direct Preference Optimization — 使用人类偏好数据对齐模型，无需显式 reward 建模。支持 sigmoid/hinge/IPO/SimPO/ORPO/CPO 变体。
+
+[查看完整源码 →](https://github.com/modelscope/twinkle/blob/main/cookbook/rl/dpo/dpo_full.py)
+
+```python
+import twinkle
+from twinkle import DeviceGroup, DeviceMesh
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.loss import DPOLoss
+from twinkle.metric import DPOMetric
+from twinkle.model import TransformersModel
+from twinkle.processor import InputProcessor
+
+MODEL_ID = 'ms://Qwen/Qwen3-4B'
+
+device_groups = [
+    DeviceGroup(name='policy', ranks=list(range(4)), device_type='GPU'),
+    DeviceGroup(name='reference', ranks=list(range(4, 8)), device_type='GPU'),
+]
+twinkle.initialize(mode='ray', nproc_per_node=8, groups=device_groups)
+
+policy_model = TransformersModel(model_id=MODEL_ID, remote_group='policy')
+policy_model.set_loss(DPOLoss(beta=0.1, loss_type='sigmoid'))
+policy_model.add_metric(DPOMetric, beta=0.1)
+
+ref_model = TransformersModel(model_id=MODEL_ID, remote_group='reference')
+
+dataset = Dataset(dataset_meta=DatasetMeta('ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji'))
+dataset.set_template('Template', model_id=MODEL_ID)
+dataset.encode()
+dataloader = DataLoader(dataset=dataset, batch_size=8)
+
+def prepare_dpo_batch(batch):
+    """将正/负样本交错排列：[pos, neg, pos, neg, ...]"""
+    result = []
+    for row in batch:
+        result.append({**row, **row['positive']})
+        result.append({**row, **row['negative']})
+    return result
+
+for batch in dataloader:
+    dpo_batch = prepare_dpo_batch(batch)
+    ref_outputs = ref_model.forward_only(inputs=dpo_batch)
+    policy_model.forward_backward(inputs=dpo_batch, ref_outputs=ref_outputs)
+    policy_model.clip_grad_and_step()
+```
diff --git a/content/showcase/embedding/index.md b/content/showcase/embedding/index.md
new file mode 100644
index 0000000..069fc46
--- /dev/null
+++ b/content/showcase/embedding/index.md
@@ -0,0 +1,39 @@
+---
+title: Embedding Training
+linkTitle: Embedding
+weight: 60
+---
+
+Train embedding models with InfoNCE contrastive loss. Supports both full-parameter and LoRA fine-tuning.
+
+[View full source →](https://github.com/modelscope/twinkle/blob/main/cookbook/exp/embedding/train_embedding_full_ddp.py)
+
+```python
+import twinkle
+from twinkle import DeviceMesh
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.loss import InfonceLoss
+from twinkle.metric import EmbeddingMetric
+from twinkle.model import TransformersModel
+from twinkle.processor import InputProcessor
+
+device_mesh = DeviceMesh.from_sizes(fsdp_size=4, dp_size=4)
+twinkle.initialize(mode='ray', global_device_mesh=device_mesh)
+
+dataset = Dataset(dataset_meta=DatasetMeta('ms://your-embedding-dataset'))
+dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
+dataset.encode()
+dataloader = DataLoader(dataset=dataset, batch_size=32)
+
+model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B')
+model.set_processor(InputProcessor)
+model.set_loss(InfonceLoss, temperature=0.07, use_batch=True)
+model.set_optimizer(optimizer_cls='AdamW', lr=1e-5)
+model.add_metric(EmbeddingMetric, is_training=True)
+
+for batch in dataloader:
+    model.forward_backward(inputs=batch, task='embedding')
+    model.clip_grad_and_step()
+model.save('last-checkpoint', output_dir='./output/embedding')
+```
diff --git a/content/showcase/embedding/index.zh.md b/content/showcase/embedding/index.zh.md
new file mode 100644
index 0000000..beab8a3
--- /dev/null
+++ b/content/showcase/embedding/index.zh.md
@@ -0,0 +1,39 @@
+---
+title: Embedding 训练
+linkTitle: Embedding
+weight: 60
+---
+
+使用 InfoNCE 对比损失训练 Embedding 模型。支持全参数或 LoRA 微调。
+
+[查看完整源码 →](https://github.com/modelscope/twinkle/blob/main/cookbook/exp/embedding/train_embedding_full_ddp.py)
+
+```python
+import twinkle
+from twinkle import DeviceMesh
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.loss import InfonceLoss
+from twinkle.metric import EmbeddingMetric
+from twinkle.model import TransformersModel
+from twinkle.processor import InputProcessor
+
+device_mesh = DeviceMesh.from_sizes(fsdp_size=4, dp_size=4)
+twinkle.initialize(mode='ray', global_device_mesh=device_mesh)
+
+dataset = Dataset(dataset_meta=DatasetMeta('ms://your-embedding-dataset'))
+dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
+dataset.encode()
+dataloader = DataLoader(dataset=dataset, batch_size=32)
+
+model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B')
+model.set_processor(InputProcessor)
+model.set_loss(InfonceLoss, temperature=0.07, use_batch=True)
+model.set_optimizer(optimizer_cls='AdamW', lr=1e-5)
+model.add_metric(EmbeddingMetric, is_training=True)
+
+for batch in dataloader:
+    model.forward_backward(inputs=batch, task='embedding')
+    model.clip_grad_and_step()
+model.save('last-checkpoint', output_dir='./output/embedding')
+```
diff --git a/content/showcase/ep-moe/index.md b/content/showcase/ep-moe/index.md
new file mode 100644
index 0000000..f001738
--- /dev/null
+++ b/content/showcase/ep-moe/index.md
@@ -0,0 +1,28 @@
+---
+title: EP + MoE (DeepSeek V4 / Qwen3.5 MoE)
+linkTitle: EP + MoE
+weight: 40
+---
+
+Expert-parallel + FSDP2 for Mixture-of-Experts models like DeepSeek V4 and Qwen3.5 MoE.
+
+[View full source →](https://github.com/modelscope/twinkle/blob/main/cookbook/transformers/ep_fsdp2_lora_deepseek_v4.py)
+
+```python
+import twinkle
+from twinkle import DeviceMesh, Platform, get_logger
+from twinkle.cli import CLI
+from twinkle.model import TransformersModel
+
+args = CLI.from_args()
+device_mesh = DeviceMesh.from_sizes(
+    fsdp_size=args.infra.fsdp_size,
+    dp_size=args.infra.dp_size,
+    ep_size=args.infra.ep_size,  # Expert Parallelism
+    device_type=Platform.get_platform().device_prefix(),
+)
+twinkle.initialize(mode=args.infra.mode, global_device_mesh=device_mesh)
+
+model = TransformersModel(model_id='ms://deepseek-ai/DeepSeek-V4')
+# ... standard training loop (same as SFT recipe)
+```
diff --git a/content/showcase/ep-moe/index.zh.md b/content/showcase/ep-moe/index.zh.md
new file mode 100644
index 0000000..544eaf2
--- /dev/null
+++ b/content/showcase/ep-moe/index.zh.md
@@ -0,0 +1,28 @@
+---
+title: EP + MoE (DeepSeek V4 / Qwen3.5 MoE)
+linkTitle: EP + MoE
+weight: 40
+---
+
+专家并行 + FSDP2，适用于 DeepSeek V4、Qwen3.5 MoE 等 MoE 模型。
+
+[查看完整源码 →](https://github.com/modelscope/twinkle/blob/main/cookbook/transformers/ep_fsdp2_lora_deepseek_v4.py)
+
+```python
+import twinkle
+from twinkle import DeviceMesh, Platform, get_logger
+from twinkle.cli import CLI
+from twinkle.model import TransformersModel
+
+args = CLI.from_args()
+device_mesh = DeviceMesh.from_sizes(
+    fsdp_size=args.infra.fsdp_size,
+    dp_size=args.infra.dp_size,
+    ep_size=args.infra.ep_size,  # Expert Parallelism
+    device_type=Platform.get_platform().device_prefix(),
+)
+twinkle.initialize(mode=args.infra.mode, global_device_mesh=device_mesh)
+
+model = TransformersModel(model_id='ms://deepseek-ai/DeepSeek-V4')
+# ... 标准训练循环（同 SFT 示例）
+```
diff --git a/content/showcase/gkd-distill/index.md b/content/showcase/gkd-distill/index.md
new file mode 100644
index 0000000..e77e94c
--- /dev/null
+++ b/content/showcase/gkd-distill/index.md
@@ -0,0 +1,56 @@
+---
+title: On-Policy Distillation (GKD)
+linkTitle: GKD Distill
+weight: 60
+---
+
+Generalized Knowledge Distillation: student generates on-policy, teacher provides top-k logprobs, student learns to match teacher's distribution.
+
+[View full source →](https://github.com/modelscope/twinkle/blob/main/cookbook/rl/gkd/gkd_on_policy.py)
+
+```python
+import twinkle
+from twinkle import DeviceMesh, DeviceGroup
+from twinkle.checkpoint_engine import CheckpointEngineManager
+from twinkle.data_format import SamplingParams
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import DatasetMeta, LazyDataset
+from twinkle.loss import GKDLoss
+from twinkle.model import TransformersModel
+from twinkle.sampler import vLLMSampler
+
+device_groups = [
+    DeviceGroup(name='student_model', ranks=4, device_type='cuda'),
+    DeviceGroup(name='student_sampler', ranks=2, device_type='cuda'),
+    DeviceGroup(name='teacher_sampler', ranks=2, device_type='cuda'),
+]
+twinkle.initialize(mode='ray', nproc_per_node=8, groups=device_groups)
+
+student_model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B', remote_group='student_model')
+student_model.set_loss(GKDLoss(beta=0.5, temperature=1.0))
+
+student_sampler = vLLMSampler(model_id='ms://Qwen/Qwen3.5-4B', remote_group='student_sampler')
+teacher_sampler = vLLMSampler(model_id='ms://Qwen/Qwen3.5-9B', remote_group='teacher_sampler')
+
+ckpt_manager = CheckpointEngineManager(model=student_model, sampler=student_sampler)
+
+dataset = LazyDataset(DatasetMeta('ms://AI-ModelScope/OlympiadBench'))
+dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
+dataloader = DataLoader(dataset=dataset, batch_size=4)
+
+for batch in dataloader:
+    ckpt_manager.sync_weights(merge_and_sync=False)
+    # Student generates on-policy completions
+    student_output = student_sampler.sample(batch, SamplingParams(max_tokens=2048))
+    input_data = [seq.new_input_feature for r in student_output for seq in r.sequences]
+    # Teacher scores the student's completions (top-k logprobs)
+    teacher_output = teacher_sampler.sample(
+        input_data, SamplingParams(max_tokens=0, prompt_logprobs=64))
+    # convert_topk_prompt_logprobs: utility defined in the full source
+    # converts vLLM topk_prompt_logprobs → {teacher_topk_logprobs, teacher_topk_indices} tensors
+    teacher_logprobs = convert_topk_prompt_logprobs(
+        [resp.topk_prompt_logprobs for resp in teacher_output])
+    # GKD backward
+    student_model.forward_backward(inputs=input_data, **teacher_logprobs)
+    student_model.clip_grad_and_step()
+```
diff --git a/content/showcase/gkd-distill/index.zh.md b/content/showcase/gkd-distill/index.zh.md
new file mode 100644
index 0000000..92e1e3e
--- /dev/null
+++ b/content/showcase/gkd-distill/index.zh.md
@@ -0,0 +1,56 @@
+---
+title: On-Policy 蒸馏 (GKD)
+linkTitle: GKD 蒸馏
+weight: 60
+---
+
+广义知识蒸馏：学生 on-policy 生成，教师提供 top-k logprobs，学生学习匹配教师分布。
+
+[查看完整源码 →](https://github.com/modelscope/twinkle/blob/main/cookbook/rl/gkd/gkd_on_policy.py)
+
+```python
+import twinkle
+from twinkle import DeviceMesh, DeviceGroup
+from twinkle.checkpoint_engine import CheckpointEngineManager
+from twinkle.data_format import SamplingParams
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import DatasetMeta, LazyDataset
+from twinkle.loss import GKDLoss
+from twinkle.model import TransformersModel
+from twinkle.sampler import vLLMSampler
+
+device_groups = [
+    DeviceGroup(name='student_model', ranks=4, device_type='cuda'),
+    DeviceGroup(name='student_sampler', ranks=2, device_type='cuda'),
+    DeviceGroup(name='teacher_sampler', ranks=2, device_type='cuda'),
+]
+twinkle.initialize(mode='ray', nproc_per_node=8, groups=device_groups)
+
+student_model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B', remote_group='student_model')
+student_model.set_loss(GKDLoss(beta=0.5, temperature=1.0))
+
+student_sampler = vLLMSampler(model_id='ms://Qwen/Qwen3.5-4B', remote_group='student_sampler')
+teacher_sampler = vLLMSampler(model_id='ms://Qwen/Qwen3.5-9B', remote_group='teacher_sampler')
+
+ckpt_manager = CheckpointEngineManager(model=student_model, sampler=student_sampler)
+
+dataset = LazyDataset(DatasetMeta('ms://AI-ModelScope/OlympiadBench'))
+dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
+dataloader = DataLoader(dataset=dataset, batch_size=4)
+
+for batch in dataloader:
+    ckpt_manager.sync_weights(merge_and_sync=False)
+    # 学生 on-policy 生成
+    student_output = student_sampler.sample(batch, SamplingParams(max_tokens=2048))
+    input_data = [seq.new_input_feature for r in student_output for seq in r.sequences]
+    # 教师对学生生成内容打分（top-k logprobs）
+    teacher_output = teacher_sampler.sample(
+        input_data, SamplingParams(max_tokens=0, prompt_logprobs=64))
+    # convert_topk_prompt_logprobs: 完整源码中定义的工具函数
+    # 将 vLLM topk_prompt_logprobs 转换为 {teacher_topk_logprobs, teacher_topk_indices} 张量
+    teacher_logprobs = convert_topk_prompt_logprobs(
+        [resp.topk_prompt_logprobs for resp in teacher_output])
+    # GKD 反向传播
+    student_model.forward_backward(inputs=input_data, **teacher_logprobs)
+    student_model.clip_grad_and_step()
+```
diff --git a/content/showcase/grpo/index.md b/content/showcase/grpo/index.md
new file mode 100644
index 0000000..7909df6
--- /dev/null
+++ b/content/showcase/grpo/index.md
@@ -0,0 +1,64 @@
+---
+title: GRPO (Reinforcement Learning)
+linkTitle: GRPO
+weight: 50
+---
+
+Group Relative Policy Optimization with vLLM sampling and custom reward functions (e.g. GSM8K math).
+
+[View full source →](https://github.com/modelscope/twinkle/blob/main/cookbook/rl/grpo/grpo.py)
+
+```python
+import twinkle
+from twinkle import DeviceMesh, DeviceGroup, get_logger
+from twinkle.advantage import GRPOAdvantage
+from twinkle.checkpoint_engine import CheckpointEngineManager
+from twinkle.cli import CLI
+from twinkle.data_format import SamplingParams
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import TransformersModel
+from twinkle.processor import InputProcessor
+from twinkle.reward import GSM8KAccuracyReward, GSM8KFormatReward
+from twinkle.sampler import vLLMSampler
+
+args = CLI.from_args()
+MODEL_GPUS, SAMPLER_GPUS = 4, 4
+
+device_groups = [
+    DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
+    DeviceGroup(name='sampler', ranks=list(range(MODEL_GPUS, MODEL_GPUS + SAMPLER_GPUS)), device_type='GPU'),
+]
+twinkle.initialize(mode='ray', nproc_per_node=MODEL_GPUS + SAMPLER_GPUS, groups=device_groups)
+
+model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B',
+                          device_mesh=DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS),
+                          remote_group='model')
+model.set_loss('GRPOLoss', epsilon=0.2)
+
+sampler = vLLMSampler(model_id='ms://Qwen/Qwen3.5-4B',
+                      engine_args={'gpu_memory_utilization': 0.8, 'max_model_len': 4496},
+                      device_mesh=DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS),
+                      remote_group='sampler')
+
+ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler)
+advantage_fn = GRPOAdvantage()
+
+dataset = Dataset(dataset_meta=DatasetMeta('ms://modelscope/gsm8k'))
+dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
+dataset.encode(add_generation_prompt=True)
+dataloader = DataLoader(dataset=dataset, batch_size=8)
+
+for batch in dataloader:
+    ckpt_manager.sync_weights(merge_and_sync=False)
+    expand_prompts = [p for prompt in batch for p in [prompt] * 8]  # group sampling
+    responses = sampler.sample(expand_prompts, SamplingParams(max_tokens=4096, logprobs=1))
+    # Extract trajectories, old log-probs, and compute rewards from responses
+    inputs = [seq.new_input_feature for r in responses for seq in r.sequences]
+    old_logps = [[lp[0][1] for lp in seq.logprobs] for r in responses for seq in r.sequences]
+    rewards = [a + f for a, f in zip(
+        GSM8KAccuracyReward()(inputs), GSM8KFormatReward()(inputs))]
+    advantages = advantage_fn(rewards, num_generations=8, scale='group')
+    model.forward_backward(inputs=inputs, old_logps=old_logps, advantages=advantages)
+    model.clip_grad_and_step()
+```
diff --git a/content/showcase/grpo/index.zh.md b/content/showcase/grpo/index.zh.md
new file mode 100644
index 0000000..0a95a3d
--- /dev/null
+++ b/content/showcase/grpo/index.zh.md
@@ -0,0 +1,64 @@
+---
+title: GRPO (强化学习)
+linkTitle: GRPO
+weight: 50
+---
+
+Group Relative Policy Optimization — 使用 vLLM 采样 + 自定义奖励函数（如 GSM8K 数学）。
+
+[查看完整源码 →](https://github.com/modelscope/twinkle/blob/main/cookbook/rl/grpo/grpo.py)
+
+```python
+import twinkle
+from twinkle import DeviceMesh, DeviceGroup, get_logger
+from twinkle.advantage import GRPOAdvantage
+from twinkle.checkpoint_engine import CheckpointEngineManager
+from twinkle.cli import CLI
+from twinkle.data_format import SamplingParams
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import TransformersModel
+from twinkle.processor import InputProcessor
+from twinkle.reward import GSM8KAccuracyReward, GSM8KFormatReward
+from twinkle.sampler import vLLMSampler
+
+args = CLI.from_args()
+MODEL_GPUS, SAMPLER_GPUS = 4, 4
+
+device_groups = [
+    DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
+    DeviceGroup(name='sampler', ranks=list(range(MODEL_GPUS, MODEL_GPUS + SAMPLER_GPUS)), device_type='GPU'),
+]
+twinkle.initialize(mode='ray', nproc_per_node=MODEL_GPUS + SAMPLER_GPUS, groups=device_groups)
+
+model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B',
+                          device_mesh=DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS),
+                          remote_group='model')
+model.set_loss('GRPOLoss', epsilon=0.2)
+
+sampler = vLLMSampler(model_id='ms://Qwen/Qwen3.5-4B',
+                      engine_args={'gpu_memory_utilization': 0.8, 'max_model_len': 4496},
+                      device_mesh=DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS),
+                      remote_group='sampler')
+
+ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler)
+advantage_fn = GRPOAdvantage()
+
+dataset = Dataset(dataset_meta=DatasetMeta('ms://modelscope/gsm8k'))
+dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
+dataset.encode(add_generation_prompt=True)
+dataloader = DataLoader(dataset=dataset, batch_size=8)
+
+for batch in dataloader:
+    ckpt_manager.sync_weights(merge_and_sync=False)
+    expand_prompts = [p for prompt in batch for p in [prompt] * 8]  # 组采样
+    responses = sampler.sample(expand_prompts, SamplingParams(max_tokens=4096, logprobs=1))
+    # 从响应中提取轨迹、旧 log-probs 并计算奖励
+    inputs = [seq.new_input_feature for r in responses for seq in r.sequences]
+    old_logps = [[lp[0][1] for lp in seq.logprobs] for r in responses for seq in r.sequences]
+    rewards = [a + f for a, f in zip(
+        GSM8KAccuracyReward()(inputs), GSM8KFormatReward()(inputs))]
+    advantages = advantage_fn(rewards, num_generations=8, scale='group')
+    model.forward_backward(inputs=inputs, old_logps=old_logps, advantages=advantages)
+    model.clip_grad_and_step()
+```
diff --git a/content/showcase/megatron-tp/index.md b/content/showcase/megatron-tp/index.md
new file mode 100644
index 0000000..31cd2e8
--- /dev/null
+++ b/content/showcase/megatron-tp/index.md
@@ -0,0 +1,42 @@
+---
+title: Megatron TP Training
+linkTitle: Megatron TP
+weight: 30
+---
+
+Tensor-parallel training via Megatron backend — ideal for large models that don't fit on a single GPU.
+
+[View full source →](https://github.com/modelscope/twinkle/blob/main/cookbook/megatron/tp.py)
+
+```python
+from peft import LoraConfig
+
+import twinkle
+from twinkle import DeviceMesh, get_logger
+from twinkle.cli import CLI
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import MegatronModel
+from twinkle.preprocessor import SelfCognitionProcessor
+
+args = CLI.from_args()
+device_mesh = DeviceMesh.from_sizes(dp_size=args.infra.dp_size, tp_size=args.infra.tp_size, pp_size=args.infra.pp_size)
+twinkle.initialize(mode=args.infra.mode, global_device_mesh=device_mesh)
+
+dataset = Dataset(dataset_meta=DatasetMeta(args.dataset.dataset_id))
+dataset.set_template(args.template.template_cls, model_id=args.model.model_id)
+dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
+dataset.encode()
+
+dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size)
+model = MegatronModel(model_id=args.model.model_id)
+model.add_adapter_to_model('default', LoraConfig(**args.get_lora_args()))
+model.set_optimizer(optimizer_cls='default', lr=args.optimizer.learning_rate)
+model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=args.scheduler.num_warmup_steps,
+                       lr_decay_steps=len(dataloader))
+
+for batch in dataloader:
+    model.forward_backward(inputs=batch)
+    model.clip_grad_and_step()
+model.save('last-checkpoint', output_dir=args.training.output_dir)
+```
diff --git a/content/showcase/megatron-tp/index.zh.md b/content/showcase/megatron-tp/index.zh.md
new file mode 100644
index 0000000..10ead3d
--- /dev/null
+++ b/content/showcase/megatron-tp/index.zh.md
@@ -0,0 +1,42 @@
+---
+title: Megatron 张量并行训练
+linkTitle: Megatron TP
+weight: 30
+---
+
+通过 Megatron 后端进行张量并行训练 — 适用于单卡放不下的大模型。
+
+[查看完整源码 →](https://github.com/modelscope/twinkle/blob/main/cookbook/megatron/tp.py)
+
+```python
+from peft import LoraConfig
+
+import twinkle
+from twinkle import DeviceMesh, get_logger
+from twinkle.cli import CLI
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import MegatronModel
+from twinkle.preprocessor import SelfCognitionProcessor
+
+args = CLI.from_args()
+device_mesh = DeviceMesh.from_sizes(dp_size=args.infra.dp_size, tp_size=args.infra.tp_size, pp_size=args.infra.pp_size)
+twinkle.initialize(mode=args.infra.mode, global_device_mesh=device_mesh)
+
+dataset = Dataset(dataset_meta=DatasetMeta(args.dataset.dataset_id))
+dataset.set_template(args.template.template_cls, model_id=args.model.model_id)
+dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
+dataset.encode()
+
+dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size)
+model = MegatronModel(model_id=args.model.model_id)
+model.add_adapter_to_model('default', LoraConfig(**args.get_lora_args()))
+model.set_optimizer(optimizer_cls='default', lr=args.optimizer.learning_rate)
+model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=args.scheduler.num_warmup_steps,
+                       lr_decay_steps=len(dataloader))
+
+for batch in dataloader:
+    model.forward_backward(inputs=batch)
+    model.clip_grad_and_step()
+model.save('last-checkpoint', output_dir=args.training.output_dir)
+```
diff --git a/content/showcase/multi-tenancy/index.md b/content/showcase/multi-tenancy/index.md
deleted file mode 100644
index 1001ef7..0000000
--- a/content/showcase/multi-tenancy/index.md
+++ /dev/null
@@ -1,43 +0,0 @@
----
-title: "Multi-Tenancy Training"
-date: 2026-03-01
-summary: "Train multiple LoRAs concurrently on a single shared base model deployment."
-image:
-  filename: multi_lora.png
-  caption: "Twinkle Multi-Tenancy Architecture"
----
-
-Twinkle supports simultaneous multi-tenant training on a shared base model, dramatically reducing deployment costs while enabling flexible configurations per tenant.
-
-## Key Features
-
-- **Resource Efficiency**: Single base model serves multiple concurrent training sessions
-- **Complete Isolation**: Each tenant has separate LoRA weights, optimizers, and loss functions
-- **Heterogeneous Configs**: Different ranks, learning rates, and training objectives per tenant
-- **Concurrent Access**: No interference between training sessions
-
-## Use Cases
-
-| Tenant | Dataset | LoRA Rank | Training Type |
-|:-------|:--------|:----------|:--------------|
-| A | Private data | 8 | SFT |
-| B | Open-source | 32 | Pre-training |
-| C | RL dataset | 16 | GRPO |
-| D | Inference | - | Log probability |
-
-## Example
-
-```python
-from twinkle_client import init_twinkle_client
-from twinkle_client.model import MultiLoraTransformersModel
-
-client = init_twinkle_client(base_url='http://server:8000')
-
-model = MultiLoraTransformersModel(model_id='ms://Qwen/Qwen3.5-4B')
-model.add_adapter_to_model('tenant_a', LoraConfig(r=8))
-model.set_loss('GRPOLoss', epsilon=0.2)
-
-for batch in dataloader:
-    model.forward_backward(inputs=batch)
-    model.step()
-```
diff --git a/content/showcase/multi-tenancy/index.zh.md b/content/showcase/multi-tenancy/index.zh.md
deleted file mode 100644
index addabb1..0000000
--- a/content/showcase/multi-tenancy/index.zh.md
+++ /dev/null
@@ -1,43 +0,0 @@
----
-title: "多租户训练"
-date: 2026-03-01
-summary: "在单一共享基座模型上并发训练多个 LoRA。"
-image:
-  filename: multi_lora.png
-  caption: "Twinkle 多租户架构"
----
-
-Twinkle 支持在共享基座模型上同时进行多租户训练，大幅降低部署成本，同时允许每个租户使用灵活的配置。
-
-## 核心特性
-
-- **资源高效**: 单一基座模型服务多个并发训练会话
-- **完全隔离**: 每个租户拥有独立的 LoRA 权重、优化器和损失函数
-- **异构配置**: 每个租户可使用不同的 rank、学习率和训练目标
-- **并发访问**: 训练会话之间互不干扰
-
-## 使用场景
-
-| 租户 | 数据集 | LoRA Rank | 训练类型 |
-|:-----|:-------|:----------|:---------|
-| A | 私有数据 | 8 | SFT |
-| B | 开源数据 | 32 | 预训练 |
-| C | RL 数据 | 16 | GRPO |
-| D | 推理 | - | 对数概率 |
-
-## 示例
-
-```python
-from twinkle_client import init_twinkle_client
-from twinkle_client.model import MultiLoraTransformersModel
-
-client = init_twinkle_client(base_url='http://server:8000')
-
-model = MultiLoraTransformersModel(model_id='ms://Qwen/Qwen3.5-4B')
-model.add_adapter_to_model('tenant_a', LoraConfig(r=8))
-model.set_loss('GRPOLoss', epsilon=0.2)
-
-for batch in dataloader:
-    model.forward_backward(inputs=batch)
-    model.step()
-```
diff --git a/content/showcase/multi-turn-rl/index.md b/content/showcase/multi-turn-rl/index.md
new file mode 100644
index 0000000..b63a814
--- /dev/null
+++ b/content/showcase/multi-turn-rl/index.md
@@ -0,0 +1,54 @@
+---
+title: Multi-Turn RL (OpenEnv)
+linkTitle: Multi-Turn RL
+weight: 70
+---
+
+Multi-turn GRPO with interactive environments — the agent takes actions via tool calls and learns from episode rewards.
+
+[View full source →](https://github.com/modelscope/twinkle/blob/main/cookbook/rl/multi_turn/multi_turn_grpo.py)
+
+```python
+import twinkle
+from twinkle import DeviceMesh, DeviceGroup
+from twinkle.advantage import GRPOAdvantage
+from twinkle.data_format import SamplingParams
+from twinkle.sampler import vLLMSampler
+from twinkle.model import TransformersModel
+from twinkle_agentic.envs import OpenEnv, EnvTool
+from twinkle_agentic.rollout.multi_turn import MultiTurnRollout
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
+MODEL_GPUS, SAMPLER_GPUS = 4, 4
+MAX_STEPS = 1000
+
+device_groups = [
+    DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
+    DeviceGroup(name='sampler', ranks=list(range(MODEL_GPUS, MODEL_GPUS + SAMPLER_GPUS)), device_type='GPU'),
+]
+twinkle.initialize(mode='ray', nproc_per_node=8, groups=device_groups)
+
+model = TransformersModel(model_id=MODEL_ID, remote_group='model')
+sampler = vLLMSampler(model_id=MODEL_ID, remote_group='sampler')
+
+sampling_params = SamplingParams(max_tokens=2048)
+rollout = MultiTurnRollout(sampler=sampler,
+                           sampling_params=sampling_params, max_turns=6)
+
+for step in range(MAX_STEPS):
+    # 1. Reset environments and get initial observations
+    # prepare_trajectories: creates OpenEnv connections, resets envs, builds initial trajectory dicts
+    trajectories, tool_managers, env_tools = prepare_trajectories(
+        n_trajectories=BATCH_SIZE * 8, env_url='http://localhost:8000',
+        tool_schema=TOOL_SCHEMA, system_prompt=SYSTEM_PROMPT)
+    # 2. Multi-turn rollout: model generates tool calls, env responds
+    all_trajectories = rollout(trajectories, tool_manager=tool_managers)
+    # 3. Extract episode rewards from environments
+    # extract_rewards: reads cumulative reward from each EnvTool instance
+    rewards = extract_rewards(env_tools)
+    # 4. GRPO advantage → forward_backward → step
+    advantages = GRPOAdvantage()(rewards, num_generations=8, scale='group')
+    model.forward_backward(inputs=all_trajectories, advantages=advantages)
+    model.clip_grad_and_step()
+```
diff --git a/content/showcase/multi-turn-rl/index.zh.md b/content/showcase/multi-turn-rl/index.zh.md
new file mode 100644
index 0000000..d0ed472
--- /dev/null
+++ b/content/showcase/multi-turn-rl/index.zh.md
@@ -0,0 +1,54 @@
+---
+title: 多轮 RL (OpenEnv)
+linkTitle: 多轮 RL
+weight: 70
+---
+
+多轮 GRPO + 交互式环境 — Agent 通过 tool call 与环境交互，从 episode reward 中学习。
+
+[查看完整源码 →](https://github.com/modelscope/twinkle/blob/main/cookbook/rl/multi_turn/multi_turn_grpo.py)
+
+```python
+import twinkle
+from twinkle import DeviceMesh, DeviceGroup
+from twinkle.advantage import GRPOAdvantage
+from twinkle.data_format import SamplingParams
+from twinkle.sampler import vLLMSampler
+from twinkle.model import TransformersModel
+from twinkle_agentic.envs import OpenEnv, EnvTool
+from twinkle_agentic.rollout.multi_turn import MultiTurnRollout
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
+MODEL_GPUS, SAMPLER_GPUS = 4, 4
+MAX_STEPS = 1000
+
+device_groups = [
+    DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
+    DeviceGroup(name='sampler', ranks=list(range(MODEL_GPUS, MODEL_GPUS + SAMPLER_GPUS)), device_type='GPU'),
+]
+twinkle.initialize(mode='ray', nproc_per_node=8, groups=device_groups)
+
+model = TransformersModel(model_id=MODEL_ID, remote_group='model')
+sampler = vLLMSampler(model_id=MODEL_ID, remote_group='sampler')
+
+sampling_params = SamplingParams(max_tokens=2048)
+rollout = MultiTurnRollout(sampler=sampler,
+                           sampling_params=sampling_params, max_turns=6)
+
+for step in range(MAX_STEPS):
+    # 1. 重置环境，获取初始观测
+    # prepare_trajectories: 创建 OpenEnv 连接，重置环境，构建初始 trajectory 字典
+    trajectories, tool_managers, env_tools = prepare_trajectories(
+        n_trajectories=BATCH_SIZE * 8, env_url='http://localhost:8000',
+        tool_schema=TOOL_SCHEMA, system_prompt=SYSTEM_PROMPT)
+    # 2. 多轮 rollout：模型生成 tool call，环境返回观测
+    all_trajectories = rollout(trajectories, tool_manager=tool_managers)
+    # 3. 从环境提取 episode reward
+    # extract_rewards: 从每个 EnvTool 实例读取累积奖励
+    rewards = extract_rewards(env_tools)
+    # 4. GRPO advantage → forward_backward → step
+    advantages = GRPOAdvantage()(rewards, num_generations=8, scale='group')
+    model.forward_backward(inputs=all_trajectories, advantages=advantages)
+    model.clip_grad_and_step()
+```
diff --git a/content/showcase/multimodal/index.md b/content/showcase/multimodal/index.md
new file mode 100644
index 0000000..6940c06
--- /dev/null
+++ b/content/showcase/multimodal/index.md
@@ -0,0 +1,45 @@
+---
+title: Multimodal SFT (VLM)
+linkTitle: Multimodal
+weight: 90
+---
+
+Vision-language model fine-tuning with image inputs (e.g. LaTeX OCR, Gemma4).
+
+[View full source →](https://github.com/modelscope/twinkle/blob/main/cookbook/mm/fsdp2.py)
+
+```python
+import twinkle
+from twinkle import DeviceMesh
+from twinkle.cli import CLI
+from twinkle.data_format import Trajectory, Message
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import LazyDataset, DatasetMeta
+from twinkle.model import TransformersModel
+from twinkle.preprocessor import Preprocessor
+
+args = CLI.from_args()
+device_mesh = DeviceMesh.from_sizes(fsdp_size=args.infra.fsdp_size, dp_size=args.infra.dp_size)
+twinkle.initialize(mode=args.infra.mode, global_device_mesh=device_mesh)
+
+class LatexOCRProcessor(Preprocessor):
+    def preprocess(self, row) -> Trajectory:
+        return Trajectory(messages=[
+            Message(role='user', content=[{'type': 'image', 'image': row['image']},
+                                          {'type': 'text', 'text': 'Convert to LaTeX.'}]),
+            Message(role='assistant', content=row['text']),
+        ])
+
+dataset = LazyDataset(DatasetMeta('ms://linxy/LaTeX_OCR'))
+dataset.map(LatexOCRProcessor())
+dataset.set_template('Qwen3_5Template', model_id=args.model.model_id, max_length=2048)
+dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size)
+
+model = TransformersModel(model_id=args.model.model_id)
+model.set_optimizer(optimizer_cls='AdamW', lr=1e-4)
+
+for batch in dataloader:
+    model.forward_backward(inputs=batch)
+    model.clip_grad_and_step()
+model.save('last-checkpoint', output_dir=args.training.output_dir)
+```
diff --git a/content/showcase/multimodal/index.zh.md b/content/showcase/multimodal/index.zh.md
new file mode 100644
index 0000000..c47c868
--- /dev/null
+++ b/content/showcase/multimodal/index.zh.md
@@ -0,0 +1,45 @@
+---
+title: 多模态 SFT (VLM)
+linkTitle: 多模态
+weight: 90
+---
+
+视觉语言模型微调，支持图片输入（如 LaTeX OCR、Gemma4）。
+
+[查看完整源码 →](https://github.com/modelscope/twinkle/blob/main/cookbook/mm/fsdp2.py)
+
+```python
+import twinkle
+from twinkle import DeviceMesh
+from twinkle.cli import CLI
+from twinkle.data_format import Trajectory, Message
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import LazyDataset, DatasetMeta
+from twinkle.model import TransformersModel
+from twinkle.preprocessor import Preprocessor
+
+args = CLI.from_args()
+device_mesh = DeviceMesh.from_sizes(fsdp_size=args.infra.fsdp_size, dp_size=args.infra.dp_size)
+twinkle.initialize(mode=args.infra.mode, global_device_mesh=device_mesh)
+
+class LatexOCRProcessor(Preprocessor):
+    def preprocess(self, row) -> Trajectory:
+        return Trajectory(messages=[
+            Message(role='user', content=[{'type': 'image', 'image': row['image']},
+                                          {'type': 'text', 'text': '转换为 LaTeX。'}]),
+            Message(role='assistant', content=row['text']),
+        ])
+
+dataset = LazyDataset(DatasetMeta('ms://linxy/LaTeX_OCR'))
+dataset.map(LatexOCRProcessor())
+dataset.set_template('Qwen3_5Template', model_id=args.model.model_id, max_length=2048)
+dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size)
+
+model = TransformersModel(model_id=args.model.model_id)
+model.set_optimizer(optimizer_cls='AdamW', lr=1e-4)
+
+for batch in dataloader:
+    model.forward_backward(inputs=batch)
+    model.clip_grad_and_step()
+model.save('last-checkpoint', output_dir=args.training.output_dir)
+```
diff --git a/content/showcase/npu-ascend/index.md b/content/showcase/npu-ascend/index.md
new file mode 100644
index 0000000..e698325
--- /dev/null
+++ b/content/showcase/npu-ascend/index.md
@@ -0,0 +1,75 @@
+---
+title: Ascend NPU — Megatron on MindSpeed
+linkTitle: NPU (Ascend)
+weight: 35
+---
+
+Training on Huawei Ascend NPUs using the Megatron backend with MindSpeed integration.
+Twinkle automatically applies fused NPU operators (RMSNorm, RoPE, SwiGLU, SDPA) via `kernelize_model()`.
+
+Three recipes are provided — basic TP, MoE with EP, and MoE with Context Parallelism.
+
+## 1. Tensor Parallel (TP + PP + DP)
+
+Launch: `ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_npu.py`
+
+[View full source →](https://github.com/modelscope/twinkle/blob/main/cookbook/megatron/ascend/tp_npu.py)
+
+```python
+from twinkle import DeviceMesh
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import MegatronModel
+import twinkle
+
+MODEL_ID = 'ms://Qwen/Qwen3-4B'
+
+# 8-card TP/PP/DP layout on NPU
+device_mesh = DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2, device_type='npu')
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+
+dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition'))
+dataset.set_template('Template', model_id=MODEL_ID)
+dataset.encode()
+dataloader = DataLoader(dataset=dataset, batch_size=8, num_workers=0)
+
+model = MegatronModel(model_id=MODEL_ID)
+# Full-parameter training by default; optionally add LoRA:
+# from peft import LoraConfig
+# model.add_adapter_to_model('default', LoraConfig(r=8, lora_alpha=32, target_modules='all-linear'))
+model.set_optimizer(optimizer_cls='default', lr=1e-4)
+
+for step, batch in enumerate(dataloader):
+    model.forward_backward(inputs=batch)
+    model.clip_grad_and_step()
+```
+
+## 2. MoE with Expert Parallel (TP + PP + DP + EP)
+
+Launch: `ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_moe_npu.py`
+
+[View full source →](https://github.com/modelscope/twinkle/blob/main/cookbook/megatron/ascend/tp_moe_npu.py)
+
+```python
+MODEL_ID = 'ms://Qwen/Qwen3-30B-A3B'
+
+# MoE layout: add ep_size=2 for expert parallelism
+device_mesh = DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2, cp_size=1, ep_size=2, device_type='npu')
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+```
+
+## 3. MoE + Context Parallelism (TP + PP + CP + EP)
+
+Launch: `ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_moe_cp_npu.py`
+
+[View full source →](https://github.com/modelscope/twinkle/blob/main/cookbook/megatron/ascend/tp_moe_cp_npu.py)
+
+```python
+MODEL_ID = 'ms://Qwen/Qwen3-30B-A3B'
+
+# Full parallelism: TP=2, PP=2, CP=2, EP=2
+device_mesh = DeviceMesh.from_sizes(dp_size=1, tp_size=2, pp_size=2, cp_size=2, ep_size=2, device_type='npu')
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+```
+
+> **Note**: Use `ASCEND_RT_VISIBLE_DEVICES` instead of `CUDA_VISIBLE_DEVICES`. The `device_type='npu'` flag enables NPU-specific kernel patches automatically.
diff --git a/content/showcase/npu-ascend/index.zh.md b/content/showcase/npu-ascend/index.zh.md
new file mode 100644
index 0000000..30fedc7
--- /dev/null
+++ b/content/showcase/npu-ascend/index.zh.md
@@ -0,0 +1,75 @@
+---
+title: 昇腾 NPU — Megatron + MindSpeed
+linkTitle: NPU（昇腾）
+weight: 35
+---
+
+使用 Megatron 后端在华为昇腾 NPU 上训练，集成 MindSpeed 加速。
+Twinkle 通过 `kernelize_model()` 自动应用融合 NPU 算子（RMSNorm、RoPE、SwiGLU、SDPA）。
+
+提供三种配方——基础 TP、MoE + EP、MoE + Context Parallelism。
+
+## 1. 张量并行（TP + PP + DP）
+
+启动命令：`ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_npu.py`
+
+[查看完整源码 →](https://github.com/modelscope/twinkle/blob/main/cookbook/megatron/ascend/tp_npu.py)
+
+```python
+from twinkle import DeviceMesh
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import MegatronModel
+import twinkle
+
+MODEL_ID = 'ms://Qwen/Qwen3-4B'
+
+# 8 卡 TP/PP/DP 布局，运行在 NPU 上
+device_mesh = DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2, device_type='npu')
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+
+dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition'))
+dataset.set_template('Template', model_id=MODEL_ID)
+dataset.encode()
+dataloader = DataLoader(dataset=dataset, batch_size=8, num_workers=0)
+
+model = MegatronModel(model_id=MODEL_ID)
+# 默认全参数训练；可选添加 LoRA：
+# from peft import LoraConfig
+# model.add_adapter_to_model('default', LoraConfig(r=8, lora_alpha=32, target_modules='all-linear'))
+model.set_optimizer(optimizer_cls='default', lr=1e-4)
+
+for step, batch in enumerate(dataloader):
+    model.forward_backward(inputs=batch)
+    model.clip_grad_and_step()
+```
+
+## 2. MoE + Expert Parallel（TP + PP + DP + EP）
+
+启动命令：`ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_moe_npu.py`
+
+[查看完整源码 →](https://github.com/modelscope/twinkle/blob/main/cookbook/megatron/ascend/tp_moe_npu.py)
+
+```python
+MODEL_ID = 'ms://Qwen/Qwen3-30B-A3B'
+
+# MoE 布局：添加 ep_size=2 启用专家并行
+device_mesh = DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2, cp_size=1, ep_size=2, device_type='npu')
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+```
+
+## 3. MoE + Context Parallelism（TP + PP + CP + EP）
+
+启动命令：`ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_moe_cp_npu.py`
+
+[查看完整源码 →](https://github.com/modelscope/twinkle/blob/main/cookbook/megatron/ascend/tp_moe_cp_npu.py)
+
+```python
+MODEL_ID = 'ms://Qwen/Qwen3-30B-A3B'
+
+# 全并行：TP=2, PP=2, CP=2, EP=2
+device_mesh = DeviceMesh.from_sizes(dp_size=1, tp_size=2, pp_size=2, cp_size=2, ep_size=2, device_type='npu')
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+```
+
+> **注意**：使用 `ASCEND_RT_VISIBLE_DEVICES` 代替 `CUDA_VISIBLE_DEVICES`。`device_type='npu'` 标志会自动启用 NPU 专用 kernel patch。
diff --git a/content/showcase/ray-training/index.md b/content/showcase/ray-training/index.md
deleted file mode 100644
index c4f1965..0000000
--- a/content/showcase/ray-training/index.md
+++ /dev/null
@@ -1,34 +0,0 @@
----
-title: "Ray Distributed Training"
-date: 2026-03-01
-summary: "Scale LLM training from single GPU to multi-node Ray clusters with the same code."
-image:
-  filename: framework.jpg
-  caption: "Twinkle Ray Training Architecture"
----
-
-Twinkle enables seamless scaling from single GPU training to multi-node Ray clusters. The same training code runs across different backends with minimal configuration changes.
-
-## Key Features
-
-- **Unified API**: Same training code works with torchrun, Ray, and HTTP modes
-- **Flexible Parallelism**: Support for FSDP, tensor parallelism, pipeline parallelism
-- **Model-Sampler Coordination**: Efficient weight synchronization for RL training
-- **Dynamic Resource Management**: Ray handles GPU allocation automatically
-
-## Example
-
-```python
-import twinkle
-from twinkle import DeviceMesh, DeviceGroup
-
-device_groups = [
-    DeviceGroup(name='model', ranks=4, device_type='cuda'),
-    DeviceGroup(name='sampler', ranks=4, device_type='cuda'),
-]
-
-twinkle.initialize(mode='ray', nproc_per_node=8, groups=device_groups)
-
-# Training code remains the same as single GPU!
-model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B', remote_group='model')
-```
diff --git a/content/showcase/ray-training/index.zh.md b/content/showcase/ray-training/index.zh.md
deleted file mode 100644
index bc3d716..0000000
--- a/content/showcase/ray-training/index.zh.md
+++ /dev/null
@@ -1,34 +0,0 @@
----
-title: "Ray 分布式训练"
-date: 2026-03-01
-summary: "使用相同代码，从单卡扩展到多节点 Ray 集群。"
-image:
-  filename: framework.jpg
-  caption: "Twinkle Ray 训练架构"
----
-
-Twinkle 支持从单卡训练无缝扩展到多节点 Ray 集群。相同的训练代码可以在不同后端运行，只需最少的配置更改。
-
-## 核心特性
-
-- **统一 API**: 相同训练代码适用于 torchrun、Ray 和 HTTP 模式
-- **灵活并行**: 支持 FSDP、张量并行、流水线并行
-- **模型-采样器协调**: 高效的 RL 训练权重同步
-- **动态资源管理**: Ray 自动处理 GPU 分配
-
-## 示例
-
-```python
-import twinkle
-from twinkle import DeviceMesh, DeviceGroup
-
-device_groups = [
-    DeviceGroup(name='model', ranks=4, device_type='cuda'),
-    DeviceGroup(name='sampler', ranks=4, device_type='cuda'),
-]
-
-twinkle.initialize(mode='ray', nproc_per_node=8, groups=device_groups)
-
-# 训练代码与单卡完全相同！
-model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B', remote_group='model')
-```
diff --git a/content/showcase/sft-transformers/index.md b/content/showcase/sft-transformers/index.md
new file mode 100644
index 0000000..9d1f968
--- /dev/null
+++ b/content/showcase/sft-transformers/index.md
@@ -0,0 +1,46 @@
+---
+title: SFT — Transformers FSDP2
+linkTitle: SFT (FSDP2)
+weight: 20
+---
+
+Supervised fine-tuning with FSDP2 sharding and the Muon optimizer. Supports both full-parameter and LoRA training.
+
+[View full source →](https://github.com/modelscope/twinkle/blob/main/cookbook/transformers/fsdp2.py)
+
+```python
+from torch.optim import Muon
+
+import twinkle
+from twinkle import DeviceMesh, get_logger
+from twinkle.cli import CLI
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import TransformersModel
+from twinkle.preprocessor import SelfCognitionProcessor
+
+args = CLI.from_args()
+device_mesh = DeviceMesh.from_sizes(fsdp_size=args.infra.fsdp_size, dp_size=args.infra.dp_size)
+twinkle.initialize(mode=args.infra.mode, global_device_mesh=device_mesh)
+
+dataset = Dataset(dataset_meta=DatasetMeta(args.dataset.dataset_id))
+dataset.set_template(args.template.template_cls, model_id=args.model.model_id)
+dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
+dataset.encode()
+
+dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size)
+model = TransformersModel(model_id=args.model.model_id)
+# Full-parameter training by default; optionally add LoRA:
+# from peft import LoraConfig
+# model.add_adapter_to_model('default', LoraConfig(**args.get_lora_args()),
+#                            gradient_accumulation_steps=args.training.gradient_accumulation_steps)
+model.set_optimizer(optimizer_cls=Muon, lr=args.optimizer.learning_rate, adjust_lr_fn='match_rms_adamw')
+model.set_lr_scheduler(scheduler_cls=args.scheduler.scheduler_cls,
+                       num_warmup_steps=args.scheduler.num_warmup_steps,
+                       num_training_steps=len(dataloader))
+
+for batch in dataloader:
+    model.forward_backward(inputs=batch)
+    model.clip_grad_and_step()
+model.save('last-checkpoint', output_dir=args.training.output_dir)
+```
diff --git a/content/showcase/sft-transformers/index.zh.md b/content/showcase/sft-transformers/index.zh.md
new file mode 100644
index 0000000..9222193
--- /dev/null
+++ b/content/showcase/sft-transformers/index.zh.md
@@ -0,0 +1,46 @@
+---
+title: SFT — Transformers FSDP2
+linkTitle: SFT (FSDP2)
+weight: 20
+---
+
+使用 FSDP2 分片 + Muon 优化器的监督微调。支持全参数和 LoRA 训练。
+
+[查看完整源码 →](https://github.com/modelscope/twinkle/blob/main/cookbook/transformers/fsdp2.py)
+
+```python
+from torch.optim import Muon
+
+import twinkle
+from twinkle import DeviceMesh, get_logger
+from twinkle.cli import CLI
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import TransformersModel
+from twinkle.preprocessor import SelfCognitionProcessor
+
+args = CLI.from_args()
+device_mesh = DeviceMesh.from_sizes(fsdp_size=args.infra.fsdp_size, dp_size=args.infra.dp_size)
+twinkle.initialize(mode=args.infra.mode, global_device_mesh=device_mesh)
+
+dataset = Dataset(dataset_meta=DatasetMeta(args.dataset.dataset_id))
+dataset.set_template(args.template.template_cls, model_id=args.model.model_id)
+dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
+dataset.encode()
+
+dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size)
+model = TransformersModel(model_id=args.model.model_id)
+# 默认全参数训练；可选添加 LoRA：
+# from peft import LoraConfig
+# model.add_adapter_to_model('default', LoraConfig(**args.get_lora_args()),
+#                            gradient_accumulation_steps=args.training.gradient_accumulation_steps)
+model.set_optimizer(optimizer_cls=Muon, lr=args.optimizer.learning_rate, adjust_lr_fn='match_rms_adamw')
+model.set_lr_scheduler(scheduler_cls=args.scheduler.scheduler_cls,
+                       num_warmup_steps=args.scheduler.num_warmup_steps,
+                       num_training_steps=len(dataloader))
+
+for batch in dataloader:
+    model.forward_backward(inputs=batch)
+    model.clip_grad_and_step()
+model.save('last-checkpoint', output_dir=args.training.output_dir)
+```
diff --git a/content/showcase/shell-launch/index.md b/content/showcase/shell-launch/index.md
new file mode 100644
index 0000000..6104138
--- /dev/null
+++ b/content/showcase/shell-launch/index.md
@@ -0,0 +1,28 @@
+---
+title: Shell Launch (torchrun)
+linkTitle: Shell Launch
+weight: 10
+---
+
+The standard way to launch local multi-GPU training with torchrun:
+
+```bash
+#!/usr/bin/env bash
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+  torchrun --nproc_per_node=8 fsdp2.py \
+    --model-id ms://Qwen/Qwen3.5-4B \
+    --dataset-id ms://swift/self-cognition \
+    --template-cls Qwen3_5Template \
+    --fsdp-size 2 \
+    --dp-size 4 \
+    --batch-size 8 \
+    --lr 1e-4 \
+    --gradient-accumulation-steps 2 \
+    --output-dir ./output/fsdp2 \
+    --adapter-name default \
+    --scheduler-cls CosineWarmupScheduler \
+    --num-warmup-steps 5 \
+    --train-samples 1000
+```
+
+[View full source →](https://github.com/modelscope/twinkle/blob/main/cookbook/transformers/fsdp2.sh)
diff --git a/content/showcase/shell-launch/index.zh.md b/content/showcase/shell-launch/index.zh.md
new file mode 100644
index 0000000..3e7135e
--- /dev/null
+++ b/content/showcase/shell-launch/index.zh.md
@@ -0,0 +1,28 @@
+---
+title: Shell 启动 (torchrun)
+linkTitle: Shell 启动
+weight: 10
+---
+
+标准多卡本地训练的 torchrun 启动方式：
+
+```bash
+#!/usr/bin/env bash
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+  torchrun --nproc_per_node=8 fsdp2.py \
+    --model-id ms://Qwen/Qwen3.5-4B \
+    --dataset-id ms://swift/self-cognition \
+    --template-cls Qwen3_5Template \
+    --fsdp-size 2 \
+    --dp-size 4 \
+    --batch-size 8 \
+    --lr 1e-4 \
+    --gradient-accumulation-steps 2 \
+    --output-dir ./output/fsdp2 \
+    --adapter-name default \
+    --scheduler-cls CosineWarmupScheduler \
+    --num-warmup-steps 5 \
+    --train-samples 1000
+```
+
+[查看完整源码 →](https://github.com/modelscope/twinkle/blob/main/cookbook/transformers/fsdp2.sh)
diff --git a/data/authors/me.yaml b/data/authors/me.yaml
deleted file mode 100644
index 6e88113..0000000
--- a/data/authors/me.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-schema: hugoblox/author/v1
-slug: me
-is_owner: true
-name:
-  display: ModelScope Team
-  given: ModelScope
-  family: Team
-status:
-  icon: "🔬"
-role: AI Research & Engineering
-bio: Building open-source AI infrastructure for the community. Twinkle is our lightweight LLM training framework.
-affiliations:
-  - name: ModelScope
-    url: https://github.com/modelscope
-links:
-  - icon: at-symbol
-    url: mailto:contact@modelscope.cn
-    label: E-mail
-  - icon: brands/github
-    url: https://github.com/modelscope/twinkle
-  - icon: rss
-    url: ./post/index.xml
-    label: Subscribe to blog via RSS feed
diff --git a/layouts/_partials/components/headers/navbar.html b/layouts/_partials/components/headers/navbar.html
index 1693d55..5293222 100644
--- a/layouts/_partials/components/headers/navbar.html
+++ b/layouts/_partials/components/headers/navbar.html
@@ -120,7 +120,7 @@
           target="_blank" rel="noopener"
           {{ end }}
           href="{{$url}}"
-        >{{ .Name }}</a
+        >{{ if eq (.Params.icon | default "") "github" }}<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="currentColor"><path d="M12 0c-6.626 0-12 5.373-12 12 0 5.302 3.438 9.8 8.207 11.387.599.111.793-.261.793-.577v-2.234c-3.338.726-4.033-1.416-4.033-1.416-.546-1.387-1.333-1.756-1.333-1.756-1.089-.745.083-.729.083-.729 1.205.084 1.839 1.237 1.839 1.237 1.07 1.834 2.807 1.304 3.492.997.107-.775.418-1.305.762-1.604-2.665-.305-5.467-1.334-5.467-5.931 0-1.311.469-2.381 1.236-3.221-.124-.303-.535-1.524.117-3.176 0 0 1.008-.322 3.301 1.23.957-.266 1.983-.399 3.003-.404 1.02.005 2.047.138 3.006.404 2.291-1.552 3.297-1.23 3.297-1.23.653 1.653.242 2.874.118 3.176.77.84 1.235 1.911 1.235 3.221 0 4.609-2.807 5.624-5.479 5.921.43.372.823 1.102.823 2.222v3.293c0 .319.192.694.801.576 4.765-1.589 8.199-6.086 8.199-11.386 0-6.627-5.373-12-12-12z"/></svg>{{ else }}{{ .Name }}{{ end }}</a
         >
       </li>
       {{ end }}
diff --git a/layouts/_partials/components/language-chooser.html b/layouts/_partials/components/language-chooser.html
new file mode 100644
index 0000000..61c7e5b
--- /dev/null
+++ b/layouts/_partials/components/language-chooser.html
@@ -0,0 +1,28 @@
+{{/* Horizontal tab-style language switcher */}}
+{{ $page := .page }}
+{{- if $page.IsTranslated -}}
+<div class="lang-switcher">
+  <span class="lang-switcher-active">
+    {{- if eq $page.Lang "en" }}EN{{ else }}中文{{ end -}}
+  </span>
+  {{ range $page.Translations }}
+  <a href="{{ .Permalink }}" class="lang-switcher-link">
+    {{- if eq .Lang "en" }}EN{{ else }}中文{{ end -}}
+  </a>
+  {{ end }}
+</div>
+{{- end -}}
+{{/* Horizontal tab-style language switcher */}}
+{{ $page := .page }}
+{{- if $page.IsTranslated -}}
+<div style="display: inline-flex; align-items: center; background: #f1f5f9; border-radius: 6px; padding: 2px; gap: 2px; font-size: 0.8rem; font-weight: 500;">
+  <span style="padding: 0.25rem 0.6rem; border-radius: 4px; background: #6366f1; color: white;">
+    {{- if eq $page.Lang "en" }}EN{{ else }}中文{{ end -}}
+  </span>
+  {{ range $page.Translations }}
+  <a href="{{ .Permalink }}" style="padding: 0.25rem 0.6rem; border-radius: 4px; text-decoration: none; color: #475569; transition: background 0.15s;" onmouseover="this.style.background='#e2e8f0'" onmouseout="this.style.background='transparent'">
+    {{- if eq .Lang "en" }}EN{{ else }}中文{{ end -}}
+  </a>
+  {{ end }}
+</div>
+{{- end -}}
diff --git a/layouts/_partials/page_author.html b/layouts/_partials/page_author.html
new file mode 100644
index 0000000..817184e
--- /dev/null
+++ b/layouts/_partials/page_author.html
@@ -0,0 +1 @@
+{{/* Author profile box - intentionally empty to disable author cards site-wide */}}
diff --git a/layouts/shortcodes/list-children.html b/layouts/shortcodes/list-children.html
new file mode 100644
index 0000000..e81fdc2
--- /dev/null
+++ b/layouts/shortcodes/list-children.html
@@ -0,0 +1,22 @@
+{{ $pages := .Page.Pages }}
+{{ if $pages }}
+<div class="list-children-grid">
+{{ range $pages }}
+  <a href="{{ .RelPermalink }}" class="list-children-card">
+    <span class="card-title">{{ .Title }}</span>
+    {{ with .Description }}<span class="card-desc">{{ . }}</span>{{ end }}
+  </a>
+{{ end }}
+</div>
+{{ end }}
+{{ $pages := .Page.Pages }}
+{{ if $pages }}
+<div style="display: grid; grid-template-columns: repeat(auto-fill, minmax(250px, 1fr)); gap: 1rem; margin-top: 1.5rem;">
+{{ range $pages }}
+  <a href="{{ .RelPermalink }}" style="display: flex; flex-direction: column; padding: 1rem 1.25rem; border: 1px solid #e5e7eb; border-radius: 0.5rem; text-decoration: none; color: inherit; transition: box-shadow 0.2s, border-color 0.2s;" onmouseover="this.style.borderColor='#6366f1';this.style.boxShadow='0 4px 6px -1px rgba(0,0,0,0.1)'" onmouseout="this.style.borderColor='#e5e7eb';this.style.boxShadow='none'">
+    <span style="font-weight: 600; font-size: 0.95rem;">{{ .Title }}</span>
+    {{ with .Description }}<span style="margin-top: 0.25rem; font-size: 0.85rem; color: #6b7280;">{{ . }}</span>{{ end }}
+  </a>
+{{ end }}
+</div>
+{{ end }}
diff --git a/netlify.toml b/netlify.toml
index c4fad08..8cfe92b 100644
--- a/netlify.toml
+++ b/netlify.toml
@@ -10,6 +10,9 @@
     echo "=== Installing dependencies ==="
     pnpm install --verbose --no-frozen-lockfile
     
+    echo "=== Syncing docs from twinkle repo ==="
+    python3 scripts/sync-docs.py
+    
     echo "=== Running Hugo build ==="
     hugo --gc --minify -b $URL --logLevel debug --printI18nWarnings --printPathWarnings
     
@@ -41,6 +44,7 @@
     echo "=== Deploy Preview Build ==="
     echo "Deploy URL: $DEPLOY_PRIME_URL"
     pnpm install --verbose --no-frozen-lockfile
+    python3 scripts/sync-docs.py
     hugo --gc --minify --buildFuture -b $DEPLOY_PRIME_URL --logLevel debug --printI18nWarnings --printPathWarnings
     pnpm run pagefind
   """
@@ -51,6 +55,7 @@
     echo "=== Branch Deploy Build ==="
     echo "Deploy URL: $DEPLOY_PRIME_URL"
     pnpm install --verbose --no-frozen-lockfile
+    python3 scripts/sync-docs.py
     hugo --gc --minify -b $DEPLOY_PRIME_URL --logLevel debug --printI18nWarnings --printPathWarnings
     pnpm run pagefind
   """
diff --git a/package.json b/package.json
index c151b07..750dc2b 100644
--- a/package.json
+++ b/package.json
@@ -5,8 +5,9 @@
   "packageManager": "pnpm@10.14.0",
   "description": "Twinkle technical blog and documentation site powered by Hugo Blox",
   "scripts": {
-    "dev": "hugo server --disableFastRender",
-    "build": "hugo --minify && pnpm run pagefind",
+    "sync-docs": "python3 scripts/sync-docs.py",
+    "dev": "python3 scripts/sync-docs.py && hugo server --disableFastRender",
+    "build": "python3 scripts/sync-docs.py && hugo --minify && pnpm run pagefind",
     "pagefind": "pagefind --site public"
   },
   "dependencies": {
diff --git a/scripts/check-links.py b/scripts/check-links.py
new file mode 100644
index 0000000..0a17386
--- /dev/null
+++ b/scripts/check-links.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+"""Check for broken internal links in Hugo content files.
+
+Scans all .md files under content/ for Markdown links and verifies:
+1. Internal relative links resolve to existing Hugo pages
+2. External links (http/https) are reachable (optional, off by default)
+
+Usage:
+    python3 scripts/check-links.py [--check-external]
+"""
+
+import re
+import sys
+from pathlib import Path
+from urllib.parse import unquote
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+WEB_ROOT = Path(__file__).resolve().parent.parent
+CONTENT_DIR = WEB_ROOT / "content"
+
+# Regex to find Markdown links: [text](url) but NOT images ![alt](url)
+# Also handles optional title: [text](url "title")
+LINK_RE = re.compile(r'(?<!!)\[([^\]]*)\]\(([^)" ]+)(?:\s+"[^"]*")?\)')
+
+# Skip patterns
+SKIP_PREFIXES = (
+    "http://", "https://", "mailto:", "#", "{{",
+    "/",  # site-root absolute paths handled by Hugo, not filesystem
+)
+
+
+def resolve_link(md_file: Path, link_target: str) -> tuple[bool, str]:
+    """Check if a relative link resolves to an existing file/page.
+    
+    Returns (is_valid, reason).
+    """
+    # Decode URL encoding
+    target = unquote(link_target)
+    
+    # Strip anchor
+    if "#" in target:
+        target = target.split("#")[0]
+    if not target:
+        return True, "anchor-only"
+
+    # Resolve relative to the markdown file's directory
+    base_dir = md_file.parent
+    resolved = (base_dir / target).resolve()
+
+    # Check direct file existence
+    if resolved.exists():
+        return True, "file exists"
+
+    # Hugo page bundles: foo/ -> foo/_index.md or foo.md
+    if not resolved.suffix:
+        # Try as directory with _index.md
+        index_path = resolved / "_index.md"
+        if index_path.exists():
+            return True, "directory index"
+        # Try with .md extension
+        md_path = resolved.with_suffix(".md")
+        if md_path.exists():
+            return True, "md file"
+        # Try with .zh.md extension
+        zh_path = resolved.parent / (resolved.name + ".zh.md")
+        if zh_path.exists():
+            return True, "zh.md file"
+
+    # For .md links, check without extension too
+    if target.endswith(".md"):
+        no_ext = resolved.with_suffix("")
+        if no_ext.exists() and no_ext.is_dir():
+            return True, "directory"
+
+    return False, f"not found: {resolved}"
+
+
+def check_external_link(url: str) -> tuple[bool, str]:
+    """Check if an external URL is reachable."""
+    import urllib.request
+    import urllib.error
+    try:
+        req = urllib.request.Request(url, method='HEAD')
+        req.add_header('User-Agent', 'Mozilla/5.0 (link-checker)')
+        resp = urllib.request.urlopen(req, timeout=10)
+        return True, f"HTTP {resp.status}"
+    except urllib.error.HTTPError as e:
+        return False, f"HTTP {e.code}"
+    except Exception as e:
+        return False, str(e)
+
+
+def main():
+    check_external = "--check-external" in sys.argv
+    
+    broken_links = []
+    total_links = 0
+    
+    md_files = sorted(CONTENT_DIR.rglob("*.md"))
+    print(f"[check-links] Scanning {len(md_files)} markdown files in {CONTENT_DIR}")
+    
+    for md_file in md_files:
+        content = md_file.read_text(encoding="utf-8", errors="replace")
+        
+        for match in LINK_RE.finditer(content):
+            link_text = match.group(1)
+            link_target = match.group(2)
+            total_links += 1
+            
+            # Skip external/special links
+            if any(link_target.startswith(p) for p in SKIP_PREFIXES):
+                if check_external and link_target.startswith(("http://", "https://")):
+                    valid, reason = check_external_link(link_target)
+                    if not valid:
+                        rel_path = md_file.relative_to(WEB_ROOT)
+                        broken_links.append((rel_path, link_text, link_target, reason))
+                continue
+            
+            valid, reason = resolve_link(md_file, link_target)
+            if not valid:
+                rel_path = md_file.relative_to(WEB_ROOT)
+                broken_links.append((rel_path, link_text, link_target, reason))
+    
+    # Report
+    print(f"\n[check-links] Total links scanned: {total_links}")
+    print(f"[check-links] Broken links found: {len(broken_links)}")
+    
+    if broken_links:
+        print("\n" + "=" * 80)
+        print("BROKEN LINKS:")
+        print("=" * 80)
+        for file_path, text, target, reason in broken_links:
+            print(f"\n  File: {file_path}")
+            print(f"  Link: [{text}]({target})")
+            print(f"  Reason: {reason}")
+        print("\n" + "=" * 80)
+        sys.exit(1)
+    else:
+        print("\n[check-links] All internal links are valid! ✓")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/sync-docs.py b/scripts/sync-docs.py
new file mode 100644
index 0000000..0bad67e
--- /dev/null
+++ b/scripts/sync-docs.py
@@ -0,0 +1,550 @@
+#!/usr/bin/env python3
+"""Sync Sphinx docs into Hugo content directory.
+
+Reads docs/source_en and docs/source_zh, converts Markdown files to
+Hugo-compatible format (adds front matter), and writes them into
+twinkle-web/content/docs/usage-guide/ and twinkle-web/content/docs/components/.
+
+Run from twinkle-web/ directory:
+    python3 scripts/sync-docs.py
+
+This script is invoked automatically by `pnpm dev` and `pnpm build`.
+It should NOT be gitignored — only the generated output directories are.
+"""
+
+import os
+import re
+import shutil
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# Path setup
+# ---------------------------------------------------------------------------
+SCRIPT_DIR = Path(__file__).resolve().parent
+WEB_ROOT = SCRIPT_DIR.parent                       # twinkle-web/
+
+
+def find_docs_root() -> Path:
+    """Find the docs/ directory by searching parent directories.
+
+    twinkle-web may live as a symlink inside the twinkle repo, or as a
+    sibling directory. We try multiple strategies.
+    """
+    # Strategy 1: sibling of web root (twinkle-web and twinkle are siblings)
+    candidate = WEB_ROOT.parent / "twinkle" / "docs"
+    if candidate.exists() and (candidate / "source_en").exists():
+        return candidate
+
+    # Strategy 2: walk up from the *unresolved* script path
+    raw_script = Path(__file__).absolute().parent.parent  # twinkle-web/ (unresolved)
+    for parent in [raw_script.parent, raw_script.parent.parent]:
+        candidate = parent / "docs"
+        if candidate.exists() and (candidate / "source_en").exists():
+            return candidate
+
+    # Strategy 3: environment variable
+    env_docs = os.environ.get("TWINKLE_DOCS_DIR")
+    if env_docs:
+        p = Path(env_docs)
+        if p.exists():
+            return p
+
+    # Strategy 4: resolved parent's parent (original logic)
+    candidate = WEB_ROOT.parent / "docs"
+    if candidate.exists() and (candidate / "source_en").exists():
+        return candidate
+
+    raise FileNotFoundError(
+        "Cannot locate docs/ directory. Set TWINKLE_DOCS_DIR env variable "
+        "or ensure docs/ is a sibling of twinkle-web/."
+    )
+
+
+DOCS_ROOT: Path | None = None
+DOCS_EN: Path | None = None
+DOCS_ZH: Path | None = None
+OUT_DIR = WEB_ROOT / "content" / "docs"
+
+
+def _init_docs_paths():
+    """Lazily initialize docs paths (called from main to avoid import-time crash)."""
+    global DOCS_ROOT, DOCS_EN, DOCS_ZH
+    DOCS_ROOT = find_docs_root()
+    DOCS_EN = DOCS_ROOT / "source_en"
+    DOCS_ZH = DOCS_ROOT / "source_zh"
+
+# Directories that are generated by this script (cleaned on each run)
+GENERATED_DIRS = ["usage-guide", "components"]
+
+# ---------------------------------------------------------------------------
+# Mapping: English directory name  ->  Chinese directory name
+# ---------------------------------------------------------------------------
+EN_ZH_DIR_MAP = {
+    "Usage Guide":              "使用指引",
+    "Components":               "组件",
+    "Server and Client":        "服务端和客户端",
+    "Dataset":                  "数据集",
+    "Data Format":              "数据格式",
+    "Data Loading":             "数据加载",
+    "Template":                 "模板",
+    "Preprocessor and Filter":  "预处理器和过滤器",
+    "Task Processor":           "任务处理器",
+    "Model":                    "模型",
+    "Sampler":                  "采样器",
+    "Reward":                   "奖励",
+    "Advantage":                "优势",
+    "Gym":                      "Gym",
+    "Hub":                      "Hub",
+    "Checkpoint Engine":        "检查点引擎",
+    "Metrics":                  "指标",
+    "Loss":                     "损失",
+    "Loss Scale":               "损失缩放",
+    "LRScheduler":              "LRScheduler",
+    "Patch":                    "补丁",
+    "Plugin":                   "组件化",
+    "Kernel":                   "Kernel",
+    "Training Middleware":       "训练中间件",
+    "CLI":                      "CLI",
+    "Notifier":                 "通知器",
+    "Agentic":                  "Agentic",
+    "TUI":                      "TUI",
+}
+
+# Chinese directory title -> English directory title (for display)
+ZH_SECTION_TITLES = {v: k for k, v in EN_ZH_DIR_MAP.items()}
+
+# ---------------------------------------------------------------------------
+# Title overrides: slug -> (en_title, zh_title)
+# Prevents sync from overwriting manually chosen titles.
+# ---------------------------------------------------------------------------
+TITLE_OVERRIDES = {
+    "usage-guide/quick-start": ("Training Guide", "训练指南"),
+}
+
+# ---------------------------------------------------------------------------
+# Hugo slug helpers
+# ---------------------------------------------------------------------------
+
+def slugify(name: str) -> str:
+    """Convert a directory/file name to a Hugo-friendly slug."""
+    s = name.lower().strip()
+    s = re.sub(r"[_ ]+", "-", s)          # spaces / underscores -> hyphens
+    s = re.sub(r"[^a-z0-9\-.]", "", s)    # strip non-ascii
+    s = re.sub(r"-{2,}", "-", s)          # collapse double hyphens
+    return s.strip("-")
+
+
+def extract_title_from_md(content: str) -> str:
+    """Return the first H1 heading from Markdown content, or ''."""
+    for line in content.splitlines():
+        m = re.match(r"^#\s+(.+)", line)
+        if m:
+            return m.group(1).strip()
+    return ""
+
+
+def strip_first_heading(content: str) -> str:
+    """Remove the first H1 heading line (Hugo uses front-matter title)."""
+    lines = content.splitlines(keepends=True)
+    in_code_block = False
+    for i, line in enumerate(lines):
+        if line.strip().startswith("```"):
+            in_code_block = not in_code_block
+            continue
+        if not in_code_block and re.match(r"^#\s+", line):
+            # Also strip the blank line right after if present
+            end = i + 1
+            if end < len(lines) and lines[end].strip() == "":
+                end += 1
+            return "".join(lines[:i] + lines[end:])
+    return content
+
+
+# ---------------------------------------------------------------------------
+# Link rewriting
+# ---------------------------------------------------------------------------
+
+# Mapping: broken relative path pattern -> corrected path
+# Supports both exact matches and regex-based rewrites
+LINK_REWRITES = {
+    # Chinese paths -> Hugo slugified paths
+    "./服务端和客户端/Twinkle客户端.md": "../server-and-client/twinkle-client/",
+    "../服务端和客户端/服务端.md": "../server-and-client/server/",
+    "./可观测化.md": "../observability/",
+    "./训练服务.md": "../../guide/taas/",
+    # English paths with spaces/wrong case -> slugified
+    "./Server%20and%20Client/Twinkle-Client.md": "../server-and-client/twinkle-client/",
+    "./Server and Client/Twinkle-Client.md": "../server-and-client/twinkle-client/",
+    "../Server%20and%20Client/Server.md": "../server-and-client/server/",
+    "../Server and Client/Server.md": "../server-and-client/server/",
+    "./Train-as-a-Service.md": "../../guide/taas/",
+    "./Observability.md": "../observability/",
+    # Cookbook source file -> GitHub URL
+    "../../../cookbook/server_mode/twinkle/self_host/self_cognition.py":
+        "https://github.com/modelscope/twinkle/blob/main/cookbook/client/twinkle/self_host/self_cognition.py",
+}
+
+
+def rewrite_links(content: str) -> str:
+    """Rewrite broken internal links to correct Hugo paths.
+    
+    Handles:
+    1. Exact path rewrites from LINK_REWRITES mapping
+    2. Case-insensitive .md references (slugify them)
+    """
+    def replace_link(match):
+        full_match = match.group(0)
+        text = match.group(1)
+        target = match.group(2)
+        
+        # Check exact rewrites (including URL-decoded version)
+        from urllib.parse import unquote
+        decoded = unquote(target)
+        if target in LINK_REWRITES:
+            return f"[{text}]({LINK_REWRITES[target]})"
+        if decoded in LINK_REWRITES:
+            return f"[{text}]({LINK_REWRITES[decoded]})"
+        
+        # Auto-slugify .md references: "Dir/File.md" -> "dir/file/"
+        if target.endswith(".md") and not target.startswith(("http://", "https://")):
+            parts = target.split("/")
+            filename = parts[-1][:-3]  # strip .md
+            slug = slugify(filename)
+            if len(parts) == 1:
+                return f"[{text}]({slug}/)"
+            prefix_parts = [slugify(p) for p in parts[:-1] if p not in (".", "..")]
+            dir_prefix = "/".join(parts[:-1]).replace(
+                parts[-2] if len(parts) > 1 else "",
+                slugify(parts[-2]) if len(parts) > 1 and parts[-2] not in (".", "..") else parts[-2]
+            )
+            # Reconstruct with relative prefix preserved
+            prefix = "/".join(p if p in (".", "..") else slugify(p) for p in parts[:-1])
+            return f"[{text}]({prefix}/{slug}/)"
+        
+        return full_match
+    
+    return re.sub(r'\[([^\]]*)\]\(([^)]+)\)', replace_link, content)
+
+def parse_toctree(rst_path: Path) -> tuple[str, list[str]]:
+    """Parse an index.rst and return (section_title, [entry_paths]).
+
+    entry_paths are the raw values from the toctree directive, e.g.
+    "Dataset.md" or "Server and Client/index.rst".
+    """
+    text = rst_path.read_text(encoding="utf-8")
+    # Section title is the first non-empty, non-directive line
+    title = ""
+    for line in text.splitlines():
+        stripped = line.strip()
+        if stripped and not stripped.startswith("..") and not stripped.startswith(":") \
+                and not stripped.startswith("*") and "=====" not in stripped:
+            title = stripped
+            break
+
+    # Collect all toctree entries (across multiple toctree directives)
+    all_entries: list[str] = []
+    in_toctree = False
+    found_entry = False
+    for line in text.splitlines():
+        if ".. toctree::" in line:
+            in_toctree = True
+            found_entry = False
+            continue
+        if in_toctree:
+            stripped = line.strip()
+            if stripped.startswith(":"):
+                continue
+            if stripped == "":
+                if found_entry:
+                    in_toctree = False
+                continue
+            all_entries.append(stripped)
+            found_entry = True
+
+    return title, all_entries
+
+
+# ---------------------------------------------------------------------------
+# Core sync logic
+# ---------------------------------------------------------------------------
+
+def add_front_matter(content: str, title: str, weight: int,
+                     extra: dict | None = None) -> str:
+    """Prepend Hugo YAML front matter to Markdown content."""
+    fm_lines = ["---"]
+    # Escape quotes in title
+    safe_title = title.replace('"', '\\"')
+    fm_lines.append(f'title: "{safe_title}"')
+    fm_lines.append(f"weight: {weight}")
+    if extra:
+        for k, v in extra.items():
+            if isinstance(v, bool):
+                fm_lines.append(f"{k}: {'true' if v else 'false'}")
+            elif isinstance(v, str):
+                fm_lines.append(f'{k}: "{v}"')
+            else:
+                fm_lines.append(f"{k}: {v}")
+    fm_lines.append("---")
+    fm_lines.append("")
+    return "\n".join(fm_lines) + content
+
+
+def write_index(out_dir: Path, title: str, weight: int,
+                subtitle: str = "", zh_title: str = "",
+                sidebar_open: bool = True):
+    """Write _index.md and _index.zh.md for a Hugo section."""
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    # English _index.md
+    body = f"\n{subtitle}\n" if subtitle else "\n"
+    lines = [
+        "---",
+        f'title: "{title}"',
+        f"weight: {weight}",
+    ]
+    if sidebar_open:
+        lines.append("sidebar:")
+        lines.append("  open: true")
+    lines += ["---", "", body]
+    # Add toc shortcode to list child pages
+    lines.append("{{< list-children >}}")
+    lines.append("")
+    (out_dir / "_index.md").write_text("\n".join(lines), encoding="utf-8")
+
+    # Chinese _index.zh.md
+    display = zh_title or title
+    lines_zh = [
+        "---",
+        f'title: "{display}"',
+        f"weight: {weight}",
+    ]
+    if sidebar_open:
+        lines_zh.append("sidebar:")
+        lines_zh.append("  open: true")
+    lines_zh += ["---", "", body]
+    lines_zh.append("{{< list-children >}}")
+    lines_zh.append("")
+    (out_dir / "_index.zh.md").write_text("\n".join(lines_zh), encoding="utf-8")
+
+
+def sync_md_file(src_en: Path, src_zh: Path | None,
+                 dest_dir: Path, weight: int):
+    """Copy one English .md (and its Chinese counterpart) into dest_dir."""
+    dest_dir.mkdir(parents=True, exist_ok=True)
+
+    content_en = src_en.read_text(encoding="utf-8")
+    title = extract_title_from_md(content_en)
+    if not title:
+        title = src_en.stem
+
+    slug = slugify(src_en.stem) + ".md"
+
+    # Check title overrides (key = relative path under OUT_DIR without .md)
+    override_key = str(dest_dir.relative_to(OUT_DIR) / slugify(src_en.stem))
+    if override_key in TITLE_OVERRIDES:
+        title = TITLE_OVERRIDES[override_key][0]
+
+    # Strip the first heading since Hugo uses front-matter title
+    body = strip_first_heading(content_en)
+    # Rewrite broken links
+    body = rewrite_links(body)
+    out_en = add_front_matter(body, title, weight)
+
+    (dest_dir / slug).write_text(out_en, encoding="utf-8")
+
+    # Chinese version
+    if src_zh and src_zh.exists():
+        content_zh = src_zh.read_text(encoding="utf-8")
+        title_zh = extract_title_from_md(content_zh) or title
+        # Apply Chinese title override
+        if override_key in TITLE_OVERRIDES:
+            title_zh = TITLE_OVERRIDES[override_key][1]
+        body_zh = strip_first_heading(content_zh)
+        # Rewrite broken links
+        body_zh = rewrite_links(body_zh)
+        out_zh = add_front_matter(body_zh, title_zh, weight)
+        zh_slug = slugify(src_en.stem) + ".zh.md"
+        (dest_dir / zh_slug).write_text(out_zh, encoding="utf-8")
+
+
+def build_en_zh_file_map() -> dict:
+    """Build a complete mapping from English source path to Chinese path.
+
+    Walks both root index.rst files and matches entries by position within
+    each toctree. Recurses into sub-section index.rst files.
+    Returns: {Path(en_md_file): Path(zh_md_file)}
+    """
+    mapping: dict[Path, Path] = {}
+
+    en_root_index = DOCS_EN / "index.rst"
+    zh_root_index = DOCS_ZH / "index.rst"
+    if not en_root_index.exists() or not zh_root_index.exists():
+        return mapping
+
+    _, en_entries = parse_toctree(en_root_index)
+    _, zh_entries = parse_toctree(zh_root_index)
+
+    # Group entries by their top-level section directory
+    def group_by_section(entries: list[str]) -> dict[str, list[str]]:
+        out: dict[str, list[str]] = {}
+        for e in entries:
+            top = e.split("/", 1)[0]
+            out.setdefault(top, []).append(e)
+        return out
+
+    en_groups = group_by_section(en_entries)
+    zh_groups = group_by_section(zh_entries)
+
+    # Match each English top-level section to its Chinese counterpart
+    for en_section, en_list in en_groups.items():
+        zh_section = EN_ZH_DIR_MAP.get(en_section)
+        if not zh_section or zh_section not in zh_groups:
+            continue
+        zh_list = zh_groups[zh_section]
+        if len(en_list) != len(zh_list):
+            print(f"  [WARN] EN/ZH entry count mismatch in '{en_section}': "
+                  f"EN={len(en_list)}, ZH={len(zh_list)}")
+        # Match by position
+        for en_entry, zh_entry in zip(en_list, zh_list):
+            en_path = DOCS_EN / en_entry
+            zh_path = DOCS_ZH / zh_entry
+            if en_entry.endswith(".md") and zh_entry.endswith(".md"):
+                if en_path.exists() and zh_path.exists():
+                    mapping[en_path] = zh_path
+            elif en_entry.endswith("index.rst") and zh_entry.endswith("index.rst"):
+                # Recurse into sub-section
+                _recurse_section_map(en_path, zh_path, mapping)
+
+    return mapping
+
+
+def _recurse_section_map(en_index: Path, zh_index: Path,
+                          mapping: dict[Path, Path]):
+    """Recurse into matched sub-section index.rst pair."""
+    if not en_index.exists() or not zh_index.exists():
+        return
+    _, en_entries = parse_toctree(en_index)
+    _, zh_entries = parse_toctree(zh_index)
+    if len(en_entries) != len(zh_entries):
+        print(f"  [WARN] EN/ZH sub-section mismatch in '{en_index.parent.name}': "
+              f"EN={len(en_entries)}, ZH={len(zh_entries)}")
+    en_dir = en_index.parent
+    zh_dir = zh_index.parent
+    # Filter to only .md and index.rst entries (in declaration order)
+    for en_entry, zh_entry in zip(en_entries, zh_entries):
+        en_path = en_dir / en_entry
+        zh_path = zh_dir / zh_entry
+        if en_entry.endswith(".md") and zh_entry.endswith(".md"):
+            if en_path.exists() and zh_path.exists():
+                mapping[en_path] = zh_path
+        elif en_entry.endswith("index.rst") and zh_entry.endswith("index.rst"):
+            _recurse_section_map(en_path, zh_path, mapping)
+
+
+def process_section(en_section_dir: Path, out_base: Path,
+                    section_weight: int, en_zh_map: dict):
+    """Recursively process an English section directory."""
+    index_rst = en_section_dir / "index.rst"
+    if not index_rst.exists():
+        return
+
+    section_name = en_section_dir.name
+    section_slug = slugify(section_name)
+    out_dir = out_base / section_slug
+
+    rst_title, entries = parse_toctree(index_rst)
+    zh_title = EN_ZH_DIR_MAP.get(section_name, "")
+
+    write_index(out_dir, rst_title or section_name, section_weight,
+                zh_title=zh_title)
+
+    sub_weight = 1
+    for entry in entries:
+        entry_path = en_section_dir / entry
+        if entry.endswith("index.rst"):
+            # Sub-section
+            sub_dir = entry_path.parent
+            process_section(sub_dir, out_dir, sub_weight, en_zh_map)
+        elif entry.endswith(".md"):
+            if entry_path.exists():
+                zh_file = en_zh_map.get(entry_path)
+                sync_md_file(entry_path, zh_file, out_dir, sub_weight)
+        sub_weight += 1
+
+
+def process_top_level_section(section_name: str, en_base: Path,
+                              out_base: Path, section_weight: int,
+                              en_zh_map: dict):
+    """Process a top-level section (Usage Guide or Components)."""
+    en_dir = en_base / section_name
+    if not en_dir.exists():
+        print(f"  [SKIP] {en_dir} does not exist")
+        return
+
+    slug = slugify(section_name)
+    out_dir = out_base / slug
+    zh_name = EN_ZH_DIR_MAP.get(section_name, section_name)
+
+    # Parse top-level index.rst
+    top_index = DOCS_EN / "index.rst"
+    _, top_entries = parse_toctree(top_index)
+
+    # Get entries for this section from the main index
+    section_entries = [e for e in top_entries
+                       if e.startswith(f"{section_name}/")]
+
+    write_index(out_dir, section_name, section_weight,
+                zh_title=zh_name, sidebar_open=True)
+
+    sub_weight = 1
+    for entry in section_entries:
+        entry_path = DOCS_EN / entry
+        if entry.endswith("index.rst"):
+            # It's a sub-section directory
+            sub_dir = entry_path.parent
+            process_section(sub_dir, out_dir, sub_weight, en_zh_map)
+        elif entry.endswith(".md"):
+            if entry_path.exists():
+                zh_file = en_zh_map.get(entry_path)
+                sync_md_file(entry_path, zh_file, out_dir, sub_weight)
+        sub_weight += 1
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+    _init_docs_paths()
+    print(f"[sync-docs] Syncing docs from {DOCS_EN} and {DOCS_ZH}")
+    print(f"[sync-docs] Output: {OUT_DIR}")
+
+    # Build EN -> ZH file mapping up-front
+    en_zh_map = build_en_zh_file_map()
+    print(f"  [MAP] Built EN->ZH mapping: {len(en_zh_map)} entries")
+
+    # Clean generated directories
+    for d in GENERATED_DIRS:
+        target = OUT_DIR / d
+        if target.exists():
+            shutil.rmtree(target)
+            print(f"  [CLEAN] {target}")
+
+    # Process Usage Guide (weight=10, after getting-started at weight=1)
+    process_top_level_section("Usage Guide", DOCS_EN,
+                              OUT_DIR, section_weight=10,
+                              en_zh_map=en_zh_map)
+
+    # Process Components (weight=20)
+    process_top_level_section("Components", DOCS_EN,
+                              OUT_DIR, section_weight=20,
+                              en_zh_map=en_zh_map)
+
+    # Count output files
+    total = sum(1 for _ in OUT_DIR.rglob("*.md")
+                if any(d in str(_) for d in GENERATED_DIRS))
+    print(f"[sync-docs] Done! Generated {total} files.")
+
+
+if __name__ == "__main__":
+    main()