fix trailer V2 quality: portrait aspect ratio, inference params, ncnn upscale

charliewwdev · claude · charliewwdev · commit 5a88d7fa1b55 · 2026-02-11T15:42:09.000+03:00
- Fix character stretching: portrait generation 480x832 → 832x480 to match
  I2V landscape shots, add cinematic wide-angle composition to prompts
- Increase I2V inference steps 30 → 50 and reduce guidance_scale 6.0 → 5.0
  to match Wan 2.2 TI2V-5B defaults for better quality
- Add realesrgan-ncnn-vulkan backend fallback to VideoUpscaler for when
  Python realesrgan/basicsr won't compile (e.g. Python 3.14)
- Update .gitignore for tools/ dir and new scripts

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -16,12 +16,16 @@ build/
 
 scripts/*
 !scripts/animate.py
+!scripts/story.py
+!scripts/produce_trailer_v2.py
 !scripts/__init__.py
 
 *.ipynb
 *.safetensors
 *.ckpt
 
+tools/
+
 models/*
 !models/StableDiffusion/
 models/StableDiffusion/*
diff --git a/animatediff/postprocess/upscale.py b/animatediff/postprocess/upscale.py
@@ -6,24 +6,66 @@
 - RealESRGAN_x4plus_anime_6B: 17MB, anime images
 - RealESRGAN_x4plus: 64MB, general purpose
 
-MPS/Apple Silicon: works with half=False and tiling.
+Backends:
+- python: Uses realesrgan pip package (requires basicsr, may not compile on Python 3.14)
+- ncnn:   Uses realesrgan-ncnn-vulkan binary (standalone, no Python deps)
+- auto:   Tries python first, falls back to ncnn
+
+MPS/Apple Silicon: Python backend uses half=False + tiling. ncnn uses Vulkan GPU natively.
 """
 
 import logging
-from typing import List, Optional, Literal
+import os
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import List, Literal
 
 import numpy as np
 from PIL import Image
 
 logger = logging.getLogger(__name__)
 
-# Model download URLs
+# Model download URLs (Python backend)
 MODEL_URLS = {
     "animevideov3": "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-animevideov3.pth",
     "anime_6B": "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth",
     "general": "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth",
 }
 
+# ncnn model name mapping
+NCNN_MODEL_NAMES = {
+    "animevideov3": "realesr-animevideov3",
+    "anime_6B": "realesrgan-x4plus-anime",
+    "general": "realesrgan-x4plus",
+}
+
+# Default ncnn binary location (relative to project root)
+PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
+NCNN_DIR = PROJECT_ROOT / "tools" / "realesrgan-ncnn-vulkan"
+
+
+def _find_ncnn_binary() -> str | None:
+    """Find realesrgan-ncnn-vulkan binary."""
+    # Check project tools/ directory
+    candidate = NCNN_DIR / "realesrgan-ncnn-vulkan"
+    if candidate.is_file() and os.access(str(candidate), os.X_OK):
+        return str(candidate)
+    # Check PATH
+    which = shutil.which("realesrgan-ncnn-vulkan")
+    if which:
+        return which
+    return None
+
+
+def _find_ncnn_models_dir() -> str | None:
+    """Find ncnn models directory."""
+    candidate = NCNN_DIR / "models"
+    if candidate.is_dir():
+        return str(candidate)
+    return None
+
 
 class VideoUpscaler:
     """Upscale video frames using Real-ESRGAN."""
@@ -34,25 +76,50 @@ def __init__(
         scale: int = 2,
         tile: int = 0,
         device: str = "cpu",
+        backend: Literal["auto", "python", "ncnn"] = "auto",
     ):
         self.model_name = model_name
         self.scale = scale
         self.tile = tile
         self.device = device
+        self.backend = backend
         self._upsampler = None
+        self._resolved_backend = None
+
+    def _resolve_backend(self) -> str:
+        """Determine which backend to use."""
+        if self._resolved_backend:
+            return self._resolved_backend
+
+        if self.backend == "python":
+            self._resolved_backend = "python"
+        elif self.backend == "ncnn":
+            self._resolved_backend = "ncnn"
+        else:  # auto
+            try:
+                import realesrgan  # noqa: F401
+                import basicsr  # noqa: F401
+                self._resolved_backend = "python"
+                logger.info("Upscale backend: python (realesrgan pip package)")
+            except ImportError:
+                if _find_ncnn_binary():
+                    self._resolved_backend = "ncnn"
+                    logger.info("Upscale backend: ncnn-vulkan (Python realesrgan unavailable)")
+                else:
+                    raise ImportError(
+                        "No upscale backend available. Either:\n"
+                        "  1. pip install realesrgan (requires basicsr)\n"
+                        "  2. Place realesrgan-ncnn-vulkan binary in tools/realesrgan-ncnn-vulkan/"
+                    )
+        return self._resolved_backend
 
     def _ensure_loaded(self):
-        """Lazy-load the upscaler model."""
+        """Lazy-load the Python upscaler model."""
         if self._upsampler is not None:
             return
 
-        try:
-            from realesrgan import RealESRGANer
-            from basicsr.utils.download_util import load_file_from_url
-        except ImportError:
-            raise ImportError(
-                "Real-ESRGAN not installed. Install with: pip install realesrgan"
-            )
+        from realesrgan import RealESRGANer
+        from basicsr.utils.download_util import load_file_from_url
 
         model_url = MODEL_URLS.get(self.model_name, MODEL_URLS["animevideov3"])
         model_path = load_file_from_url(url=model_url, model_dir="weights/realesrgan", progress=True)
@@ -110,26 +177,91 @@ def _ensure_loaded(self):
 
         logger.info(f"Loaded Real-ESRGAN model: {self.model_name} (device={self.device}, tile={tile})")
 
-    def upscale_frame(self, frame: Image.Image) -> Image.Image:
-        """Upscale a single frame."""
+    def _upscale_frame_python(self, frame: Image.Image) -> Image.Image:
+        """Upscale a single frame using Python backend."""
         self._ensure_loaded()
         import cv2
 
         img = np.array(frame)
-        # Real-ESRGAN expects BGR
         img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
-
         output, _ = self._upsampler.enhance(img_bgr, outscale=self.scale)
-
         output_rgb = cv2.cvtColor(output, cv2.COLOR_BGR2RGB)
         return Image.fromarray(output_rgb)
 
+    def _upscale_frames_ncnn(self, frames: List[Image.Image]) -> List[Image.Image]:
+        """Upscale frames using realesrgan-ncnn-vulkan binary (batch directory mode)."""
+        binary = _find_ncnn_binary()
+        if not binary:
+            raise RuntimeError("realesrgan-ncnn-vulkan binary not found")
+
+        models_dir = _find_ncnn_models_dir()
+        if not models_dir:
+            raise RuntimeError(
+                f"ncnn models not found. Expected at: {NCNN_DIR / 'models'}"
+            )
+
+        ncnn_model_name = NCNN_MODEL_NAMES.get(self.model_name, "realesr-animevideov3")
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            in_dir = os.path.join(tmpdir, "input")
+            out_dir = os.path.join(tmpdir, "output")
+            os.makedirs(in_dir)
+            os.makedirs(out_dir)
+
+            # Save input frames as PNG
+            for i, frame in enumerate(frames):
+                frame.save(os.path.join(in_dir, f"{i:06d}.png"))
+
+            # Run ncnn binary
+            cmd = [
+                binary,
+                "-i", in_dir,
+                "-o", out_dir,
+                "-s", str(self.scale),
+                "-n", ncnn_model_name,
+                "-m", models_dir,
+                "-f", "png",
+            ]
+            logger.info(f"Running ncnn upscale: {' '.join(cmd)}")
+
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
+            if result.returncode != 0:
+                raise RuntimeError(
+                    f"realesrgan-ncnn-vulkan failed (exit {result.returncode}): "
+                    f"{result.stderr[:500]}"
+                )
+
+            # Load output frames
+            upscaled = []
+            for i in range(len(frames)):
+                out_path = os.path.join(out_dir, f"{i:06d}.png")
+                if not os.path.exists(out_path):
+                    raise RuntimeError(f"ncnn output missing: {out_path}")
+                upscaled.append(Image.open(out_path).convert("RGB"))
+
+            return upscaled
+
+    def upscale_frame(self, frame: Image.Image) -> Image.Image:
+        """Upscale a single frame."""
+        backend = self._resolve_backend()
+        if backend == "python":
+            return self._upscale_frame_python(frame)
+        else:
+            # ncnn: batch of 1
+            return self._upscale_frames_ncnn([frame])[0]
+
     def upscale_frames(self, frames: List[Image.Image]) -> List[Image.Image]:
         """Upscale all frames in a video."""
-        logger.info(f"Upscaling {len(frames)} frames with {self.model_name} ({self.scale}x)")
+        backend = self._resolve_backend()
+        logger.info(f"Upscaling {len(frames)} frames with {self.model_name} ({self.scale}x, backend={backend})")
+
+        if backend == "ncnn":
+            return self._upscale_frames_ncnn(frames)
+
+        # Python backend: frame-by-frame
         result = []
         for i, frame in enumerate(frames):
-            result.append(self.upscale_frame(frame))
+            result.append(self._upscale_frame_python(frame))
             if (i + 1) % 10 == 0:
                 logger.info(f"  Upscaled {i+1}/{len(frames)} frames")
         return result
diff --git a/examples/fanren_trailer_v2.json b/examples/fanren_trailer_v2.json
@@ -23,28 +23,28 @@
     "韩立_early": {
       "description": "young thin boy age 14, simple brown cloth, messy black hair, poor but determined dark eyes, thin face",
       "image_path": null,
-      "portrait_prompt": "portrait of a young thin Chinese boy age 14, simple ragged brown cloth, messy black hair, poor but determined dark eyes, thin face, anime style, xianxia, high detail, upper body, looking at camera",
+      "portrait_prompt": "cinematic wide angle portrait of a young thin Chinese boy age 14, simple ragged brown cloth, messy black hair, poor but determined dark eyes, thin face, anime style, xianxia, high detail, upper body centered in frame, landscape composition, looking at camera",
       "lora_path": null,
       "lora_scale": 1.0
     },
     "韩立_late": {
       "description": "young male cultivator age 16, grey disciple robes, hair tied in topknot, determined gaze, faint golden aura",
       "image_path": null,
-      "portrait_prompt": "portrait of a young Chinese male cultivator age 16, grey disciple robes with dark trim, hair tied in topknot, determined gaze, faint golden spiritual aura, anime style, xianxia, high detail, upper body, looking at camera",
+      "portrait_prompt": "cinematic wide angle portrait of a young Chinese male cultivator age 16, grey disciple robes with dark trim, hair tied in topknot, determined gaze, faint golden spiritual aura, anime style, xianxia, high detail, upper body centered in frame, landscape composition, looking at camera",
       "lora_path": null,
       "lora_scale": 1.0
     },
     "墨大夫": {
       "description": "sinister old man with long white beard, dark green robes, sharp cunning eyes, hunched posture, alchemist",
       "image_path": null,
-      "portrait_prompt": "portrait of a sinister old Chinese man alchemist, long white beard, dark green robes, sharp cunning eyes, hunched posture, dim candlelit background, anime style, xianxia, high detail, upper body, looking at camera",
+      "portrait_prompt": "cinematic wide angle portrait of a sinister old Chinese man alchemist, long white beard, dark green robes, sharp cunning eyes, hunched posture, dim candlelit background, anime style, xianxia, high detail, upper body centered in frame, landscape composition, looking at camera",
       "lora_path": null,
       "lora_scale": 1.0
     },
     "厉飞雨": {
       "description": "handsome young man in grey disciple robes, short black hair, warm smile, loyal friend, carrying a sword",
       "image_path": null,
-      "portrait_prompt": "portrait of a handsome young Chinese man in grey disciple robes, short black hair, warm friendly smile, sword on back, anime style, xianxia, high detail, upper body, looking at camera",
+      "portrait_prompt": "cinematic wide angle portrait of a handsome young Chinese man in grey disciple robes, short black hair, warm friendly smile, sword on back, anime style, xianxia, high detail, upper body centered in frame, landscape composition, looking at camera",
       "lora_path": null,
       "lora_scale": 1.0
     }
diff --git a/scripts/produce_trailer_v2.py b/scripts/produce_trailer_v2.py
@@ -288,8 +288,8 @@ def phase2_generate_on_gpu():
             output = pipeline.generate(
                 prompt=prompt,
                 negative_prompt="blurry, low quality, distorted, deformed, ugly, watermark, multiple people",
-                width=480,
-                height=832,
+                width=832,
+                height=480,
                 num_frames=17,
                 num_inference_steps=30,
                 guidance_scale=6.0,
@@ -361,10 +361,10 @@ def phase2_generate_on_gpu():
                 print(f"  Shot {i:2d}: I2V with ref={char_name}")
             else:
                 print(f"  Shot {i:2d}: ⚠️ No portrait for {char_name}, using black frame")
-                ref_image = Image.new("RGB", (480, 832), (0, 0, 0))
+                ref_image = Image.new("RGB", (832, 480), (0, 0, 0))
         else:
             print(f"  Shot {i:2d}: No characters, using black frame")
-            ref_image = Image.new("RGB", (480, 832), (0, 0, 0))
+            ref_image = Image.new("RGB", (832, 480), (0, 0, 0))
 
         num_frames = shot.get("num_frames", 0)
         if num_frames == 0:
@@ -382,8 +382,8 @@ def phase2_generate_on_gpu():
             width=832,
             height=480,
             num_frames=num_frames,
-            num_inference_steps=30,
-            guidance_scale=6.0,
+            num_inference_steps=50,
+            guidance_scale=5.0,
             seed=shot.get("seed", -1),
             image=ref_image,
         )