feat!: next data를 처리하는 방식 및 API 변경

ilotoki0804 · ilotoki0804 · commit bc5c4f05dde0 · 2025-11-13T19:44:49.000+09:00
diff --git a/src/httpc/__main__.py b/src/httpc/__main__.py
@@ -182,7 +182,7 @@ def _handle_next_data(args) -> None:
     from httpc import ParseTool
 
     console = Console()
-    text = ParseTool(text).extract_next_data()
+    text = ParseTool(text)._extract_next_data()
 
     if not args:
         for item in text:
@@ -208,7 +208,7 @@ def _handle_next_data(args) -> None:
             if not args.include_prefixed and item.prefix:
                 continue
             data_raw = json.dumps(item.value, ensure_ascii=False)
-            truncated_limit = int(args.overview) or 80
+            truncated_limit = int(args.overview or 80)
             truncated = data_raw[:truncated_limit]
             if len(data_raw) < truncated_limit:
                 truncated = truncated + " " + "." * (truncated_limit - len(truncated))
diff --git a/src/httpc/_next_data.py b/src/httpc/_next_data.py
@@ -1,61 +1,58 @@
+# 편의상 "next data"라고 부르나 정식 명칭은 RSC payload임.
+
+from __future__ import annotations
+
 import json
-from operator import attrgetter
 import re
 import typing
 
+from ._base import logger
+
 next_f_data = re.compile(r"self\.__next_f\.push\(\[\d+,\s*(.*)\]\)", re.DOTALL)
+# HL, I, "$"가 각각 어떤 역할을 하는지 알려면 https://roy-jung.github.io/250323-react-server-components/ 이 코드 참고
 line_regex = re.compile(r"^\s*(?P<hexdigit>[0-9a-fA-F]+):(?P<data_prefix>[A-Z]*)(?P<data_raw>.*)")
 
 
 class NextData(typing.NamedTuple):
-    script_no: int
     line_no: int
     hexdigit: str
     prefix: str
     value: typing.Any
+    parsed: bool
 
 
-def extract_next_data(scripts: typing.Iterable[str], prefix_to_ignore: typing.Container[str] | None = None) -> list[NextData]:
+def extract_next_data(scripts: typing.Iterable[str], prefix_to_ignore: typing.Container[str] | None = None, warn_not_parsed: bool = False) -> list[NextData]:
     line: str
-    to_be_continued: str | None = None
     next_data = []
-    for script_no, script in enumerate(scripts):
+    joined = ""
+    for script in scripts:
         matched = next_f_data.match(script)
         if not matched:
-            # assert "self.__next_f.push(1" not in script, script
             continue
-        for line_no, line in enumerate(json.loads(matched[1]).split("\n")):
-            if not line:
-                continue
-            matched = line_regex.match(line)
-            if not matched:
-                if to_be_continued is None:
-                    raise ValueError(f"Line {line_no} in script {script_no} does not match the expected format: {line!r}")
-
-                data_raw = to_be_continued + line
-                to_be_continued = None
-                if prefix_to_ignore and data_prefix in prefix_to_ignore:  # noqa: F821
-                    continue
-                # script_no와 line_no 데이터는 continuation의 데이터가 사용되고,
-                # hexdigit과 data_prefix는 이전 matched의 데이터를 사용
-                # 가능하면 script_no와 line_no 데이터도 이전 matched의 데이터를 사용하면 좋지만,
-                # 굳이 중요한 건 아니니 이렇게 구현함
-                next_data.append(NextData(script_no, line_no, hexdigit, data_prefix, json.loads(data_raw)))  # noqa: F821
-                continue
-            elif to_be_continued is not None:
-                raise ValueError(f"Line {line_no} in script {script_no} does not match the expected format: {line!r}")
-
-            hexdigit = matched["hexdigit"]
-            data_prefix = matched["data_prefix"]
-            data_raw = matched["data_raw"]
-            try:
-                json_data = json.loads(data_raw)
-            except json.JSONDecodeError:
-                to_be_continued = data_raw
-                continue
-            if prefix_to_ignore and data_prefix in prefix_to_ignore:
-                continue
-            next_data.append(NextData(script_no, line_no, hexdigit, data_prefix, json_data))
+        joined += json.loads(matched[1])
+
+    for line_no, line in enumerate(joined.split("\n")):
+        if not line:
+            continue
+        matched = line_regex.match(line)
+        if not matched:
+            raise ValueError(f"Line {line_no} does not match the expected format: {line!r}")
+
+        hexdigit = matched["hexdigit"]
+        data_prefix = matched["data_prefix"]
+        data_raw = matched["data_raw"]
+        if prefix_to_ignore and data_prefix in prefix_to_ignore:
+            continue
+        try:
+            json_data = json.loads(data_raw)
+        except json.JSONDecodeError:
+            if warn_not_parsed:
+                logger.warning(f"Failed to parse following data to JSON: {data_raw}")
+            json_data = data_raw
+            parsed = False
+        else:
+            parsed = True
+        next_data.append(NextData(line_no, hexdigit, data_prefix, json_data, parsed))
 
     next_data.sort(key=lambda x: int(x.hexdigit, 16))
     return next_data
diff --git a/src/httpc/_parse.py b/src/httpc/_parse.py
@@ -9,6 +9,7 @@
 from selectolax.lexbor import LexborNode as Node
 
 from ._broadcaster import BroadcastList
+from ._base import logger
 
 if typing.TYPE_CHECKING:
     from ._broadcaster import NodeBroadcastList
@@ -64,16 +65,19 @@ def single(self, query, default=_ABSENT, *, remain_ok=False, new: bool = False):
         else:
             raise ValueError(f"Query {query!r} matched with {length} nodes{self._get_url_note()}.")
 
-    def extract_next_data(self, prefix_to_ignore: typing.Container | None = None) -> list[NextData]:
+    def _extract_next_data(self, prefix_to_ignore: typing.Container | None = None) -> list[NextData]:
         scripts = [script.text(strip=True) for script in self.match("script")]
         next_data = extract_next_data(scripts, prefix_to_ignore=prefix_to_ignore)
         return next_data
 
-    def next_data(self, *, exclude_prefixed: bool = True) -> dict[str, typing.Any]:
+    def next_data(self, *, exclude_prefixed: bool = True, warn_unparsed: bool = True) -> dict[str, NextData]:
         prefix_to_ignore = ("HL", "I") if exclude_prefixed else None
-        next_data = self.extract_next_data(prefix_to_ignore=prefix_to_ignore)
+        next_data = self._extract_next_data(prefix_to_ignore=prefix_to_ignore)
+        for data in next_data:
+            if not data.parsed and warn_unparsed:
+                logger.warning(f"Failed to parse following data: {data.value}")
         return {
-            data.hexdigit: data.value
+            data.hexdigit: data
             for data in next_data
         }
 
diff --git a/tests/test_next_data.py b/tests/test_next_data.py