Skip to content

Commit bc5c4f0

Browse files
committed
feat!: next data를 처리하는 방식 및 API 변경
1 parent 1e8da3e commit bc5c4f0

4 files changed

Lines changed: 89 additions & 88 deletions

File tree

src/httpc/__main__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ def _handle_next_data(args) -> None:
182182
from httpc import ParseTool
183183

184184
console = Console()
185-
text = ParseTool(text).extract_next_data()
185+
text = ParseTool(text)._extract_next_data()
186186

187187
if not args:
188188
for item in text:
@@ -208,7 +208,7 @@ def _handle_next_data(args) -> None:
208208
if not args.include_prefixed and item.prefix:
209209
continue
210210
data_raw = json.dumps(item.value, ensure_ascii=False)
211-
truncated_limit = int(args.overview) or 80
211+
truncated_limit = int(args.overview or 80)
212212
truncated = data_raw[:truncated_limit]
213213
if len(data_raw) < truncated_limit:
214214
truncated = truncated + " " + "." * (truncated_limit - len(truncated))

src/httpc/_next_data.py

Lines changed: 35 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,58 @@
1+
# 편의상 "next data"라고 부르나 정식 명칭은 RSC payload임.
2+
3+
from __future__ import annotations
4+
15
import json
2-
from operator import attrgetter
36
import re
47
import typing
58

9+
from ._base import logger
10+
611
next_f_data = re.compile(r"self\.__next_f\.push\(\[\d+,\s*(.*)\]\)", re.DOTALL)
12+
# HL, I, "$"가 각각 어떤 역할을 하는지 알려면 https://roy-jung.github.io/250323-react-server-components/ 이 코드 참고
713
line_regex = re.compile(r"^\s*(?P<hexdigit>[0-9a-fA-F]+):(?P<data_prefix>[A-Z]*)(?P<data_raw>.*)")
814

915

1016
class NextData(typing.NamedTuple):
11-
script_no: int
1217
line_no: int
1318
hexdigit: str
1419
prefix: str
1520
value: typing.Any
21+
parsed: bool
1622

1723

18-
def extract_next_data(scripts: typing.Iterable[str], prefix_to_ignore: typing.Container[str] | None = None) -> list[NextData]:
24+
def extract_next_data(scripts: typing.Iterable[str], prefix_to_ignore: typing.Container[str] | None = None, warn_not_parsed: bool = False) -> list[NextData]:
1925
line: str
20-
to_be_continued: str | None = None
2126
next_data = []
22-
for script_no, script in enumerate(scripts):
27+
joined = ""
28+
for script in scripts:
2329
matched = next_f_data.match(script)
2430
if not matched:
25-
# assert "self.__next_f.push(1" not in script, script
2631
continue
27-
for line_no, line in enumerate(json.loads(matched[1]).split("\n")):
28-
if not line:
29-
continue
30-
matched = line_regex.match(line)
31-
if not matched:
32-
if to_be_continued is None:
33-
raise ValueError(f"Line {line_no} in script {script_no} does not match the expected format: {line!r}")
34-
35-
data_raw = to_be_continued + line
36-
to_be_continued = None
37-
if prefix_to_ignore and data_prefix in prefix_to_ignore: # noqa: F821
38-
continue
39-
# script_no와 line_no 데이터는 continuation의 데이터가 사용되고,
40-
# hexdigit과 data_prefix는 이전 matched의 데이터를 사용
41-
# 가능하면 script_no와 line_no 데이터도 이전 matched의 데이터를 사용하면 좋지만,
42-
# 굳이 중요한 건 아니니 이렇게 구현함
43-
next_data.append(NextData(script_no, line_no, hexdigit, data_prefix, json.loads(data_raw))) # noqa: F821
44-
continue
45-
elif to_be_continued is not None:
46-
raise ValueError(f"Line {line_no} in script {script_no} does not match the expected format: {line!r}")
47-
48-
hexdigit = matched["hexdigit"]
49-
data_prefix = matched["data_prefix"]
50-
data_raw = matched["data_raw"]
51-
try:
52-
json_data = json.loads(data_raw)
53-
except json.JSONDecodeError:
54-
to_be_continued = data_raw
55-
continue
56-
if prefix_to_ignore and data_prefix in prefix_to_ignore:
57-
continue
58-
next_data.append(NextData(script_no, line_no, hexdigit, data_prefix, json_data))
32+
joined += json.loads(matched[1])
33+
34+
for line_no, line in enumerate(joined.split("\n")):
35+
if not line:
36+
continue
37+
matched = line_regex.match(line)
38+
if not matched:
39+
raise ValueError(f"Line {line_no} does not match the expected format: {line!r}")
40+
41+
hexdigit = matched["hexdigit"]
42+
data_prefix = matched["data_prefix"]
43+
data_raw = matched["data_raw"]
44+
if prefix_to_ignore and data_prefix in prefix_to_ignore:
45+
continue
46+
try:
47+
json_data = json.loads(data_raw)
48+
except json.JSONDecodeError:
49+
if warn_not_parsed:
50+
logger.warning(f"Failed to parse following data to JSON: {data_raw}")
51+
json_data = data_raw
52+
parsed = False
53+
else:
54+
parsed = True
55+
next_data.append(NextData(line_no, hexdigit, data_prefix, json_data, parsed))
5956

6057
next_data.sort(key=lambda x: int(x.hexdigit, 16))
6158
return next_data

src/httpc/_parse.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from selectolax.lexbor import LexborNode as Node
1010

1111
from ._broadcaster import BroadcastList
12+
from ._base import logger
1213

1314
if typing.TYPE_CHECKING:
1415
from ._broadcaster import NodeBroadcastList
@@ -64,16 +65,19 @@ def single(self, query, default=_ABSENT, *, remain_ok=False, new: bool = False):
6465
else:
6566
raise ValueError(f"Query {query!r} matched with {length} nodes{self._get_url_note()}.")
6667

67-
def extract_next_data(self, prefix_to_ignore: typing.Container | None = None) -> list[NextData]:
68+
def _extract_next_data(self, prefix_to_ignore: typing.Container | None = None) -> list[NextData]:
6869
scripts = [script.text(strip=True) for script in self.match("script")]
6970
next_data = extract_next_data(scripts, prefix_to_ignore=prefix_to_ignore)
7071
return next_data
7172

72-
def next_data(self, *, exclude_prefixed: bool = True) -> dict[str, typing.Any]:
73+
def next_data(self, *, exclude_prefixed: bool = True, warn_unparsed: bool = True) -> dict[str, NextData]:
7374
prefix_to_ignore = ("HL", "I") if exclude_prefixed else None
74-
next_data = self.extract_next_data(prefix_to_ignore=prefix_to_ignore)
75+
next_data = self._extract_next_data(prefix_to_ignore=prefix_to_ignore)
76+
for data in next_data:
77+
if not data.parsed and warn_unparsed:
78+
logger.warning(f"Failed to parse following data: {data.value}")
7579
return {
76-
data.hexdigit: data.value
80+
data.hexdigit: data
7781
for data in next_data
7882
}
7983

0 commit comments

Comments
 (0)