Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
a139dc1
docs(devforge): investigation and proposed fix for issue #479
claude Jul 1, 2026
cbab1e3
fix: decode response header values as UTF-8 with ISO-8859-1 fallback
claude Jul 1, 2026
7c52daf
fix: correct header-decode doctest and formatting
claude Jul 1, 2026
5233130
chore(devforge): record iter-2 reviewer pass and final-review phase
claude Jul 1, 2026
db37ee1
refactor: avoid redundant copy in header-value decode
claude Jul 1, 2026
0d0ec07
chore(devforge): record final-review pass (loop converged)
claude Jul 1, 2026
d236997
docs(devforge): revise design to per-ecosystem decode + raw-bytes acc…
claude Jul 1, 2026
4a0e40e
feat: per-ecosystem header decoding + raw-header-bytes accessor
claude Jul 1, 2026
1b52315
test: cover Python from_async header decode + raw_headers over a real…
claude Jul 1, 2026
b54d354
chore(devforge): enter final review (inner loop converged, rev 2)
claude Jul 1, 2026
750e31a
chore(devforge): record thermonuclear final review (rev 2, round 1)
claude Jul 1, 2026
82d8756
fix: preserve rawHeaders across clone(), declare it in .d.ts, correct…
claude Jul 1, 2026
6a40868
docs: align raw-header field comments with the accessor's documented …
claude Jul 1, 2026
e8289e5
chore(devforge): final review converged (rev 2, both reviewers PASS)
claude Jul 1, 2026
3bc991f
chore(devforge): record create-PR approval
claude Jul 1, 2026
dd95522
chore(devforge): record PR #492 and finish run
claude Jul 1, 2026
b773362
chore: remove stray napi-0.2.1.zip committed by mistake
claude Jul 1, 2026
f64e62b
chore: drop .devforge working files from the PR and gitignore them
claude Jul 1, 2026
deac557
fix(python): declare raw_headers in the type stub
claude Jul 1, 2026
060101a
chore: drop .devforge from committed .gitignore
claude Jul 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions impit-node/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,18 @@ export declare class ImpitResponse {
* In case of redirects, this will be the final URL after all redirects have been followed.
*/
url: string
/**
* Raw, undecoded response header values as `[name, bytes]` pairs.
*
* Unlike {@link headers}, whose values are decoded as ISO-8859-1 strings (matching the Fetch
* API), this exposes the exact value bytes received on the wire. Use it when a header carries
* UTF-8 (e.g. a `Content-Disposition` filename) or when verifying a header signature/HMAC.
*
* Names are lowercased and the original wire order is not preserved (the underlying HTTP client
* normalizes header names into a map); duplicate values for a name are kept. This is an impit
* extension - the standard Fetch `Response` has no raw-header accessor.
*/
get rawHeaders(): Array<[string, Uint8Array]>
/** @ignore */
decodeBuffer(buffer: Buffer): string
/**
Expand Down
5 changes: 5 additions & 0 deletions impit-node/index.wrapper.js
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,11 @@ class Impit extends native.Impit {
value: this.url,
enumerable: true,
});
// Preserve the impit-specific raw header bytes across clone().
Object.defineProperty(clone, 'rawHeaders', {
value: this.rawHeaders,
enumerable: true,
});
Object.defineProperty(clone, 'text', {
value: async function () {
const buffer = await clone.arrayBuffer();
Expand Down
39 changes: 39 additions & 0 deletions impit-node/src/response.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use napi::bindgen_prelude::JsObjectValue;
use napi::{
bindgen_prelude::{
BufferSlice, FromNapiValue, Function, Object, ReadableStream, Result, This, ToNapiValue,
Uint8Array,
},
sys, Env, JsValue, Unknown,
};
Expand Down Expand Up @@ -61,6 +62,9 @@ pub struct ImpitResponse {
///
/// In case of redirects, this will be the final URL after all redirects have been followed.
pub url: String,
// Raw, undecoded header name/value byte pairs (values exact; names lowercased, order not the
// original wire order - see the `rawHeaders` getter docs). Exposed via the `rawHeaders` getter.
raw_header_pairs: Vec<(String, Vec<u8>)>,
// Shared sender used to immediately signal abort to the JS ReadableStream without polling.
abort_receiver: Arc<tokio::sync::Mutex<Option<tokio::sync::mpsc::Receiver<()>>>>,
abort_sender: Arc<tokio::sync::Mutex<Option<tokio::sync::mpsc::Sender<()>>>>,
Expand Down Expand Up @@ -89,12 +93,17 @@ impl<'env> ImpitResponse {
.canonical_reason()
.unwrap_or("")
.to_string();
// JS Fetch semantics: header values are decoded as ISO-8859-1 (each byte 0x00..=0xFF maps to
// the code point U+0000..=U+00FF). This keeps the string form byte-recoverable via
// `Buffer.from(value, 'latin1')`; callers needing exact UTF-8 use the `rawHeaders` accessor.
Comment on lines +96 to +98

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This (ISO-8859-1 being a bijection) imo means we don't need the rawHeaders field in impit-node bindings.

See that the rawHeaders is not a part of the fetch interface impit is implementing. Adding more fields might bite us later with maintenance costs.

let mut headers_vec: Vec<(String, String)> = Vec::new();
let mut raw_header_pairs: Vec<(String, Vec<u8>)> = Vec::new();
for (k, v) in response.headers().iter() {
headers_vec.push((
k.as_str().to_string(),
v.as_bytes().iter().map(|&b| b as char).collect(),
));
raw_header_pairs.push((k.as_str().to_string(), v.as_bytes().to_vec()));
}
let headers = Headers(headers_vec);
let ok = response.status().is_success();
Expand All @@ -107,11 +116,41 @@ impl<'env> ImpitResponse {
headers,
ok,
url,
raw_header_pairs,
abort_receiver: Arc::new(tokio::sync::Mutex::new(None)),
abort_sender: Arc::new(tokio::sync::Mutex::new(None)),
})
}

/// Raw, undecoded response header values as `[name, bytes]` pairs.
///
/// Unlike {@link headers}, whose values are decoded as ISO-8859-1 strings (matching the Fetch
/// API), this exposes the exact value bytes received on the wire. Use it when a header carries
/// UTF-8 (e.g. a `Content-Disposition` filename) or when verifying a header signature/HMAC,
/// where the precise bytes matter:
///
/// @example
/// ```ts
/// const [, raw] = response.rawHeaders.find(([k]) => k.toLowerCase() === 'content-disposition');
/// const value = new TextDecoder('utf-8').decode(raw);
/// ```
///
/// Header names are lowercased and the original wire order is not preserved (the underlying
/// HTTP client normalizes header names into a map); duplicate values for a name are kept. This
/// is an impit extension; the standard Fetch `Response` has no raw-header accessor.
#[napi(
getter,
js_name = "rawHeaders",
ts_return_type = "Array<[string, Uint8Array]>"
)]
pub fn raw_headers(&self) -> Vec<(String, Uint8Array)> {
self
.raw_header_pairs
.iter()
.map(|(name, value)| (name.clone(), Uint8Array::from(value.clone())))
.collect()
}

fn get_inner_response(&self, env: &Env, mut this: This<Object>) -> Result<Object<'_>> {
let cached_response = this.get::<Object>(INNER_RESPONSE_PROPERTY_NAME)?;

Expand Down
23 changes: 23 additions & 0 deletions impit-node/test/basics.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,29 @@ describe.each([
t.expect(response.headers.get('x-non-ascii')).toBe(routes.nonAsciiHeader.headerValue);
});

test('raw header bytes preserve the exact wire value while the string stays ISO-8859-1 (Fetch-style)', async (t) => {
const response = await impit.fetch(new URL(routes.utf8Header.path, "http://127.0.0.1:3001").href);

// Fetch semantics: the string form is ISO-8859-1, so a UTF-8 value reads back as mojibake.
const latin1 = response.headers.get('x-utf8');
t.expect(latin1).not.toBe(routes.utf8Header.headerValue);

// rawHeaders exposes the exact value bytes, which decode to the real UTF-8 value.
const rawPair = response.rawHeaders.find(([k]) => k.toLowerCase() === 'x-utf8');
t.expect(rawPair).toBeDefined();
const rawBytes = Buffer.from(rawPair![1]);
t.expect(rawBytes.toString('utf8')).toBe(routes.utf8Header.headerValue);

// The ISO-8859-1 string also round-trips back to those exact bytes (the standard Fetch workaround).
t.expect(Buffer.from(latin1!, 'latin1').equals(rawBytes)).toBe(true);

// rawHeaders survives clone() (clone() returns a Fetch Response augmented by impit).
const cloned = response.clone() as unknown as { rawHeaders: Array<[string, Uint8Array]> };
const clonedPair = cloned.rawHeaders.find(([k]) => k.toLowerCase() === 'x-utf8');
t.expect(clonedPair).toBeDefined();
t.expect(Buffer.from(clonedPair![1]).equals(rawBytes)).toBe(true);
});

test('.json() method works', async (t) => {
const response = await impit.fetch(getHttpBinUrl('/json'));
const json = await response.json();
Expand Down
20 changes: 20 additions & 0 deletions impit-node/test/mock.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ export const routes = {
path: '/non-ascii-header',
headerValue: 'Dienstag, 31. März 2026',
},
utf8Header: {
path: '/utf8-header',
headerValue: 'attachment; filename="naïve.pdf"',
},
}

function parseMultipart(body: Buffer, boundary: string): Record<string, string> {
Expand Down Expand Up @@ -117,6 +121,22 @@ export async function runServer(port: number): Promise<Server> {
socket.end();
});

app.get(routes.utf8Header.path, (req, res) => {
const socket = res.socket!;
socket.write('HTTP/1.1 200 OK\r\n');
socket.write('Content-Type: text/plain\r\n');
// Header value carrying UTF-8 bytes (the ï is 0xC3 0xAF).
socket.write(Buffer.concat([
Buffer.from('X-Utf8: '),
Buffer.from(routes.utf8Header.headerValue, 'utf-8'),
Buffer.from('\r\n'),
]));
socket.write('Content-Length: 2\r\n');
socket.write('\r\n');
socket.write('ok');
socket.end();
});

app.get('/socket', (req, res) => {
const socket = req.socket;
const clientAddress = socket.remoteAddress;
Expand Down
16 changes: 16 additions & 0 deletions impit-python/python/impit/impit.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,22 @@ class Response:
print(response.headers) # {'content-type': 'text/html; charset=utf-8', ... }
"""

@property
def raw_headers(self) -> list[tuple[bytes, bytes]]:
"""Raw, undecoded header name/value pairs as ``(bytes, bytes)``.

Similar to httpx's ``Response.headers.raw``, but note two differences imposed by the
underlying HTTP client: header names are normalized to lowercase and the original wire
order is not preserved (duplicate values for a name are kept). Header *values* are the
exact bytes received - useful when a header carries UTF-8 or when verifying a header
signature/HMAC.

.. code-block:: python

response = await client.get("https://crawlee.dev")
print(response.raw_headers) # [(b'content-type', b'text/html; charset=utf-8'), ... ]
"""

text: str
"""Response body as text. Decoded from :attr:`content` using :attr:`encoding`.

Expand Down
50 changes: 43 additions & 7 deletions impit-python/src/response.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@ use tokio::sync::Mutex as AsyncMutex;
use bytes::Bytes;
use encoding::label::encoding_from_whatwg_label;
use futures::{Stream, StreamExt};
use impit::{errors::ImpitError, utils::ContentType};
use impit::{
errors::ImpitError,
utils::{decode_header_value, ContentType},
};
use pyo3::prelude::*;
use pyo3::types::PyBytes;
use reqwest::{Response, StatusCode, Version};
use std::pin::Pin;

Expand Down Expand Up @@ -223,6 +227,9 @@ pub struct ImpitPyResponse {
content: Option<Vec<u8>>,
inner: Option<Response>,
inner_state: InnerResponseState,
// Raw, undecoded header name/value byte pairs (values exact; names lowercased, order not the
// original wire order - see the `raw_headers` getter docs). Exposed via the `raw_headers` getter.
raw_headers: Vec<(Vec<u8>, Vec<u8>)>,
Comment on lines +230 to +232

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any chance we can follow the httpx interface and expose Response.headers.raw instead (source code)?

I suppose it would mean implementing the Headers class, but it's, imo, worth it for the interface alignment and futureproofing impit. See that we can borrow the Headers class implementation from httpx and keep the implementation in Python (same as, e.g., Cookies here).

}

#[pymethods]
Expand All @@ -238,6 +245,12 @@ impl ImpitPyResponse {
) -> Self {
let headers = headers.unwrap_or_default();

// No wire bytes for a manually constructed response; use the UTF-8 bytes of the strings.
let raw_headers: Vec<(Vec<u8>, Vec<u8>)> = headers
.iter()
.map(|(k, v)| (k.clone().into_bytes(), v.clone().into_bytes()))
.collect();

let encoding = match headers
.iter()
.find(|(k, _)| k.to_lowercase() == "content-type")
Expand Down Expand Up @@ -267,6 +280,7 @@ impl ImpitPyResponse {
content: Some(content.unwrap_or_default()),
inner: None,
inner_state: InnerResponseState::Read,
raw_headers,
}
}

Expand Down Expand Up @@ -439,6 +453,21 @@ impl ImpitPyResponse {
Ok(())
}

/// Raw, undecoded header name/value pairs as `(bytes, bytes)`. Similar to httpx's
/// `Response.headers.raw`, but note two differences imposed by the underlying HTTP client:
/// header names are normalized to lowercase and the original wire order is not preserved
/// (duplicate values for a name are kept). Header *values* are the exact bytes received.
///
/// Unlike `headers` (str values decoded UTF-8-first), this returns the exact value bytes, for
/// callers that need them - e.g. verifying a header signature/HMAC.
#[getter]
fn raw_headers<'py>(&self, py: Python<'py>) -> Vec<(Bound<'py, PyBytes>, Bound<'py, PyBytes>)> {
self.raw_headers
.iter()
.map(|(name, value)| (PyBytes::new(py, name), PyBytes::new(py, value)))
.collect()
}

#[getter]
fn content(&mut self, py: Python<'_>) -> PyResult<Vec<u8>> {
self.read(py)
Expand Down Expand Up @@ -536,12 +565,18 @@ impl ImpitPyResponse {
_ => "Unknown".to_string(),
};
let is_redirect = val.status().is_redirection();
let headers = HashMap::from_iter(val.headers().iter().map(|(k, v)| {
(
k.as_str().to_string(),
v.as_bytes().iter().map(|&b| b as char).collect::<String>(),
)
}));
// Python/httpx semantics: decode header values UTF-8-first with an ISO-8859-1 fallback.
let headers = HashMap::from_iter(
val.headers()
.iter()
.map(|(k, v)| (k.as_str().to_string(), decode_header_value(v.as_bytes()))),
);
// Exact wire bytes for callers that need them (httpx `Headers.raw` equivalent).
let raw_headers: Vec<(Vec<u8>, Vec<u8>)> = val
.headers()
.iter()
.map(|(k, v)| (k.as_str().as_bytes().to_vec(), v.as_bytes().to_vec()))
.collect();

let content_type_charset = headers
.get("content-type")
Expand Down Expand Up @@ -597,6 +632,7 @@ impl ImpitPyResponse {
is_stream_consumed,
inner_state,
inner,
raw_headers,
})
}
}
56 changes: 56 additions & 0 deletions impit-python/test/async_client_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,39 @@ def truncating_server(port_holder: list[int]) -> None:
server.close()


def header_encoding_server(port_holder: list[int]) -> None:
"""Send a response carrying a UTF-8 header value and a lone ISO-8859-1 byte."""
server = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_V6ONLY, 0)
server.bind(('::', 0))
port_holder[0] = server.getsockname()[1]
server.listen(1)

conn, _ = server.accept()
conn.recv(1024)
body = b'ok'
response = b''.join(
[
b'HTTP/1.1 200 OK\r\n',
b'Content-Type: text/plain\r\n',
b'X-Utf8: ',
'attachment; filename="naïve.pdf"'.encode(),
b'\r\n',
b'X-Latin1: M',
bytes([0xE4]), # 'a' with diaeresis in ISO-8859-1; not valid UTF-8 on its own
b'rz\r\n',
b'Content-Length: ',
str(len(body)).encode(),
b'\r\n\r\n',
body,
]
)
conn.send(response)
conn.close()
server.close()


@pytest.mark.asyncio
@pytest.mark.parametrize(
('browser', 'ja4'),
Expand Down Expand Up @@ -425,6 +458,29 @@ async def test_local_address(self, browser: Browser, addresses: tuple[str, str])
assert response.status_code == 200
thread.join()

@pytest.mark.asyncio
async def test_header_value_decoding_and_raw_bytes(self, browser: Browser) -> None:
port_holder = [0]
thread = threading.Thread(target=header_encoding_server, args=(port_holder,))
thread.start()
await asyncio.sleep(0.1)

impit = AsyncClient(browser=browser)
response = await impit.get(f'http://127.0.0.1:{port_holder[0]}/', timeout=5)
thread.join()

utf8_value = 'attachment; filename="naïve.pdf"'

# Python follows httpx semantics: a UTF-8 header value decodes correctly as str...
assert response.headers['x-utf8'] == utf8_value
# ...and a lone non-UTF-8 byte falls back to ISO-8859-1.
assert response.headers['x-latin1'] == 'März'

# raw_headers exposes the exact wire bytes (httpx Headers.raw equivalent).
raw = dict(response.raw_headers)
assert raw[b'x-utf8'] == utf8_value.encode('utf-8')
assert raw[b'x-latin1'] == b'M' + bytes([0xE4]) + b'rz'


@pytest.mark.parametrize(
('browser'),
Expand Down
13 changes: 13 additions & 0 deletions impit-python/test/response_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,19 @@ def test_response_constructor_with_headers() -> None:
assert response.headers['Content-Type'] == 'application/json'


def test_response_raw_headers() -> None:
# raw_headers exposes header name/value pairs as exact bytes (httpx Headers.raw equivalent).
response = Response(200, headers={'Content-Type': 'application/json', 'X-Unicode': 'naïve'})

raw = response.raw_headers

assert isinstance(raw, list)
assert all(isinstance(k, bytes) and isinstance(v, bytes) for k, v in raw)
assert (b'Content-Type', b'application/json') in raw
# A non-ASCII value is preserved as its exact UTF-8 bytes.
assert (b'X-Unicode', 'naïve'.encode()) in raw


def test_response_headers_encoding() -> None:
response = Response(
200, headers={'Content-Type': 'text/plain; charset=cp1250'}, content=b'\x9e\x64\xe1\xf8\x65\x6e\xed'
Expand Down
1 change: 1 addition & 0 deletions impit/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ pub mod fingerprint;
/// Various utility functions and types.
pub mod utils {
pub use crate::response_parsing::decode;
pub use crate::response_parsing::decode_header_value;
pub use crate::response_parsing::determine_encoding;
pub use crate::response_parsing::ContentType;
pub use encoding::all as encodings;
Expand Down
Loading
Loading