diff --git a/impit-node/index.d.ts b/impit-node/index.d.ts index 08b21fb9..9bce9d6e 100644 --- a/impit-node/index.d.ts +++ b/impit-node/index.d.ts @@ -143,6 +143,18 @@ export declare class ImpitResponse { * In case of redirects, this will be the final URL after all redirects have been followed. */ url: string + /** + * Raw, undecoded response header values as `[name, bytes]` pairs. + * + * Unlike {@link headers}, whose values are decoded as ISO-8859-1 strings (matching the Fetch + * API), this exposes the exact value bytes received on the wire. Use it when a header carries + * UTF-8 (e.g. a `Content-Disposition` filename) or when verifying a header signature/HMAC. + * + * Names are lowercased and the original wire order is not preserved (the underlying HTTP client + * normalizes header names into a map); duplicate values for a name are kept. This is an impit + * extension - the standard Fetch `Response` has no raw-header accessor. + */ + get rawHeaders(): Array<[string, Uint8Array]> /** @ignore */ decodeBuffer(buffer: Buffer): string /** diff --git a/impit-node/index.wrapper.js b/impit-node/index.wrapper.js index 3e9ed22e..1c5dc6a7 100644 --- a/impit-node/index.wrapper.js +++ b/impit-node/index.wrapper.js @@ -480,6 +480,11 @@ class Impit extends native.Impit { value: this.url, enumerable: true, }); + // Preserve the impit-specific raw header bytes across clone(). + Object.defineProperty(clone, 'rawHeaders', { + value: this.rawHeaders, + enumerable: true, + }); Object.defineProperty(clone, 'text', { value: async function () { const buffer = await clone.arrayBuffer(); diff --git a/impit-node/src/response.rs b/impit-node/src/response.rs index e1a0ba0d..a0ad04d8 100644 --- a/impit-node/src/response.rs +++ b/impit-node/src/response.rs @@ -5,6 +5,7 @@ use napi::bindgen_prelude::JsObjectValue; use napi::{ bindgen_prelude::{ BufferSlice, FromNapiValue, Function, Object, ReadableStream, Result, This, ToNapiValue, + Uint8Array, }, sys, Env, JsValue, Unknown, }; @@ -61,6 +62,9 @@ pub struct ImpitResponse { /// /// In case of redirects, this will be the final URL after all redirects have been followed. pub url: String, + // Raw, undecoded header name/value byte pairs (values exact; names lowercased, order not the + // original wire order - see the `rawHeaders` getter docs). Exposed via the `rawHeaders` getter. + raw_header_pairs: Vec<(String, Vec)>, // Shared sender used to immediately signal abort to the JS ReadableStream without polling. abort_receiver: Arc>>>, abort_sender: Arc>>>, @@ -89,12 +93,17 @@ impl<'env> ImpitResponse { .canonical_reason() .unwrap_or("") .to_string(); + // JS Fetch semantics: header values are decoded as ISO-8859-1 (each byte 0x00..=0xFF maps to + // the code point U+0000..=U+00FF). This keeps the string form byte-recoverable via + // `Buffer.from(value, 'latin1')`; callers needing exact UTF-8 use the `rawHeaders` accessor. let mut headers_vec: Vec<(String, String)> = Vec::new(); + let mut raw_header_pairs: Vec<(String, Vec)> = Vec::new(); for (k, v) in response.headers().iter() { headers_vec.push(( k.as_str().to_string(), v.as_bytes().iter().map(|&b| b as char).collect(), )); + raw_header_pairs.push((k.as_str().to_string(), v.as_bytes().to_vec())); } let headers = Headers(headers_vec); let ok = response.status().is_success(); @@ -107,11 +116,41 @@ impl<'env> ImpitResponse { headers, ok, url, + raw_header_pairs, abort_receiver: Arc::new(tokio::sync::Mutex::new(None)), abort_sender: Arc::new(tokio::sync::Mutex::new(None)), }) } + /// Raw, undecoded response header values as `[name, bytes]` pairs. + /// + /// Unlike {@link headers}, whose values are decoded as ISO-8859-1 strings (matching the Fetch + /// API), this exposes the exact value bytes received on the wire. Use it when a header carries + /// UTF-8 (e.g. a `Content-Disposition` filename) or when verifying a header signature/HMAC, + /// where the precise bytes matter: + /// + /// @example + /// ```ts + /// const [, raw] = response.rawHeaders.find(([k]) => k.toLowerCase() === 'content-disposition'); + /// const value = new TextDecoder('utf-8').decode(raw); + /// ``` + /// + /// Header names are lowercased and the original wire order is not preserved (the underlying + /// HTTP client normalizes header names into a map); duplicate values for a name are kept. This + /// is an impit extension; the standard Fetch `Response` has no raw-header accessor. + #[napi( + getter, + js_name = "rawHeaders", + ts_return_type = "Array<[string, Uint8Array]>" + )] + pub fn raw_headers(&self) -> Vec<(String, Uint8Array)> { + self + .raw_header_pairs + .iter() + .map(|(name, value)| (name.clone(), Uint8Array::from(value.clone()))) + .collect() + } + fn get_inner_response(&self, env: &Env, mut this: This) -> Result> { let cached_response = this.get::(INNER_RESPONSE_PROPERTY_NAME)?; diff --git a/impit-node/test/basics.test.ts b/impit-node/test/basics.test.ts index 7f7df808..15813088 100644 --- a/impit-node/test/basics.test.ts +++ b/impit-node/test/basics.test.ts @@ -571,6 +571,29 @@ describe.each([ t.expect(response.headers.get('x-non-ascii')).toBe(routes.nonAsciiHeader.headerValue); }); + test('raw header bytes preserve the exact wire value while the string stays ISO-8859-1 (Fetch-style)', async (t) => { + const response = await impit.fetch(new URL(routes.utf8Header.path, "http://127.0.0.1:3001").href); + + // Fetch semantics: the string form is ISO-8859-1, so a UTF-8 value reads back as mojibake. + const latin1 = response.headers.get('x-utf8'); + t.expect(latin1).not.toBe(routes.utf8Header.headerValue); + + // rawHeaders exposes the exact value bytes, which decode to the real UTF-8 value. + const rawPair = response.rawHeaders.find(([k]) => k.toLowerCase() === 'x-utf8'); + t.expect(rawPair).toBeDefined(); + const rawBytes = Buffer.from(rawPair![1]); + t.expect(rawBytes.toString('utf8')).toBe(routes.utf8Header.headerValue); + + // The ISO-8859-1 string also round-trips back to those exact bytes (the standard Fetch workaround). + t.expect(Buffer.from(latin1!, 'latin1').equals(rawBytes)).toBe(true); + + // rawHeaders survives clone() (clone() returns a Fetch Response augmented by impit). + const cloned = response.clone() as unknown as { rawHeaders: Array<[string, Uint8Array]> }; + const clonedPair = cloned.rawHeaders.find(([k]) => k.toLowerCase() === 'x-utf8'); + t.expect(clonedPair).toBeDefined(); + t.expect(Buffer.from(clonedPair![1]).equals(rawBytes)).toBe(true); + }); + test('.json() method works', async (t) => { const response = await impit.fetch(getHttpBinUrl('/json')); const json = await response.json(); diff --git a/impit-node/test/mock.server.ts b/impit-node/test/mock.server.ts index 3c2ddf6f..1ebba4bb 100644 --- a/impit-node/test/mock.server.ts +++ b/impit-node/test/mock.server.ts @@ -24,6 +24,10 @@ export const routes = { path: '/non-ascii-header', headerValue: 'Dienstag, 31. März 2026', }, + utf8Header: { + path: '/utf8-header', + headerValue: 'attachment; filename="naïve.pdf"', + }, } function parseMultipart(body: Buffer, boundary: string): Record { @@ -117,6 +121,22 @@ export async function runServer(port: number): Promise { socket.end(); }); + app.get(routes.utf8Header.path, (req, res) => { + const socket = res.socket!; + socket.write('HTTP/1.1 200 OK\r\n'); + socket.write('Content-Type: text/plain\r\n'); + // Header value carrying UTF-8 bytes (the ï is 0xC3 0xAF). + socket.write(Buffer.concat([ + Buffer.from('X-Utf8: '), + Buffer.from(routes.utf8Header.headerValue, 'utf-8'), + Buffer.from('\r\n'), + ])); + socket.write('Content-Length: 2\r\n'); + socket.write('\r\n'); + socket.write('ok'); + socket.end(); + }); + app.get('/socket', (req, res) => { const socket = req.socket; const clientAddress = socket.remoteAddress; diff --git a/impit-python/python/impit/impit.pyi b/impit-python/python/impit/impit.pyi index 7be1fbf1..06fb6dcd 100644 --- a/impit-python/python/impit/impit.pyi +++ b/impit-python/python/impit/impit.pyi @@ -180,6 +180,22 @@ class Response: print(response.headers) # {'content-type': 'text/html; charset=utf-8', ... } """ + @property + def raw_headers(self) -> list[tuple[bytes, bytes]]: + """Raw, undecoded header name/value pairs as ``(bytes, bytes)``. + + Similar to httpx's ``Response.headers.raw``, but note two differences imposed by the + underlying HTTP client: header names are normalized to lowercase and the original wire + order is not preserved (duplicate values for a name are kept). Header *values* are the + exact bytes received - useful when a header carries UTF-8 or when verifying a header + signature/HMAC. + + .. code-block:: python + + response = await client.get("https://crawlee.dev") + print(response.raw_headers) # [(b'content-type', b'text/html; charset=utf-8'), ... ] + """ + text: str """Response body as text. Decoded from :attr:`content` using :attr:`encoding`. diff --git a/impit-python/src/response.rs b/impit-python/src/response.rs index a5cb5a15..2f7b0685 100644 --- a/impit-python/src/response.rs +++ b/impit-python/src/response.rs @@ -5,8 +5,12 @@ use tokio::sync::Mutex as AsyncMutex; use bytes::Bytes; use encoding::label::encoding_from_whatwg_label; use futures::{Stream, StreamExt}; -use impit::{errors::ImpitError, utils::ContentType}; +use impit::{ + errors::ImpitError, + utils::{decode_header_value, ContentType}, +}; use pyo3::prelude::*; +use pyo3::types::PyBytes; use reqwest::{Response, StatusCode, Version}; use std::pin::Pin; @@ -223,6 +227,9 @@ pub struct ImpitPyResponse { content: Option>, inner: Option, inner_state: InnerResponseState, + // Raw, undecoded header name/value byte pairs (values exact; names lowercased, order not the + // original wire order - see the `raw_headers` getter docs). Exposed via the `raw_headers` getter. + raw_headers: Vec<(Vec, Vec)>, } #[pymethods] @@ -238,6 +245,12 @@ impl ImpitPyResponse { ) -> Self { let headers = headers.unwrap_or_default(); + // No wire bytes for a manually constructed response; use the UTF-8 bytes of the strings. + let raw_headers: Vec<(Vec, Vec)> = headers + .iter() + .map(|(k, v)| (k.clone().into_bytes(), v.clone().into_bytes())) + .collect(); + let encoding = match headers .iter() .find(|(k, _)| k.to_lowercase() == "content-type") @@ -267,6 +280,7 @@ impl ImpitPyResponse { content: Some(content.unwrap_or_default()), inner: None, inner_state: InnerResponseState::Read, + raw_headers, } } @@ -439,6 +453,21 @@ impl ImpitPyResponse { Ok(()) } + /// Raw, undecoded header name/value pairs as `(bytes, bytes)`. Similar to httpx's + /// `Response.headers.raw`, but note two differences imposed by the underlying HTTP client: + /// header names are normalized to lowercase and the original wire order is not preserved + /// (duplicate values for a name are kept). Header *values* are the exact bytes received. + /// + /// Unlike `headers` (str values decoded UTF-8-first), this returns the exact value bytes, for + /// callers that need them - e.g. verifying a header signature/HMAC. + #[getter] + fn raw_headers<'py>(&self, py: Python<'py>) -> Vec<(Bound<'py, PyBytes>, Bound<'py, PyBytes>)> { + self.raw_headers + .iter() + .map(|(name, value)| (PyBytes::new(py, name), PyBytes::new(py, value))) + .collect() + } + #[getter] fn content(&mut self, py: Python<'_>) -> PyResult> { self.read(py) @@ -536,12 +565,18 @@ impl ImpitPyResponse { _ => "Unknown".to_string(), }; let is_redirect = val.status().is_redirection(); - let headers = HashMap::from_iter(val.headers().iter().map(|(k, v)| { - ( - k.as_str().to_string(), - v.as_bytes().iter().map(|&b| b as char).collect::(), - ) - })); + // Python/httpx semantics: decode header values UTF-8-first with an ISO-8859-1 fallback. + let headers = HashMap::from_iter( + val.headers() + .iter() + .map(|(k, v)| (k.as_str().to_string(), decode_header_value(v.as_bytes()))), + ); + // Exact wire bytes for callers that need them (httpx `Headers.raw` equivalent). + let raw_headers: Vec<(Vec, Vec)> = val + .headers() + .iter() + .map(|(k, v)| (k.as_str().as_bytes().to_vec(), v.as_bytes().to_vec())) + .collect(); let content_type_charset = headers .get("content-type") @@ -597,6 +632,7 @@ impl ImpitPyResponse { is_stream_consumed, inner_state, inner, + raw_headers, }) } } diff --git a/impit-python/test/async_client_test.py b/impit-python/test/async_client_test.py index 8a0b11b5..6ea26e5b 100644 --- a/impit-python/test/async_client_test.py +++ b/impit-python/test/async_client_test.py @@ -46,6 +46,39 @@ def truncating_server(port_holder: list[int]) -> None: server.close() +def header_encoding_server(port_holder: list[int]) -> None: + """Send a response carrying a UTF-8 header value and a lone ISO-8859-1 byte.""" + server = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) + server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + server.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_V6ONLY, 0) + server.bind(('::', 0)) + port_holder[0] = server.getsockname()[1] + server.listen(1) + + conn, _ = server.accept() + conn.recv(1024) + body = b'ok' + response = b''.join( + [ + b'HTTP/1.1 200 OK\r\n', + b'Content-Type: text/plain\r\n', + b'X-Utf8: ', + 'attachment; filename="naïve.pdf"'.encode(), + b'\r\n', + b'X-Latin1: M', + bytes([0xE4]), # 'a' with diaeresis in ISO-8859-1; not valid UTF-8 on its own + b'rz\r\n', + b'Content-Length: ', + str(len(body)).encode(), + b'\r\n\r\n', + body, + ] + ) + conn.send(response) + conn.close() + server.close() + + @pytest.mark.asyncio @pytest.mark.parametrize( ('browser', 'ja4'), @@ -425,6 +458,29 @@ async def test_local_address(self, browser: Browser, addresses: tuple[str, str]) assert response.status_code == 200 thread.join() + @pytest.mark.asyncio + async def test_header_value_decoding_and_raw_bytes(self, browser: Browser) -> None: + port_holder = [0] + thread = threading.Thread(target=header_encoding_server, args=(port_holder,)) + thread.start() + await asyncio.sleep(0.1) + + impit = AsyncClient(browser=browser) + response = await impit.get(f'http://127.0.0.1:{port_holder[0]}/', timeout=5) + thread.join() + + utf8_value = 'attachment; filename="naïve.pdf"' + + # Python follows httpx semantics: a UTF-8 header value decodes correctly as str... + assert response.headers['x-utf8'] == utf8_value + # ...and a lone non-UTF-8 byte falls back to ISO-8859-1. + assert response.headers['x-latin1'] == 'März' + + # raw_headers exposes the exact wire bytes (httpx Headers.raw equivalent). + raw = dict(response.raw_headers) + assert raw[b'x-utf8'] == utf8_value.encode('utf-8') + assert raw[b'x-latin1'] == b'M' + bytes([0xE4]) + b'rz' + @pytest.mark.parametrize( ('browser'), diff --git a/impit-python/test/response_test.py b/impit-python/test/response_test.py index ba09610f..a72af162 100644 --- a/impit-python/test/response_test.py +++ b/impit-python/test/response_test.py @@ -40,6 +40,19 @@ def test_response_constructor_with_headers() -> None: assert response.headers['Content-Type'] == 'application/json' +def test_response_raw_headers() -> None: + # raw_headers exposes header name/value pairs as exact bytes (httpx Headers.raw equivalent). + response = Response(200, headers={'Content-Type': 'application/json', 'X-Unicode': 'naïve'}) + + raw = response.raw_headers + + assert isinstance(raw, list) + assert all(isinstance(k, bytes) and isinstance(v, bytes) for k, v in raw) + assert (b'Content-Type', b'application/json') in raw + # A non-ASCII value is preserved as its exact UTF-8 bytes. + assert (b'X-Unicode', 'naïve'.encode()) in raw + + def test_response_headers_encoding() -> None: response = Response( 200, headers={'Content-Type': 'text/plain; charset=cp1250'}, content=b'\x9e\x64\xe1\xf8\x65\x6e\xed' diff --git a/impit/src/lib.rs b/impit/src/lib.rs index 0af8cae7..463d42e4 100644 --- a/impit/src/lib.rs +++ b/impit/src/lib.rs @@ -78,6 +78,7 @@ pub mod fingerprint; /// Various utility functions and types. pub mod utils { pub use crate::response_parsing::decode; + pub use crate::response_parsing::decode_header_value; pub use crate::response_parsing::determine_encoding; pub use crate::response_parsing::ContentType; pub use encoding::all as encodings; diff --git a/impit/src/response_parsing/mod.rs b/impit/src/response_parsing/mod.rs index 9874e1de..d8eed727 100644 --- a/impit/src/response_parsing/mod.rs +++ b/impit/src/response_parsing/mod.rs @@ -133,6 +133,36 @@ pub fn determine_encoding(bytes: &[u8]) -> Option { None } +/// Decodes an HTTP header value into a [`String`]. +/// +/// Header values arrive as raw bytes with no charset declaration. Per RFC 9110 §5.5 they are +/// nominally ISO-8859-1 (the `obs-text` range), but in practice modern servers routinely send +/// UTF-8 (for example `Content-Disposition: attachment; filename="naïve.pdf"`). +/// +/// This function decodes the bytes as UTF-8 when they form valid UTF-8, and otherwise falls back +/// to a byte-for-byte ISO-8859-1 decode (each byte `0x00..=0xFF` maps to the code point +/// `U+0000..=U+00FF`). This fixes the common UTF-8 case without corrupting genuine ISO-8859-1 +/// values, never fails, and never emits `U+FFFD` replacement characters — so no header value can +/// crash a caller or come back empty. +/// +/// ### Example +/// +/// ```rust +/// use impit::utils::decode_header_value; +/// +/// // Valid UTF-8 is decoded as UTF-8 (the ï is the two UTF-8 bytes 0xC3 0xAF). +/// assert_eq!(decode_header_value(&[b'n', b'a', 0xC3, 0xAF, b'v', b'e']), "naïve"); +/// +/// // A lone 0xE4 is not valid UTF-8, so it falls back to ISO-8859-1 ('ä'). +/// assert_eq!(decode_header_value(&[b'M', 0xE4, b'r', b'z']), "März"); +/// ``` +pub fn decode_header_value(bytes: &[u8]) -> String { + match std::str::from_utf8(bytes) { + Ok(valid) => valid.to_owned(), + Err(_) => bytes.iter().map(|&b| b as char).collect(), + } +} + /// A struct that represents the contents of the `Content-Type` header. /// /// The struct is used to extract the charset from the `Content-Type` header and convert it to an [`encoding::EncodingRef`]. @@ -173,3 +203,48 @@ impl From for Option { encoding::label::encoding_from_whatwg_label(val.charset.as_str()) } } + +#[cfg(test)] +mod tests { + use super::decode_header_value; + + #[test] + fn ascii_is_unchanged() { + assert_eq!(decode_header_value(b"application/json"), "application/json"); + } + + #[test] + fn empty_is_empty() { + assert_eq!(decode_header_value(b""), ""); + } + + #[test] + fn utf8_is_decoded_as_utf8() { + // "naïve.pdf" — the ï is UTF-8 bytes 0xC3 0xAF (issue #479). + let bytes = "attachment; filename=\"naïve.pdf\"".as_bytes(); + assert_eq!( + decode_header_value(bytes), + "attachment; filename=\"naïve.pdf\"" + ); + } + + #[test] + fn invalid_utf8_falls_back_to_iso_8859_1() { + // Lone 0xE4 ('ä' in ISO-8859-1) is not valid UTF-8 (PR #434 / issue #430). + let bytes = [ + b'D', b'i', b'e', b'n', b's', b't', b'a', b'g', b',', b' ', b'3', b'1', b'.', b' ', + b'M', 0xE4, b'r', b'z', b' ', b'2', b'0', b'2', b'6', + ]; + assert_eq!(decode_header_value(&bytes), "Dienstag, 31. März 2026"); + } + + #[test] + fn iso_8859_1_fallback_never_produces_replacement_char() { + // Every non-UTF-8 byte maps to exactly one char, so the result round-trips back to bytes. + let bytes = [0xE4, 0xF6, 0xFC, 0xFF]; + let decoded = decode_header_value(&bytes); + assert!(!decoded.contains('\u{FFFD}')); + let roundtrip: Vec = decoded.chars().map(|c| c as u8).collect(); + assert_eq!(roundtrip, bytes); + } +}