From 7b1f505090be3a686d5672ada12c2f09a5860ab0 Mon Sep 17 00:00:00 2001 From: Adrian Chaves Date: Wed, 10 Jun 2026 17:12:01 +0200 Subject: [PATCH 1/2] Add multi-part support --- docs/api.rst | 4 + docs/usage.rst | 48 +++++++++-- form2request/__init__.py | 3 +- form2request/_base.py | 98 ++++++++++++++++------- tests/test_main.py | 168 ++++++++++++++++++++++++++++++++------- 5 files changed, 255 insertions(+), 66 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 18f8d4b..574bfb8 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -7,3 +7,7 @@ API reference .. autoclass:: form2request.Request :members: :undoc-members: + +.. autoclass:: form2request.FileField + :members: + :undoc-members: diff --git a/docs/usage.rst b/docs/usage.rst index d50507c..7e8413d 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -32,9 +32,8 @@ output to build requests with any HTTP client software. It also provides :func:`~form2request.form2request` supports :ref:`user-defined form data -`, :ref:`choosing a specific submit button (or none) `, and -:ref:`overriding form attributes `. - +`, :ref:`file uploads `, :ref:`choosing a specific submit button +(or none) `, and :ref:`overriding form attributes `. .. _form: @@ -93,7 +92,6 @@ ML-based solution that can can automatically find a form of a specified type :ref:`submit button `. Its :ref:`formasaurus:usage` documentation includes an example featuring form2request. - .. _data: Setting form data @@ -142,6 +140,46 @@ To remove a field value, set it to ``None``: >>> form2request(form, {"foo": None}) Request(url='https://example.com', method='GET', headers=[], body=b'') +.. _uploads: + +Uploading files +=============== + +Forms that upload files use ``enctype="multipart/form-data"``. Pass a +:class:`~form2request.FileField` instance as the value for any file input +field: + +>>> from form2request import FileField, form2request +>>> html = b""" +...
+... +... +...
""" +>>> selector = Selector(body=html, base_url="https://example.com") +>>> form = selector.css("form") +>>> request_data = form2request(form, { +... "description": "quarterly report", +... "attachment": FileField( +... content=b"col1,col2\n1,2\n", +... filename="report.csv", +... content_type="text/csv", +... ), +... }) +>>> request_data.method +'POST' +>>> request_data.url +'https://example.com' +>>> request_data.headers[0][1].startswith("multipart/form-data") +True + +The ``filename`` and ``content_type`` arguments of +:class:`~form2request.FileField` are optional. When omitted, ``filename`` +defaults to an empty string and ``content_type`` defaults to +``application/octet-stream``. + +For non-file fields, :func:`~form2request.form2request` handles encoding +automatically — regular text fields are sent as plain text parts within the +multipart body, without needing any special wrapping. .. _click: @@ -188,7 +226,6 @@ To change that, set ``click`` to the element that should be clicked: >>> form2request(form, click=submit_baz) Request(url='https://example.com?foo=baz', method='GET', headers=[], body=b'') - .. _override: Overriding form attributes @@ -202,7 +239,6 @@ You can override the method_ and enctype_ attributes of a form: >>> form2request(form, method="POST", enctype="text/plain") Request(url='https://example.com', method='POST', headers=[('Content-Type', 'text/plain')], body=b'foo=bar') - .. _request: Using request data diff --git a/form2request/__init__.py b/form2request/__init__.py index f2cb5c7..45718a5 100644 --- a/form2request/__init__.py +++ b/form2request/__init__.py @@ -1,8 +1,9 @@ """Build HTTP requests out of HTML forms.""" -from ._base import Request, form2request +from ._base import FileField, Request, form2request __all__ = [ + "FileField", "Request", "form2request", ] diff --git a/form2request/_base.py b/form2request/_base.py index fb2dcd2..a7e9b3f 100644 --- a/form2request/_base.py +++ b/form2request/_base.py @@ -11,6 +11,7 @@ cast, ) from urllib.parse import urlencode, urljoin, urlsplit, urlunsplit +import uuid from parsel import Selector, SelectorList from w3lib.html import strip_html5_whitespace @@ -18,7 +19,17 @@ if TYPE_CHECKING: from lxml.html import FormElement, HtmlElement -FormdataVType = Union[str, Iterable[str]] + +@dataclass +class FileField: + """A file upload value for use with multipart/form-data forms.""" + + content: bytes + filename: str = "" + content_type: str = "application/octet-stream" + + +FormdataVType = Union[str, FileField, Iterable[str]] FormdataKVType = tuple[str, FormdataVType] FormdataType = Optional[Union[dict[str, FormdataVType], Iterable[FormdataKVType]]] @@ -38,7 +49,11 @@ def _enctype( ) -> str: if enctype: enctype = enctype.lower() - if enctype not in {"application/x-www-form-urlencoded", "text/plain"}: + if enctype not in { + "application/x-www-form-urlencoded", + "text/plain", + "multipart/form-data", + }: raise ValueError( f"The specified form enctype ({enctype!r}) is not supported " f"for forms with the POST method." @@ -46,19 +61,9 @@ def _enctype( elif click_element is not None and ( enctype := (click_element.get("formenctype") or "").lower() ): - if enctype == "multipart/form-data": - raise NotImplementedError( - f"{click_element} has formenctype set to {enctype!r}, which " - f"form2request does not currently support for forms with the " - f"POST method." - ) - elif ( - enctype := (form.get("enctype") or "").lower() - ) and enctype == "multipart/form-data": - raise NotImplementedError( - f"{form} has enctype set to {enctype!r}, which form2request does " - f"not currently support for forms with the POST method." - ) + pass + elif enctype := (form.get("enctype") or "").lower(): + pass return enctype @@ -131,7 +136,7 @@ def _click_element( def _data( form: FormElement, data: FormdataType, click_element: HtmlElement | None -) -> list[tuple[str, str]]: +) -> list[tuple[str, str | FileField]]: data = data or {} if click_element is not None and (name := click_element.get("name")): click_data = (name, cast("str", click_element.get("value"))) @@ -171,10 +176,35 @@ def _data( return [ (k, v) for k, vs in values - for v in ([vs] if isinstance(vs, (str, bytes)) else vs) + for v in ([vs] if isinstance(vs, (str, bytes, FileField)) else vs) ] +def _build_multipart_body( + data: list[tuple[str, str | FileField]], boundary: str +) -> bytes: + parts = [] + for name, value in data: + if isinstance(value, FileField): + filename_part = f'; filename="{value.filename}"' if value.filename else "" + header = ( + f"--{boundary}\r\n" + f'Content-Disposition: form-data; name="{name}"{filename_part}\r\n' + f"Content-Type: {value.content_type}\r\n" + f"\r\n" + ).encode() + parts.append(header + value.content + b"\r\n") + else: + header = ( + f"--{boundary}\r\n" + f'Content-Disposition: form-data; name="{name}"\r\n' + f"\r\n" + ).encode() + parts.append(header + value.encode() + b"\r\n") + parts.append(f"--{boundary}--\r\n".encode()) + return b"".join(parts) + + @dataclass class Request: """HTTP request data.""" @@ -282,23 +312,33 @@ def form2request( click_element = _click_element(form_el, click) url = _url(form_el, click_element) method = _method(form_el, click_element, method) - headers = [] - body = "" data = _data(form_el, data, click_element) if method == "GET": url = urlunsplit(urlsplit(url)._replace(query=urlencode(data, doseq=True))) - else: - assert method == "POST" - enctype = _enctype(form_el, click_element, enctype) - if enctype == "text/plain": - headers = [("Content-Type", "text/plain")] - body = "\n".join(f"{k}={v}" for k, v in data) - else: - headers = [("Content-Type", "application/x-www-form-urlencoded")] - body = urlencode(data, doseq=True) + return Request(url=url, method=method, headers=[], body=b"") + assert method == "POST" + enctype = _enctype(form_el, click_element, enctype) + if enctype == "multipart/form-data": + boundary = uuid.uuid4().hex + headers = [("Content-Type", f'multipart/form-data; boundary="{boundary}"')] + return Request( + url=url, + method=method, + headers=headers, + body=_build_multipart_body(data, boundary), + ) + if enctype == "text/plain": + body = "\n".join(f"{k}={v}" for k, v in data) + return Request( + url=url, + method=method, + headers=[("Content-Type", "text/plain")], + body=body.encode(), + ) + body = urlencode(data, doseq=True) return Request( url=url, method=method, - headers=headers, + headers=[("Content-Type", "application/x-www-form-urlencoded")], body=body.encode(), ) diff --git a/tests/test_main.py b/tests/test_main.py index 79a60dd..d2fb8f1 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,8 +1,33 @@ +import email + import pytest from lxml.html import fromstring from parsel import Selector -from form2request import Request, form2request +from form2request import FileField, Request, form2request + + +def _parse_multipart(request): + """Parse a multipart/form-data Request body. + + Returns a list of dicts with keys: name, filename, content_type, content. + """ + ct = next(v for k, v in request.headers if k == "Content-Type") + raw = f"Content-Type: {ct}\r\n\r\n".encode() + request.body + msg = email.message_from_bytes(raw) + if not msg.is_multipart(): + return [] + parts = [] + for part in msg.get_payload(): + parts.append( + { + "name": part.get_param("name", header="content-disposition"), + "filename": part.get_param("filename", header="content-disposition"), + "content_type": part.get_content_type(), + "content": part.get_payload(decode=True), + } + ) + return parts @pytest.mark.parametrize( @@ -226,14 +251,6 @@ "foo", ) ), - # multipart/form-data raises a NotImplementedError exception when the - # method is POST. - ( - "https://example.com", - b"""
""", - {}, - NotImplementedError, - ), # multipart/form-data does work when method is GET (default). ( "https://example.com", @@ -273,21 +290,8 @@ b"", ), ), - ( - "https://example.com", - b"""
-
""", - {}, - NotImplementedError, - ), # enctype may be overridden, in which case it raises ValueError for - # both unknown and unsupported values when method is POST. - ( - "https://example.com", - b"""
""", - {"enctype": "multipart/form-data"}, - ValueError, - ), + # unknown values when method is POST. ( "https://example.com", b"""
""", @@ -769,13 +773,6 @@ b"a+ /=b+ /\nc+ /=d+ /", ), ), - ( - "https://example.com", - b"""
-
""", - {}, - NotImplementedError, - ), ], ) def test_form2request(base_url, html, kwargs, expected): @@ -820,3 +817,114 @@ def test_form2request_parsel(): assert form2request(form, click=submit_baz) == expected assert form2request(form, click=submit_baz[0]) == expected assert form2request(form, click=submit_baz[0].root) == expected + + +def _multipart_request(html, **kwargs): + root = fromstring(html, base_url="https://example.com") + form = root.xpath("//form")[0] + return form2request(form, **kwargs) + + +def test_multipart_empty_form(): + request = _multipart_request( + b'
' + ) + assert request.method == "POST" + assert request.url == "https://example.com" + ct = next(v for k, v in request.headers if k == "Content-Type") + assert ct.startswith("multipart/form-data; boundary=") + assert _parse_multipart(request) == [] + + +def test_multipart_text_fields(): + request = _multipart_request( + b"""
+ + +
""" + ) + parts = _parse_multipart(request) + assert parts == [ + {"name": "a", "filename": None, "content_type": "text/plain", "content": b"hello"}, + {"name": "b", "filename": None, "content_type": "text/plain", "content": b"world"}, + ] + + +def test_multipart_file_field(): + request = _multipart_request( + b"""
+ +
""", + data={"upload": FileField(content=b"file content", filename="test.txt", content_type="text/plain")}, + ) + parts = _parse_multipart(request) + assert len(parts) == 1 + assert parts[0]["name"] == "upload" + assert parts[0]["filename"] == "test.txt" + assert parts[0]["content_type"] == "text/plain" + assert parts[0]["content"] == b"file content" + + +def test_multipart_file_field_default_content_type(): + request = _multipart_request( + b'
', + data={"f": FileField(content=b"\x00\x01\x02", filename="data.bin")}, + ) + parts = _parse_multipart(request) + assert parts[0]["content_type"] == "application/octet-stream" + assert parts[0]["content"] == b"\x00\x01\x02" + + +def test_multipart_mixed_fields(): + request = _multipart_request( + b"""
+ +
""", + data={"attachment": FileField(content=b"data", filename="a.bin")}, + ) + parts = _parse_multipart(request) + assert len(parts) == 2 + names = {p["name"] for p in parts} + assert names == {"note", "attachment"} + + +def test_multipart_formenctype_button(): + # formenctype="multipart/form-data" on the submit button triggers multipart. + request = _multipart_request( + b"""
+ + +
""" + ) + ct = next(v for k, v in request.headers if k == "Content-Type") + assert ct.startswith("multipart/form-data; boundary=") + parts = _parse_multipart(request) + assert parts[0]["name"] == "x" + assert parts[0]["content"] == b"y" + + +def test_multipart_formenctype_case_insensitive(): + request = _multipart_request( + b"""
+ +
""" + ) + ct = next(v for k, v in request.headers if k == "Content-Type") + assert ct.startswith("multipart/form-data; boundary=") + + +def test_multipart_enctype_override(): + # enctype parameter override to multipart/form-data. + request = _multipart_request( + b'
', + enctype="multipart/form-data", + ) + ct = next(v for k, v in request.headers if k == "Content-Type") + assert ct.startswith("multipart/form-data; boundary=") + parts = _parse_multipart(request) + assert parts[0] == { + "name": "k", + "filename": None, + "content_type": "text/plain", + "content": b"v", + } From 9ccea86ca5fc73105001712b6e8b581c4bb5d9c8 Mon Sep 17 00:00:00 2001 From: Adrian Chaves Date: Wed, 10 Jun 2026 17:50:51 +0200 Subject: [PATCH 2/2] Address issues --- form2request/_base.py | 23 +++++++++-------------- tests/test_main.py | 41 +++++++++++++++++++++++++++-------------- 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/form2request/_base.py b/form2request/_base.py index a7e9b3f..43e9f6d 100644 --- a/form2request/_base.py +++ b/form2request/_base.py @@ -1,5 +1,6 @@ from __future__ import annotations +import uuid from collections.abc import Iterable from dataclasses import dataclass from typing import ( @@ -11,7 +12,6 @@ cast, ) from urllib.parse import urlencode, urljoin, urlsplit, urlunsplit -import uuid from parsel import Selector, SelectorList from w3lib.html import strip_html5_whitespace @@ -58,13 +58,12 @@ def _enctype( f"The specified form enctype ({enctype!r}) is not supported " f"for forms with the POST method." ) - elif click_element is not None and ( - enctype := (click_element.get("formenctype") or "").lower() - ): - pass - elif enctype := (form.get("enctype") or "").lower(): + elif ( + click_element is not None + and (enctype := (click_element.get("formenctype") or "").lower()) + ) or (enctype := (form.get("enctype") or "").lower()): pass - return enctype + return enctype or "" def _url(form: FormElement, click_element: HtmlElement | None) -> str: @@ -196,9 +195,7 @@ def _build_multipart_body( parts.append(header + value.content + b"\r\n") else: header = ( - f"--{boundary}\r\n" - f'Content-Disposition: form-data; name="{name}"\r\n' - f"\r\n" + f'--{boundary}\r\nContent-Disposition: form-data; name="{name}"\r\n\r\n' ).encode() parts.append(header + value.encode() + b"\r\n") parts.append(f"--{boundary}--\r\n".encode()) @@ -248,11 +245,9 @@ def to_requests(self, **kwargs: Any): return request.prepare() def to_scrapy(self, callback: Callable, **kwargs: Any): - """Convert the request to :class:`scrapy.Request - `. + """Convert the request to :class:`scrapy.Request`. - All *kwargs* are passed to :class:`scrapy.Request - ` as is. + All *kwargs* are passed to :class:`scrapy.Request` as is. """ import scrapy diff --git a/tests/test_main.py b/tests/test_main.py index d2fb8f1..56d8c3e 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -17,17 +17,16 @@ def _parse_multipart(request): msg = email.message_from_bytes(raw) if not msg.is_multipart(): return [] - parts = [] - for part in msg.get_payload(): - parts.append( - { - "name": part.get_param("name", header="content-disposition"), - "filename": part.get_param("filename", header="content-disposition"), - "content_type": part.get_content_type(), - "content": part.get_payload(decode=True), - } - ) - return parts + return [ + { + "name": part.get_param("name", header="content-disposition"), + "filename": part.get_param("filename", header="content-disposition"), + "content_type": part.get_content_type(), + "content": part.get_payload(decode=True), + } + for part in msg.get_payload() + if isinstance(part, email.message.Message) + ] @pytest.mark.parametrize( @@ -845,8 +844,18 @@ def test_multipart_text_fields(): ) parts = _parse_multipart(request) assert parts == [ - {"name": "a", "filename": None, "content_type": "text/plain", "content": b"hello"}, - {"name": "b", "filename": None, "content_type": "text/plain", "content": b"world"}, + { + "name": "a", + "filename": None, + "content_type": "text/plain", + "content": b"hello", + }, + { + "name": "b", + "filename": None, + "content_type": "text/plain", + "content": b"world", + }, ] @@ -855,7 +864,11 @@ def test_multipart_file_field(): b"""
""", - data={"upload": FileField(content=b"file content", filename="test.txt", content_type="text/plain")}, + data={ + "upload": FileField( + content=b"file content", filename="test.txt", content_type="text/plain" + ) + }, ) parts = _parse_multipart(request) assert len(parts) == 1