diff --git a/.gitignore b/.gitignore
index e2da7cc59..7301e0ec2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -92,5 +92,10 @@ cert.txt
 # local MCP server config (contains auth tokens)
 .mcp.json
 mcp-server/create-agent-key.ts
-.gcloudignore
+
+# Claude
 CLAUDE.md
+
+#gcloud
+.gcloudignore
+
diff --git a/docs/lobbying-disclosure-ingestion.md b/docs/lobbying-disclosure-ingestion.md
new file mode 100644
index 000000000..99c9652a7
--- /dev/null
+++ b/docs/lobbying-disclosure-ingestion.md
@@ -0,0 +1,1038 @@
+# Lobbying Disclosure Ingestion Pipeline
+
+## Overview
+
+The MA Secretary of State lobbying portal
+([sec.state.ma.us/LobbyistPublicSearch](https://www.sec.state.ma.us/LobbyistPublicSearch/))
+publishes semi-annual disclosure filings for all registered lobbyists and
+lobbying entities. This document describes the plan for scraping that data and
+storing it in Firestore in a way that allows joining to MAPLE bill data.
+
+The portal has three levels of pages:
+
+1. **Search page** → one row per registrant per year
+2. **Summary page** → registrant metadata + links to semi-annual disclosure
+   filings
+3. **CompleteDisclosure page** → per-client compensation table + per-client bill
+   activity tables
+
+Historical data goes back to 2005. MAPLE has bill data only from ~2020 onward,
+so bill joins will only resolve for filings from the 192nd General Court (2021)
+and later. All historical filings are ingested regardless.
+
+---
+
+## Terminology
+
+The portal has two registrant types:
+
+- **Lobbyist** — an individual person who lobbies directly on behalf of clients.
+- **Employer** — a lobbying firm that employs individual lobbyists and is
+  retained by clients. Called "Lobbyist Entity" on the portal.
+
+In both cases, the registrant reports compensation received from each **client**
+(the organization that hired them) and which bills they lobbied for that client.
+
+---
+
+## Firestore Data Model
+
+Two top-level collections, normalized by registrant and by lobbying activity
+record.
+
+### `/lobbyingRegistrants/{registrantId}`
+
+`registrantId` is a SHA-256 hash (first 40 hex chars) of
+`{entityName}_{year}`. Hashing avoids sanitizing arbitrary Unicode and
+punctuation in entity names to fit Firestore ID constraints while remaining
+stable and dedup-safe across runs.
+
+One model covers both individual lobbyists and lobbying firms. A separate model
+is not needed because the portal search returns both under the same schema, and
+per-filing detail pages do not expose which individual lobbyists within a firm
+worked on which bill.
+
+```typescript
+interface LobbyingRegistrant {
+  registrantId: string // SHA-256 hash of "{entityName}_{year}"
+  entityName: string // firm name or individual lobbyist name (raw portal value)
+  entityNameNorm: string // normalized form; see Normalization section
+  year: number
+  generalCourt: number // computed from year
+  regType: "Lobbyist" | "Employer"
+  clients: LobbyingClient[]
+  disclosureUrls: string[] // source portal URLs, for audit trail
+  fetchedAt: Timestamp
+}
+
+interface LobbyingClient {
+  clientName: string
+  clientNameNorm: string // normalized form
+  compensation: number | null
+}
+```
+
+**Example document** (`lobbyingRegistrants/0f11970cc6f17e35cc02685b794cb63a655c13b3`):
+
+```json
+{
+  "registrantId": "0f11970cc6f17e35cc02685b794cb63a655c13b3",
+  "entityName": "27 South Strategies, LLC",
+  "entityNameNorm": "27 SOUTH STRATEGIES",
+  "year": 2024,
+  "generalCourt": 193,
+  "regType": "Employer",
+  "clients": [
+    {
+      "clientName": "The Massachusetts International Festival of the Arts, Inc.",
+      "clientNameNorm": "MASSACHUSETTS INTERNATIONAL FESTIVAL OF ARTS",
+      "compensation": 24000.0
+    },
+    {
+      "clientName": "Veterinary Emergency Group, LLC",
+      "clientNameNorm": "VETERINARY EMERGENCY GROUP",
+      "compensation": 33000.0
+    },
+    {
+      "clientName": "Gobrands, Inc",
+      "clientNameNorm": "GOBRANDS",
+      "compensation": 60000.0
+    }
+  ],
+  "disclosureUrls": [
+    "https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=hTpfuAXM..."
+  ]
+}
+```
+
+### `/lobbyingFilings/{filingId}`
+
+`filingId` is a SHA-256 hash (first 40 hex chars) of the logical key
+`{entityName}_{clientName}_{chamber}_{activityRef}_{generalCourt}`.
+
+```typescript
+type LobbyingChamber =
+  | "House Bill"
+  | "Senate Bill"
+  | "House Docket"
+  | "Senate Docket"
+  | "Executive" // lobbying of executive branch agencies
+  | "Other" // catch-all for rare legacy codes (FY, CMR, etc.)
+
+interface LobbyingFiling {
+  filingId: string
+  entityName: string // raw portal value
+  entityNameNorm: string // normalized form
+  clientName: string // raw portal value; "_total_salary_" sentinel for pre-2013
+  clientNameNorm: string // normalized form
+  year: number
+  generalCourt: number
+  chamber: LobbyingChamber
+  // For legislative chambers: the bill number string (e.g. "H1234", "HD56").
+  // For Executive: the agency name. Not a bill reference.
+  billId: string | null
+  activityTitle: string // bill title (legislative) or meeting description (executive)
+  position: string // "Support" | "Oppose" | "Neutral" | etc.; empty for executive
+  amount: number | null // compensation allocated to this activity
+  fetchedAt: Timestamp
+}
+```
+
+**Example documents** — query: `lobbyingFilings` where `generalCourt == 193` and `billId == "H1"`:
+
+```json
+[
+  {
+    "filingId": "0bac34faf2f624083daaaea57ee55f5364d2be29",
+    "entityName": "Christina Ascolillo",
+    "entityNameNorm": "CHRISTINA ASCOLILLO",
+    "clientName": "American Council of Engineering Companies of Massachusetts",
+    "clientNameNorm": "AMERICAN COUNCIL OF ENGINEERING COMPANIES OF MASSACHUSETTS",
+    "year": 2023,
+    "generalCourt": 193,
+    "chamber": "House Bill",
+    "billId": "H1",
+    "activityTitle": "An Act making appropriations for the Fiscal Year 2024...",
+    "position": "Neutral",
+    "amount": 0.0
+  },
+  {
+    "filingId": "109600251ca8fd279e8bb7562bc9d234c39f63a8",
+    "entityName": "Christina Ascolillo",
+    "entityNameNorm": "CHRISTINA ASCOLILLO",
+    "clientName": "St. Mary's Center for Women and Children, Inc.",
+    "clientNameNorm": "ST MARY S CENTER FOR WOMEN AND CHILDREN",
+    "year": 2023,
+    "generalCourt": 193,
+    "chamber": "House Bill",
+    "billId": "H1",
+    "activityTitle": "An Act making appropriations for the Fiscal Year 2024...",
+    "position": "Neutral",
+    "amount": 0.0
+  }
+]
+```
+
+### Constructing `billId` from Raw Portal Data
+
+The portal stores bill numbers as bare integers and records the chamber
+separately. The `billId` field — which maps to `Bill.id` in MAPLE — is
+constructed during ingest by combining chamber prefix and integer:
+
+| `chamber`       | Prefix | Example raw | `billId` |
+| --------------- | ------ | ----------- | -------- |
+| `House Bill`    | `H`    | `1234`      | `H1234`  |
+| `Senate Bill`   | `S`    | `1234`      | `S1234`  |
+| `House Docket`  | `HD`   | `56`        | `HD56`   |
+| `Senate Docket` | `SD`   | `56`        | `SD56`   |
+| `Executive`     | —      | agency name | `null`   |
+| `Other`         | —      | varies      | `null`   |
+
+Note: `H1234` and `S1234` are distinct bills even though they share the same
+integer. The prefix is required to disambiguate. `billId` is `null` for
+non-legislative chambers.
+
+#### Legacy chamber code normalization
+
+The portal uses short-form codes in older filings, normalized during ingest:
+
+| Raw value | Stored as     |
+| --------- | ------------- |
+| `HB`      | `House Bill`  |
+| `SB`      | `Senate Bill` |
+
+Rare codes (`FY`, `C`, `CMR`, `HR`, etc.) are stored as `Other`.
+
+### Joining to Bill Data
+
+**The join only applies to legislative chambers** (`House Bill`, `Senate Bill`,
+`House Docket`, `Senate Docket`) where `billId` is non-null. For `Executive`
+and `Other`, no join should be attempted.
+
+```typescript
+// Only valid when filing.billId !== null
+db.collection(`/generalCourts/${filing.generalCourt}/bills`).doc(filing.billId)
+```
+
+---
+
+## Entity Name Normalization
+
+The portal does not enforce consistent name formatting. The same client or
+registrant may appear as "Acme Corp.", "ACME CORPORATION", "Acme, Inc. d/b/a
+Acme Consulting", etc. across filings and years. Without normalization,
+grouping by entity is unreliable.
+
+Both `entityName` and `clientName` are normalized using the following pipeline,
+applied in order. The raw portal value is always preserved alongside the
+normalized form.
+
+### Normalization pipeline
+
+1. **Uppercase** — convert the entire string to upper case.
+2. **Strip d/b/a suffix** — remove everything from the first occurrence of
+   `D/B/A`, `D/B/A`, `DBA` (and spacing variants) onward, so the registered
+   name is used rather than a trade name.
+3. **Hyphen → space** — replace `-` with ` ` so `LAN-TEL` and `LAN TEL`
+   collapse to the same key.
+4. **Punctuation → space** — replace `,`, `.`, `'`, `'`, `'`, `(`, `)` with
+   space. Replacement with space (not empty string) prevents adjacent tokens
+   from concatenating (e.g. `,INC` becomes ` INC`, which is then caught by step
+   5).
+5. **Remove legal entity type words** — whole-word removal of: `LLC`, `LLP`,
+   `INC`, `INCORPORATED`, `CORPORATION`, `CORP`, `LTD`, `LIMITED`, `PC`,
+   `PLLC`.
+6. **Remove "THE"** — whole-word removal anywhere in the string (not just as a
+   leading prefix).
+7. **Ampersand → AND** — replace `&` with `AND`.
+8. **Fix known typo** — replace `ASSICIATES` with `ASSOCIATES` (legacy portal
+   data).
+9. **Remove professional suffix phrases** — whole-phrase removal of: `LAW
+OFFICE OF`, `AND ASSOCIATES`, `& ASSOCIATES`, `AND ASSOC`, `ATTORNEY AT
+LAW`, `ATTORNEY@LAW`, `ATTORNET AT LAW`, `AND PARTNERS`, `PUBLIC POLICY
+GROUP`, `LEGISLATIVE SERVICES`, `POLICY GROUP`, `ASSOCIATES`, `COUNSELLORS
+AT LAW`.
+10. **Collapse whitespace** — replace runs of whitespace with a single space and
+    strip leading/trailing whitespace.
+
+### Usage
+
+`entityNameNorm` and `clientNameNorm` are stored on every document and filing.
+They should be used for grouping, deduplication, and display-level matching.
+Raw names are preserved for provenance and audit.
+
+---
+
+## Deduplication and Amount Aggregation
+
+### Does lobbying the same bill multiple times mean we should sum amounts?
+
+The portal collects two semi-annual disclosure filings per registrant per year
+(one for each 6-month period). In theory, a registrant could report the same
+bill in both H1 and H2 filings with separate compensation amounts that should
+be summed. Analysis of the actual data shows this does not occur: after
+processing, zero rows share the same `(entityName, clientName, year,
+generalCourt, billId, position)` — each (registrant, client, bill, year)
+combination appears exactly once. The semi-annual periods report different
+activity, not the same activity twice.
+
+The same registrant can lobby the same bill across multiple General Courts
+(observed up to 6 times across years). These are stored as separate documents
+per `generalCourt` and should not be summed — each court is a distinct
+legislative session.
+
+### Null-bill row deduplication
+
+The one real duplication artifact in the portal data is **null-bill rows** —
+entries filed when a registrant had no specific bills to report for a client in
+a period. These appear in both the H1 and H2 disclosures as identical rows and
+should be collapsed. During ingest, if the same `(entityName, clientName, year,
+generalCourt, chamber, position)` with a null `billId` is encountered more than
+once, keep the row with the highest `amount` so no spend is lost if the two
+copies carry different values (in practice amounts are usually both zero).
+
+### Ingest strategy
+
+When processing multiple disclosure URLs for the same registrant+year, write
+`lobbyingFilings` documents using the logical key as the document ID. A
+subsequent disclosure URL that produces the same document ID will naturally
+upsert (overwrite) rather than duplicate. For null-bill rows, since `billId` is
+null, include `chamber` in the document ID to avoid false merges between
+executive and legislative null rows.
+
+---
+
+## Scraper Architecture
+
+### Why a standalone Cloud Run container
+
+The MA SoS portal is protected by Imperva WAF, which uses TLS fingerprinting to
+classify HTTP clients at the network layer before examining any headers. Node.js
+produces a TLS fingerprint that Imperva challenges with a JavaScript
+verification page; Python's `requests` library produces a fingerprint that
+Imperva allows through without challenge. This is a runtime-level constraint
+that cannot be addressed by header configuration or cipher reordering alone.
+
+The scraper therefore runs as a standalone **Cloud Run container** written in
+Python, deployed alongside the existing MCP server container. All data modeling,
+Firestore collection/field names, and normalization logic are documented here and
+kept consistent between the Python container and the TypeScript type definitions
+in `functions/src/lobbying/types.ts`.
+
+### Cloud Run container: `lobbying-scraper/`
+
+**Files:** `lobbying-scraper/{scrape,portal,normalize,writer}.py`
+
+- Scheduled weekly by Cloud Scheduler
+- Runs an incremental check: fetches the current and prior year's summary links
+  (one POST), compares disc URLs against the Firestore cursor, and **exits
+  immediately if nothing is new** (fast path, typically seconds)
+- When new or updated disclosures are found, fetches and processes them
+- Persists a cursor in `scrapers/lobbying`:
+  - `processedDiscUrls: string[]` — disc URLs already written; skipped on
+    re-runs
+  - `summaryDiscCache: {[summaryUrl]: string[]}` — maps summary page URLs to
+    their disc URLs so summary page GETs are skipped for prior-year registrants
+    whose disclosures are all already processed
+- For each new disclosure URL:
+  - Parse registrant + client compensation rows → upsert `lobbyingRegistrants`
+  - Parse bill activity rows → batch-write `lobbyingFilings`
+- 1s delay between requests; exponential backoff on transient failures
+
+### Incremental strategy
+
+In steady state (after the initial backfill), each weekly run:
+
+1. One POST to fetch all summary links for current + prior year
+2. For prior-year registrants with all disc URLs in the cursor: zero GETs
+3. For current-year registrants: one GET per summary page to check for new
+   disclosure periods
+4. For any new disc URLs: one GET per disclosure page
+
+New filings arrive twice a year (semi-annual reporting periods). Between
+periods, the run completes in under a minute.
+
+The backfill script (`--mode backfill`) uses a separate subcollection cursor at
+`scrapers/lobbyingBackfill/processedUrls/{urlHash}` so it does not interfere
+with the live scraper state.
+
+### Portal HTML format eras
+
+The `CompleteDisclosure.aspx` page has changed layout several times. The parser
+handles all four eras:
+
+| Era      | Years        | Compensation source                                   | Notes                                                                                               |
+| -------- | ------------ | ----------------------------------------------------- | --------------------------------------------------------------------------------------------------- |
+| Modern   | 2019–present | `grdvClientPaidToEntity` grid                         | Per-client rows; both employer and individual registrant variants                                   |
+| Hybrid   | 2014–2018    | `Panel1_N` div blocks                                 | Compensation in collapsible panels; bill activity in a separate grid                                |
+| Legacy B | 2009–2013    | "Compensation received" column in bill-activity table | Per-client amounts; deduplication required (same (client, amount) pair can appear in multiple rows) |
+| Legacy A | 2005–2008    | `grdvSalaryPaid` entity-total row                     | No per-client breakdown; stored under sentinel `clientName: "_total_salary_"`                       |
+
+Pre-2009 **individual** lobbyist summary pages link to `RegVersionLobbyist.aspx`
+rather than `CompleteDisclosure.aspx`. These produce no disclosure detail and are
+skipped — expected portal behavior. Employer/entity registrants use
+`CompleteDisclosure.aspx` correctly across all years.
+
+For Legacy A filings (2005–2008), `clientName: "_total_salary_"` signals to
+callers that per-client compensation is unavailable. No bill-level compensation
+amount is available for these years.
+
+---
+
+## New Files
+
+```
+functions/src/lobbying/
+  types.ts     — Runtypes schema definitions for LobbyingRegistrant, LobbyingFiling
+  normalize.ts — Entity name normalization pipeline (also used client-side)
+  index.ts     — Re-exports
+
+lobbying-scraper/
+  scrape.py           — Entry point: --mode weekly (incremental) | --mode backfill
+  portal.py           — HTTP fetch wrappers + pure HTML parsers for all 4 format eras
+  normalize.py        — Port of normalize.ts
+  writer.py           — Firestore document construction + writes
+  archive.py          — GCS raw-HTML archive (write-only; enabled by ARCHIVE_RAW=1)
+  reparse_archive.py  — Offline driver to re-ingest archived HTML into Firestore
+  requirements.txt    — requests, beautifulsoup4, google-cloud-firestore, google-cloud-storage
+  Dockerfile          — Python 3.12-slim image
+```
+
+The TypeScript lobbying module (`functions/src/lobbying/`) contains only the
+schema types and normalization logic. There is no TypeScript scraper or
+Firebase Function — ingestion is handled entirely by the Cloud Run container.
+This follows the same pattern as the MCP server and avoids the complexity of
+running multiple language runtimes in the same Firebase Functions deployment.
+
+---
+
+## Deploying the Cloud Run Container
+
+Follows the same pattern as the MCP server. The Artifact Registry repo
+(`maple-lobbying`) and Cloud Run job (`maple-lobbying-scraper`) are already
+created in `digital-testimony-dev`.
+
+```bash
+cd lobbying-scraper
+IMAGE=us-central1-docker.pkg.dev/digital-testimony-dev/maple-lobbying/scraper:latest
+docker build -t $IMAGE . && docker push $IMAGE
+
+gcloud run jobs update maple-lobbying-scraper \
+  --image=$IMAGE \
+  --project=digital-testimony-dev \
+  --region=us-central1
+```
+
+For a new project (prod), create the job first:
+
+```bash
+gcloud artifacts repositories create maple-lobbying \
+  --repository-format=docker --location=us-central1 --project=<project>
+
+gcloud run jobs create maple-lobbying-scraper \
+  --image=$IMAGE \
+  --project=<project> \
+  --region=us-central1 \
+  --task-timeout=30m \
+  --max-retries=0
+
+# Schedule weekly (Mondays 6am UTC)
+gcloud scheduler jobs create http maple-lobbying-weekly \
+  --schedule="0 6 * * 1" \
+  --uri="https://us-central1-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/<project>/jobs/maple-lobbying-scraper:run" \
+  --http-method=POST \
+  --oauth-service-account-email=<scheduler-sa>@<project>.iam.gserviceaccount.com \
+  --location=us-central1
+```
+
+## Historical Backfill
+
+Runs `scrape.py --mode backfill` directly. Resumable — the subcollection
+cursor at `scrapers/lobbyingBackfill/processedUrls` tracks progress.
+
+Always set `ARCHIVE_RAW=1` for real runs so every fetched page is preserved
+for offline reparsing. Create the GCS archive bucket first if it does not
+exist (see Step 7 of the test plan).
+
+```bash
+cd lobbying-scraper
+
+# Test a single year with no writes
+GOOGLE_CLOUD_PROJECT=digital-testimony-dev \
+  GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  python3 scrape.py --mode backfill --year 2024 --limit 3 --dry-run
+
+# Run a single year for real (with archiving)
+GOOGLE_CLOUD_PROJECT=digital-testimony-dev \
+  GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  ARCHIVE_RAW=1 \
+  python3 scrape.py --mode backfill --year 2024
+
+# Full history (2005–present, resumable, with archiving)
+GOOGLE_CLOUD_PROJECT=digital-testimony-dev \
+  GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  ARCHIVE_RAW=1 \
+  python3 scrape.py --mode backfill
+```
+
+---
+
+## Firestore Rules
+
+Add read-only public rules alongside the existing `generalCourts` rule:
+
+```
+match /lobbyingRegistrants/{doc} { allow read: if true; }
+match /lobbyingFilings/{doc}     { allow read: if true; }
+```
+
+---
+
+## Firestore Indexes
+
+Add composite indexes for common query patterns:
+
+| Collection        | Fields                                 | Use case                                 |
+| ----------------- | -------------------------------------- | ---------------------------------------- |
+| `lobbyingFilings` | `generalCourt ASC, billId ASC`         | Fetch all legislative filings for a bill |
+| `lobbyingFilings` | `generalCourt ASC, chamber ASC`        | Filter by chamber within a court         |
+| `lobbyingFilings` | `generalCourt ASC, entityNameNorm ASC` | Fetch all filings for a registrant       |
+| `lobbyingFilings` | `generalCourt ASC, clientNameNorm ASC` | Fetch all filings for a client           |
+
+Note: bill-join queries should always filter on `chamber` (or check
+`billId !== null`) to exclude `Executive` and `Other` rows before treating
+`billId` as a MAPLE bill reference.
+
+---
+
+## Implementation Status
+
+| File                                  | Status     | Notes                                                            |
+| ------------------------------------- | ---------- | ---------------------------------------------------------------- |
+| `functions/src/lobbying/types.ts`     | ✅ Done    | Firestore schema types; imported by future frontend code         |
+| `functions/src/lobbying/normalize.ts` | ✅ Done    | Normalization pipeline; also ported to `normalize.py`            |
+| `functions/src/lobbying/index.ts`     | ✅ Done    | Re-exports types and normalize                                   |
+| `firestore.rules`                     | ✅ Done    |                                                                  |
+| `firestore.indexes.json`              | ✅ Done    |                                                                  |
+| `lobbying-scraper/normalize.py`       | ✅ Done    | Port of normalize.ts                                             |
+| `lobbying-scraper/portal.py`          | ✅ Done    | All 4 format eras; pure parsers separated from fetch wrappers    |
+| `lobbying-scraper/writer.py`          | ✅ Done    | Firestore document construction                                  |
+| `lobbying-scraper/scrape.py`          | ✅ Done    | Entry point; `--mode weekly` and `--mode backfill`               |
+| `lobbying-scraper/archive.py`         | ✅ Done    | GCS write-only archive; enabled via `ARCHIVE_RAW=1`              |
+| `lobbying-scraper/reparse_archive.py` | ✅ Done    | Offline reparse driver; looks up registrant meta from Firestore  |
+| `lobbying-scraper/Dockerfile`         | ⚠️ Rebuild | Needs rebuild after `google-cloud-storage` added to requirements |
+
+### Document ID scheme
+
+Both `registrantId` and `filingId` are SHA-256 hashes (first 40 hex chars) of
+their respective logical keys. Hashes are used rather than slugified strings
+because entity names and client names contain arbitrary Unicode and punctuation
+that would require aggressive sanitization to fit Firestore ID constraints. The
+hash is stable across runs for the same logical record.
+
+---
+
+## Future Work (Subsequent PRs)
+
+### Frontend
+
+- **Dedicated lobbying pages**
+
+  - `/lobbyists` index: searchable list of registrants with total compensation,
+    client count, and year filter
+  - `/lobbyists/{registrantId}` profile: full client list, all bills lobbied,
+    compensation over time
+  - `/clients/{clientNameNorm}` profile: registrants hired, bills lobbied,
+    total spend per year
+
+- **Bill page integration** (`/bills/{court}/{billId}`)
+
+  - "Lobbying activity" section listing registrants + clients that lobbied this
+    bill, with position (Support / Oppose / Neutral) and compensation where
+    available
+  - Link to registrant profile pages
+
+- **Organization profile page integration**
+  - If an organization's normalized name matches a `clientNameNorm` in
+    `lobbyingFilings`, surface a "Lobbying history" panel showing which bills
+    they lobbied and which registrants they hired
+
+### MCP Tools
+
+Expose lobbying data via the MAPLE MCP server so that AI agents and Claude can
+answer questions like "who lobbied bill H1234?" or "what did Acme Corp lobby
+for in 2024?".
+
+- **`get_lobbying_filings_for_bill`** — given `generalCourt` + `billId`, return
+  all `lobbyingFilings` for that bill with registrant, client, position, and
+  amount
+- **`get_lobbying_registrant`** — given `registrantId`, return the registrant
+  document with client list and disclosure URLs
+- **`search_lobbying_by_client`** — given a client name (raw or normalized),
+  return matching filings across all courts
+- **`get_lobbying_summary_for_bill`** — aggregate view: unique registrant count,
+  unique client count, total compensation (where non-null), position breakdown
+
+---
+
+## Incremental Test Plan
+
+Testing proceeds from the inside out: unit logic first, then live portal
+fetches against the real site, then a small Firestore write, then a full
+backfill year, then steady-state function operation.
+
+### Step 1 — Unit tests: parser, normalization, bill construction
+
+Run the pytest suite against all 4 HTML format eras using committed fixture
+pages:
+
+```bash
+cd lobbying-scraper
+python -m pytest tests/ -v
+```
+
+Expected: 26 tests pass. Covers compensation totals and client/bill counts for
+all eras (2007, 2011, 2016, 2024 — employer and individual), normalization edge
+cases, bill ID construction, and specific bug regressions (semicolon bill
+separator, "Total amount" artifact row, null billId for executive rows).
+
+### Step 2 — Live portal fetch: dry run
+
+Verify the portal is reachable and the parser returns valid data without writing
+to Firestore:
+
+```bash
+cd lobbying-scraper
+GOOGLE_CLOUD_PROJECT=digital-testimony-dev \
+  GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  python3 scrape.py --mode backfill --year 2024 --limit 3 --dry-run
+```
+
+Expected: 3 registrants fetched, compensation and bill rows printed, no
+Firestore writes.
+
+### Step 3 — Firestore write: single year, small limit
+
+Write a small batch to the dev project and verify results:
+
+```bash
+cd lobbying-scraper
+GOOGLE_CLOUD_PROJECT=digital-testimony-dev \
+  GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  python3 scrape.py --mode backfill --year 2024 --limit 3
+```
+
+Verify in Firestore console:
+
+- `lobbyingRegistrants` has 3 documents with `entityName`, `entityNameNorm`,
+  `regType`, `clients`, `generalCourt`
+- `lobbyingFilings` has documents with `billId` non-null for legislative rows
+  and `null` for Executive rows
+- `scrapers/lobbyingBackfill/processedUrls` has entries with `url` and
+  `processedAt` fields
+- Re-running skips already-processed URLs (output shows 0 new disclosures)
+
+### Step 4 — Spot-check: bill join
+
+Pick a `lobbyingFiling` document with a non-null `billId` and a `generalCourt`
+≥ 192. Verify the bill exists in MAPLE:
+
+```
+/generalCourts/{filing.generalCourt}/bills/{filing.billId}
+```
+
+If the bill is found, the join key is correct. If not found, check: (a) whether
+MAPLE has data for that court, (b) whether the bill number format matches
+(prefix + integer, no leading zeros).
+
+### Step 5 — Create GCS archive bucket (once per project)
+
+The archive bucket name is derived from `GOOGLE_CLOUD_PROJECT`. Create it once
+with the Archive storage class (written once, read rarely):
+
+```bash
+gsutil mb -p digital-testimony-dev -l us-central1 \
+  -c archive gs://digital-testimony-dev-lobbying-archive
+```
+
+Repeat for prod when running the prod backfill:
+
+```bash
+gsutil mb -p digital-testimony-prod -l us-central1 \
+  -c archive gs://digital-testimony-prod-lobbying-archive
+```
+
+Each project uses its own bucket. Dev and prod data are isolated; the Cloud Run
+service account for each project only needs access to its own bucket.
+
+### Step 6 — Backfill: full current year (or partial across all years)
+
+Always run with `ARCHIVE_RAW=1` so every fetched page is preserved for offline
+reparsing. For a large-scale dev test before a full prod run, `--limit N` is
+applied per year — e.g. `--limit 50` across all years fetches ~10% of history:
+
+```bash
+cd lobbying-scraper
+
+# Full current year
+GOOGLE_CLOUD_PROJECT=digital-testimony-dev \
+  GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  ARCHIVE_RAW=1 \
+  python3 scrape.py --mode backfill --year 2024
+
+# ~10% partial across all years (good large-scale dev validation)
+GOOGLE_CLOUD_PROJECT=digital-testimony-dev \
+  GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  ARCHIVE_RAW=1 \
+  python3 scrape.py --mode backfill --limit 50
+```
+
+Expected for full year: ~500–600 registrants, ~1,000 disclosure pages, several
+thousand filing documents written.
+
+### Step 7 — Backfill: full history (2005–present)
+
+Run without `--year` to process all years. Can be interrupted and resumed:
+
+```bash
+GOOGLE_CLOUD_PROJECT=digital-testimony-dev \
+  GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \
+  ARCHIVE_RAW=1 \
+  python3 scrape.py --mode backfill
+```
+
+Expected runtime: several hours at ~1s/request. The subcollection cursor at
+`scrapers/lobbyingBackfill/processedUrls` allows safe interruption and
+resumption.
+
+### Step 8 — Deploy and verify Cloud Run scraper
+
+Build and push the container image, then update the Cloud Run job:
+
+```bash
+cd lobbying-scraper
+IMAGE=us-central1-docker.pkg.dev/digital-testimony-dev/maple-lobbying/scraper:latest
+docker build -t $IMAGE . && docker push $IMAGE
+
+gcloud run jobs update maple-lobbying-scraper \
+  --image=$IMAGE \
+  --project=digital-testimony-dev \
+  --region=us-central1
+```
+
+Trigger a manual run via the Cloud Run console or:
+
+```bash
+gcloud run jobs execute maple-lobbying-scraper \
+  --project=digital-testimony-dev \
+  --region=us-central1
+```
+
+Verify via Cloud Run logs: the run should find near-zero new disclosures if the
+backfill already completed, confirming the weekly fast-path works correctly.
+
+---
+
+## Design Decisions
+
+| Decision                    | Choice                                                                       | Rationale                                                                                                                                                                |
+| --------------------------- | ---------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| Collection placement        | Top-level `/lobbyingRegistrants`, `/lobbyingFilings`                         | Lobbying data spans multiple General Courts and is not scoped to a single court like bills/members                                                                       |
+| Single registrant model     | One type, `regType: "Lobbyist" \| "Employer"`                                | Individual lobbyists and firms share the same portal schema; per-bill individual attribution is not available                                                            |
+| `billId` construction       | `{chamberPrefix}{billNumber}` at ingest time                                 | Raw portal data stores chamber and integer separately; the composite is what matches MAPLE's `Bill.id`                                                                   |
+| `billId` null for Executive | `null` instead of agency name                                                | Prevents accidental bill lookups; makes join guard explicit at the type level                                                                                            |
+| Normalized name fields      | Store both raw and `*Norm` fields                                            | Raw names preserved for provenance; normalized names used for grouping and matching                                                                                      |
+| HTML parser                 | `beautifulsoup4` (Python)                                                    | Runs in the Cloud Run container alongside the scraper; no JavaScript runtime needed                                                                                      |
+| Live scraper cursor         | Array in `/scrapers/lobbying` doc                                            | ~1,000 URLs/year fits well within the 1 MB Firestore doc limit; simple and atomic with other scraper state                                                               |
+| Backfill cursor             | Firestore subcollection `/scrapers/lobbyingBackfill/processedUrls/{urlHash}` | Full 2005-present history (~50,000 URLs) would exceed the 1 MB doc limit; subcollection scales without bound and is durable, inspectable, and resumable from any machine |
+| Incremental strategy        | Skip already-processed disclosure URLs; write docs by logical key (upsert)   | Survives function restarts and re-runs without re-fetching already-scraped pages; natural upsert prevents duplicates without an explicit dedup pass                      |
+| Legacy format (pre-2013)    | Store with `clientName: "_total_salary_"` sentinel                           | Preserves data completeness; callers can filter on this value                                                                                                            |
+| Historical data             | Admin backfill script (2005 → present)                                       | Full history is ingested once; Cloud Function maintains current+prior year going forward                                                                                 |
+
+---
+
+## Appendix: Phase 2 — OCPF Contribution Ingestion
+
+This appendix tracks planned additions to the lobbying pipeline in a subsequent
+PR. The changes link OCPF (Office of Campaign and Political Finance) campaign
+contribution records to lobbying registrants, enabling questions like "how much
+did this firm contribute to politicians while lobbying on this bill?"
+
+### Background
+
+The MA Secretary of State portal (Phase 1, implemented) covers who lobbied which
+bills and for how much compensation. OCPF covers who donated to which political
+candidates. Linking the two surfaces the intersection: lobbying firms that also
+made political contributions.
+
+OCPF publishes bulk data files at
+`ocpf2.blob.core.windows.net/downloads/data2/` (the same host used by
+`matchOcpfMembers.ts` for the filers file). The contribution records file URL
+needs to be confirmed before implementation begins.
+
+### New Data
+
+#### `/lobbyingContributionRecipients/{recipientId}`
+
+One document per deduplicated contribution recipient (politician/PAC). The
+deduplication pipeline is described below.
+
+```typescript
+interface LobbyingContributionRecipient {
+  recipientId: string // SHA-256(recipientName + officeSought)[:40]
+  recipientName: string // canonical display name after dedup
+  recipientSlug: string // slugify("{recipientName} {officeSought}")
+  officeSought: string // canonical office value (see normalization below)
+  totalReceived: number
+  nContributions: number
+  years: number[]
+  nameVariants: string[] // all raw names that resolved to this record
+  manuallyMerged: boolean // true if recipient_merges.json was applied
+  topFirms: [string, string, number][] // [entityName, entityNameNorm, amount]
+  fetchedAt: Timestamp
+}
+```
+
+#### New fields on `LobbyingRegistrant` (optional, written by contribution run)
+
+```typescript
+nBills?: number             // total filing rows across all clients and years
+nContributions?: number     // total OCPF contribution records from this registrant
+totalContributions?: number // total dollar amount contributed
+```
+
+`nBills` is computed during the disclosure scrape (bill rows are already in
+memory) and written as a side effect of `--mode weekly` and `--mode backfill`.
+`nContributions` and `totalContributions` are written by `--mode contributions`.
+
+#### Pure-contribution firms (new registrant subtype)
+
+Firms that made OCPF contributions but have zero lobbying activity (no filings)
+are written as `LobbyingRegistrant` documents with `nBills: 0`, `clients: []`,
+`totalContributions ≥ 500`. These were previously invisible. The $500 threshold
+filters noise; adjust after reviewing the data.
+
+### Contribution Recipient Deduplication
+
+The same politician may appear under many name variants in OCPF data ("Joe
+Curtatone", "Joseph Curtatone", "Curtatone"). The dedup pipeline runs in five
+passes in-memory after loading all contribution records.
+
+All rules are systemic (handle a class of inputs); individual special cases are
+handled via `recipient_merges.json` only.
+
+#### New file: `lobbying-scraper/recipient_normalize.py`
+
+**Step 1 — `_extract_recipient_name(raw)`**
+
+Strips committee wrapper names using 14 regex patterns (e.g. "Cte. to Elect Ron
+Mariano" → "Ron Mariano"). Then applies in order:
+
+- Last-first flip: `"Mariano, Ronald"` → `"Ronald Mariano"`
+- Honorific strip: `"Sen. Marc Rodrigues"` → `"Marc Rodrigues"`
+- Party label strip: `"Ron Mariano Democrat"` → `"Ron Mariano"`
+- Initial-dot prefix strip: `"R.Mariano"` → `"Mariano"`
+
+**Step 2 — `_recipient_dedup_key(name, office)` → `(first, last, office)`**
+
+Key is a `(first_token, last_token, canonical_office)` tuple. Before keying,
+`first_token` is passed through `_NICKNAME_MAP`:
+
+| Raw              | Canonical |
+| ---------------- | --------- |
+| joe              | joseph    |
+| mike             | michael   |
+| bob              | robert    |
+| bill             | william   |
+| jim              | james     |
+| tom              | thomas    |
+| dan              | daniel    |
+| dave             | david     |
+| steve            | steven    |
+| ron              | ronald    |
+| tim              | timothy   |
+| rick, rich, dick | richard   |
+| charlie, chuck   | charles   |
+| ben              | benjamin  |
+| tony             | anthony   |
+| marty            | martin    |
+| don              | donald    |
+| ken              | kenneth   |
+| ed               | edward    |
+| ray              | raymond   |
+| nick             | nicholas  |
+
+If either token is a generic word (`the`, `comm`, `committee`, `democratic`,
+`republican`, `house`, `senate`, `friends`, `cte`, `pac`), the key falls back
+to the full canonical text.
+
+**Step 3 — Manual merges (`recipient_merges.json`)**
+
+JSON file maps `(name, office)` alias pairs to a canonical record. Start as an
+empty object `{}`; populate incrementally as data review finds misses. Applied
+after key assignment.
+
+**Step 4 — `_merge_bare_surnames(groups)`**
+
+Single-name entries where `first == last` (e.g. `"Curtatone"`) are merged into
+the highest-total multi-token peer with the same `(last, office)`.
+
+**Step 5 — `_fuzzy_merge_keys(groups)`**
+
+Groups keys by `(first, office)`. Pairs where last tokens differ by Levenshtein
+distance ≤ 1 are merged into the higher-total key. Minimum 6 chars on both last
+tokens (protects short surnames: Walsh, Jones, etc.).
+
+### Office Normalization — `_normalize_office(raw)` in `recipient_normalize.py`
+
+Canonical office values:
+
+`State Representative` · `State Senator` · `Governor` · `Lt. Governor` ·
+`Attorney General` · `Treasurer` · `Auditor` · `Secretary of State` · `Mayor` ·
+`City Council` · `District Attorney` · `Sheriff` · `PAC`
+
+Normalization pipeline (applied in order):
+
+1. Strip `"Candidate for [Massachusetts] X"` prefix → `"X"`
+2. Strip leading `MA` / `Mass.` / `Massachusetts`
+3. Strip trailing junk punctuation (`:;/`)
+4. Strip parenthetical suffixes
+5. Strip `"of the …"` / `"from the …"` suffixes
+6. Strip after dash or slash
+7. Strip after comma
+8. Strip ordinal suffixes (e.g. `" 3rd Norfolk …"`)
+9. Re-strip trailing junk (ordinal strip can leave dangling colons)
+10. `OFFICE_MAP` lookup (canonical string → canonical value)
+11. Fallback — district lookup: ordinal + MA county → `State Representative`
+12. Fallback — county suffix: `"Representative Suffolk"` → strip county → retry map
+13. Fallback — location qualifier: strip `"of <city>"`, trailing word, or leading
+    word → retry map (handles `"Mayor of Somerville"`, `"Somerville Mayor"`,
+    `"Mayor Somerville"` → `"Mayor"`)
+
+### New Files
+
+```
+lobbying-scraper/
+  ocpf_contributions.py   — OCPF bulk data download, parse, match to registrants
+  recipient_normalize.py  — office normalization + 5-step recipient dedup pipeline
+  recipient_merges.json   — manual merge overrides (start as {})
+```
+
+### Infrastructure Changes
+
+**`firestore.rules`** — add:
+
+```
+match /lobbyingContributionRecipients/{id} { allow read: if true; allow write: if false; }
+```
+
+**`firestore.indexes.json`** — add:
+
+| Collection                       | Fields                                 | Use case                    |
+| -------------------------------- | -------------------------------------- | --------------------------- |
+| `lobbyingContributionRecipients` | `officeSought ASC, totalReceived DESC` | Office-filtered leaderboard |
+| `lobbyingContributionRecipients` | `officeSought ASC, recipientName ASC`  | Alphabetical browse         |
+
+### What Is Not Implemented (Phase 2)
+
+**AI bill topics (`tags.json` equivalent)** — requires a bill embeddings parquet
+file. MAPLE does not produce this file. The topics feature is deferred; MAPLE's
+existing bill search index could be used as a substitute in a future PR.
+
+**`recipient_merges.json`** — starts empty. Requires domain review of the
+deduplicated output to find cases the automated rules miss. Populate
+incrementally.
+
+### Order of Work
+
+1. Confirm OCPF bulk contribution file URL and column schema
+2. Confirm or incorporate HTML archiving pattern (see open question below)
+3. Implement `recipient_normalize.py` (self-contained, testable in isolation)
+4. Implement `ocpf_contributions.py`
+5. Update `functions/src/lobbying/types.ts` schema
+6. Update `writer.py` and `scrape.py`
+7. Update `firestore.rules` and `firestore.indexes.json`
+
+### HTML Archiving for Fast Reparsing
+
+The current scraper fetches and immediately parses SoS portal HTML, storing
+nothing but the parsed Firestore documents. If parsing logic changes, the full
+portal must be re-scraped — slow and fragile given the Imperva TLS constraint.
+Phase 2 adds GCS archiving of raw HTML as a side effect of every portal fetch.
+
+#### Design principles
+
+The archive is **write-only cold storage**, not a cache. It is fully decoupled
+from both the incremental cursor (which stays Firestore-only) and the parse
+path. Fetching from the portal is always driven by the Firestore cursor; the
+archive is a downstream side effect of a successful fetch.
+
+Parsers must be **pure functions with no I/O** — `parse_summary(soup)`,
+`parse_disclosure_detail(soup, year)` — so the identical code runs in the live
+scrape and in the offline reparse script without modification.
+
+#### GCS key scheme
+
+One object per fetched page: `raw_html/{sha1(url)}.html`. URL-hash is
+collision-free without modeling the ASP.NET URL key space. Individual `.html`
+objects (not batch tarballs) are appropriate here because our scraper runs
+weekly in small increments rather than in large historical sweeps; per-object
+GCS is simpler and makes the reparse path a flat list + get.
+
+Bucket: `gs://<project>-lobbying-archive/raw_html/`
+Storage class: Archive (written once, read rarely).
+
+#### Write timing
+
+Archived immediately after `raise_for_status()` passes, before parsing, and
+**not gated on parse success**:
+
+```python
+def _get(session, url):
+    r = session.get(url, ...)
+    r.raise_for_status()
+    if "Summary.aspx" in url or "CompleteDisclosure" in url:
+        _archive_page(url, r.text)   # side effect; never blocks parse
+    return BeautifulSoup(r.text, "html.parser")
+```
+
+Gating on parse success would defeat the purpose: the archive exists precisely
+to recover from parser gaps later, so we want the bytes even when the current
+parser does not fully understand them. The search/results page is excluded
+(same URL across all years, trivially regenerable).
+
+#### Reparse path
+
+A dedicated offline script `lobbying-scraper/reparse_archive.py`:
+
+```
+python3 reparse_archive.py [--limit N] [--dry-run]
+```
+
+Lists `raw_html/*.html` objects from GCS, downloads them, resolves each
+`sha1.html` back to its original URL (stored as object metadata at write time),
+and runs the pure parser functions against the archived soup. Tracks completed
+objects via a `processed_archive.txt` marker so reparse is resumable.
+
+#### Cursor interaction
+
+No change to the Firestore cursor. The archive is never consulted to skip a
+portal fetch. "Have I processed this disclosure?" is still answered by the
+Firestore cursor; the archive is a consequence of fetching, not an input to the
+decision.
+
+#### New infrastructure required
+
+- GCS bucket `<project>-lobbying-archive` (Archive class, no public access)
+- `google-cloud-storage` added to `lobbying-scraper/requirements.txt`
+- `GOOGLE_CLOUD_PROJECT` env var (already available on Cloud Run) used to
+  derive bucket name
+- IAM: Cloud Run service account needs `storage.objects.create` on the bucket
+- New file: `lobbying-scraper/reparse_archive.py`
+
+#### Order of work within Phase 2
+
+This is a prerequisite for `ocpf_contributions.py` only in the sense that we
+want the archive in place before running the full historical backfill so we do
+not have to re-scrape. It is otherwise independent and can be implemented
+alongside the contribution pipeline rather than before it.
diff --git a/firestore.indexes.json b/firestore.indexes.json
index 83cb3fa6d..c267a6868 100644
--- a/firestore.indexes.json
+++ b/firestore.indexes.json
@@ -788,25 +788,46 @@
       "collectionGroup": "ballotQuestions",
       "queryScope": "COLLECTION",
       "fields": [
-        { "fieldPath": "electionYear", "order": "ASCENDING" },
-        { "fieldPath": "ballotStatus", "order": "ASCENDING" }
+        {
+          "fieldPath": "electionYear",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "ballotStatus",
+          "order": "ASCENDING"
+        }
       ]
     },
     {
       "collectionGroup": "publishedTestimony",
       "queryScope": "COLLECTION_GROUP",
       "fields": [
-        { "fieldPath": "ballotQuestionId", "order": "ASCENDING" },
-        { "fieldPath": "publishedAt", "order": "DESCENDING" }
+        {
+          "fieldPath": "ballotQuestionId",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "publishedAt",
+          "order": "DESCENDING"
+        }
       ]
     },
     {
       "collectionGroup": "publishedTestimony",
       "queryScope": "COLLECTION",
       "fields": [
-        { "fieldPath": "billId", "order": "ASCENDING" },
-        { "fieldPath": "court", "order": "ASCENDING" },
-        { "fieldPath": "ballotQuestionId", "order": "ASCENDING" }
+        {
+          "fieldPath": "billId",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "court",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "ballotQuestionId",
+          "order": "ASCENDING"
+        }
       ]
     },
     {
@@ -898,6 +919,62 @@
           }
         }
       ]
+    },
+    {
+      "collectionGroup": "lobbyingFilings",
+      "queryScope": "COLLECTION",
+      "fields": [
+        {
+          "fieldPath": "generalCourt",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "billId",
+          "order": "ASCENDING"
+        }
+      ]
+    },
+    {
+      "collectionGroup": "lobbyingFilings",
+      "queryScope": "COLLECTION",
+      "fields": [
+        {
+          "fieldPath": "generalCourt",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "chamber",
+          "order": "ASCENDING"
+        }
+      ]
+    },
+    {
+      "collectionGroup": "lobbyingFilings",
+      "queryScope": "COLLECTION",
+      "fields": [
+        {
+          "fieldPath": "generalCourt",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "entityNameNorm",
+          "order": "ASCENDING"
+        }
+      ]
+    },
+    {
+      "collectionGroup": "lobbyingFilings",
+      "queryScope": "COLLECTION",
+      "fields": [
+        {
+          "fieldPath": "generalCourt",
+          "order": "ASCENDING"
+        },
+        {
+          "fieldPath": "clientNameNorm",
+          "order": "ASCENDING"
+        }
+      ]
     }
   ],
   "fieldOverrides": [
diff --git a/firestore.rules b/firestore.rules
index a95586279..42db67276 100644
--- a/firestore.rules
+++ b/firestore.rules
@@ -103,6 +103,14 @@ service cloud.firestore {
       allow read: if true;
       allow write: if false;
     }
+    match /lobbyingRegistrants/{id} {
+      allow read: if true;
+      allow write: if false;
+    }
+    match /lobbyingFilings/{id} {
+      allow read: if true;
+      allow write: if false;
+    }
     match /transcriptions/{tid} {
       // public, read-only
       allow read: if true
diff --git a/functions/src/lobbying/index.ts b/functions/src/lobbying/index.ts
new file mode 100644
index 000000000..6d039ae51
--- /dev/null
+++ b/functions/src/lobbying/index.ts
@@ -0,0 +1,2 @@
+export * from "./types"
+export { normalizeEntityName } from "./normalize"
diff --git a/functions/src/lobbying/normalize.ts b/functions/src/lobbying/normalize.ts
new file mode 100644
index 000000000..a7beb338f
--- /dev/null
+++ b/functions/src/lobbying/normalize.ts
@@ -0,0 +1,72 @@
+/**
+ * Entity name normalization pipeline.
+ *
+ * The SoS portal does not enforce consistent name formatting. The same client or
+ * registrant may appear as "Acme Corp.", "ACME CORPORATION", "Acme, Inc. d/b/a
+ * Acme Consulting", etc. across filings and years.
+ *
+ *  The steps must be applied in the exact order
+ * listed here; changing the order produces different (incorrect) output.
+ */
+
+// Step 2: strip d/b/a trade-name suffix before any other transforms so the
+// trade name doesn't bleed into the canonical form.
+const DBA_RE = /\s+D\s*\/+B\s*\/+A?\s+.*|\s+DBA\s+.*/i
+
+// Step 5: remove legal entity type words with whole-word matching so
+// "INCORPORATED" and "CORP" are caught in addition to "LLC"/"INC".
+const LEGAL_ENTITY_RE =
+  /\b(LLC|LLP|INC|INCORPORATED|CORPORATION|CORP|LTD|LIMITED|PC|PLLC)\b/g
+
+// Step 6: remove "THE" as a whole word anywhere (not just as a leading prefix).
+const THE_RE = /\bTHE\b/g
+
+// Step 9: professional suffix phrases to remove wholesale.
+const MISC_PHRASES = [
+  "LAW OFFICE OF",
+  "AND ASSOCIATES",
+  "& ASSOCIATES",
+  "AND ASSOC",
+  "ATTORNEY AT LAW",
+  "ATTORNEY@LAW",
+  "ATTORNET AT LAW", // known portal typo
+  "AND PARTNERS",
+  "PUBLIC POLICY GROUP",
+  "LEGISLATIVE SERVICES",
+  "POLICY GROUP",
+  "ASSOCIATES",
+  "COUNSELLORS AT LAW"
+]
+
+export function normalizeEntityName(raw: string | null | undefined): string {
+  if (!raw) return ""
+
+  let x = raw.toUpperCase() // Step 1: uppercase
+
+  x = x.replace(DBA_RE, "") // Step 2: strip d/b/a suffix
+
+  x = x.replace(/-/g, " ") // Step 3: hyphen → space
+
+  // Step 4: punctuation → space (not empty string, so ",INC" → " INC" → caught
+  // by step 5's whole-word removal).
+  for (const ch of [",", ".", "'", "‘", "’", "(", ")"]) {
+    x = x.split(ch).join(" ")
+  }
+
+  x = x.replace(LEGAL_ENTITY_RE, " ") // Step 5: remove legal entity type words
+
+  x = x.replace(THE_RE, " ") // Step 6: remove THE anywhere
+
+  x = x.replace(/&/g, "AND") // Step 7: ampersand → AND
+
+  x = x.replace("ASSICIATES", "ASSOCIATES") // Step 8: fix known portal typo
+
+  // Step 9: remove professional suffix phrases
+  for (const phrase of MISC_PHRASES) {
+    x = x.split(phrase).join(" ")
+  }
+
+  x = x.replace(/\s+/g, " ").trim() // Step 10: collapse whitespace
+
+  return x
+}
diff --git a/functions/src/lobbying/types.ts b/functions/src/lobbying/types.ts
new file mode 100644
index 000000000..83eaab761
--- /dev/null
+++ b/functions/src/lobbying/types.ts
@@ -0,0 +1,101 @@
+import {
+  Array,
+  InstanceOf,
+  Literal,
+  Number,
+  Null,
+  Record,
+  Static,
+  String,
+  Union
+} from "runtypes"
+import { Timestamp } from "../firebase"
+
+export type LobbyingChamber = Static<typeof LobbyingChamber>
+export const LobbyingChamber = Union(
+  Literal("House Bill"),
+  Literal("Senate Bill"),
+  Literal("House Docket"),
+  Literal("Senate Docket"),
+  Literal("Executive"),
+  Literal("Other")
+)
+
+export type LobbyingClient = Static<typeof LobbyingClient>
+export const LobbyingClient = Record({
+  clientName: String,
+  clientNameNorm: String,
+  compensation: Null.Or(Number)
+})
+
+export type LobbyingRegistrant = Static<typeof LobbyingRegistrant>
+export const LobbyingRegistrant = Record({
+  registrantId: String,
+  entityName: String,
+  entityNameNorm: String,
+  year: Number,
+  generalCourt: Number,
+  regType: Union(Literal("Lobbyist"), Literal("Employer")),
+  clients: Array(LobbyingClient),
+  disclosureUrls: Array(String),
+  fetchedAt: InstanceOf(Timestamp)
+})
+
+export type LobbyingFiling = Static<typeof LobbyingFiling>
+export const LobbyingFiling = Record({
+  filingId: String,
+  entityName: String,
+  entityNameNorm: String,
+  clientName: String,
+  clientNameNorm: String,
+  year: Number,
+  generalCourt: Number,
+  chamber: LobbyingChamber,
+  // Non-null only for legislative chambers (House Bill, Senate Bill, House Docket,
+  // Senate Docket). For Executive and Other, no bill join should be attempted.
+  billId: Null.Or(String),
+  activityTitle: String,
+  position: String,
+  amount: Null.Or(Number),
+  fetchedAt: InstanceOf(Timestamp)
+})
+
+/** Firestore path for lobbying registrant documents */
+export const REGISTRANTS_COLLECTION = "lobbyingRegistrants"
+
+/** Firestore path for lobbying filing documents */
+export const FILINGS_COLLECTION = "lobbyingFilings"
+
+/** Firestore path for the live scraper cursor document */
+export const SCRAPER_DOC = "/scrapers/lobbying"
+
+/** Firestore path for the backfill cursor subcollection */
+export const BACKFILL_DOC = "/scrapers/lobbyingBackfill"
+export const BACKFILL_URLS_COLLECTION = "processedUrls"
+
+/** Earliest year with portal data */
+export const FIRST_LOBBYING_YEAR = 2005
+
+/**
+ * Sentinel clientName used for pre-2013 legacy filings where compensation is
+ * reported as a single total rather than broken down per client.
+ */
+export const LEGACY_TOTAL_CLIENT = "_total_salary_"
+
+/**
+ * Chamber prefix map for constructing billId values that match MAPLE's Bill.id.
+ * Typed as a plain index signature so portal.ts can look up any LobbyingChamber
+ * without triggering "Property X does not exist" on the Partial.
+ */
+export const CHAMBER_PREFIXES: { [chamber: string]: string | undefined } = {
+  "House Bill": "H",
+  "Senate Bill": "S",
+  "House Docket": "HD",
+  "Senate Docket": "SD"
+}
+
+/** Canonical chamber values for legacy short-form codes found in older filings */
+export const LEGACY_CHAMBER_MAP: { [raw: string]: LobbyingChamber } = {
+  HB: "House Bill",
+  SB: "Senate Bill"
+}
diff --git a/lobbying-scraper/.dockerignore b/lobbying-scraper/.dockerignore
new file mode 100644
index 000000000..9460c99c4
--- /dev/null
+++ b/lobbying-scraper/.dockerignore
@@ -0,0 +1,4 @@
+__pycache__/
+*.pyc
+*.pyo
+.env
diff --git a/lobbying-scraper/Dockerfile b/lobbying-scraper/Dockerfile
new file mode 100644
index 000000000..4b2da65b5
--- /dev/null
+++ b/lobbying-scraper/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY normalize.py portal.py writer.py scrape.py ./
+
+# Cloud Run sets PORT; we don't use it (this is a job, not a server).
+# Cloud Scheduler invokes the container via HTTP POST to /; handle it minimally.
+ENV PYTHONUNBUFFERED=1
+
+# ENTRYPOINT is the fixed executable; CMD provides default args that --args overrides.
+ENTRYPOINT ["python3", "scrape.py"]
+CMD ["--mode", "weekly"]
diff --git a/lobbying-scraper/__pycache__/normalize.cpython-37.pyc b/lobbying-scraper/__pycache__/normalize.cpython-37.pyc
new file mode 100644
index 000000000..94efb861e
Binary files /dev/null and b/lobbying-scraper/__pycache__/normalize.cpython-37.pyc differ
diff --git a/lobbying-scraper/__pycache__/portal.cpython-37.pyc b/lobbying-scraper/__pycache__/portal.cpython-37.pyc
new file mode 100644
index 000000000..d515175f3
Binary files /dev/null and b/lobbying-scraper/__pycache__/portal.cpython-37.pyc differ
diff --git a/lobbying-scraper/archive.py b/lobbying-scraper/archive.py
new file mode 100644
index 000000000..ede5d6656
--- /dev/null
+++ b/lobbying-scraper/archive.py
@@ -0,0 +1,60 @@
+"""GCS raw-HTML archive for the MA SoS lobbying scraper.
+
+Write-only cold storage: every fetched Summary/CompleteDisclosure page is
+saved as gs://{bucket}/raw_html/{sha1(url)}.html with the original URL stored
+as object metadata. This enables offline reparsing when parser logic changes
+without re-scraping the portal (which is rate-limited and Imperva-protected).
+
+Enabled by setting ARCHIVE_RAW=1 in the environment. When disabled (default),
+save_page() is a no-op so local runs and tests work without GCS credentials.
+
+The bucket is named {GOOGLE_CLOUD_PROJECT}-lobbying-archive and should be
+created with the Archive storage class (written once, read rarely).
+"""
+
+from __future__ import annotations
+
+import hashlib
+import os
+
+_ENABLED = os.environ.get("ARCHIVE_RAW", "").lower() in ("1", "true", "yes")
+_bucket_name: str | None = None
+_client = None  # google.cloud.storage.Client, lazily initialized
+
+
+def _gcs():
+    global _client
+    if _client is None:
+        from google.cloud import storage  # noqa: PLC0415
+        _client = storage.Client()
+    return _client
+
+
+def _get_bucket_name() -> str:
+    global _bucket_name
+    if _bucket_name is None:
+        project = os.environ.get("GOOGLE_CLOUD_PROJECT") or _gcs().project
+        _bucket_name = f"{project}-lobbying-archive"
+    return _bucket_name
+
+
+def blob_name(url: str) -> str:
+    """Return the GCS object name for a given URL."""
+    return "raw_html/" + hashlib.sha1(url.encode()).hexdigest() + ".html"
+
+
+def save_page(url: str, html: str) -> None:
+    """Write raw HTML to GCS. No-op when ARCHIVE_RAW is not set."""
+    if not _ENABLED:
+        return
+    try:
+        bucket = _gcs().bucket(_get_bucket_name())
+        blob = bucket.blob(blob_name(url))
+        blob.metadata = {"source-url": url}
+        blob.upload_from_string(
+            html.encode("utf-8"),
+            content_type="text/html; charset=utf-8",
+        )
+    except Exception as exc:
+        # Archive failures must never interrupt the live scrape path.
+        print(f"  [archive] WARNING: failed to save {url[:80]!r}: {exc}")
diff --git a/lobbying-scraper/normalize.py b/lobbying-scraper/normalize.py
new file mode 100644
index 000000000..6e6f7418e
--- /dev/null
+++ b/lobbying-scraper/normalize.py
@@ -0,0 +1,50 @@
+"""Entity name normalization pipeline.
+
+Direct port of functions/src/lobbying/normalize.ts. Steps must be applied in
+this exact order — changing the order produces different (incorrect) output.
+"""
+
+from __future__ import annotations
+
+import re
+
+_DBA_RE = re.compile(r"\s+D\s*/+B\s*/+A?\s+.*|\s+DBA\s+.*", re.IGNORECASE)
+_LEGAL_RE = re.compile(
+    r"\b(LLC|LLP|INC|INCORPORATED|CORPORATION|CORP|LTD|LIMITED|PC|PLLC)\b"
+)
+_THE_RE = re.compile(r"\bTHE\b")
+_WS_RE = re.compile(r"\s+")
+
+_MISC_PHRASES = [
+    "LAW OFFICE OF",
+    "AND ASSOCIATES",
+    "& ASSOCIATES",
+    "AND ASSOC",
+    "ATTORNEY AT LAW",
+    "ATTORNEY@LAW",
+    "ATTORNET AT LAW",  # known portal typo
+    "AND PARTNERS",
+    "PUBLIC POLICY GROUP",
+    "LEGISLATIVE SERVICES",
+    "POLICY GROUP",
+    "ASSOCIATES",
+    "COUNSELLORS AT LAW",
+]
+
+
+def normalize_entity_name(raw: str | None) -> str:
+    if not raw:
+        return ""
+    x = raw.upper()                          # 1. uppercase
+    x = _DBA_RE.sub("", x)                  # 2. strip d/b/a suffix
+    x = x.replace("-", " ")                 # 3. hyphen → space
+    for ch in (",", ".", "'", "‘", "’", "(", ")"):
+        x = x.replace(ch, " ")             # 4. punctuation → space
+    x = _LEGAL_RE.sub(" ", x)              # 5. remove legal entity words
+    x = _THE_RE.sub(" ", x)               # 6. remove THE anywhere
+    x = x.replace("&", "AND")             # 7. ampersand → AND
+    x = x.replace("ASSICIATES", "ASSOCIATES")  # 8. fix known typo
+    for phrase in _MISC_PHRASES:           # 9. remove professional suffix phrases
+        x = x.replace(phrase, " ")
+    x = _WS_RE.sub(" ", x).strip()        # 10. collapse whitespace
+    return x
diff --git a/lobbying-scraper/portal.py b/lobbying-scraper/portal.py
new file mode 100644
index 000000000..3a5528898
--- /dev/null
+++ b/lobbying-scraper/portal.py
@@ -0,0 +1,512 @@
+"""HTTP client and HTML parser for the MA SoS lobbying portal.
+
+Portal: https://www.sec.state.ma.us/LobbyistPublicSearch/
+
+Page flow:
+  1. Search POST  -> summary links table
+  2. Summary.aspx -> registrant name/year/type + CompleteDisclosure links
+  3. CompleteDisclosure.aspx -> per-client compensation + per-client bill activity
+
+Four HTML format eras for CompleteDisclosure pages (detected by table IDs):
+  Modern  (2019+):     grdvClientPaidToEntity + grdvActivitiesNew{year}_{n}
+  Hybrid  (2014-2018): no comp table; Panel1_{n} divs + grdvActivitiesNew_{n}
+  Legacy  (2009-2013): grdvActivities with Compensation received column
+  Legacy  (2005-2008): grdvSalaryPaid (entity total) + grdvActivities (no comp col)
+
+parse_summary() and parse_disclosure_detail() are pure functions (no I/O) so
+they can be called from both the live scraper and the offline reparse driver.
+fetch_* are thin wrappers that handle HTTP and raw-HTML archiving.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import re
+import time
+from dataclasses import dataclass, field
+from typing import Optional
+
+import requests
+from bs4 import BeautifulSoup, Tag
+
+import archive
+
+# ── Constants ─────────────────────────────────────────────────────────────────
+
+BASE_URL = "https://www.sec.state.ma.us/LobbyistPublicSearch/"
+SEARCH_URL = BASE_URL + "Default.aspx"
+
+_UA = (
+    "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) "
+    "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"
+)
+_REQUEST_DELAY = 1.0
+_MAX_RETRIES = 6
+_RETRY_STATUS = {429, 500, 502, 503, 504}
+
+# Lobby disclosure data begins in 2005; GC 183 started Jan 2003.
+FIRST_YEAR = 2005
+FIRST_GC = 183
+FIRST_GC_START_YEAR = 2003
+
+# clientName sentinel for pre-2009 filings where compensation is a single entity total
+LEGACY_TOTAL_CLIENT = "_total_salary_"
+
+# Maps canonical chamber names to the bill-ID prefix used in MAPLE's Bill.id
+CHAMBER_PREFIXES: dict[str, str] = {
+    "House Bill": "H",
+    "Senate Bill": "S",
+    "House Docket": "HD",
+    "Senate Docket": "SD",
+}
+
+# Legacy short-form chamber codes found in older filings
+LEGACY_CHAMBER_MAP: dict[str, str] = {
+    "HB": "House Bill",
+    "SB": "Senate Bill",
+}
+
+# ── Data types ────────────────────────────────────────────────────────────────
+
+
+@dataclass
+class Compensation:
+    client_name: str
+    amount: Optional[float]
+
+
+@dataclass
+class BillActivity:
+    client_name: str
+    chamber: str           # canonical LobbyingChamber value
+    raw_bill_number: str
+    bill_id: Optional[str]  # e.g. "H1234"; null for Executive/Other
+    activity_title: str
+    position: str
+    amount: Optional[float]
+
+
+@dataclass
+class DisclosureMeta:
+    entity_name: str
+    year: Optional[int]
+    reg_type: str          # "Lobbyist" | "Employer"
+    disclosure_urls: list[str] = field(default_factory=list)
+
+
+@dataclass
+class DisclosureDetail:
+    compensation: list[Compensation] = field(default_factory=list)
+    bills: list[BillActivity] = field(default_factory=list)
+
+
+# ── Derived-value helpers ─────────────────────────────────────────────────────
+
+
+def year_to_general_court(year: int) -> int:
+    return FIRST_GC + (year - FIRST_GC_START_YEAR) // 2
+
+
+def normalize_chamber(raw: str) -> str:
+    t = raw.strip()
+    if t in LEGACY_CHAMBER_MAP:
+        return LEGACY_CHAMBER_MAP[t]
+    known = {"House Bill", "Senate Bill", "House Docket", "Senate Docket", "Executive"}
+    return t if t in known else "Other"
+
+
+def construct_bill_id(chamber: str, raw_bill_number: str) -> Optional[str]:
+    """Construct the MAPLE-compatible billId from chamber + raw integer.
+
+    Returns None for Executive and Other chambers where no bill join is possible.
+    H1234 and S1234 are distinct bills even though they share the same integer;
+    the prefix is required to disambiguate.
+    """
+    prefix = CHAMBER_PREFIXES.get(chamber)
+    if not prefix:
+        return None
+    try:
+        return f"{prefix}{int(raw_bill_number)}"
+    except (ValueError, TypeError):
+        return None
+
+
+def registrant_id(entity_name: str, year: int) -> str:
+    key = f"{year}|{entity_name}"
+    return hashlib.sha256(key.encode()).hexdigest()[:40]
+
+
+def filing_id(
+    entity_name: str,
+    client_name: str,
+    chamber: str,
+    bill_id: Optional[str],
+    general_court: int,
+    position: str,
+) -> str:
+    key = "|".join([
+        entity_name, client_name, chamber,
+        bill_id or "__null__", str(general_court), position,
+    ])
+    return hashlib.sha256(key.encode()).hexdigest()[:40]
+
+
+# ── HTTP session ──────────────────────────────────────────────────────────────
+
+
+def make_session() -> requests.Session:
+    s = requests.Session()
+    s.headers.update({
+        "User-Agent": _UA,
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate, br",
+        "Connection": "keep-alive",
+    })
+    return s
+
+
+def _get(session: requests.Session, url: str) -> BeautifulSoup:
+    for attempt in range(_MAX_RETRIES):
+        time.sleep(_REQUEST_DELAY * (2 ** attempt) if attempt else _REQUEST_DELAY)
+        try:
+            r = session.get(url, timeout=60)
+        except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e:
+            if attempt == _MAX_RETRIES - 1:
+                raise
+            print(f"  GET network error (attempt {attempt + 1}): {e}")
+            continue
+        if r.status_code in _RETRY_STATUS:
+            print(f"  GET HTTP {r.status_code} (attempt {attempt + 1}) — retrying")
+            if attempt == _MAX_RETRIES - 1:
+                r.raise_for_status()
+            continue
+        r.raise_for_status()
+        # Archive content pages before parsing; excludes the search page which
+        # shares one URL across all years and is trivially regenerable.
+        if "Summary.aspx" in url or "CompleteDisclosure" in url:
+            archive.save_page(url, r.text)
+        return BeautifulSoup(r.text, "html.parser")
+    raise RuntimeError("_get: exhausted retries")  # unreachable
+
+
+def _post(session: requests.Session, url: str, data: dict) -> BeautifulSoup:
+    for attempt in range(_MAX_RETRIES):
+        time.sleep(_REQUEST_DELAY * (2 ** attempt) if attempt else _REQUEST_DELAY)
+        try:
+            r = session.post(url, data=data, timeout=180)
+        except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e:
+            if attempt == _MAX_RETRIES - 1:
+                raise
+            print(f"  POST network error (attempt {attempt + 1}): {e}")
+            continue
+        if r.status_code in _RETRY_STATUS:
+            print(f"  POST HTTP {r.status_code} (attempt {attempt + 1}) — retrying")
+            if attempt == _MAX_RETRIES - 1:
+                r.raise_for_status()
+            continue
+        r.raise_for_status()
+        return BeautifulSoup(r.text, "html.parser")
+    raise RuntimeError("_post: exhausted retries")  # unreachable
+
+
+# ── Portal scraping ───────────────────────────────────────────────────────────
+
+
+def _viewstate(soup: BeautifulSoup) -> dict:
+    return {
+        inp["name"]: inp.get("value", "")
+        for inp in soup.find_all("input", type="hidden")
+        if inp.get("name")
+    }
+
+
+def fetch_summary_links(session: requests.Session, year: int) -> list[str]:
+    """Return all Summary.aspx URLs for a given year via a single search POST."""
+    soup = _get(session, SEARCH_URL)
+    data = {
+        **_viewstate(soup),
+        "__EVENTTARGET": "",
+        "__EVENTARGUMENT": "",
+        "ctl00$ContentPlaceHolder1$Search": "rdbSearchByType",
+        "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$ddlYear": str(year),
+        "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$txtN_ame": "",
+        "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$lddSearchType$DropDown": "3",
+        "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$drpType": "L",
+        "ctl00$ContentPlaceHolder1$drpPageSize": "20000",
+        "ctl00$ContentPlaceHolder1$btnSearch": "Search",
+    }
+    results = _post(session, SEARCH_URL, data)
+    table = results.find(
+        "table",
+        id=lambda x: x and "grdvSearchResultByTypeAndCategory" in x,
+    )
+    if not table:
+        return []
+    return [
+        BASE_URL + a["href"] if not a["href"].startswith("http") else a["href"]
+        for a in table.find_all("a", href=True)
+        if "Summary.aspx" in a["href"]
+    ]
+
+
+def parse_summary(soup: BeautifulSoup) -> DisclosureMeta:
+    """Parse a Summary.aspx page. Pure function — no I/O."""
+    def text(el_id: str) -> str:
+        el = soup.find(id=el_id)
+        return el.get_text(strip=True) if el else ""
+
+    entity_name = text("ContentPlaceHolder1_lblRegistrantName")
+    year_text = text("ContentPlaceHolder1_lblYear")
+    reg_type_raw = text("ContentPlaceHolder1_lblRegType")
+
+    try:
+        year = int(year_text)
+    except ValueError:
+        year = None
+
+    reg_type = "Employer" if "Entity" in reg_type_raw else "Lobbyist"
+
+    disc_urls = [
+        BASE_URL + a["href"] if not a["href"].startswith("http") else a["href"]
+        for a in soup.find_all("a", href=True)
+        if "CompleteDisclosure" in a["href"]
+    ]
+
+    return DisclosureMeta(
+        entity_name=entity_name,
+        year=year,
+        reg_type=reg_type,
+        disclosure_urls=disc_urls,
+    )
+
+
+def fetch_disclosure_meta(session: requests.Session, summary_url: str) -> DisclosureMeta:
+    return parse_summary(_get(session, summary_url))
+
+
+def _parse_amount(text: str) -> Optional[float]:
+    cleaned = text.replace("$", "").replace(",", "").strip()
+    try:
+        return float(cleaned)
+    except ValueError:
+        return None
+
+
+def _grid_rows(table: Tag) -> list:
+    return table.find_all("tr", class_=lambda c: c and "Grid" in c and "Header" not in c)
+
+
+def parse_disclosure_detail(soup: BeautifulSoup, year: int) -> DisclosureDetail:
+    """Parse a CompleteDisclosure page. Pure function — no I/O.
+
+    Four HTML format eras (detected by table IDs):
+
+    Modern (2019+): grdvClientPaidToEntity holds per-client compensation;
+      bills in grdvActivitiesNew{year}_{n} (one table per client).
+
+    Hybrid (2014-2018): no grdvClientPaidToEntity. Bills in grdvActivitiesNew_{n}
+      (no year suffix). Per-client compensation is in id-less Panel1_{n} divs
+      ("Total amount paid by client...: $X") indexed by lblClientName_{n} spans.
+      Each client reports either a Panel1 total OR activity-level amounts —
+      summing both sources is safe because the unused one is always zero.
+      Omitting this path silently drops ~99% of 2014-2018 compensation.
+
+    Legacy (2009-2013): single grdvActivities table with a "Compensation received"
+      column carrying a per-client total repeated on every bill row for that client.
+      Must deduplicate distinct (client, amount) pairs before summing — never sum
+      raw rows or the total is multiplied by bill count.
+
+    Legacy (2005-2008): grdvActivities has no compensation column; fall back to
+      grdvSalaryPaid entity total under the _total_salary_ placeholder client.
+    """
+    compensation: list[Compensation] = []
+    bills: list[BillActivity] = []
+    gc = year_to_general_court(year)
+
+    # ── Modern / Hybrid: per-client activity tables ───────────────────────────
+    comp_table = soup.find(
+        "table",
+        id=lambda x: x and "grdvClientPaidToEntity" in (x or ""),
+    )
+    if comp_table:
+        for row in _grid_rows(comp_table):
+            cells = [td.get_text(strip=True) for td in row.find_all("td")]
+            if len(cells) >= 2:
+                compensation.append(Compensation(
+                    client_name=cells[0],
+                    amount=_parse_amount(cells[1]),
+                ))
+
+    # Activity tables: ID pattern grdvActivitiesNew{year}_{n} (Modern, 2019+)
+    # or grdvActivitiesNew_{n} with no year (Hybrid, 2014-2018). The same loop
+    # handles both; compensation source differs (see Hybrid block below).
+    activity_by_client: dict[str, float] = {}
+    for act_table in soup.find_all(
+        "table",
+        id=lambda x: x and re.search(r"grdvActivitiesNew(\d{4})?_\d+", x or ""),
+    ):
+        client_span = act_table.find_previous(
+            "span",
+            id=lambda x: x and "lblClientName" in (x or ""),
+        )
+        client_name = client_span.get_text(strip=True) if client_span else ""
+
+        for row in _grid_rows(act_table):
+            cells = [td.get_text(strip=True) for td in row.find_all("td")]
+            if len(cells) < 4:
+                continue
+            chamber = normalize_chamber(cells[0])
+            raw_num = cells[1]
+            bill_id = construct_bill_id(chamber, raw_num)
+            amt = _parse_amount(cells[4]) if len(cells) > 4 else None
+            bills.append(BillActivity(
+                client_name=client_name,
+                chamber=chamber,
+                raw_bill_number=raw_num,
+                bill_id=bill_id,
+                activity_title=cells[2] if len(cells) > 2 else "",
+                position=cells[3] if len(cells) > 3 else "",
+                amount=amt,
+            ))
+            if amt:
+                activity_by_client[client_name] = (
+                    activity_by_client.get(client_name, 0.0) + amt
+                )
+
+    # Hybrid (2014-2018): no modern comp table — reconstruct per-client amounts
+    # from Panel1 "Total amount paid by client" divs indexed by client-name spans.
+    if not comp_table and bills:
+        client_by_idx = {
+            sp["id"].split("_")[-1]: sp.get_text(strip=True)
+            for sp in soup.find_all(
+                "span", id=lambda x: x and "lblClientName_" in (x or "")
+            )
+        }
+        panel_by_client: dict[str, float] = {}
+        for div in soup.find_all("div", id=lambda x: x and "Panel1_" in (x or "")):
+            idx = div["id"].split("_")[-1]
+            cn = client_by_idx.get(idx)
+            if not cn:
+                continue
+            m = re.search(r"\$([\d,]+\.\d\d)", div.get_text(" ", strip=True))
+            panel_by_client[cn] = float(m.group(1).replace(",", "")) if m else 0.0
+        for cn in set(panel_by_client) | set(activity_by_client):
+            amt = panel_by_client.get(cn, 0.0) + activity_by_client.get(cn, 0.0)
+            if amt:
+                compensation.append(Compensation(client_name=cn, amount=amt))
+
+    if comp_table or bills:
+        return DisclosureDetail(compensation=compensation, bills=bills)
+
+    # ── Legacy format (2005-2013): single grdvActivities table ───────────────
+    act_table = soup.find("table", id=lambda x: x and x.endswith("grdvActivities"))
+    # Distinct (client, amount) pairs from the "Compensation received" column
+    # (2009-2013). The portal repeats the per-client total on every bill row for
+    # that client, so we deduplicate before summing to avoid multiplying by bill count.
+    legacy_comp_pairs: set = set()
+    comp_col: Optional[int] = None
+
+    if act_table:
+        all_rows = act_table.find_all("tr")
+        headers = [
+            th.get_text(strip=True)
+            for th in (all_rows[0].find_all(["th", "td"]) if all_rows else [])
+        ]
+
+        if headers and "Activity" in headers[0]:
+            # 6-col entity layout has Lobbyist as second header
+            if len(headers) >= 2 and "Lobbyist" in headers[1]:
+                bill_col, pos_col, client_col = 0, 2, 4
+            else:
+                bill_col, pos_col, client_col = 0, 1, 3
+        else:
+            bill_col, pos_col, client_col = 1, None, 3
+
+        if any("Compensation" in h for h in headers):
+            comp_col = len(headers) - 1
+
+        chamber_map = {
+            "H": "House Bill", "S": "Senate Bill",
+            "HD": "House Docket", "SD": "Senate Docket",
+        }
+        skip = {"Activity or Bill No and Title", "N/A", "None", "", "Total amount"}
+
+        for row in all_rows[1:]:
+            cells = [td.get_text(strip=True) for td in row.find_all("td")]
+            if len(cells) <= max(bill_col, client_col):
+                continue
+            client_name = cells[client_col]
+            bill_cell = cells[bill_col]
+            amt = (
+                _parse_amount(cells[comp_col])
+                if comp_col is not None and len(cells) > comp_col
+                else None
+            )
+            # Exclude "Total amount" summary rows appended by legacy individual
+            # disclosures — these are not real clients.
+            if amt is not None and client_name not in ("Total amount", "Total", ""):
+                legacy_comp_pairs.add((client_name, amt))
+            if not bill_cell or bill_cell in skip:
+                continue
+            # Bill token may use a semicolon separator ("H73; Title") or a space
+            parts = re.split(r"[;\s]", bill_cell, maxsplit=1)
+            bill_no = parts[0].rstrip(";")
+            m = re.match(r"^([A-Z]+)(\d+)$", bill_no)
+            if not m:
+                continue
+            prefix, number = m.group(1), m.group(2)
+            chamber = chamber_map.get(prefix, "Other")
+            bill_id = construct_bill_id(chamber, number)
+            bills.append(BillActivity(
+                client_name=client_name,
+                chamber=chamber,
+                raw_bill_number=number,
+                bill_id=bill_id,
+                activity_title=parts[1].strip() if len(parts) > 1 else "",
+                position=(
+                    cells[pos_col]
+                    if pos_col is not None and len(cells) > pos_col
+                    else ""
+                ),
+                amount=amt,
+            ))
+
+    # Per-client compensation from the "Compensation received" column (2009-2013).
+    # Fall back to grdvSalaryPaid entity total when no compensation column exists (2005-2008).
+    if comp_col is not None:
+        per_client: dict[str, float] = {}
+        for cn, amt in legacy_comp_pairs:
+            per_client[cn] = per_client.get(cn, 0.0) + amt
+        for cn, amt in per_client.items():
+            if amt:
+                compensation.append(Compensation(client_name=cn, amount=amt))
+    else:
+        salary_table = soup.find(
+            "table", id=lambda x: x and "grdvSalaryPaid" in (x or "")
+        )
+        if salary_table:
+            total = 0.0
+            for row in salary_table.find_all("tr"):
+                cells = [td.get_text(strip=True) for td in row.find_all("td")]
+                if len(cells) >= 2 and "Total" not in cells[0]:
+                    amt = _parse_amount(cells[1])
+                    if amt:
+                        total += amt
+            if total:
+                compensation.append(
+                    Compensation(client_name=LEGACY_TOTAL_CLIENT, amount=total)
+                )
+
+    return DisclosureDetail(compensation=compensation, bills=bills)
+
+
+def fetch_disclosure_detail(
+    session: requests.Session, disc_url: str, year: int
+) -> DisclosureDetail:
+    return parse_disclosure_detail(_get(session, disc_url), year)
+
+
+def year_from_disc_url(url: str) -> Optional[int]:
+    """Extract the filing year from a CompleteDisclosure URL query string."""
+    m = re.search(r"[?&]FilingYear=(\d{4})", url)
+    return int(m.group(1)) if m else None
diff --git a/lobbying-scraper/reparse_archive.py b/lobbying-scraper/reparse_archive.py
new file mode 100644
index 000000000..266ff1ad6
--- /dev/null
+++ b/lobbying-scraper/reparse_archive.py
@@ -0,0 +1,155 @@
+"""Offline reparse driver: re-ingests raw HTML from the GCS archive.
+
+Downloads archived CompleteDisclosure pages from GCS, re-runs the pure parsers
+against them, and writes results back to Firestore. Use this when parser logic
+has changed and historical data needs to be re-ingested without re-scraping.
+
+For each archived disclosure page the driver looks up the corresponding
+registrant document in Firestore (via the disclosureUrls array) to obtain the
+entity name needed to construct filing document IDs. Registrant documents must
+therefore already exist before running a reparse.
+
+Usage:
+    GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \\
+      python3 reparse_archive.py [--limit N] [--dry-run]
+
+Progress is tracked in Firestore at /scrapers/lobbyingReparse so the run is
+resumable: restarting skips blobs already marked as processed.
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+
+from bs4 import BeautifulSoup
+from google.cloud import firestore, storage
+
+import archive
+from portal import DisclosureMeta, parse_disclosure_detail, year_from_disc_url
+from writer import REGISTRANTS_COLLECTION, write_filings
+
+REPARSE_DOC = "scrapers/lobbyingReparse"
+
+
+def _meta_for_disc_url(db: firestore.Client, disc_url: str) -> DisclosureMeta | None:
+    """Look up the registrant that owns this disclosure URL."""
+    results = (
+        db.collection(REGISTRANTS_COLLECTION)
+        .where("disclosureUrls", "array_contains", disc_url)
+        .limit(1)
+        .get()
+    )
+    if not results:
+        return None
+    data = results[0].to_dict()
+    return DisclosureMeta(
+        entity_name=data.get("entityName", ""),
+        year=data.get("year"),
+        reg_type=data.get("regType", "Lobbyist"),
+        disclosure_urls=data.get("disclosureUrls", []),
+    )
+
+
+def _is_processed(db: firestore.Client, blob_name: str) -> bool:
+    doc = db.document(REPARSE_DOC).get()
+    if not doc.exists:
+        return False
+    return blob_name in doc.to_dict().get("processedBlobs", [])
+
+
+def _mark_processed(db: firestore.Client, blob_name: str) -> None:
+    db.document(REPARSE_DOC).set(
+        {"processedBlobs": firestore.ArrayUnion([blob_name])},
+        merge=True,
+    )
+
+
+def run(limit: int | None, dry_run: bool) -> None:
+    gcs = storage.Client()
+    bucket_name = archive._get_bucket_name()
+    bucket = gcs.bucket(bucket_name)
+
+    db: firestore.Client | None = None if dry_run else firestore.Client()
+
+    blobs = list(bucket.list_blobs(prefix="raw_html/"))
+    print(f"Found {len(blobs)} archived pages")
+
+    processed = 0
+    skipped = 0
+    errors = 0
+
+    for blob in blobs:
+        if limit is not None and processed >= limit:
+            break
+
+        blob.reload()
+        url = (blob.metadata or {}).get("source-url", "")
+
+        if "CompleteDisclosure" not in url:
+            skipped += 1
+            continue
+
+        if db is not None and _is_processed(db, blob.name):
+            skipped += 1
+            continue
+
+        year = year_from_disc_url(url)
+        if year is None:
+            print(f"  SKIP {blob.name}: cannot extract year from {url!r}")
+            skipped += 1
+            continue
+
+        meta: DisclosureMeta | None = None
+        if db is not None:
+            meta = _meta_for_disc_url(db, url)
+            if meta is None:
+                print(f"  SKIP {blob.name}: no registrant found for {url!r}")
+                skipped += 1
+                continue
+
+        try:
+            html = blob.download_as_text(encoding="utf-8")
+            soup = BeautifulSoup(html, "html.parser")
+            detail = parse_disclosure_detail(soup, year)
+        except Exception as exc:
+            print(f"  ERROR parsing {url}: {exc}")
+            errors += 1
+            continue
+
+        print(
+            f"  {url[:80]!r} — {len(detail.compensation)} comp,"
+            f" {len(detail.bills)} bills"
+        )
+
+        if not dry_run and db is not None and meta is not None:
+            write_filings(db, meta, detail)
+            _mark_processed(db, blob.name)
+
+        processed += 1
+
+    print(
+        f"\nDone: {processed} reparsed, {skipped} skipped, {errors} errors"
+        + (" (dry run — nothing written)" if dry_run else "")
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Reparse raw HTML archive into Firestore"
+    )
+    parser.add_argument("--limit", type=int, default=None, help="Stop after N pages")
+    parser.add_argument(
+        "--dry-run", action="store_true", help="Parse but do not write to Firestore"
+    )
+    args = parser.parse_args()
+
+    # Ensure archive module can resolve the bucket name even in dry-run mode
+    if not os.environ.get("ARCHIVE_RAW"):
+        os.environ["ARCHIVE_RAW"] = "1"
+
+    run(limit=args.limit, dry_run=args.dry_run)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lobbying-scraper/requirements.txt b/lobbying-scraper/requirements.txt
new file mode 100644
index 000000000..d92c075e4
--- /dev/null
+++ b/lobbying-scraper/requirements.txt
@@ -0,0 +1,4 @@
+requests>=2.28
+beautifulsoup4>=4.12
+google-cloud-firestore>=2.14
+google-cloud-storage>=2.10
diff --git a/lobbying-scraper/scrape.py b/lobbying-scraper/scrape.py
new file mode 100644
index 000000000..657c59603
--- /dev/null
+++ b/lobbying-scraper/scrape.py
@@ -0,0 +1,271 @@
+"""Lobbying disclosure scraper — Cloud Run entry point.
+
+Runs on a weekly Cloud Scheduler trigger. Checks for new or amended disclosures
+and exits immediately if none are found (fast path). When new disclosures exist,
+fetches and writes them to Firestore.
+
+Also serves as the library used by the TypeScript backfill admin script via
+subprocess.
+
+Environment variables:
+  GOOGLE_CLOUD_PROJECT  — GCP project ID (set automatically in Cloud Run)
+  FIRESTORE_EMULATOR_HOST — set to use the local emulator (e.g. localhost:8080)
+
+CLI flags (for local / backfill use):
+  --year YEAR     Only process this year (default: current + prior)
+  --limit N       Max registrants per year (for testing)
+  --dry-run       Fetch and parse but do not write to Firestore
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import os
+import sys
+from datetime import datetime, timezone
+
+from google.cloud import firestore
+
+from portal import (
+    FIRST_YEAR,
+    fetch_disclosure_detail,
+    fetch_disclosure_meta,
+    fetch_summary_links,
+    make_session,
+)
+from writer import (
+    BACKFILL_DOC,
+    BACKFILL_URLS_COLLECTION,
+    SCRAPER_DOC,
+    write_filings,
+    write_registrant,
+)
+
+
+# ── Cursor helpers ────────────────────────────────────────────────────────────
+
+
+def _load_live_cursor(db: firestore.Client) -> tuple[set[str], dict[str, list[str]]]:
+    """Return (processedDiscUrls, summaryDiscCache) from the live scraper doc."""
+    doc = db.document(SCRAPER_DOC).get()
+    data = doc.to_dict() or {}
+    return (
+        set(data.get("processedDiscUrls", [])),
+        data.get("summaryDiscCache", {}),
+    )
+
+
+def _save_live_cursor(
+    db: firestore.Client,
+    processed: set[str],
+    cache: dict[str, list[str]],
+) -> None:
+    db.document(SCRAPER_DOC).set(
+        {"processedDiscUrls": list(processed), "summaryDiscCache": cache},
+        merge=True,
+    )
+
+
+def _is_backfill_processed(db: firestore.Client, disc_url: str) -> bool:
+    h = hashlib.sha256(disc_url.encode()).hexdigest()[:40]
+    return db.document(BACKFILL_DOC).collection(BACKFILL_URLS_COLLECTION).document(h).get().exists
+
+
+def _mark_backfill_processed(db: firestore.Client, disc_url: str) -> None:
+    h = hashlib.sha256(disc_url.encode()).hexdigest()[:40]
+    db.document(BACKFILL_DOC).collection(BACKFILL_URLS_COLLECTION).document(h).set(
+        {"url": disc_url, "processedAt": datetime.now(tz=timezone.utc).isoformat()}
+    )
+
+
+# ── Core processing ───────────────────────────────────────────────────────────
+
+
+def process_disclosure(
+    db: firestore.Client | None,
+    session,
+    summary_url: str,
+    disc_url: str,
+    year: int,
+    dry_run: bool = False,
+) -> tuple[int, int]:
+    """Fetch one disclosure page and write registrant + filing documents.
+
+    Returns (compensation_rows, filing_rows).
+    """
+    meta = fetch_disclosure_meta(session, summary_url)
+    detail = fetch_disclosure_detail(session, disc_url, year)
+
+    if dry_run or db is None:
+        return len(detail.compensation), len(detail.bills)
+
+    write_registrant(db, meta, detail, disc_url)
+    n_filings = write_filings(db, meta, detail)
+    return len(detail.compensation), n_filings
+
+
+# ── Weekly incremental run ────────────────────────────────────────────────────
+
+
+def run_weekly(
+    db: "firestore.Client | None",
+    years: list[int],
+    limit: int | None = None,
+    dry_run: bool = False,
+) -> int:
+    """Incremental weekly check. Returns number of new disclosures processed."""
+    current_year = datetime.now(tz=timezone.utc).year
+    processed, cache = _load_live_cursor(db) if db is not None else (set(), {})
+
+    session = make_session()
+    new_count = 0
+
+    for year in years:
+        print(f"\n── {year} ──")
+        try:
+            summary_urls = fetch_summary_links(session, year)
+        except Exception as e:
+            print(f"  failed to fetch summary links: {e}", file=sys.stderr)
+            continue
+
+        if limit:
+            summary_urls = summary_urls[:limit]
+
+        print(f"  {len(summary_urls)} registrants on portal")
+
+        for summary_url in summary_urls:
+            # Use cached disc URLs for prior years; always re-check current year
+            disc_urls = cache.get(summary_url)
+            if disc_urls is None or year == current_year:
+                try:
+                    meta = fetch_disclosure_meta(session, summary_url)
+                    disc_urls = meta.disclosure_urls
+                    cache[summary_url] = disc_urls
+                    if not dry_run:
+                        _save_live_cursor(db, processed, cache)
+                except Exception as e:
+                    print(f"  failed to fetch summary {summary_url}: {e}", file=sys.stderr)
+                    continue
+
+            new_disc_urls = [u for u in disc_urls if u not in processed]
+            if not new_disc_urls:
+                continue
+
+            for disc_url in new_disc_urls:
+                try:
+                    comp_n, filing_n = process_disclosure(
+                        db, session, summary_url, disc_url, year, dry_run=dry_run
+                    )
+                    processed.add(disc_url)
+                    new_count += 1
+                    print(f"  processed: {comp_n} clients, {filing_n} filings")
+                    if not dry_run:
+                        _save_live_cursor(db, processed, cache)
+                except Exception as e:
+                    print(f"  failed to process {disc_url}: {e}", file=sys.stderr)
+
+    return new_count
+
+
+# ── Historical backfill ───────────────────────────────────────────────────────
+
+
+def run_backfill(
+    db: "firestore.Client | None",
+    years: list[int],
+    limit: int | None = None,
+    dry_run: bool = False,
+) -> int:
+    """Full historical backfill using the subcollection cursor. Resumable."""
+    session = make_session()
+    total_new = 0
+
+    for year in years:
+        print(f"\n── {year} ──")
+        try:
+            summary_urls = fetch_summary_links(session, year)
+        except Exception as e:
+            print(f"  failed to fetch summary links: {e}", file=sys.stderr)
+            continue
+
+        if limit:
+            summary_urls = summary_urls[:limit]
+
+        print(f"  {len(summary_urls)} registrants on portal")
+        year_new = 0
+
+        for i, summary_url in enumerate(summary_urls):
+            try:
+                meta = fetch_disclosure_meta(session, summary_url)
+            except Exception as e:
+                print(f"  [{i+1}/{len(summary_urls)}] failed to fetch summary: {e}", file=sys.stderr)
+                continue
+
+            for disc_url in meta.disclosure_urls:
+                if db is not None and not dry_run and _is_backfill_processed(db, disc_url):
+                    continue
+                try:
+                    comp_n, filing_n = process_disclosure(
+                        db, session, summary_url, disc_url, year, dry_run=dry_run
+                    )
+                    if not dry_run:
+                        _mark_backfill_processed(db, disc_url)
+                    total_new += 1
+                    year_new += 1
+                except Exception as e:
+                    print(f"  failed to process {disc_url}: {e}", file=sys.stderr)
+
+            if (i + 1) % 50 == 0 or i + 1 == len(summary_urls):
+                print(f"  [{i+1}/{len(summary_urls)}] {year_new} new disclosures so far")
+
+        print(f"  {year} complete: {year_new} new disclosures")
+
+    return total_new
+
+
+# ── Entry point ───────────────────────────────────────────────────────────────
+
+
+def main() -> None:
+    p = argparse.ArgumentParser()
+    p.add_argument("--year", type=int, default=None)
+    p.add_argument("--limit", type=int, default=None)
+    p.add_argument("--dry-run", action="store_true")
+    p.add_argument(
+        "--mode",
+        choices=["weekly", "backfill"],
+        default="weekly",
+        help="weekly: incremental check; backfill: full history with subcollection cursor",
+    )
+    args = p.parse_args()
+
+    current_year = datetime.now(tz=timezone.utc).year
+
+    if args.year:
+        years = [args.year]
+    elif args.mode == "weekly":
+        years = [current_year, current_year - 1]
+    else:
+        years = list(range(FIRST_YEAR, current_year + 1))
+
+    project = os.environ.get("GOOGLE_CLOUD_PROJECT")
+    db = firestore.Client(project=project) if not args.dry_run else None
+
+    if args.mode == "weekly":
+        n = run_weekly(db, years, limit=args.limit, dry_run=args.dry_run)
+        if n == 0:
+            print("\nNo new disclosures found.")
+        else:
+            print(f"\nDone: {n} new disclosures written.")
+    else:
+        n = run_backfill(db, years, limit=args.limit, dry_run=args.dry_run)
+        print(f"\nBackfill complete: {n} new disclosures written.")
+
+    # Emit structured result for callers (e.g. TypeScript backfill script)
+    print(json.dumps({"newDisclosures": n}), file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lobbying-scraper/tests/__init__.py b/lobbying-scraper/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/lobbying-scraper/tests/__pycache__/__init__.cpython-37.pyc b/lobbying-scraper/tests/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 000000000..33712a9e9
Binary files /dev/null and b/lobbying-scraper/tests/__pycache__/__init__.cpython-37.pyc differ
diff --git a/lobbying-scraper/tests/__pycache__/test_portal_parser.cpython-37-pytest-7.4.4.pyc b/lobbying-scraper/tests/__pycache__/test_portal_parser.cpython-37-pytest-7.4.4.pyc
new file mode 100644
index 000000000..e18d528fe
Binary files /dev/null and b/lobbying-scraper/tests/__pycache__/test_portal_parser.cpython-37-pytest-7.4.4.pyc differ
diff --git a/lobbying-scraper/tests/fixtures/2007e_disc.html.gz b/lobbying-scraper/tests/fixtures/2007e_disc.html.gz
new file mode 100644
index 000000000..4311a97da
Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2007e_disc.html.gz differ
diff --git a/lobbying-scraper/tests/fixtures/2007e_summ.html.gz b/lobbying-scraper/tests/fixtures/2007e_summ.html.gz
new file mode 100644
index 000000000..58d40abbe
Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2007e_summ.html.gz differ
diff --git a/lobbying-scraper/tests/fixtures/2011e_disc.html.gz b/lobbying-scraper/tests/fixtures/2011e_disc.html.gz
new file mode 100644
index 000000000..2a01a7963
Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2011e_disc.html.gz differ
diff --git a/lobbying-scraper/tests/fixtures/2011e_summ.html.gz b/lobbying-scraper/tests/fixtures/2011e_summ.html.gz
new file mode 100644
index 000000000..d7b525310
Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2011e_summ.html.gz differ
diff --git a/lobbying-scraper/tests/fixtures/2011i_disc.html.gz b/lobbying-scraper/tests/fixtures/2011i_disc.html.gz
new file mode 100644
index 000000000..84736b4a7
Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2011i_disc.html.gz differ
diff --git a/lobbying-scraper/tests/fixtures/2011i_summ.html.gz b/lobbying-scraper/tests/fixtures/2011i_summ.html.gz
new file mode 100644
index 000000000..430f68591
Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2011i_summ.html.gz differ
diff --git a/lobbying-scraper/tests/fixtures/2016e_disc.html.gz b/lobbying-scraper/tests/fixtures/2016e_disc.html.gz
new file mode 100644
index 000000000..279fd59c0
Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2016e_disc.html.gz differ
diff --git a/lobbying-scraper/tests/fixtures/2016e_summ.html.gz b/lobbying-scraper/tests/fixtures/2016e_summ.html.gz
new file mode 100644
index 000000000..a0bc28b85
Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2016e_summ.html.gz differ
diff --git a/lobbying-scraper/tests/fixtures/2024e_disc.html.gz b/lobbying-scraper/tests/fixtures/2024e_disc.html.gz
new file mode 100644
index 000000000..9074511f5
Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2024e_disc.html.gz differ
diff --git a/lobbying-scraper/tests/fixtures/2024e_summ.html.gz b/lobbying-scraper/tests/fixtures/2024e_summ.html.gz
new file mode 100644
index 000000000..cb6181711
Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2024e_summ.html.gz differ
diff --git a/lobbying-scraper/tests/fixtures/2024i_disc.html.gz b/lobbying-scraper/tests/fixtures/2024i_disc.html.gz
new file mode 100644
index 000000000..44dddc2ef
Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2024i_disc.html.gz differ
diff --git a/lobbying-scraper/tests/fixtures/2024i_summ.html.gz b/lobbying-scraper/tests/fixtures/2024i_summ.html.gz
new file mode 100644
index 000000000..e6dc3b1b6
Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2024i_summ.html.gz differ
diff --git a/lobbying-scraper/tests/test_portal_parser.py b/lobbying-scraper/tests/test_portal_parser.py
new file mode 100644
index 000000000..03f73aaee
--- /dev/null
+++ b/lobbying-scraper/tests/test_portal_parser.py
@@ -0,0 +1,172 @@
+"""Regression tests for the MA SoS lobbying disclosure parser.
+
+The portal HTML has four distinct format eras; the parser is the most likely
+thing to silently break when the portal changes its markup. These tests parse
+committed fixture pages (one per era, employer + individual) and assert known-
+correct compensation totals, client/bill counts, era detection, and specific
+bug fixes (the "Total amount" summary-row artifact; the "H73;" semicolon bill
+separator; hybrid-era Panel1 compensation).
+
+Fixtures: tests/fixtures/*.html.gz (gzipped real disclosure + summary pages).
+"""
+
+import gzip
+import sys
+from pathlib import Path
+
+import pytest
+from bs4 import BeautifulSoup
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from portal import (
+    _parse_amount,
+    parse_disclosure_detail,
+    parse_summary,
+    year_to_general_court,
+)
+
+FIXTURES = Path(__file__).parent / "fixtures"
+
+
+def _soup(name: str) -> BeautifulSoup:
+    with gzip.open(FIXTURES / f"{name}.html.gz", "rt", encoding="utf-8") as fh:
+        return BeautifulSoup(fh.read(), "html.parser")
+
+
+def _comp_total(detail) -> float:
+    return sum(c.amount for c in detail.compensation if c.amount)
+
+
+# ── Disclosure parsing ────────────────────────────────────────────────────────
+
+# (fixture, year, expected_comp, n_clients, n_bills, era_label)
+DISCLOSURE_CASES = [
+    ("2007e", 2007, 112_500.00,   1,    2, "legacy 2005-2008: entity total under _total_salary_"),
+    ("2011e", 2011, 641_243.00,  23,    4, "legacy 2009-2013: per-client Compensation received column"),
+    ("2016e", 2016, 990_474.00,  30, 1357, "hybrid 2014-2018: Panel1 div totals"),
+    ("2024e", 2024, 115_000.00,   5,   22, "modern 2019+: grdvClientPaidToEntity"),
+    ("2024i", 2024, 1_095_200.0, 17,  135, "modern 2019+ individual"),
+    ("2011i", 2011,  18_518.00,   1,    0, "legacy 2009-2013 individual"),
+]
+
+
+@pytest.mark.parametrize("fix,year,exp_comp,n_clients,n_bills,era", DISCLOSURE_CASES)
+def test_compensation_total_and_counts(fix, year, exp_comp, n_clients, n_bills, era):
+    detail = parse_disclosure_detail(_soup(f"{fix}_disc"), year)
+    assert _comp_total(detail) == pytest.approx(exp_comp, abs=1.0), f"{fix} ({era}) comp total"
+    assert len(detail.compensation) == n_clients, f"{fix} ({era}) client count"
+    assert len(detail.bills) == n_bills, f"{fix} ({era}) bill count"
+
+
+@pytest.mark.parametrize("fix,year,_c,_n,_b,_e", DISCLOSURE_CASES)
+def test_no_total_amount_artifact(fix, year, _c, _n, _b, _e):
+    """The legacy individual summary row (client_name == 'Total amount') must
+    never be captured as a real client — that bug inflated 2010-2013 comp rows."""
+    detail = parse_disclosure_detail(_soup(f"{fix}_disc"), year)
+    bad = [
+        c for c in detail.compensation
+        if c.client_name in ("Total amount", "Total", "")
+    ]
+    assert not bad, f"{fix} produced summary-row artifacts: {bad}"
+
+
+def test_legacy_2007_uses_total_salary_placeholder():
+    """2005-2008 has no per-client comp column; comp falls back to the entity
+    salary total stored under the _total_salary_ placeholder client."""
+    detail = parse_disclosure_detail(_soup("2007e_disc"), 2007)
+    assert [c.client_name for c in detail.compensation] == ["_total_salary_"]
+
+
+def test_legacy_2011_is_per_client_not_placeholder():
+    """2009-2013 has a per-client 'Compensation received' column, so comp is
+    stored under real client names — never the _total_salary_ placeholder."""
+    detail = parse_disclosure_detail(_soup("2011e_disc"), 2011)
+    names = [c.client_name for c in detail.compensation]
+    assert "_total_salary_" not in names
+    assert len(names) == len(set(names)), "per-client compensation should be deduplicated"
+
+
+def test_hybrid_2016_has_nonzero_compensation():
+    """2014-2018 compensation comes from Panel1 divs; was silently $0 before fix."""
+    detail = parse_disclosure_detail(_soup("2016e_disc"), 2016)
+    assert _comp_total(detail) == pytest.approx(990_474.00, abs=1.0)
+    assert len(detail.compensation) == 30
+
+
+def test_semicolon_bill_separator_parsed():
+    """Legacy bill tokens may use 'H73; Title' (semicolon separator) instead of
+    a space; the bill number must still be parsed correctly."""
+    detail = parse_disclosure_detail(_soup("2011e_disc"), 2011)
+    house_numbers = {b.raw_bill_number for b in detail.bills if b.chamber == "House Bill"}
+    assert "73" in house_numbers, "H73 (semicolon-separated) should be parsed"
+
+
+def test_modern_individual_per_client_comp():
+    """Modern individual registrants report per-client compensation in
+    grdvClientPaidToEntity — verify it is captured."""
+    detail = parse_disclosure_detail(_soup("2024i_disc"), 2024)
+    assert _comp_total(detail) > 0
+    assert all(
+        c.client_name not in ("Total amount", "") for c in detail.compensation
+    )
+
+
+# ── Summary page parsing ──────────────────────────────────────────────────────
+
+# (fixture, entity_name, year, reg_type, n_disc_urls)
+SUMMARY_CASES = [
+    ("2007e_summ", "Ventry Associates, LLP",   2007, "Employer",  2),
+    ("2011e_summ", "ML Strategies, LLC",        2011, "Employer",  7),
+    ("2024e_summ", "21c, LLC",                  2024, "Employer",  2),
+    ("2024i_summ", "Anthony Arthur Abdelahad",  2024, "Lobbyist",  2),
+    ("2011i_summ", "Aaron Judd Agulnek",        2011, "Lobbyist",  4),
+]
+
+
+@pytest.mark.parametrize("fix,name,year,reg_type,n_urls", SUMMARY_CASES)
+def test_summary_metadata(fix, name, year, reg_type, n_urls):
+    meta = parse_summary(_soup(fix))
+    assert meta.entity_name == name
+    assert meta.year == year
+    assert meta.reg_type == reg_type
+    assert len(meta.disclosure_urls) == n_urls
+    assert all("CompleteDisclosure" in u for u in meta.disclosure_urls)
+
+
+# ── Helper functions ──────────────────────────────────────────────────────────
+
+def test_parse_amount():
+    assert _parse_amount("$1,234.56") == 1234.56
+    assert _parse_amount("$0.00") == 0.0
+    assert _parse_amount("") is None
+    assert _parse_amount("N/A") is None
+
+
+def test_year_to_general_court():
+    assert year_to_general_court(2003) == 183
+    assert year_to_general_court(2004) == 183
+    assert year_to_general_court(2005) == 184
+    assert year_to_general_court(2023) == 193
+    assert year_to_general_court(2025) == 194
+
+
+# ── Bill ID construction ──────────────────────────────────────────────────────
+
+def test_bill_ids_in_modern_disclosure():
+    """Verify bill_id is correctly constructed for each chamber prefix."""
+    detail = parse_disclosure_detail(_soup("2024e_disc"), 2024)
+    house = [b for b in detail.bills if b.chamber == "House Bill"]
+    senate = [b for b in detail.bills if b.chamber == "Senate Bill"]
+    if house:
+        assert all(b.bill_id and b.bill_id.startswith("H") for b in house)
+    if senate:
+        assert all(b.bill_id and b.bill_id.startswith("S") for b in senate)
+
+
+def test_executive_rows_have_null_bill_id():
+    """Executive chamber rows must produce null billId so no accidental bill join occurs."""
+    detail = parse_disclosure_detail(_soup("2024i_disc"), 2024)
+    executive = [b for b in detail.bills if b.chamber == "Executive"]
+    if executive:
+        assert all(b.bill_id is None for b in executive)
diff --git a/lobbying-scraper/writer.py b/lobbying-scraper/writer.py
new file mode 100644
index 000000000..d49d12424
--- /dev/null
+++ b/lobbying-scraper/writer.py
@@ -0,0 +1,123 @@
+"""Firestore document construction and write helpers.
+
+Mirrors the data model in functions/src/lobbying/types.ts. All collection
+names and field names must stay in sync with that file.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime, timezone
+
+from google.cloud import firestore
+from normalize import normalize_entity_name
+from portal import (
+    BillActivity,
+    Compensation,
+    DisclosureDetail,
+    DisclosureMeta,
+    filing_id,
+    registrant_id,
+    year_to_general_court,
+)
+
+REGISTRANTS_COLLECTION = "lobbyingRegistrants"
+FILINGS_COLLECTION = "lobbyingFilings"
+SCRAPER_DOC = "scrapers/lobbying"
+BACKFILL_DOC = "scrapers/lobbyingBackfill"
+BACKFILL_URLS_COLLECTION = "processedUrls"
+
+
+def _now() -> datetime:
+    return datetime.now(tz=timezone.utc)
+
+
+def write_registrant(
+    db: firestore.Client,
+    meta: DisclosureMeta,
+    detail: DisclosureDetail,
+    disc_url: str,
+) -> None:
+    """Upsert a LobbyingRegistrant document."""
+    if not meta.entity_name or meta.year is None:
+        return
+
+    doc_id = registrant_id(meta.entity_name, meta.year)
+    ref = db.collection(REGISTRANTS_COLLECTION).document(doc_id)
+
+    clients = [
+        {
+            "clientName": c.client_name,
+            "clientNameNorm": normalize_entity_name(c.client_name),
+            "compensation": c.amount,
+        }
+        for c in detail.compensation
+    ]
+
+    data = {
+        "registrantId": doc_id,
+        "entityName": meta.entity_name,
+        "entityNameNorm": normalize_entity_name(meta.entity_name),
+        "year": meta.year,
+        "generalCourt": year_to_general_court(meta.year),
+        "regType": meta.reg_type,
+        "clients": clients,
+        "disclosureUrls": firestore.ArrayUnion([disc_url]),
+        "fetchedAt": _now(),
+    }
+    ref.set(data, merge=True)
+
+
+def write_filings(
+    db: firestore.Client,
+    meta: DisclosureMeta,
+    detail: DisclosureDetail,
+) -> int:
+    """Batch-write LobbyingFiling documents. Returns the number written."""
+    if not meta.entity_name or meta.year is None or not detail.bills:
+        return 0
+
+    gc = year_to_general_court(meta.year)
+    entity_name = meta.entity_name
+    entity_norm = normalize_entity_name(entity_name)
+    now = _now()
+
+    batch = db.batch()
+    count = 0
+
+    for bill in detail.bills:
+        fid = filing_id(
+            entity_name,
+            bill.client_name,
+            bill.chamber,
+            bill.bill_id,
+            gc,
+            bill.position,
+        )
+        ref = db.collection(FILINGS_COLLECTION).document(fid)
+        doc = {
+            "filingId": fid,
+            "entityName": entity_name,
+            "entityNameNorm": entity_norm,
+            "clientName": bill.client_name,
+            "clientNameNorm": normalize_entity_name(bill.client_name),
+            "year": meta.year,
+            "generalCourt": gc,
+            "chamber": bill.chamber,
+            "billId": bill.bill_id,
+            "activityTitle": bill.activity_title,
+            "position": bill.position,
+            "amount": bill.amount,
+            "fetchedAt": now,
+        }
+        batch.set(ref, doc)
+        count += 1
+
+        # Firestore batch limit is 500 writes
+        if count % 400 == 0:
+            batch.commit()
+            batch = db.batch()
+
+    if count % 400 != 0:
+        batch.commit()
+
+    return count