diff --git a/.gitignore b/.gitignore index e2da7cc59..7301e0ec2 100644 --- a/.gitignore +++ b/.gitignore @@ -92,5 +92,10 @@ cert.txt # local MCP server config (contains auth tokens) .mcp.json mcp-server/create-agent-key.ts -.gcloudignore + +# Claude CLAUDE.md + +#gcloud +.gcloudignore + diff --git a/docs/lobbying-disclosure-ingestion.md b/docs/lobbying-disclosure-ingestion.md new file mode 100644 index 000000000..99c9652a7 --- /dev/null +++ b/docs/lobbying-disclosure-ingestion.md @@ -0,0 +1,1038 @@ +# Lobbying Disclosure Ingestion Pipeline + +## Overview + +The MA Secretary of State lobbying portal +([sec.state.ma.us/LobbyistPublicSearch](https://www.sec.state.ma.us/LobbyistPublicSearch/)) +publishes semi-annual disclosure filings for all registered lobbyists and +lobbying entities. This document describes the plan for scraping that data and +storing it in Firestore in a way that allows joining to MAPLE bill data. + +The portal has three levels of pages: + +1. **Search page** → one row per registrant per year +2. **Summary page** → registrant metadata + links to semi-annual disclosure + filings +3. **CompleteDisclosure page** → per-client compensation table + per-client bill + activity tables + +Historical data goes back to 2005. MAPLE has bill data only from ~2020 onward, +so bill joins will only resolve for filings from the 192nd General Court (2021) +and later. All historical filings are ingested regardless. + +--- + +## Terminology + +The portal has two registrant types: + +- **Lobbyist** — an individual person who lobbies directly on behalf of clients. +- **Employer** — a lobbying firm that employs individual lobbyists and is + retained by clients. Called "Lobbyist Entity" on the portal. + +In both cases, the registrant reports compensation received from each **client** +(the organization that hired them) and which bills they lobbied for that client. + +--- + +## Firestore Data Model + +Two top-level collections, normalized by registrant and by lobbying activity +record. + +### `/lobbyingRegistrants/{registrantId}` + +`registrantId` is a SHA-256 hash (first 40 hex chars) of +`{entityName}_{year}`. Hashing avoids sanitizing arbitrary Unicode and +punctuation in entity names to fit Firestore ID constraints while remaining +stable and dedup-safe across runs. + +One model covers both individual lobbyists and lobbying firms. A separate model +is not needed because the portal search returns both under the same schema, and +per-filing detail pages do not expose which individual lobbyists within a firm +worked on which bill. + +```typescript +interface LobbyingRegistrant { + registrantId: string // SHA-256 hash of "{entityName}_{year}" + entityName: string // firm name or individual lobbyist name (raw portal value) + entityNameNorm: string // normalized form; see Normalization section + year: number + generalCourt: number // computed from year + regType: "Lobbyist" | "Employer" + clients: LobbyingClient[] + disclosureUrls: string[] // source portal URLs, for audit trail + fetchedAt: Timestamp +} + +interface LobbyingClient { + clientName: string + clientNameNorm: string // normalized form + compensation: number | null +} +``` + +**Example document** (`lobbyingRegistrants/0f11970cc6f17e35cc02685b794cb63a655c13b3`): + +```json +{ + "registrantId": "0f11970cc6f17e35cc02685b794cb63a655c13b3", + "entityName": "27 South Strategies, LLC", + "entityNameNorm": "27 SOUTH STRATEGIES", + "year": 2024, + "generalCourt": 193, + "regType": "Employer", + "clients": [ + { + "clientName": "The Massachusetts International Festival of the Arts, Inc.", + "clientNameNorm": "MASSACHUSETTS INTERNATIONAL FESTIVAL OF ARTS", + "compensation": 24000.0 + }, + { + "clientName": "Veterinary Emergency Group, LLC", + "clientNameNorm": "VETERINARY EMERGENCY GROUP", + "compensation": 33000.0 + }, + { + "clientName": "Gobrands, Inc", + "clientNameNorm": "GOBRANDS", + "compensation": 60000.0 + } + ], + "disclosureUrls": [ + "https://www.sec.state.ma.us/LobbyistPublicSearch/CompleteDisclosure.aspx?sysvalue=hTpfuAXM..." + ] +} +``` + +### `/lobbyingFilings/{filingId}` + +`filingId` is a SHA-256 hash (first 40 hex chars) of the logical key +`{entityName}_{clientName}_{chamber}_{activityRef}_{generalCourt}`. + +```typescript +type LobbyingChamber = + | "House Bill" + | "Senate Bill" + | "House Docket" + | "Senate Docket" + | "Executive" // lobbying of executive branch agencies + | "Other" // catch-all for rare legacy codes (FY, CMR, etc.) + +interface LobbyingFiling { + filingId: string + entityName: string // raw portal value + entityNameNorm: string // normalized form + clientName: string // raw portal value; "_total_salary_" sentinel for pre-2013 + clientNameNorm: string // normalized form + year: number + generalCourt: number + chamber: LobbyingChamber + // For legislative chambers: the bill number string (e.g. "H1234", "HD56"). + // For Executive: the agency name. Not a bill reference. + billId: string | null + activityTitle: string // bill title (legislative) or meeting description (executive) + position: string // "Support" | "Oppose" | "Neutral" | etc.; empty for executive + amount: number | null // compensation allocated to this activity + fetchedAt: Timestamp +} +``` + +**Example documents** — query: `lobbyingFilings` where `generalCourt == 193` and `billId == "H1"`: + +```json +[ + { + "filingId": "0bac34faf2f624083daaaea57ee55f5364d2be29", + "entityName": "Christina Ascolillo", + "entityNameNorm": "CHRISTINA ASCOLILLO", + "clientName": "American Council of Engineering Companies of Massachusetts", + "clientNameNorm": "AMERICAN COUNCIL OF ENGINEERING COMPANIES OF MASSACHUSETTS", + "year": 2023, + "generalCourt": 193, + "chamber": "House Bill", + "billId": "H1", + "activityTitle": "An Act making appropriations for the Fiscal Year 2024...", + "position": "Neutral", + "amount": 0.0 + }, + { + "filingId": "109600251ca8fd279e8bb7562bc9d234c39f63a8", + "entityName": "Christina Ascolillo", + "entityNameNorm": "CHRISTINA ASCOLILLO", + "clientName": "St. Mary's Center for Women and Children, Inc.", + "clientNameNorm": "ST MARY S CENTER FOR WOMEN AND CHILDREN", + "year": 2023, + "generalCourt": 193, + "chamber": "House Bill", + "billId": "H1", + "activityTitle": "An Act making appropriations for the Fiscal Year 2024...", + "position": "Neutral", + "amount": 0.0 + } +] +``` + +### Constructing `billId` from Raw Portal Data + +The portal stores bill numbers as bare integers and records the chamber +separately. The `billId` field — which maps to `Bill.id` in MAPLE — is +constructed during ingest by combining chamber prefix and integer: + +| `chamber` | Prefix | Example raw | `billId` | +| --------------- | ------ | ----------- | -------- | +| `House Bill` | `H` | `1234` | `H1234` | +| `Senate Bill` | `S` | `1234` | `S1234` | +| `House Docket` | `HD` | `56` | `HD56` | +| `Senate Docket` | `SD` | `56` | `SD56` | +| `Executive` | — | agency name | `null` | +| `Other` | — | varies | `null` | + +Note: `H1234` and `S1234` are distinct bills even though they share the same +integer. The prefix is required to disambiguate. `billId` is `null` for +non-legislative chambers. + +#### Legacy chamber code normalization + +The portal uses short-form codes in older filings, normalized during ingest: + +| Raw value | Stored as | +| --------- | ------------- | +| `HB` | `House Bill` | +| `SB` | `Senate Bill` | + +Rare codes (`FY`, `C`, `CMR`, `HR`, etc.) are stored as `Other`. + +### Joining to Bill Data + +**The join only applies to legislative chambers** (`House Bill`, `Senate Bill`, +`House Docket`, `Senate Docket`) where `billId` is non-null. For `Executive` +and `Other`, no join should be attempted. + +```typescript +// Only valid when filing.billId !== null +db.collection(`/generalCourts/${filing.generalCourt}/bills`).doc(filing.billId) +``` + +--- + +## Entity Name Normalization + +The portal does not enforce consistent name formatting. The same client or +registrant may appear as "Acme Corp.", "ACME CORPORATION", "Acme, Inc. d/b/a +Acme Consulting", etc. across filings and years. Without normalization, +grouping by entity is unreliable. + +Both `entityName` and `clientName` are normalized using the following pipeline, +applied in order. The raw portal value is always preserved alongside the +normalized form. + +### Normalization pipeline + +1. **Uppercase** — convert the entire string to upper case. +2. **Strip d/b/a suffix** — remove everything from the first occurrence of + `D/B/A`, `D/B/A`, `DBA` (and spacing variants) onward, so the registered + name is used rather than a trade name. +3. **Hyphen → space** — replace `-` with ` ` so `LAN-TEL` and `LAN TEL` + collapse to the same key. +4. **Punctuation → space** — replace `,`, `.`, `'`, `'`, `'`, `(`, `)` with + space. Replacement with space (not empty string) prevents adjacent tokens + from concatenating (e.g. `,INC` becomes ` INC`, which is then caught by step + 5). +5. **Remove legal entity type words** — whole-word removal of: `LLC`, `LLP`, + `INC`, `INCORPORATED`, `CORPORATION`, `CORP`, `LTD`, `LIMITED`, `PC`, + `PLLC`. +6. **Remove "THE"** — whole-word removal anywhere in the string (not just as a + leading prefix). +7. **Ampersand → AND** — replace `&` with `AND`. +8. **Fix known typo** — replace `ASSICIATES` with `ASSOCIATES` (legacy portal + data). +9. **Remove professional suffix phrases** — whole-phrase removal of: `LAW +OFFICE OF`, `AND ASSOCIATES`, `& ASSOCIATES`, `AND ASSOC`, `ATTORNEY AT +LAW`, `ATTORNEY@LAW`, `ATTORNET AT LAW`, `AND PARTNERS`, `PUBLIC POLICY +GROUP`, `LEGISLATIVE SERVICES`, `POLICY GROUP`, `ASSOCIATES`, `COUNSELLORS +AT LAW`. +10. **Collapse whitespace** — replace runs of whitespace with a single space and + strip leading/trailing whitespace. + +### Usage + +`entityNameNorm` and `clientNameNorm` are stored on every document and filing. +They should be used for grouping, deduplication, and display-level matching. +Raw names are preserved for provenance and audit. + +--- + +## Deduplication and Amount Aggregation + +### Does lobbying the same bill multiple times mean we should sum amounts? + +The portal collects two semi-annual disclosure filings per registrant per year +(one for each 6-month period). In theory, a registrant could report the same +bill in both H1 and H2 filings with separate compensation amounts that should +be summed. Analysis of the actual data shows this does not occur: after +processing, zero rows share the same `(entityName, clientName, year, +generalCourt, billId, position)` — each (registrant, client, bill, year) +combination appears exactly once. The semi-annual periods report different +activity, not the same activity twice. + +The same registrant can lobby the same bill across multiple General Courts +(observed up to 6 times across years). These are stored as separate documents +per `generalCourt` and should not be summed — each court is a distinct +legislative session. + +### Null-bill row deduplication + +The one real duplication artifact in the portal data is **null-bill rows** — +entries filed when a registrant had no specific bills to report for a client in +a period. These appear in both the H1 and H2 disclosures as identical rows and +should be collapsed. During ingest, if the same `(entityName, clientName, year, +generalCourt, chamber, position)` with a null `billId` is encountered more than +once, keep the row with the highest `amount` so no spend is lost if the two +copies carry different values (in practice amounts are usually both zero). + +### Ingest strategy + +When processing multiple disclosure URLs for the same registrant+year, write +`lobbyingFilings` documents using the logical key as the document ID. A +subsequent disclosure URL that produces the same document ID will naturally +upsert (overwrite) rather than duplicate. For null-bill rows, since `billId` is +null, include `chamber` in the document ID to avoid false merges between +executive and legislative null rows. + +--- + +## Scraper Architecture + +### Why a standalone Cloud Run container + +The MA SoS portal is protected by Imperva WAF, which uses TLS fingerprinting to +classify HTTP clients at the network layer before examining any headers. Node.js +produces a TLS fingerprint that Imperva challenges with a JavaScript +verification page; Python's `requests` library produces a fingerprint that +Imperva allows through without challenge. This is a runtime-level constraint +that cannot be addressed by header configuration or cipher reordering alone. + +The scraper therefore runs as a standalone **Cloud Run container** written in +Python, deployed alongside the existing MCP server container. All data modeling, +Firestore collection/field names, and normalization logic are documented here and +kept consistent between the Python container and the TypeScript type definitions +in `functions/src/lobbying/types.ts`. + +### Cloud Run container: `lobbying-scraper/` + +**Files:** `lobbying-scraper/{scrape,portal,normalize,writer}.py` + +- Scheduled weekly by Cloud Scheduler +- Runs an incremental check: fetches the current and prior year's summary links + (one POST), compares disc URLs against the Firestore cursor, and **exits + immediately if nothing is new** (fast path, typically seconds) +- When new or updated disclosures are found, fetches and processes them +- Persists a cursor in `scrapers/lobbying`: + - `processedDiscUrls: string[]` — disc URLs already written; skipped on + re-runs + - `summaryDiscCache: {[summaryUrl]: string[]}` — maps summary page URLs to + their disc URLs so summary page GETs are skipped for prior-year registrants + whose disclosures are all already processed +- For each new disclosure URL: + - Parse registrant + client compensation rows → upsert `lobbyingRegistrants` + - Parse bill activity rows → batch-write `lobbyingFilings` +- 1s delay between requests; exponential backoff on transient failures + +### Incremental strategy + +In steady state (after the initial backfill), each weekly run: + +1. One POST to fetch all summary links for current + prior year +2. For prior-year registrants with all disc URLs in the cursor: zero GETs +3. For current-year registrants: one GET per summary page to check for new + disclosure periods +4. For any new disc URLs: one GET per disclosure page + +New filings arrive twice a year (semi-annual reporting periods). Between +periods, the run completes in under a minute. + +The backfill script (`--mode backfill`) uses a separate subcollection cursor at +`scrapers/lobbyingBackfill/processedUrls/{urlHash}` so it does not interfere +with the live scraper state. + +### Portal HTML format eras + +The `CompleteDisclosure.aspx` page has changed layout several times. The parser +handles all four eras: + +| Era | Years | Compensation source | Notes | +| -------- | ------------ | ----------------------------------------------------- | --------------------------------------------------------------------------------------------------- | +| Modern | 2019–present | `grdvClientPaidToEntity` grid | Per-client rows; both employer and individual registrant variants | +| Hybrid | 2014–2018 | `Panel1_N` div blocks | Compensation in collapsible panels; bill activity in a separate grid | +| Legacy B | 2009–2013 | "Compensation received" column in bill-activity table | Per-client amounts; deduplication required (same (client, amount) pair can appear in multiple rows) | +| Legacy A | 2005–2008 | `grdvSalaryPaid` entity-total row | No per-client breakdown; stored under sentinel `clientName: "_total_salary_"` | + +Pre-2009 **individual** lobbyist summary pages link to `RegVersionLobbyist.aspx` +rather than `CompleteDisclosure.aspx`. These produce no disclosure detail and are +skipped — expected portal behavior. Employer/entity registrants use +`CompleteDisclosure.aspx` correctly across all years. + +For Legacy A filings (2005–2008), `clientName: "_total_salary_"` signals to +callers that per-client compensation is unavailable. No bill-level compensation +amount is available for these years. + +--- + +## New Files + +``` +functions/src/lobbying/ + types.ts — Runtypes schema definitions for LobbyingRegistrant, LobbyingFiling + normalize.ts — Entity name normalization pipeline (also used client-side) + index.ts — Re-exports + +lobbying-scraper/ + scrape.py — Entry point: --mode weekly (incremental) | --mode backfill + portal.py — HTTP fetch wrappers + pure HTML parsers for all 4 format eras + normalize.py — Port of normalize.ts + writer.py — Firestore document construction + writes + archive.py — GCS raw-HTML archive (write-only; enabled by ARCHIVE_RAW=1) + reparse_archive.py — Offline driver to re-ingest archived HTML into Firestore + requirements.txt — requests, beautifulsoup4, google-cloud-firestore, google-cloud-storage + Dockerfile — Python 3.12-slim image +``` + +The TypeScript lobbying module (`functions/src/lobbying/`) contains only the +schema types and normalization logic. There is no TypeScript scraper or +Firebase Function — ingestion is handled entirely by the Cloud Run container. +This follows the same pattern as the MCP server and avoids the complexity of +running multiple language runtimes in the same Firebase Functions deployment. + +--- + +## Deploying the Cloud Run Container + +Follows the same pattern as the MCP server. The Artifact Registry repo +(`maple-lobbying`) and Cloud Run job (`maple-lobbying-scraper`) are already +created in `digital-testimony-dev`. + +```bash +cd lobbying-scraper +IMAGE=us-central1-docker.pkg.dev/digital-testimony-dev/maple-lobbying/scraper:latest +docker build -t $IMAGE . && docker push $IMAGE + +gcloud run jobs update maple-lobbying-scraper \ + --image=$IMAGE \ + --project=digital-testimony-dev \ + --region=us-central1 +``` + +For a new project (prod), create the job first: + +```bash +gcloud artifacts repositories create maple-lobbying \ + --repository-format=docker --location=us-central1 --project= + +gcloud run jobs create maple-lobbying-scraper \ + --image=$IMAGE \ + --project= \ + --region=us-central1 \ + --task-timeout=30m \ + --max-retries=0 + +# Schedule weekly (Mondays 6am UTC) +gcloud scheduler jobs create http maple-lobbying-weekly \ + --schedule="0 6 * * 1" \ + --uri="https://us-central1-run.googleapis.com/apis/run.googleapis.com/v1/namespaces//jobs/maple-lobbying-scraper:run" \ + --http-method=POST \ + --oauth-service-account-email=@.iam.gserviceaccount.com \ + --location=us-central1 +``` + +## Historical Backfill + +Runs `scrape.py --mode backfill` directly. Resumable — the subcollection +cursor at `scrapers/lobbyingBackfill/processedUrls` tracks progress. + +Always set `ARCHIVE_RAW=1` for real runs so every fetched page is preserved +for offline reparsing. Create the GCS archive bucket first if it does not +exist (see Step 7 of the test plan). + +```bash +cd lobbying-scraper + +# Test a single year with no writes +GOOGLE_CLOUD_PROJECT=digital-testimony-dev \ + GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + python3 scrape.py --mode backfill --year 2024 --limit 3 --dry-run + +# Run a single year for real (with archiving) +GOOGLE_CLOUD_PROJECT=digital-testimony-dev \ + GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + ARCHIVE_RAW=1 \ + python3 scrape.py --mode backfill --year 2024 + +# Full history (2005–present, resumable, with archiving) +GOOGLE_CLOUD_PROJECT=digital-testimony-dev \ + GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + ARCHIVE_RAW=1 \ + python3 scrape.py --mode backfill +``` + +--- + +## Firestore Rules + +Add read-only public rules alongside the existing `generalCourts` rule: + +``` +match /lobbyingRegistrants/{doc} { allow read: if true; } +match /lobbyingFilings/{doc} { allow read: if true; } +``` + +--- + +## Firestore Indexes + +Add composite indexes for common query patterns: + +| Collection | Fields | Use case | +| ----------------- | -------------------------------------- | ---------------------------------------- | +| `lobbyingFilings` | `generalCourt ASC, billId ASC` | Fetch all legislative filings for a bill | +| `lobbyingFilings` | `generalCourt ASC, chamber ASC` | Filter by chamber within a court | +| `lobbyingFilings` | `generalCourt ASC, entityNameNorm ASC` | Fetch all filings for a registrant | +| `lobbyingFilings` | `generalCourt ASC, clientNameNorm ASC` | Fetch all filings for a client | + +Note: bill-join queries should always filter on `chamber` (or check +`billId !== null`) to exclude `Executive` and `Other` rows before treating +`billId` as a MAPLE bill reference. + +--- + +## Implementation Status + +| File | Status | Notes | +| ------------------------------------- | ---------- | ---------------------------------------------------------------- | +| `functions/src/lobbying/types.ts` | ✅ Done | Firestore schema types; imported by future frontend code | +| `functions/src/lobbying/normalize.ts` | ✅ Done | Normalization pipeline; also ported to `normalize.py` | +| `functions/src/lobbying/index.ts` | ✅ Done | Re-exports types and normalize | +| `firestore.rules` | ✅ Done | | +| `firestore.indexes.json` | ✅ Done | | +| `lobbying-scraper/normalize.py` | ✅ Done | Port of normalize.ts | +| `lobbying-scraper/portal.py` | ✅ Done | All 4 format eras; pure parsers separated from fetch wrappers | +| `lobbying-scraper/writer.py` | ✅ Done | Firestore document construction | +| `lobbying-scraper/scrape.py` | ✅ Done | Entry point; `--mode weekly` and `--mode backfill` | +| `lobbying-scraper/archive.py` | ✅ Done | GCS write-only archive; enabled via `ARCHIVE_RAW=1` | +| `lobbying-scraper/reparse_archive.py` | ✅ Done | Offline reparse driver; looks up registrant meta from Firestore | +| `lobbying-scraper/Dockerfile` | ⚠️ Rebuild | Needs rebuild after `google-cloud-storage` added to requirements | + +### Document ID scheme + +Both `registrantId` and `filingId` are SHA-256 hashes (first 40 hex chars) of +their respective logical keys. Hashes are used rather than slugified strings +because entity names and client names contain arbitrary Unicode and punctuation +that would require aggressive sanitization to fit Firestore ID constraints. The +hash is stable across runs for the same logical record. + +--- + +## Future Work (Subsequent PRs) + +### Frontend + +- **Dedicated lobbying pages** + + - `/lobbyists` index: searchable list of registrants with total compensation, + client count, and year filter + - `/lobbyists/{registrantId}` profile: full client list, all bills lobbied, + compensation over time + - `/clients/{clientNameNorm}` profile: registrants hired, bills lobbied, + total spend per year + +- **Bill page integration** (`/bills/{court}/{billId}`) + + - "Lobbying activity" section listing registrants + clients that lobbied this + bill, with position (Support / Oppose / Neutral) and compensation where + available + - Link to registrant profile pages + +- **Organization profile page integration** + - If an organization's normalized name matches a `clientNameNorm` in + `lobbyingFilings`, surface a "Lobbying history" panel showing which bills + they lobbied and which registrants they hired + +### MCP Tools + +Expose lobbying data via the MAPLE MCP server so that AI agents and Claude can +answer questions like "who lobbied bill H1234?" or "what did Acme Corp lobby +for in 2024?". + +- **`get_lobbying_filings_for_bill`** — given `generalCourt` + `billId`, return + all `lobbyingFilings` for that bill with registrant, client, position, and + amount +- **`get_lobbying_registrant`** — given `registrantId`, return the registrant + document with client list and disclosure URLs +- **`search_lobbying_by_client`** — given a client name (raw or normalized), + return matching filings across all courts +- **`get_lobbying_summary_for_bill`** — aggregate view: unique registrant count, + unique client count, total compensation (where non-null), position breakdown + +--- + +## Incremental Test Plan + +Testing proceeds from the inside out: unit logic first, then live portal +fetches against the real site, then a small Firestore write, then a full +backfill year, then steady-state function operation. + +### Step 1 — Unit tests: parser, normalization, bill construction + +Run the pytest suite against all 4 HTML format eras using committed fixture +pages: + +```bash +cd lobbying-scraper +python -m pytest tests/ -v +``` + +Expected: 26 tests pass. Covers compensation totals and client/bill counts for +all eras (2007, 2011, 2016, 2024 — employer and individual), normalization edge +cases, bill ID construction, and specific bug regressions (semicolon bill +separator, "Total amount" artifact row, null billId for executive rows). + +### Step 2 — Live portal fetch: dry run + +Verify the portal is reachable and the parser returns valid data without writing +to Firestore: + +```bash +cd lobbying-scraper +GOOGLE_CLOUD_PROJECT=digital-testimony-dev \ + GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + python3 scrape.py --mode backfill --year 2024 --limit 3 --dry-run +``` + +Expected: 3 registrants fetched, compensation and bill rows printed, no +Firestore writes. + +### Step 3 — Firestore write: single year, small limit + +Write a small batch to the dev project and verify results: + +```bash +cd lobbying-scraper +GOOGLE_CLOUD_PROJECT=digital-testimony-dev \ + GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + python3 scrape.py --mode backfill --year 2024 --limit 3 +``` + +Verify in Firestore console: + +- `lobbyingRegistrants` has 3 documents with `entityName`, `entityNameNorm`, + `regType`, `clients`, `generalCourt` +- `lobbyingFilings` has documents with `billId` non-null for legislative rows + and `null` for Executive rows +- `scrapers/lobbyingBackfill/processedUrls` has entries with `url` and + `processedAt` fields +- Re-running skips already-processed URLs (output shows 0 new disclosures) + +### Step 4 — Spot-check: bill join + +Pick a `lobbyingFiling` document with a non-null `billId` and a `generalCourt` +≥ 192. Verify the bill exists in MAPLE: + +``` +/generalCourts/{filing.generalCourt}/bills/{filing.billId} +``` + +If the bill is found, the join key is correct. If not found, check: (a) whether +MAPLE has data for that court, (b) whether the bill number format matches +(prefix + integer, no leading zeros). + +### Step 5 — Create GCS archive bucket (once per project) + +The archive bucket name is derived from `GOOGLE_CLOUD_PROJECT`. Create it once +with the Archive storage class (written once, read rarely): + +```bash +gsutil mb -p digital-testimony-dev -l us-central1 \ + -c archive gs://digital-testimony-dev-lobbying-archive +``` + +Repeat for prod when running the prod backfill: + +```bash +gsutil mb -p digital-testimony-prod -l us-central1 \ + -c archive gs://digital-testimony-prod-lobbying-archive +``` + +Each project uses its own bucket. Dev and prod data are isolated; the Cloud Run +service account for each project only needs access to its own bucket. + +### Step 6 — Backfill: full current year (or partial across all years) + +Always run with `ARCHIVE_RAW=1` so every fetched page is preserved for offline +reparsing. For a large-scale dev test before a full prod run, `--limit N` is +applied per year — e.g. `--limit 50` across all years fetches ~10% of history: + +```bash +cd lobbying-scraper + +# Full current year +GOOGLE_CLOUD_PROJECT=digital-testimony-dev \ + GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + ARCHIVE_RAW=1 \ + python3 scrape.py --mode backfill --year 2024 + +# ~10% partial across all years (good large-scale dev validation) +GOOGLE_CLOUD_PROJECT=digital-testimony-dev \ + GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + ARCHIVE_RAW=1 \ + python3 scrape.py --mode backfill --limit 50 +``` + +Expected for full year: ~500–600 registrants, ~1,000 disclosure pages, several +thousand filing documents written. + +### Step 7 — Backfill: full history (2005–present) + +Run without `--year` to process all years. Can be interrupted and resumed: + +```bash +GOOGLE_CLOUD_PROJECT=digital-testimony-dev \ + GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \ + ARCHIVE_RAW=1 \ + python3 scrape.py --mode backfill +``` + +Expected runtime: several hours at ~1s/request. The subcollection cursor at +`scrapers/lobbyingBackfill/processedUrls` allows safe interruption and +resumption. + +### Step 8 — Deploy and verify Cloud Run scraper + +Build and push the container image, then update the Cloud Run job: + +```bash +cd lobbying-scraper +IMAGE=us-central1-docker.pkg.dev/digital-testimony-dev/maple-lobbying/scraper:latest +docker build -t $IMAGE . && docker push $IMAGE + +gcloud run jobs update maple-lobbying-scraper \ + --image=$IMAGE \ + --project=digital-testimony-dev \ + --region=us-central1 +``` + +Trigger a manual run via the Cloud Run console or: + +```bash +gcloud run jobs execute maple-lobbying-scraper \ + --project=digital-testimony-dev \ + --region=us-central1 +``` + +Verify via Cloud Run logs: the run should find near-zero new disclosures if the +backfill already completed, confirming the weekly fast-path works correctly. + +--- + +## Design Decisions + +| Decision | Choice | Rationale | +| --------------------------- | ---------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| Collection placement | Top-level `/lobbyingRegistrants`, `/lobbyingFilings` | Lobbying data spans multiple General Courts and is not scoped to a single court like bills/members | +| Single registrant model | One type, `regType: "Lobbyist" \| "Employer"` | Individual lobbyists and firms share the same portal schema; per-bill individual attribution is not available | +| `billId` construction | `{chamberPrefix}{billNumber}` at ingest time | Raw portal data stores chamber and integer separately; the composite is what matches MAPLE's `Bill.id` | +| `billId` null for Executive | `null` instead of agency name | Prevents accidental bill lookups; makes join guard explicit at the type level | +| Normalized name fields | Store both raw and `*Norm` fields | Raw names preserved for provenance; normalized names used for grouping and matching | +| HTML parser | `beautifulsoup4` (Python) | Runs in the Cloud Run container alongside the scraper; no JavaScript runtime needed | +| Live scraper cursor | Array in `/scrapers/lobbying` doc | ~1,000 URLs/year fits well within the 1 MB Firestore doc limit; simple and atomic with other scraper state | +| Backfill cursor | Firestore subcollection `/scrapers/lobbyingBackfill/processedUrls/{urlHash}` | Full 2005-present history (~50,000 URLs) would exceed the 1 MB doc limit; subcollection scales without bound and is durable, inspectable, and resumable from any machine | +| Incremental strategy | Skip already-processed disclosure URLs; write docs by logical key (upsert) | Survives function restarts and re-runs without re-fetching already-scraped pages; natural upsert prevents duplicates without an explicit dedup pass | +| Legacy format (pre-2013) | Store with `clientName: "_total_salary_"` sentinel | Preserves data completeness; callers can filter on this value | +| Historical data | Admin backfill script (2005 → present) | Full history is ingested once; Cloud Function maintains current+prior year going forward | + +--- + +## Appendix: Phase 2 — OCPF Contribution Ingestion + +This appendix tracks planned additions to the lobbying pipeline in a subsequent +PR. The changes link OCPF (Office of Campaign and Political Finance) campaign +contribution records to lobbying registrants, enabling questions like "how much +did this firm contribute to politicians while lobbying on this bill?" + +### Background + +The MA Secretary of State portal (Phase 1, implemented) covers who lobbied which +bills and for how much compensation. OCPF covers who donated to which political +candidates. Linking the two surfaces the intersection: lobbying firms that also +made political contributions. + +OCPF publishes bulk data files at +`ocpf2.blob.core.windows.net/downloads/data2/` (the same host used by +`matchOcpfMembers.ts` for the filers file). The contribution records file URL +needs to be confirmed before implementation begins. + +### New Data + +#### `/lobbyingContributionRecipients/{recipientId}` + +One document per deduplicated contribution recipient (politician/PAC). The +deduplication pipeline is described below. + +```typescript +interface LobbyingContributionRecipient { + recipientId: string // SHA-256(recipientName + officeSought)[:40] + recipientName: string // canonical display name after dedup + recipientSlug: string // slugify("{recipientName} {officeSought}") + officeSought: string // canonical office value (see normalization below) + totalReceived: number + nContributions: number + years: number[] + nameVariants: string[] // all raw names that resolved to this record + manuallyMerged: boolean // true if recipient_merges.json was applied + topFirms: [string, string, number][] // [entityName, entityNameNorm, amount] + fetchedAt: Timestamp +} +``` + +#### New fields on `LobbyingRegistrant` (optional, written by contribution run) + +```typescript +nBills?: number // total filing rows across all clients and years +nContributions?: number // total OCPF contribution records from this registrant +totalContributions?: number // total dollar amount contributed +``` + +`nBills` is computed during the disclosure scrape (bill rows are already in +memory) and written as a side effect of `--mode weekly` and `--mode backfill`. +`nContributions` and `totalContributions` are written by `--mode contributions`. + +#### Pure-contribution firms (new registrant subtype) + +Firms that made OCPF contributions but have zero lobbying activity (no filings) +are written as `LobbyingRegistrant` documents with `nBills: 0`, `clients: []`, +`totalContributions ≥ 500`. These were previously invisible. The $500 threshold +filters noise; adjust after reviewing the data. + +### Contribution Recipient Deduplication + +The same politician may appear under many name variants in OCPF data ("Joe +Curtatone", "Joseph Curtatone", "Curtatone"). The dedup pipeline runs in five +passes in-memory after loading all contribution records. + +All rules are systemic (handle a class of inputs); individual special cases are +handled via `recipient_merges.json` only. + +#### New file: `lobbying-scraper/recipient_normalize.py` + +**Step 1 — `_extract_recipient_name(raw)`** + +Strips committee wrapper names using 14 regex patterns (e.g. "Cte. to Elect Ron +Mariano" → "Ron Mariano"). Then applies in order: + +- Last-first flip: `"Mariano, Ronald"` → `"Ronald Mariano"` +- Honorific strip: `"Sen. Marc Rodrigues"` → `"Marc Rodrigues"` +- Party label strip: `"Ron Mariano Democrat"` → `"Ron Mariano"` +- Initial-dot prefix strip: `"R.Mariano"` → `"Mariano"` + +**Step 2 — `_recipient_dedup_key(name, office)` → `(first, last, office)`** + +Key is a `(first_token, last_token, canonical_office)` tuple. Before keying, +`first_token` is passed through `_NICKNAME_MAP`: + +| Raw | Canonical | +| ---------------- | --------- | +| joe | joseph | +| mike | michael | +| bob | robert | +| bill | william | +| jim | james | +| tom | thomas | +| dan | daniel | +| dave | david | +| steve | steven | +| ron | ronald | +| tim | timothy | +| rick, rich, dick | richard | +| charlie, chuck | charles | +| ben | benjamin | +| tony | anthony | +| marty | martin | +| don | donald | +| ken | kenneth | +| ed | edward | +| ray | raymond | +| nick | nicholas | + +If either token is a generic word (`the`, `comm`, `committee`, `democratic`, +`republican`, `house`, `senate`, `friends`, `cte`, `pac`), the key falls back +to the full canonical text. + +**Step 3 — Manual merges (`recipient_merges.json`)** + +JSON file maps `(name, office)` alias pairs to a canonical record. Start as an +empty object `{}`; populate incrementally as data review finds misses. Applied +after key assignment. + +**Step 4 — `_merge_bare_surnames(groups)`** + +Single-name entries where `first == last` (e.g. `"Curtatone"`) are merged into +the highest-total multi-token peer with the same `(last, office)`. + +**Step 5 — `_fuzzy_merge_keys(groups)`** + +Groups keys by `(first, office)`. Pairs where last tokens differ by Levenshtein +distance ≤ 1 are merged into the higher-total key. Minimum 6 chars on both last +tokens (protects short surnames: Walsh, Jones, etc.). + +### Office Normalization — `_normalize_office(raw)` in `recipient_normalize.py` + +Canonical office values: + +`State Representative` · `State Senator` · `Governor` · `Lt. Governor` · +`Attorney General` · `Treasurer` · `Auditor` · `Secretary of State` · `Mayor` · +`City Council` · `District Attorney` · `Sheriff` · `PAC` + +Normalization pipeline (applied in order): + +1. Strip `"Candidate for [Massachusetts] X"` prefix → `"X"` +2. Strip leading `MA` / `Mass.` / `Massachusetts` +3. Strip trailing junk punctuation (`:;/`) +4. Strip parenthetical suffixes +5. Strip `"of the …"` / `"from the …"` suffixes +6. Strip after dash or slash +7. Strip after comma +8. Strip ordinal suffixes (e.g. `" 3rd Norfolk …"`) +9. Re-strip trailing junk (ordinal strip can leave dangling colons) +10. `OFFICE_MAP` lookup (canonical string → canonical value) +11. Fallback — district lookup: ordinal + MA county → `State Representative` +12. Fallback — county suffix: `"Representative Suffolk"` → strip county → retry map +13. Fallback — location qualifier: strip `"of "`, trailing word, or leading + word → retry map (handles `"Mayor of Somerville"`, `"Somerville Mayor"`, + `"Mayor Somerville"` → `"Mayor"`) + +### New Files + +``` +lobbying-scraper/ + ocpf_contributions.py — OCPF bulk data download, parse, match to registrants + recipient_normalize.py — office normalization + 5-step recipient dedup pipeline + recipient_merges.json — manual merge overrides (start as {}) +``` + +### Infrastructure Changes + +**`firestore.rules`** — add: + +``` +match /lobbyingContributionRecipients/{id} { allow read: if true; allow write: if false; } +``` + +**`firestore.indexes.json`** — add: + +| Collection | Fields | Use case | +| -------------------------------- | -------------------------------------- | --------------------------- | +| `lobbyingContributionRecipients` | `officeSought ASC, totalReceived DESC` | Office-filtered leaderboard | +| `lobbyingContributionRecipients` | `officeSought ASC, recipientName ASC` | Alphabetical browse | + +### What Is Not Implemented (Phase 2) + +**AI bill topics (`tags.json` equivalent)** — requires a bill embeddings parquet +file. MAPLE does not produce this file. The topics feature is deferred; MAPLE's +existing bill search index could be used as a substitute in a future PR. + +**`recipient_merges.json`** — starts empty. Requires domain review of the +deduplicated output to find cases the automated rules miss. Populate +incrementally. + +### Order of Work + +1. Confirm OCPF bulk contribution file URL and column schema +2. Confirm or incorporate HTML archiving pattern (see open question below) +3. Implement `recipient_normalize.py` (self-contained, testable in isolation) +4. Implement `ocpf_contributions.py` +5. Update `functions/src/lobbying/types.ts` schema +6. Update `writer.py` and `scrape.py` +7. Update `firestore.rules` and `firestore.indexes.json` + +### HTML Archiving for Fast Reparsing + +The current scraper fetches and immediately parses SoS portal HTML, storing +nothing but the parsed Firestore documents. If parsing logic changes, the full +portal must be re-scraped — slow and fragile given the Imperva TLS constraint. +Phase 2 adds GCS archiving of raw HTML as a side effect of every portal fetch. + +#### Design principles + +The archive is **write-only cold storage**, not a cache. It is fully decoupled +from both the incremental cursor (which stays Firestore-only) and the parse +path. Fetching from the portal is always driven by the Firestore cursor; the +archive is a downstream side effect of a successful fetch. + +Parsers must be **pure functions with no I/O** — `parse_summary(soup)`, +`parse_disclosure_detail(soup, year)` — so the identical code runs in the live +scrape and in the offline reparse script without modification. + +#### GCS key scheme + +One object per fetched page: `raw_html/{sha1(url)}.html`. URL-hash is +collision-free without modeling the ASP.NET URL key space. Individual `.html` +objects (not batch tarballs) are appropriate here because our scraper runs +weekly in small increments rather than in large historical sweeps; per-object +GCS is simpler and makes the reparse path a flat list + get. + +Bucket: `gs://-lobbying-archive/raw_html/` +Storage class: Archive (written once, read rarely). + +#### Write timing + +Archived immediately after `raise_for_status()` passes, before parsing, and +**not gated on parse success**: + +```python +def _get(session, url): + r = session.get(url, ...) + r.raise_for_status() + if "Summary.aspx" in url or "CompleteDisclosure" in url: + _archive_page(url, r.text) # side effect; never blocks parse + return BeautifulSoup(r.text, "html.parser") +``` + +Gating on parse success would defeat the purpose: the archive exists precisely +to recover from parser gaps later, so we want the bytes even when the current +parser does not fully understand them. The search/results page is excluded +(same URL across all years, trivially regenerable). + +#### Reparse path + +A dedicated offline script `lobbying-scraper/reparse_archive.py`: + +``` +python3 reparse_archive.py [--limit N] [--dry-run] +``` + +Lists `raw_html/*.html` objects from GCS, downloads them, resolves each +`sha1.html` back to its original URL (stored as object metadata at write time), +and runs the pure parser functions against the archived soup. Tracks completed +objects via a `processed_archive.txt` marker so reparse is resumable. + +#### Cursor interaction + +No change to the Firestore cursor. The archive is never consulted to skip a +portal fetch. "Have I processed this disclosure?" is still answered by the +Firestore cursor; the archive is a consequence of fetching, not an input to the +decision. + +#### New infrastructure required + +- GCS bucket `-lobbying-archive` (Archive class, no public access) +- `google-cloud-storage` added to `lobbying-scraper/requirements.txt` +- `GOOGLE_CLOUD_PROJECT` env var (already available on Cloud Run) used to + derive bucket name +- IAM: Cloud Run service account needs `storage.objects.create` on the bucket +- New file: `lobbying-scraper/reparse_archive.py` + +#### Order of work within Phase 2 + +This is a prerequisite for `ocpf_contributions.py` only in the sense that we +want the archive in place before running the full historical backfill so we do +not have to re-scrape. It is otherwise independent and can be implemented +alongside the contribution pipeline rather than before it. diff --git a/firestore.indexes.json b/firestore.indexes.json index 83cb3fa6d..c267a6868 100644 --- a/firestore.indexes.json +++ b/firestore.indexes.json @@ -788,25 +788,46 @@ "collectionGroup": "ballotQuestions", "queryScope": "COLLECTION", "fields": [ - { "fieldPath": "electionYear", "order": "ASCENDING" }, - { "fieldPath": "ballotStatus", "order": "ASCENDING" } + { + "fieldPath": "electionYear", + "order": "ASCENDING" + }, + { + "fieldPath": "ballotStatus", + "order": "ASCENDING" + } ] }, { "collectionGroup": "publishedTestimony", "queryScope": "COLLECTION_GROUP", "fields": [ - { "fieldPath": "ballotQuestionId", "order": "ASCENDING" }, - { "fieldPath": "publishedAt", "order": "DESCENDING" } + { + "fieldPath": "ballotQuestionId", + "order": "ASCENDING" + }, + { + "fieldPath": "publishedAt", + "order": "DESCENDING" + } ] }, { "collectionGroup": "publishedTestimony", "queryScope": "COLLECTION", "fields": [ - { "fieldPath": "billId", "order": "ASCENDING" }, - { "fieldPath": "court", "order": "ASCENDING" }, - { "fieldPath": "ballotQuestionId", "order": "ASCENDING" } + { + "fieldPath": "billId", + "order": "ASCENDING" + }, + { + "fieldPath": "court", + "order": "ASCENDING" + }, + { + "fieldPath": "ballotQuestionId", + "order": "ASCENDING" + } ] }, { @@ -898,6 +919,62 @@ } } ] + }, + { + "collectionGroup": "lobbyingFilings", + "queryScope": "COLLECTION", + "fields": [ + { + "fieldPath": "generalCourt", + "order": "ASCENDING" + }, + { + "fieldPath": "billId", + "order": "ASCENDING" + } + ] + }, + { + "collectionGroup": "lobbyingFilings", + "queryScope": "COLLECTION", + "fields": [ + { + "fieldPath": "generalCourt", + "order": "ASCENDING" + }, + { + "fieldPath": "chamber", + "order": "ASCENDING" + } + ] + }, + { + "collectionGroup": "lobbyingFilings", + "queryScope": "COLLECTION", + "fields": [ + { + "fieldPath": "generalCourt", + "order": "ASCENDING" + }, + { + "fieldPath": "entityNameNorm", + "order": "ASCENDING" + } + ] + }, + { + "collectionGroup": "lobbyingFilings", + "queryScope": "COLLECTION", + "fields": [ + { + "fieldPath": "generalCourt", + "order": "ASCENDING" + }, + { + "fieldPath": "clientNameNorm", + "order": "ASCENDING" + } + ] } ], "fieldOverrides": [ diff --git a/firestore.rules b/firestore.rules index a95586279..42db67276 100644 --- a/firestore.rules +++ b/firestore.rules @@ -103,6 +103,14 @@ service cloud.firestore { allow read: if true; allow write: if false; } + match /lobbyingRegistrants/{id} { + allow read: if true; + allow write: if false; + } + match /lobbyingFilings/{id} { + allow read: if true; + allow write: if false; + } match /transcriptions/{tid} { // public, read-only allow read: if true diff --git a/functions/src/lobbying/index.ts b/functions/src/lobbying/index.ts new file mode 100644 index 000000000..6d039ae51 --- /dev/null +++ b/functions/src/lobbying/index.ts @@ -0,0 +1,2 @@ +export * from "./types" +export { normalizeEntityName } from "./normalize" diff --git a/functions/src/lobbying/normalize.ts b/functions/src/lobbying/normalize.ts new file mode 100644 index 000000000..a7beb338f --- /dev/null +++ b/functions/src/lobbying/normalize.ts @@ -0,0 +1,72 @@ +/** + * Entity name normalization pipeline. + * + * The SoS portal does not enforce consistent name formatting. The same client or + * registrant may appear as "Acme Corp.", "ACME CORPORATION", "Acme, Inc. d/b/a + * Acme Consulting", etc. across filings and years. + * + * The steps must be applied in the exact order + * listed here; changing the order produces different (incorrect) output. + */ + +// Step 2: strip d/b/a trade-name suffix before any other transforms so the +// trade name doesn't bleed into the canonical form. +const DBA_RE = /\s+D\s*\/+B\s*\/+A?\s+.*|\s+DBA\s+.*/i + +// Step 5: remove legal entity type words with whole-word matching so +// "INCORPORATED" and "CORP" are caught in addition to "LLC"/"INC". +const LEGAL_ENTITY_RE = + /\b(LLC|LLP|INC|INCORPORATED|CORPORATION|CORP|LTD|LIMITED|PC|PLLC)\b/g + +// Step 6: remove "THE" as a whole word anywhere (not just as a leading prefix). +const THE_RE = /\bTHE\b/g + +// Step 9: professional suffix phrases to remove wholesale. +const MISC_PHRASES = [ + "LAW OFFICE OF", + "AND ASSOCIATES", + "& ASSOCIATES", + "AND ASSOC", + "ATTORNEY AT LAW", + "ATTORNEY@LAW", + "ATTORNET AT LAW", // known portal typo + "AND PARTNERS", + "PUBLIC POLICY GROUP", + "LEGISLATIVE SERVICES", + "POLICY GROUP", + "ASSOCIATES", + "COUNSELLORS AT LAW" +] + +export function normalizeEntityName(raw: string | null | undefined): string { + if (!raw) return "" + + let x = raw.toUpperCase() // Step 1: uppercase + + x = x.replace(DBA_RE, "") // Step 2: strip d/b/a suffix + + x = x.replace(/-/g, " ") // Step 3: hyphen → space + + // Step 4: punctuation → space (not empty string, so ",INC" → " INC" → caught + // by step 5's whole-word removal). + for (const ch of [",", ".", "'", "‘", "’", "(", ")"]) { + x = x.split(ch).join(" ") + } + + x = x.replace(LEGAL_ENTITY_RE, " ") // Step 5: remove legal entity type words + + x = x.replace(THE_RE, " ") // Step 6: remove THE anywhere + + x = x.replace(/&/g, "AND") // Step 7: ampersand → AND + + x = x.replace("ASSICIATES", "ASSOCIATES") // Step 8: fix known portal typo + + // Step 9: remove professional suffix phrases + for (const phrase of MISC_PHRASES) { + x = x.split(phrase).join(" ") + } + + x = x.replace(/\s+/g, " ").trim() // Step 10: collapse whitespace + + return x +} diff --git a/functions/src/lobbying/types.ts b/functions/src/lobbying/types.ts new file mode 100644 index 000000000..83eaab761 --- /dev/null +++ b/functions/src/lobbying/types.ts @@ -0,0 +1,101 @@ +import { + Array, + InstanceOf, + Literal, + Number, + Null, + Record, + Static, + String, + Union +} from "runtypes" +import { Timestamp } from "../firebase" + +export type LobbyingChamber = Static +export const LobbyingChamber = Union( + Literal("House Bill"), + Literal("Senate Bill"), + Literal("House Docket"), + Literal("Senate Docket"), + Literal("Executive"), + Literal("Other") +) + +export type LobbyingClient = Static +export const LobbyingClient = Record({ + clientName: String, + clientNameNorm: String, + compensation: Null.Or(Number) +}) + +export type LobbyingRegistrant = Static +export const LobbyingRegistrant = Record({ + registrantId: String, + entityName: String, + entityNameNorm: String, + year: Number, + generalCourt: Number, + regType: Union(Literal("Lobbyist"), Literal("Employer")), + clients: Array(LobbyingClient), + disclosureUrls: Array(String), + fetchedAt: InstanceOf(Timestamp) +}) + +export type LobbyingFiling = Static +export const LobbyingFiling = Record({ + filingId: String, + entityName: String, + entityNameNorm: String, + clientName: String, + clientNameNorm: String, + year: Number, + generalCourt: Number, + chamber: LobbyingChamber, + // Non-null only for legislative chambers (House Bill, Senate Bill, House Docket, + // Senate Docket). For Executive and Other, no bill join should be attempted. + billId: Null.Or(String), + activityTitle: String, + position: String, + amount: Null.Or(Number), + fetchedAt: InstanceOf(Timestamp) +}) + +/** Firestore path for lobbying registrant documents */ +export const REGISTRANTS_COLLECTION = "lobbyingRegistrants" + +/** Firestore path for lobbying filing documents */ +export const FILINGS_COLLECTION = "lobbyingFilings" + +/** Firestore path for the live scraper cursor document */ +export const SCRAPER_DOC = "/scrapers/lobbying" + +/** Firestore path for the backfill cursor subcollection */ +export const BACKFILL_DOC = "/scrapers/lobbyingBackfill" +export const BACKFILL_URLS_COLLECTION = "processedUrls" + +/** Earliest year with portal data */ +export const FIRST_LOBBYING_YEAR = 2005 + +/** + * Sentinel clientName used for pre-2013 legacy filings where compensation is + * reported as a single total rather than broken down per client. + */ +export const LEGACY_TOTAL_CLIENT = "_total_salary_" + +/** + * Chamber prefix map for constructing billId values that match MAPLE's Bill.id. + * Typed as a plain index signature so portal.ts can look up any LobbyingChamber + * without triggering "Property X does not exist" on the Partial. + */ +export const CHAMBER_PREFIXES: { [chamber: string]: string | undefined } = { + "House Bill": "H", + "Senate Bill": "S", + "House Docket": "HD", + "Senate Docket": "SD" +} + +/** Canonical chamber values for legacy short-form codes found in older filings */ +export const LEGACY_CHAMBER_MAP: { [raw: string]: LobbyingChamber } = { + HB: "House Bill", + SB: "Senate Bill" +} diff --git a/lobbying-scraper/.dockerignore b/lobbying-scraper/.dockerignore new file mode 100644 index 000000000..9460c99c4 --- /dev/null +++ b/lobbying-scraper/.dockerignore @@ -0,0 +1,4 @@ +__pycache__/ +*.pyc +*.pyo +.env diff --git a/lobbying-scraper/Dockerfile b/lobbying-scraper/Dockerfile new file mode 100644 index 000000000..4b2da65b5 --- /dev/null +++ b/lobbying-scraper/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.12-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY normalize.py portal.py writer.py scrape.py ./ + +# Cloud Run sets PORT; we don't use it (this is a job, not a server). +# Cloud Scheduler invokes the container via HTTP POST to /; handle it minimally. +ENV PYTHONUNBUFFERED=1 + +# ENTRYPOINT is the fixed executable; CMD provides default args that --args overrides. +ENTRYPOINT ["python3", "scrape.py"] +CMD ["--mode", "weekly"] diff --git a/lobbying-scraper/__pycache__/normalize.cpython-37.pyc b/lobbying-scraper/__pycache__/normalize.cpython-37.pyc new file mode 100644 index 000000000..94efb861e Binary files /dev/null and b/lobbying-scraper/__pycache__/normalize.cpython-37.pyc differ diff --git a/lobbying-scraper/__pycache__/portal.cpython-37.pyc b/lobbying-scraper/__pycache__/portal.cpython-37.pyc new file mode 100644 index 000000000..d515175f3 Binary files /dev/null and b/lobbying-scraper/__pycache__/portal.cpython-37.pyc differ diff --git a/lobbying-scraper/archive.py b/lobbying-scraper/archive.py new file mode 100644 index 000000000..ede5d6656 --- /dev/null +++ b/lobbying-scraper/archive.py @@ -0,0 +1,60 @@ +"""GCS raw-HTML archive for the MA SoS lobbying scraper. + +Write-only cold storage: every fetched Summary/CompleteDisclosure page is +saved as gs://{bucket}/raw_html/{sha1(url)}.html with the original URL stored +as object metadata. This enables offline reparsing when parser logic changes +without re-scraping the portal (which is rate-limited and Imperva-protected). + +Enabled by setting ARCHIVE_RAW=1 in the environment. When disabled (default), +save_page() is a no-op so local runs and tests work without GCS credentials. + +The bucket is named {GOOGLE_CLOUD_PROJECT}-lobbying-archive and should be +created with the Archive storage class (written once, read rarely). +""" + +from __future__ import annotations + +import hashlib +import os + +_ENABLED = os.environ.get("ARCHIVE_RAW", "").lower() in ("1", "true", "yes") +_bucket_name: str | None = None +_client = None # google.cloud.storage.Client, lazily initialized + + +def _gcs(): + global _client + if _client is None: + from google.cloud import storage # noqa: PLC0415 + _client = storage.Client() + return _client + + +def _get_bucket_name() -> str: + global _bucket_name + if _bucket_name is None: + project = os.environ.get("GOOGLE_CLOUD_PROJECT") or _gcs().project + _bucket_name = f"{project}-lobbying-archive" + return _bucket_name + + +def blob_name(url: str) -> str: + """Return the GCS object name for a given URL.""" + return "raw_html/" + hashlib.sha1(url.encode()).hexdigest() + ".html" + + +def save_page(url: str, html: str) -> None: + """Write raw HTML to GCS. No-op when ARCHIVE_RAW is not set.""" + if not _ENABLED: + return + try: + bucket = _gcs().bucket(_get_bucket_name()) + blob = bucket.blob(blob_name(url)) + blob.metadata = {"source-url": url} + blob.upload_from_string( + html.encode("utf-8"), + content_type="text/html; charset=utf-8", + ) + except Exception as exc: + # Archive failures must never interrupt the live scrape path. + print(f" [archive] WARNING: failed to save {url[:80]!r}: {exc}") diff --git a/lobbying-scraper/normalize.py b/lobbying-scraper/normalize.py new file mode 100644 index 000000000..6e6f7418e --- /dev/null +++ b/lobbying-scraper/normalize.py @@ -0,0 +1,50 @@ +"""Entity name normalization pipeline. + +Direct port of functions/src/lobbying/normalize.ts. Steps must be applied in +this exact order — changing the order produces different (incorrect) output. +""" + +from __future__ import annotations + +import re + +_DBA_RE = re.compile(r"\s+D\s*/+B\s*/+A?\s+.*|\s+DBA\s+.*", re.IGNORECASE) +_LEGAL_RE = re.compile( + r"\b(LLC|LLP|INC|INCORPORATED|CORPORATION|CORP|LTD|LIMITED|PC|PLLC)\b" +) +_THE_RE = re.compile(r"\bTHE\b") +_WS_RE = re.compile(r"\s+") + +_MISC_PHRASES = [ + "LAW OFFICE OF", + "AND ASSOCIATES", + "& ASSOCIATES", + "AND ASSOC", + "ATTORNEY AT LAW", + "ATTORNEY@LAW", + "ATTORNET AT LAW", # known portal typo + "AND PARTNERS", + "PUBLIC POLICY GROUP", + "LEGISLATIVE SERVICES", + "POLICY GROUP", + "ASSOCIATES", + "COUNSELLORS AT LAW", +] + + +def normalize_entity_name(raw: str | None) -> str: + if not raw: + return "" + x = raw.upper() # 1. uppercase + x = _DBA_RE.sub("", x) # 2. strip d/b/a suffix + x = x.replace("-", " ") # 3. hyphen → space + for ch in (",", ".", "'", "‘", "’", "(", ")"): + x = x.replace(ch, " ") # 4. punctuation → space + x = _LEGAL_RE.sub(" ", x) # 5. remove legal entity words + x = _THE_RE.sub(" ", x) # 6. remove THE anywhere + x = x.replace("&", "AND") # 7. ampersand → AND + x = x.replace("ASSICIATES", "ASSOCIATES") # 8. fix known typo + for phrase in _MISC_PHRASES: # 9. remove professional suffix phrases + x = x.replace(phrase, " ") + x = _WS_RE.sub(" ", x).strip() # 10. collapse whitespace + return x diff --git a/lobbying-scraper/portal.py b/lobbying-scraper/portal.py new file mode 100644 index 000000000..3a5528898 --- /dev/null +++ b/lobbying-scraper/portal.py @@ -0,0 +1,512 @@ +"""HTTP client and HTML parser for the MA SoS lobbying portal. + +Portal: https://www.sec.state.ma.us/LobbyistPublicSearch/ + +Page flow: + 1. Search POST -> summary links table + 2. Summary.aspx -> registrant name/year/type + CompleteDisclosure links + 3. CompleteDisclosure.aspx -> per-client compensation + per-client bill activity + +Four HTML format eras for CompleteDisclosure pages (detected by table IDs): + Modern (2019+): grdvClientPaidToEntity + grdvActivitiesNew{year}_{n} + Hybrid (2014-2018): no comp table; Panel1_{n} divs + grdvActivitiesNew_{n} + Legacy (2009-2013): grdvActivities with Compensation received column + Legacy (2005-2008): grdvSalaryPaid (entity total) + grdvActivities (no comp col) + +parse_summary() and parse_disclosure_detail() are pure functions (no I/O) so +they can be called from both the live scraper and the offline reparse driver. +fetch_* are thin wrappers that handle HTTP and raw-HTML archiving. +""" + +from __future__ import annotations + +import hashlib +import re +import time +from dataclasses import dataclass, field +from typing import Optional + +import requests +from bs4 import BeautifulSoup, Tag + +import archive + +# ── Constants ───────────────────────────────────────────────────────────────── + +BASE_URL = "https://www.sec.state.ma.us/LobbyistPublicSearch/" +SEARCH_URL = BASE_URL + "Default.aspx" + +_UA = ( + "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148" +) +_REQUEST_DELAY = 1.0 +_MAX_RETRIES = 6 +_RETRY_STATUS = {429, 500, 502, 503, 504} + +# Lobby disclosure data begins in 2005; GC 183 started Jan 2003. +FIRST_YEAR = 2005 +FIRST_GC = 183 +FIRST_GC_START_YEAR = 2003 + +# clientName sentinel for pre-2009 filings where compensation is a single entity total +LEGACY_TOTAL_CLIENT = "_total_salary_" + +# Maps canonical chamber names to the bill-ID prefix used in MAPLE's Bill.id +CHAMBER_PREFIXES: dict[str, str] = { + "House Bill": "H", + "Senate Bill": "S", + "House Docket": "HD", + "Senate Docket": "SD", +} + +# Legacy short-form chamber codes found in older filings +LEGACY_CHAMBER_MAP: dict[str, str] = { + "HB": "House Bill", + "SB": "Senate Bill", +} + +# ── Data types ──────────────────────────────────────────────────────────────── + + +@dataclass +class Compensation: + client_name: str + amount: Optional[float] + + +@dataclass +class BillActivity: + client_name: str + chamber: str # canonical LobbyingChamber value + raw_bill_number: str + bill_id: Optional[str] # e.g. "H1234"; null for Executive/Other + activity_title: str + position: str + amount: Optional[float] + + +@dataclass +class DisclosureMeta: + entity_name: str + year: Optional[int] + reg_type: str # "Lobbyist" | "Employer" + disclosure_urls: list[str] = field(default_factory=list) + + +@dataclass +class DisclosureDetail: + compensation: list[Compensation] = field(default_factory=list) + bills: list[BillActivity] = field(default_factory=list) + + +# ── Derived-value helpers ───────────────────────────────────────────────────── + + +def year_to_general_court(year: int) -> int: + return FIRST_GC + (year - FIRST_GC_START_YEAR) // 2 + + +def normalize_chamber(raw: str) -> str: + t = raw.strip() + if t in LEGACY_CHAMBER_MAP: + return LEGACY_CHAMBER_MAP[t] + known = {"House Bill", "Senate Bill", "House Docket", "Senate Docket", "Executive"} + return t if t in known else "Other" + + +def construct_bill_id(chamber: str, raw_bill_number: str) -> Optional[str]: + """Construct the MAPLE-compatible billId from chamber + raw integer. + + Returns None for Executive and Other chambers where no bill join is possible. + H1234 and S1234 are distinct bills even though they share the same integer; + the prefix is required to disambiguate. + """ + prefix = CHAMBER_PREFIXES.get(chamber) + if not prefix: + return None + try: + return f"{prefix}{int(raw_bill_number)}" + except (ValueError, TypeError): + return None + + +def registrant_id(entity_name: str, year: int) -> str: + key = f"{year}|{entity_name}" + return hashlib.sha256(key.encode()).hexdigest()[:40] + + +def filing_id( + entity_name: str, + client_name: str, + chamber: str, + bill_id: Optional[str], + general_court: int, + position: str, +) -> str: + key = "|".join([ + entity_name, client_name, chamber, + bill_id or "__null__", str(general_court), position, + ]) + return hashlib.sha256(key.encode()).hexdigest()[:40] + + +# ── HTTP session ────────────────────────────────────────────────────────────── + + +def make_session() -> requests.Session: + s = requests.Session() + s.headers.update({ + "User-Agent": _UA, + "Accept": "*/*", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + }) + return s + + +def _get(session: requests.Session, url: str) -> BeautifulSoup: + for attempt in range(_MAX_RETRIES): + time.sleep(_REQUEST_DELAY * (2 ** attempt) if attempt else _REQUEST_DELAY) + try: + r = session.get(url, timeout=60) + except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: + if attempt == _MAX_RETRIES - 1: + raise + print(f" GET network error (attempt {attempt + 1}): {e}") + continue + if r.status_code in _RETRY_STATUS: + print(f" GET HTTP {r.status_code} (attempt {attempt + 1}) — retrying") + if attempt == _MAX_RETRIES - 1: + r.raise_for_status() + continue + r.raise_for_status() + # Archive content pages before parsing; excludes the search page which + # shares one URL across all years and is trivially regenerable. + if "Summary.aspx" in url or "CompleteDisclosure" in url: + archive.save_page(url, r.text) + return BeautifulSoup(r.text, "html.parser") + raise RuntimeError("_get: exhausted retries") # unreachable + + +def _post(session: requests.Session, url: str, data: dict) -> BeautifulSoup: + for attempt in range(_MAX_RETRIES): + time.sleep(_REQUEST_DELAY * (2 ** attempt) if attempt else _REQUEST_DELAY) + try: + r = session.post(url, data=data, timeout=180) + except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: + if attempt == _MAX_RETRIES - 1: + raise + print(f" POST network error (attempt {attempt + 1}): {e}") + continue + if r.status_code in _RETRY_STATUS: + print(f" POST HTTP {r.status_code} (attempt {attempt + 1}) — retrying") + if attempt == _MAX_RETRIES - 1: + r.raise_for_status() + continue + r.raise_for_status() + return BeautifulSoup(r.text, "html.parser") + raise RuntimeError("_post: exhausted retries") # unreachable + + +# ── Portal scraping ─────────────────────────────────────────────────────────── + + +def _viewstate(soup: BeautifulSoup) -> dict: + return { + inp["name"]: inp.get("value", "") + for inp in soup.find_all("input", type="hidden") + if inp.get("name") + } + + +def fetch_summary_links(session: requests.Session, year: int) -> list[str]: + """Return all Summary.aspx URLs for a given year via a single search POST.""" + soup = _get(session, SEARCH_URL) + data = { + **_viewstate(soup), + "__EVENTTARGET": "", + "__EVENTARGUMENT": "", + "ctl00$ContentPlaceHolder1$Search": "rdbSearchByType", + "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$ddlYear": str(year), + "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$txtN_ame": "", + "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$lddSearchType$DropDown": "3", + "ctl00$ContentPlaceHolder1$ucSearchCriteriaByType$drpType": "L", + "ctl00$ContentPlaceHolder1$drpPageSize": "20000", + "ctl00$ContentPlaceHolder1$btnSearch": "Search", + } + results = _post(session, SEARCH_URL, data) + table = results.find( + "table", + id=lambda x: x and "grdvSearchResultByTypeAndCategory" in x, + ) + if not table: + return [] + return [ + BASE_URL + a["href"] if not a["href"].startswith("http") else a["href"] + for a in table.find_all("a", href=True) + if "Summary.aspx" in a["href"] + ] + + +def parse_summary(soup: BeautifulSoup) -> DisclosureMeta: + """Parse a Summary.aspx page. Pure function — no I/O.""" + def text(el_id: str) -> str: + el = soup.find(id=el_id) + return el.get_text(strip=True) if el else "" + + entity_name = text("ContentPlaceHolder1_lblRegistrantName") + year_text = text("ContentPlaceHolder1_lblYear") + reg_type_raw = text("ContentPlaceHolder1_lblRegType") + + try: + year = int(year_text) + except ValueError: + year = None + + reg_type = "Employer" if "Entity" in reg_type_raw else "Lobbyist" + + disc_urls = [ + BASE_URL + a["href"] if not a["href"].startswith("http") else a["href"] + for a in soup.find_all("a", href=True) + if "CompleteDisclosure" in a["href"] + ] + + return DisclosureMeta( + entity_name=entity_name, + year=year, + reg_type=reg_type, + disclosure_urls=disc_urls, + ) + + +def fetch_disclosure_meta(session: requests.Session, summary_url: str) -> DisclosureMeta: + return parse_summary(_get(session, summary_url)) + + +def _parse_amount(text: str) -> Optional[float]: + cleaned = text.replace("$", "").replace(",", "").strip() + try: + return float(cleaned) + except ValueError: + return None + + +def _grid_rows(table: Tag) -> list: + return table.find_all("tr", class_=lambda c: c and "Grid" in c and "Header" not in c) + + +def parse_disclosure_detail(soup: BeautifulSoup, year: int) -> DisclosureDetail: + """Parse a CompleteDisclosure page. Pure function — no I/O. + + Four HTML format eras (detected by table IDs): + + Modern (2019+): grdvClientPaidToEntity holds per-client compensation; + bills in grdvActivitiesNew{year}_{n} (one table per client). + + Hybrid (2014-2018): no grdvClientPaidToEntity. Bills in grdvActivitiesNew_{n} + (no year suffix). Per-client compensation is in id-less Panel1_{n} divs + ("Total amount paid by client...: $X") indexed by lblClientName_{n} spans. + Each client reports either a Panel1 total OR activity-level amounts — + summing both sources is safe because the unused one is always zero. + Omitting this path silently drops ~99% of 2014-2018 compensation. + + Legacy (2009-2013): single grdvActivities table with a "Compensation received" + column carrying a per-client total repeated on every bill row for that client. + Must deduplicate distinct (client, amount) pairs before summing — never sum + raw rows or the total is multiplied by bill count. + + Legacy (2005-2008): grdvActivities has no compensation column; fall back to + grdvSalaryPaid entity total under the _total_salary_ placeholder client. + """ + compensation: list[Compensation] = [] + bills: list[BillActivity] = [] + gc = year_to_general_court(year) + + # ── Modern / Hybrid: per-client activity tables ─────────────────────────── + comp_table = soup.find( + "table", + id=lambda x: x and "grdvClientPaidToEntity" in (x or ""), + ) + if comp_table: + for row in _grid_rows(comp_table): + cells = [td.get_text(strip=True) for td in row.find_all("td")] + if len(cells) >= 2: + compensation.append(Compensation( + client_name=cells[0], + amount=_parse_amount(cells[1]), + )) + + # Activity tables: ID pattern grdvActivitiesNew{year}_{n} (Modern, 2019+) + # or grdvActivitiesNew_{n} with no year (Hybrid, 2014-2018). The same loop + # handles both; compensation source differs (see Hybrid block below). + activity_by_client: dict[str, float] = {} + for act_table in soup.find_all( + "table", + id=lambda x: x and re.search(r"grdvActivitiesNew(\d{4})?_\d+", x or ""), + ): + client_span = act_table.find_previous( + "span", + id=lambda x: x and "lblClientName" in (x or ""), + ) + client_name = client_span.get_text(strip=True) if client_span else "" + + for row in _grid_rows(act_table): + cells = [td.get_text(strip=True) for td in row.find_all("td")] + if len(cells) < 4: + continue + chamber = normalize_chamber(cells[0]) + raw_num = cells[1] + bill_id = construct_bill_id(chamber, raw_num) + amt = _parse_amount(cells[4]) if len(cells) > 4 else None + bills.append(BillActivity( + client_name=client_name, + chamber=chamber, + raw_bill_number=raw_num, + bill_id=bill_id, + activity_title=cells[2] if len(cells) > 2 else "", + position=cells[3] if len(cells) > 3 else "", + amount=amt, + )) + if amt: + activity_by_client[client_name] = ( + activity_by_client.get(client_name, 0.0) + amt + ) + + # Hybrid (2014-2018): no modern comp table — reconstruct per-client amounts + # from Panel1 "Total amount paid by client" divs indexed by client-name spans. + if not comp_table and bills: + client_by_idx = { + sp["id"].split("_")[-1]: sp.get_text(strip=True) + for sp in soup.find_all( + "span", id=lambda x: x and "lblClientName_" in (x or "") + ) + } + panel_by_client: dict[str, float] = {} + for div in soup.find_all("div", id=lambda x: x and "Panel1_" in (x or "")): + idx = div["id"].split("_")[-1] + cn = client_by_idx.get(idx) + if not cn: + continue + m = re.search(r"\$([\d,]+\.\d\d)", div.get_text(" ", strip=True)) + panel_by_client[cn] = float(m.group(1).replace(",", "")) if m else 0.0 + for cn in set(panel_by_client) | set(activity_by_client): + amt = panel_by_client.get(cn, 0.0) + activity_by_client.get(cn, 0.0) + if amt: + compensation.append(Compensation(client_name=cn, amount=amt)) + + if comp_table or bills: + return DisclosureDetail(compensation=compensation, bills=bills) + + # ── Legacy format (2005-2013): single grdvActivities table ─────────────── + act_table = soup.find("table", id=lambda x: x and x.endswith("grdvActivities")) + # Distinct (client, amount) pairs from the "Compensation received" column + # (2009-2013). The portal repeats the per-client total on every bill row for + # that client, so we deduplicate before summing to avoid multiplying by bill count. + legacy_comp_pairs: set = set() + comp_col: Optional[int] = None + + if act_table: + all_rows = act_table.find_all("tr") + headers = [ + th.get_text(strip=True) + for th in (all_rows[0].find_all(["th", "td"]) if all_rows else []) + ] + + if headers and "Activity" in headers[0]: + # 6-col entity layout has Lobbyist as second header + if len(headers) >= 2 and "Lobbyist" in headers[1]: + bill_col, pos_col, client_col = 0, 2, 4 + else: + bill_col, pos_col, client_col = 0, 1, 3 + else: + bill_col, pos_col, client_col = 1, None, 3 + + if any("Compensation" in h for h in headers): + comp_col = len(headers) - 1 + + chamber_map = { + "H": "House Bill", "S": "Senate Bill", + "HD": "House Docket", "SD": "Senate Docket", + } + skip = {"Activity or Bill No and Title", "N/A", "None", "", "Total amount"} + + for row in all_rows[1:]: + cells = [td.get_text(strip=True) for td in row.find_all("td")] + if len(cells) <= max(bill_col, client_col): + continue + client_name = cells[client_col] + bill_cell = cells[bill_col] + amt = ( + _parse_amount(cells[comp_col]) + if comp_col is not None and len(cells) > comp_col + else None + ) + # Exclude "Total amount" summary rows appended by legacy individual + # disclosures — these are not real clients. + if amt is not None and client_name not in ("Total amount", "Total", ""): + legacy_comp_pairs.add((client_name, amt)) + if not bill_cell or bill_cell in skip: + continue + # Bill token may use a semicolon separator ("H73; Title") or a space + parts = re.split(r"[;\s]", bill_cell, maxsplit=1) + bill_no = parts[0].rstrip(";") + m = re.match(r"^([A-Z]+)(\d+)$", bill_no) + if not m: + continue + prefix, number = m.group(1), m.group(2) + chamber = chamber_map.get(prefix, "Other") + bill_id = construct_bill_id(chamber, number) + bills.append(BillActivity( + client_name=client_name, + chamber=chamber, + raw_bill_number=number, + bill_id=bill_id, + activity_title=parts[1].strip() if len(parts) > 1 else "", + position=( + cells[pos_col] + if pos_col is not None and len(cells) > pos_col + else "" + ), + amount=amt, + )) + + # Per-client compensation from the "Compensation received" column (2009-2013). + # Fall back to grdvSalaryPaid entity total when no compensation column exists (2005-2008). + if comp_col is not None: + per_client: dict[str, float] = {} + for cn, amt in legacy_comp_pairs: + per_client[cn] = per_client.get(cn, 0.0) + amt + for cn, amt in per_client.items(): + if amt: + compensation.append(Compensation(client_name=cn, amount=amt)) + else: + salary_table = soup.find( + "table", id=lambda x: x and "grdvSalaryPaid" in (x or "") + ) + if salary_table: + total = 0.0 + for row in salary_table.find_all("tr"): + cells = [td.get_text(strip=True) for td in row.find_all("td")] + if len(cells) >= 2 and "Total" not in cells[0]: + amt = _parse_amount(cells[1]) + if amt: + total += amt + if total: + compensation.append( + Compensation(client_name=LEGACY_TOTAL_CLIENT, amount=total) + ) + + return DisclosureDetail(compensation=compensation, bills=bills) + + +def fetch_disclosure_detail( + session: requests.Session, disc_url: str, year: int +) -> DisclosureDetail: + return parse_disclosure_detail(_get(session, disc_url), year) + + +def year_from_disc_url(url: str) -> Optional[int]: + """Extract the filing year from a CompleteDisclosure URL query string.""" + m = re.search(r"[?&]FilingYear=(\d{4})", url) + return int(m.group(1)) if m else None diff --git a/lobbying-scraper/reparse_archive.py b/lobbying-scraper/reparse_archive.py new file mode 100644 index 000000000..266ff1ad6 --- /dev/null +++ b/lobbying-scraper/reparse_archive.py @@ -0,0 +1,155 @@ +"""Offline reparse driver: re-ingests raw HTML from the GCS archive. + +Downloads archived CompleteDisclosure pages from GCS, re-runs the pure parsers +against them, and writes results back to Firestore. Use this when parser logic +has changed and historical data needs to be re-ingested without re-scraping. + +For each archived disclosure page the driver looks up the corresponding +registrant document in Firestore (via the disclosureUrls array) to obtain the +entity name needed to construct filing document IDs. Registrant documents must +therefore already exist before running a reparse. + +Usage: + GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/application_default_credentials.json \\ + python3 reparse_archive.py [--limit N] [--dry-run] + +Progress is tracked in Firestore at /scrapers/lobbyingReparse so the run is +resumable: restarting skips blobs already marked as processed. +""" + +from __future__ import annotations + +import argparse +import os + +from bs4 import BeautifulSoup +from google.cloud import firestore, storage + +import archive +from portal import DisclosureMeta, parse_disclosure_detail, year_from_disc_url +from writer import REGISTRANTS_COLLECTION, write_filings + +REPARSE_DOC = "scrapers/lobbyingReparse" + + +def _meta_for_disc_url(db: firestore.Client, disc_url: str) -> DisclosureMeta | None: + """Look up the registrant that owns this disclosure URL.""" + results = ( + db.collection(REGISTRANTS_COLLECTION) + .where("disclosureUrls", "array_contains", disc_url) + .limit(1) + .get() + ) + if not results: + return None + data = results[0].to_dict() + return DisclosureMeta( + entity_name=data.get("entityName", ""), + year=data.get("year"), + reg_type=data.get("regType", "Lobbyist"), + disclosure_urls=data.get("disclosureUrls", []), + ) + + +def _is_processed(db: firestore.Client, blob_name: str) -> bool: + doc = db.document(REPARSE_DOC).get() + if not doc.exists: + return False + return blob_name in doc.to_dict().get("processedBlobs", []) + + +def _mark_processed(db: firestore.Client, blob_name: str) -> None: + db.document(REPARSE_DOC).set( + {"processedBlobs": firestore.ArrayUnion([blob_name])}, + merge=True, + ) + + +def run(limit: int | None, dry_run: bool) -> None: + gcs = storage.Client() + bucket_name = archive._get_bucket_name() + bucket = gcs.bucket(bucket_name) + + db: firestore.Client | None = None if dry_run else firestore.Client() + + blobs = list(bucket.list_blobs(prefix="raw_html/")) + print(f"Found {len(blobs)} archived pages") + + processed = 0 + skipped = 0 + errors = 0 + + for blob in blobs: + if limit is not None and processed >= limit: + break + + blob.reload() + url = (blob.metadata or {}).get("source-url", "") + + if "CompleteDisclosure" not in url: + skipped += 1 + continue + + if db is not None and _is_processed(db, blob.name): + skipped += 1 + continue + + year = year_from_disc_url(url) + if year is None: + print(f" SKIP {blob.name}: cannot extract year from {url!r}") + skipped += 1 + continue + + meta: DisclosureMeta | None = None + if db is not None: + meta = _meta_for_disc_url(db, url) + if meta is None: + print(f" SKIP {blob.name}: no registrant found for {url!r}") + skipped += 1 + continue + + try: + html = blob.download_as_text(encoding="utf-8") + soup = BeautifulSoup(html, "html.parser") + detail = parse_disclosure_detail(soup, year) + except Exception as exc: + print(f" ERROR parsing {url}: {exc}") + errors += 1 + continue + + print( + f" {url[:80]!r} — {len(detail.compensation)} comp," + f" {len(detail.bills)} bills" + ) + + if not dry_run and db is not None and meta is not None: + write_filings(db, meta, detail) + _mark_processed(db, blob.name) + + processed += 1 + + print( + f"\nDone: {processed} reparsed, {skipped} skipped, {errors} errors" + + (" (dry run — nothing written)" if dry_run else "") + ) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Reparse raw HTML archive into Firestore" + ) + parser.add_argument("--limit", type=int, default=None, help="Stop after N pages") + parser.add_argument( + "--dry-run", action="store_true", help="Parse but do not write to Firestore" + ) + args = parser.parse_args() + + # Ensure archive module can resolve the bucket name even in dry-run mode + if not os.environ.get("ARCHIVE_RAW"): + os.environ["ARCHIVE_RAW"] = "1" + + run(limit=args.limit, dry_run=args.dry_run) + + +if __name__ == "__main__": + main() diff --git a/lobbying-scraper/requirements.txt b/lobbying-scraper/requirements.txt new file mode 100644 index 000000000..d92c075e4 --- /dev/null +++ b/lobbying-scraper/requirements.txt @@ -0,0 +1,4 @@ +requests>=2.28 +beautifulsoup4>=4.12 +google-cloud-firestore>=2.14 +google-cloud-storage>=2.10 diff --git a/lobbying-scraper/scrape.py b/lobbying-scraper/scrape.py new file mode 100644 index 000000000..657c59603 --- /dev/null +++ b/lobbying-scraper/scrape.py @@ -0,0 +1,271 @@ +"""Lobbying disclosure scraper — Cloud Run entry point. + +Runs on a weekly Cloud Scheduler trigger. Checks for new or amended disclosures +and exits immediately if none are found (fast path). When new disclosures exist, +fetches and writes them to Firestore. + +Also serves as the library used by the TypeScript backfill admin script via +subprocess. + +Environment variables: + GOOGLE_CLOUD_PROJECT — GCP project ID (set automatically in Cloud Run) + FIRESTORE_EMULATOR_HOST — set to use the local emulator (e.g. localhost:8080) + +CLI flags (for local / backfill use): + --year YEAR Only process this year (default: current + prior) + --limit N Max registrants per year (for testing) + --dry-run Fetch and parse but do not write to Firestore +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import os +import sys +from datetime import datetime, timezone + +from google.cloud import firestore + +from portal import ( + FIRST_YEAR, + fetch_disclosure_detail, + fetch_disclosure_meta, + fetch_summary_links, + make_session, +) +from writer import ( + BACKFILL_DOC, + BACKFILL_URLS_COLLECTION, + SCRAPER_DOC, + write_filings, + write_registrant, +) + + +# ── Cursor helpers ──────────────────────────────────────────────────────────── + + +def _load_live_cursor(db: firestore.Client) -> tuple[set[str], dict[str, list[str]]]: + """Return (processedDiscUrls, summaryDiscCache) from the live scraper doc.""" + doc = db.document(SCRAPER_DOC).get() + data = doc.to_dict() or {} + return ( + set(data.get("processedDiscUrls", [])), + data.get("summaryDiscCache", {}), + ) + + +def _save_live_cursor( + db: firestore.Client, + processed: set[str], + cache: dict[str, list[str]], +) -> None: + db.document(SCRAPER_DOC).set( + {"processedDiscUrls": list(processed), "summaryDiscCache": cache}, + merge=True, + ) + + +def _is_backfill_processed(db: firestore.Client, disc_url: str) -> bool: + h = hashlib.sha256(disc_url.encode()).hexdigest()[:40] + return db.document(BACKFILL_DOC).collection(BACKFILL_URLS_COLLECTION).document(h).get().exists + + +def _mark_backfill_processed(db: firestore.Client, disc_url: str) -> None: + h = hashlib.sha256(disc_url.encode()).hexdigest()[:40] + db.document(BACKFILL_DOC).collection(BACKFILL_URLS_COLLECTION).document(h).set( + {"url": disc_url, "processedAt": datetime.now(tz=timezone.utc).isoformat()} + ) + + +# ── Core processing ─────────────────────────────────────────────────────────── + + +def process_disclosure( + db: firestore.Client | None, + session, + summary_url: str, + disc_url: str, + year: int, + dry_run: bool = False, +) -> tuple[int, int]: + """Fetch one disclosure page and write registrant + filing documents. + + Returns (compensation_rows, filing_rows). + """ + meta = fetch_disclosure_meta(session, summary_url) + detail = fetch_disclosure_detail(session, disc_url, year) + + if dry_run or db is None: + return len(detail.compensation), len(detail.bills) + + write_registrant(db, meta, detail, disc_url) + n_filings = write_filings(db, meta, detail) + return len(detail.compensation), n_filings + + +# ── Weekly incremental run ──────────────────────────────────────────────────── + + +def run_weekly( + db: "firestore.Client | None", + years: list[int], + limit: int | None = None, + dry_run: bool = False, +) -> int: + """Incremental weekly check. Returns number of new disclosures processed.""" + current_year = datetime.now(tz=timezone.utc).year + processed, cache = _load_live_cursor(db) if db is not None else (set(), {}) + + session = make_session() + new_count = 0 + + for year in years: + print(f"\n── {year} ──") + try: + summary_urls = fetch_summary_links(session, year) + except Exception as e: + print(f" failed to fetch summary links: {e}", file=sys.stderr) + continue + + if limit: + summary_urls = summary_urls[:limit] + + print(f" {len(summary_urls)} registrants on portal") + + for summary_url in summary_urls: + # Use cached disc URLs for prior years; always re-check current year + disc_urls = cache.get(summary_url) + if disc_urls is None or year == current_year: + try: + meta = fetch_disclosure_meta(session, summary_url) + disc_urls = meta.disclosure_urls + cache[summary_url] = disc_urls + if not dry_run: + _save_live_cursor(db, processed, cache) + except Exception as e: + print(f" failed to fetch summary {summary_url}: {e}", file=sys.stderr) + continue + + new_disc_urls = [u for u in disc_urls if u not in processed] + if not new_disc_urls: + continue + + for disc_url in new_disc_urls: + try: + comp_n, filing_n = process_disclosure( + db, session, summary_url, disc_url, year, dry_run=dry_run + ) + processed.add(disc_url) + new_count += 1 + print(f" processed: {comp_n} clients, {filing_n} filings") + if not dry_run: + _save_live_cursor(db, processed, cache) + except Exception as e: + print(f" failed to process {disc_url}: {e}", file=sys.stderr) + + return new_count + + +# ── Historical backfill ─────────────────────────────────────────────────────── + + +def run_backfill( + db: "firestore.Client | None", + years: list[int], + limit: int | None = None, + dry_run: bool = False, +) -> int: + """Full historical backfill using the subcollection cursor. Resumable.""" + session = make_session() + total_new = 0 + + for year in years: + print(f"\n── {year} ──") + try: + summary_urls = fetch_summary_links(session, year) + except Exception as e: + print(f" failed to fetch summary links: {e}", file=sys.stderr) + continue + + if limit: + summary_urls = summary_urls[:limit] + + print(f" {len(summary_urls)} registrants on portal") + year_new = 0 + + for i, summary_url in enumerate(summary_urls): + try: + meta = fetch_disclosure_meta(session, summary_url) + except Exception as e: + print(f" [{i+1}/{len(summary_urls)}] failed to fetch summary: {e}", file=sys.stderr) + continue + + for disc_url in meta.disclosure_urls: + if db is not None and not dry_run and _is_backfill_processed(db, disc_url): + continue + try: + comp_n, filing_n = process_disclosure( + db, session, summary_url, disc_url, year, dry_run=dry_run + ) + if not dry_run: + _mark_backfill_processed(db, disc_url) + total_new += 1 + year_new += 1 + except Exception as e: + print(f" failed to process {disc_url}: {e}", file=sys.stderr) + + if (i + 1) % 50 == 0 or i + 1 == len(summary_urls): + print(f" [{i+1}/{len(summary_urls)}] {year_new} new disclosures so far") + + print(f" {year} complete: {year_new} new disclosures") + + return total_new + + +# ── Entry point ─────────────────────────────────────────────────────────────── + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--year", type=int, default=None) + p.add_argument("--limit", type=int, default=None) + p.add_argument("--dry-run", action="store_true") + p.add_argument( + "--mode", + choices=["weekly", "backfill"], + default="weekly", + help="weekly: incremental check; backfill: full history with subcollection cursor", + ) + args = p.parse_args() + + current_year = datetime.now(tz=timezone.utc).year + + if args.year: + years = [args.year] + elif args.mode == "weekly": + years = [current_year, current_year - 1] + else: + years = list(range(FIRST_YEAR, current_year + 1)) + + project = os.environ.get("GOOGLE_CLOUD_PROJECT") + db = firestore.Client(project=project) if not args.dry_run else None + + if args.mode == "weekly": + n = run_weekly(db, years, limit=args.limit, dry_run=args.dry_run) + if n == 0: + print("\nNo new disclosures found.") + else: + print(f"\nDone: {n} new disclosures written.") + else: + n = run_backfill(db, years, limit=args.limit, dry_run=args.dry_run) + print(f"\nBackfill complete: {n} new disclosures written.") + + # Emit structured result for callers (e.g. TypeScript backfill script) + print(json.dumps({"newDisclosures": n}), file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/lobbying-scraper/tests/__init__.py b/lobbying-scraper/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/lobbying-scraper/tests/__pycache__/__init__.cpython-37.pyc b/lobbying-scraper/tests/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 000000000..33712a9e9 Binary files /dev/null and b/lobbying-scraper/tests/__pycache__/__init__.cpython-37.pyc differ diff --git a/lobbying-scraper/tests/__pycache__/test_portal_parser.cpython-37-pytest-7.4.4.pyc b/lobbying-scraper/tests/__pycache__/test_portal_parser.cpython-37-pytest-7.4.4.pyc new file mode 100644 index 000000000..e18d528fe Binary files /dev/null and b/lobbying-scraper/tests/__pycache__/test_portal_parser.cpython-37-pytest-7.4.4.pyc differ diff --git a/lobbying-scraper/tests/fixtures/2007e_disc.html.gz b/lobbying-scraper/tests/fixtures/2007e_disc.html.gz new file mode 100644 index 000000000..4311a97da Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2007e_disc.html.gz differ diff --git a/lobbying-scraper/tests/fixtures/2007e_summ.html.gz b/lobbying-scraper/tests/fixtures/2007e_summ.html.gz new file mode 100644 index 000000000..58d40abbe Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2007e_summ.html.gz differ diff --git a/lobbying-scraper/tests/fixtures/2011e_disc.html.gz b/lobbying-scraper/tests/fixtures/2011e_disc.html.gz new file mode 100644 index 000000000..2a01a7963 Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2011e_disc.html.gz differ diff --git a/lobbying-scraper/tests/fixtures/2011e_summ.html.gz b/lobbying-scraper/tests/fixtures/2011e_summ.html.gz new file mode 100644 index 000000000..d7b525310 Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2011e_summ.html.gz differ diff --git a/lobbying-scraper/tests/fixtures/2011i_disc.html.gz b/lobbying-scraper/tests/fixtures/2011i_disc.html.gz new file mode 100644 index 000000000..84736b4a7 Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2011i_disc.html.gz differ diff --git a/lobbying-scraper/tests/fixtures/2011i_summ.html.gz b/lobbying-scraper/tests/fixtures/2011i_summ.html.gz new file mode 100644 index 000000000..430f68591 Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2011i_summ.html.gz differ diff --git a/lobbying-scraper/tests/fixtures/2016e_disc.html.gz b/lobbying-scraper/tests/fixtures/2016e_disc.html.gz new file mode 100644 index 000000000..279fd59c0 Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2016e_disc.html.gz differ diff --git a/lobbying-scraper/tests/fixtures/2016e_summ.html.gz b/lobbying-scraper/tests/fixtures/2016e_summ.html.gz new file mode 100644 index 000000000..a0bc28b85 Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2016e_summ.html.gz differ diff --git a/lobbying-scraper/tests/fixtures/2024e_disc.html.gz b/lobbying-scraper/tests/fixtures/2024e_disc.html.gz new file mode 100644 index 000000000..9074511f5 Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2024e_disc.html.gz differ diff --git a/lobbying-scraper/tests/fixtures/2024e_summ.html.gz b/lobbying-scraper/tests/fixtures/2024e_summ.html.gz new file mode 100644 index 000000000..cb6181711 Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2024e_summ.html.gz differ diff --git a/lobbying-scraper/tests/fixtures/2024i_disc.html.gz b/lobbying-scraper/tests/fixtures/2024i_disc.html.gz new file mode 100644 index 000000000..44dddc2ef Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2024i_disc.html.gz differ diff --git a/lobbying-scraper/tests/fixtures/2024i_summ.html.gz b/lobbying-scraper/tests/fixtures/2024i_summ.html.gz new file mode 100644 index 000000000..e6dc3b1b6 Binary files /dev/null and b/lobbying-scraper/tests/fixtures/2024i_summ.html.gz differ diff --git a/lobbying-scraper/tests/test_portal_parser.py b/lobbying-scraper/tests/test_portal_parser.py new file mode 100644 index 000000000..03f73aaee --- /dev/null +++ b/lobbying-scraper/tests/test_portal_parser.py @@ -0,0 +1,172 @@ +"""Regression tests for the MA SoS lobbying disclosure parser. + +The portal HTML has four distinct format eras; the parser is the most likely +thing to silently break when the portal changes its markup. These tests parse +committed fixture pages (one per era, employer + individual) and assert known- +correct compensation totals, client/bill counts, era detection, and specific +bug fixes (the "Total amount" summary-row artifact; the "H73;" semicolon bill +separator; hybrid-era Panel1 compensation). + +Fixtures: tests/fixtures/*.html.gz (gzipped real disclosure + summary pages). +""" + +import gzip +import sys +from pathlib import Path + +import pytest +from bs4 import BeautifulSoup + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from portal import ( + _parse_amount, + parse_disclosure_detail, + parse_summary, + year_to_general_court, +) + +FIXTURES = Path(__file__).parent / "fixtures" + + +def _soup(name: str) -> BeautifulSoup: + with gzip.open(FIXTURES / f"{name}.html.gz", "rt", encoding="utf-8") as fh: + return BeautifulSoup(fh.read(), "html.parser") + + +def _comp_total(detail) -> float: + return sum(c.amount for c in detail.compensation if c.amount) + + +# ── Disclosure parsing ──────────────────────────────────────────────────────── + +# (fixture, year, expected_comp, n_clients, n_bills, era_label) +DISCLOSURE_CASES = [ + ("2007e", 2007, 112_500.00, 1, 2, "legacy 2005-2008: entity total under _total_salary_"), + ("2011e", 2011, 641_243.00, 23, 4, "legacy 2009-2013: per-client Compensation received column"), + ("2016e", 2016, 990_474.00, 30, 1357, "hybrid 2014-2018: Panel1 div totals"), + ("2024e", 2024, 115_000.00, 5, 22, "modern 2019+: grdvClientPaidToEntity"), + ("2024i", 2024, 1_095_200.0, 17, 135, "modern 2019+ individual"), + ("2011i", 2011, 18_518.00, 1, 0, "legacy 2009-2013 individual"), +] + + +@pytest.mark.parametrize("fix,year,exp_comp,n_clients,n_bills,era", DISCLOSURE_CASES) +def test_compensation_total_and_counts(fix, year, exp_comp, n_clients, n_bills, era): + detail = parse_disclosure_detail(_soup(f"{fix}_disc"), year) + assert _comp_total(detail) == pytest.approx(exp_comp, abs=1.0), f"{fix} ({era}) comp total" + assert len(detail.compensation) == n_clients, f"{fix} ({era}) client count" + assert len(detail.bills) == n_bills, f"{fix} ({era}) bill count" + + +@pytest.mark.parametrize("fix,year,_c,_n,_b,_e", DISCLOSURE_CASES) +def test_no_total_amount_artifact(fix, year, _c, _n, _b, _e): + """The legacy individual summary row (client_name == 'Total amount') must + never be captured as a real client — that bug inflated 2010-2013 comp rows.""" + detail = parse_disclosure_detail(_soup(f"{fix}_disc"), year) + bad = [ + c for c in detail.compensation + if c.client_name in ("Total amount", "Total", "") + ] + assert not bad, f"{fix} produced summary-row artifacts: {bad}" + + +def test_legacy_2007_uses_total_salary_placeholder(): + """2005-2008 has no per-client comp column; comp falls back to the entity + salary total stored under the _total_salary_ placeholder client.""" + detail = parse_disclosure_detail(_soup("2007e_disc"), 2007) + assert [c.client_name for c in detail.compensation] == ["_total_salary_"] + + +def test_legacy_2011_is_per_client_not_placeholder(): + """2009-2013 has a per-client 'Compensation received' column, so comp is + stored under real client names — never the _total_salary_ placeholder.""" + detail = parse_disclosure_detail(_soup("2011e_disc"), 2011) + names = [c.client_name for c in detail.compensation] + assert "_total_salary_" not in names + assert len(names) == len(set(names)), "per-client compensation should be deduplicated" + + +def test_hybrid_2016_has_nonzero_compensation(): + """2014-2018 compensation comes from Panel1 divs; was silently $0 before fix.""" + detail = parse_disclosure_detail(_soup("2016e_disc"), 2016) + assert _comp_total(detail) == pytest.approx(990_474.00, abs=1.0) + assert len(detail.compensation) == 30 + + +def test_semicolon_bill_separator_parsed(): + """Legacy bill tokens may use 'H73; Title' (semicolon separator) instead of + a space; the bill number must still be parsed correctly.""" + detail = parse_disclosure_detail(_soup("2011e_disc"), 2011) + house_numbers = {b.raw_bill_number for b in detail.bills if b.chamber == "House Bill"} + assert "73" in house_numbers, "H73 (semicolon-separated) should be parsed" + + +def test_modern_individual_per_client_comp(): + """Modern individual registrants report per-client compensation in + grdvClientPaidToEntity — verify it is captured.""" + detail = parse_disclosure_detail(_soup("2024i_disc"), 2024) + assert _comp_total(detail) > 0 + assert all( + c.client_name not in ("Total amount", "") for c in detail.compensation + ) + + +# ── Summary page parsing ────────────────────────────────────────────────────── + +# (fixture, entity_name, year, reg_type, n_disc_urls) +SUMMARY_CASES = [ + ("2007e_summ", "Ventry Associates, LLP", 2007, "Employer", 2), + ("2011e_summ", "ML Strategies, LLC", 2011, "Employer", 7), + ("2024e_summ", "21c, LLC", 2024, "Employer", 2), + ("2024i_summ", "Anthony Arthur Abdelahad", 2024, "Lobbyist", 2), + ("2011i_summ", "Aaron Judd Agulnek", 2011, "Lobbyist", 4), +] + + +@pytest.mark.parametrize("fix,name,year,reg_type,n_urls", SUMMARY_CASES) +def test_summary_metadata(fix, name, year, reg_type, n_urls): + meta = parse_summary(_soup(fix)) + assert meta.entity_name == name + assert meta.year == year + assert meta.reg_type == reg_type + assert len(meta.disclosure_urls) == n_urls + assert all("CompleteDisclosure" in u for u in meta.disclosure_urls) + + +# ── Helper functions ────────────────────────────────────────────────────────── + +def test_parse_amount(): + assert _parse_amount("$1,234.56") == 1234.56 + assert _parse_amount("$0.00") == 0.0 + assert _parse_amount("") is None + assert _parse_amount("N/A") is None + + +def test_year_to_general_court(): + assert year_to_general_court(2003) == 183 + assert year_to_general_court(2004) == 183 + assert year_to_general_court(2005) == 184 + assert year_to_general_court(2023) == 193 + assert year_to_general_court(2025) == 194 + + +# ── Bill ID construction ────────────────────────────────────────────────────── + +def test_bill_ids_in_modern_disclosure(): + """Verify bill_id is correctly constructed for each chamber prefix.""" + detail = parse_disclosure_detail(_soup("2024e_disc"), 2024) + house = [b for b in detail.bills if b.chamber == "House Bill"] + senate = [b for b in detail.bills if b.chamber == "Senate Bill"] + if house: + assert all(b.bill_id and b.bill_id.startswith("H") for b in house) + if senate: + assert all(b.bill_id and b.bill_id.startswith("S") for b in senate) + + +def test_executive_rows_have_null_bill_id(): + """Executive chamber rows must produce null billId so no accidental bill join occurs.""" + detail = parse_disclosure_detail(_soup("2024i_disc"), 2024) + executive = [b for b in detail.bills if b.chamber == "Executive"] + if executive: + assert all(b.bill_id is None for b in executive) diff --git a/lobbying-scraper/writer.py b/lobbying-scraper/writer.py new file mode 100644 index 000000000..d49d12424 --- /dev/null +++ b/lobbying-scraper/writer.py @@ -0,0 +1,123 @@ +"""Firestore document construction and write helpers. + +Mirrors the data model in functions/src/lobbying/types.ts. All collection +names and field names must stay in sync with that file. +""" + +from __future__ import annotations + +from datetime import datetime, timezone + +from google.cloud import firestore +from normalize import normalize_entity_name +from portal import ( + BillActivity, + Compensation, + DisclosureDetail, + DisclosureMeta, + filing_id, + registrant_id, + year_to_general_court, +) + +REGISTRANTS_COLLECTION = "lobbyingRegistrants" +FILINGS_COLLECTION = "lobbyingFilings" +SCRAPER_DOC = "scrapers/lobbying" +BACKFILL_DOC = "scrapers/lobbyingBackfill" +BACKFILL_URLS_COLLECTION = "processedUrls" + + +def _now() -> datetime: + return datetime.now(tz=timezone.utc) + + +def write_registrant( + db: firestore.Client, + meta: DisclosureMeta, + detail: DisclosureDetail, + disc_url: str, +) -> None: + """Upsert a LobbyingRegistrant document.""" + if not meta.entity_name or meta.year is None: + return + + doc_id = registrant_id(meta.entity_name, meta.year) + ref = db.collection(REGISTRANTS_COLLECTION).document(doc_id) + + clients = [ + { + "clientName": c.client_name, + "clientNameNorm": normalize_entity_name(c.client_name), + "compensation": c.amount, + } + for c in detail.compensation + ] + + data = { + "registrantId": doc_id, + "entityName": meta.entity_name, + "entityNameNorm": normalize_entity_name(meta.entity_name), + "year": meta.year, + "generalCourt": year_to_general_court(meta.year), + "regType": meta.reg_type, + "clients": clients, + "disclosureUrls": firestore.ArrayUnion([disc_url]), + "fetchedAt": _now(), + } + ref.set(data, merge=True) + + +def write_filings( + db: firestore.Client, + meta: DisclosureMeta, + detail: DisclosureDetail, +) -> int: + """Batch-write LobbyingFiling documents. Returns the number written.""" + if not meta.entity_name or meta.year is None or not detail.bills: + return 0 + + gc = year_to_general_court(meta.year) + entity_name = meta.entity_name + entity_norm = normalize_entity_name(entity_name) + now = _now() + + batch = db.batch() + count = 0 + + for bill in detail.bills: + fid = filing_id( + entity_name, + bill.client_name, + bill.chamber, + bill.bill_id, + gc, + bill.position, + ) + ref = db.collection(FILINGS_COLLECTION).document(fid) + doc = { + "filingId": fid, + "entityName": entity_name, + "entityNameNorm": entity_norm, + "clientName": bill.client_name, + "clientNameNorm": normalize_entity_name(bill.client_name), + "year": meta.year, + "generalCourt": gc, + "chamber": bill.chamber, + "billId": bill.bill_id, + "activityTitle": bill.activity_title, + "position": bill.position, + "amount": bill.amount, + "fetchedAt": now, + } + batch.set(ref, doc) + count += 1 + + # Firestore batch limit is 500 writes + if count % 400 == 0: + batch.commit() + batch = db.batch() + + if count % 400 != 0: + batch.commit() + + return count