Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -92,5 +92,10 @@ cert.txt
# local MCP server config (contains auth tokens)
.mcp.json
mcp-server/create-agent-key.ts
.gcloudignore

# Claude
CLAUDE.md

#gcloud
.gcloudignore

1,038 changes: 1,038 additions & 0 deletions docs/lobbying-disclosure-ingestion.md

Large diffs are not rendered by default.

91 changes: 84 additions & 7 deletions firestore.indexes.json
Original file line number Diff line number Diff line change
Expand Up @@ -788,25 +788,46 @@
"collectionGroup": "ballotQuestions",
"queryScope": "COLLECTION",
"fields": [
{ "fieldPath": "electionYear", "order": "ASCENDING" },
{ "fieldPath": "ballotStatus", "order": "ASCENDING" }
{
"fieldPath": "electionYear",
"order": "ASCENDING"
},
{
"fieldPath": "ballotStatus",
"order": "ASCENDING"
}
]
},
{
"collectionGroup": "publishedTestimony",
"queryScope": "COLLECTION_GROUP",
"fields": [
{ "fieldPath": "ballotQuestionId", "order": "ASCENDING" },
{ "fieldPath": "publishedAt", "order": "DESCENDING" }
{
"fieldPath": "ballotQuestionId",
"order": "ASCENDING"
},
{
"fieldPath": "publishedAt",
"order": "DESCENDING"
}
]
},
{
"collectionGroup": "publishedTestimony",
"queryScope": "COLLECTION",
"fields": [
{ "fieldPath": "billId", "order": "ASCENDING" },
{ "fieldPath": "court", "order": "ASCENDING" },
{ "fieldPath": "ballotQuestionId", "order": "ASCENDING" }
{
"fieldPath": "billId",
"order": "ASCENDING"
},
{
"fieldPath": "court",
"order": "ASCENDING"
},
{
"fieldPath": "ballotQuestionId",
"order": "ASCENDING"
}
]
},
{
Expand Down Expand Up @@ -898,6 +919,62 @@
}
}
]
},
{
"collectionGroup": "lobbyingFilings",
"queryScope": "COLLECTION",
"fields": [
{
"fieldPath": "generalCourt",
"order": "ASCENDING"
},
{
"fieldPath": "billId",
"order": "ASCENDING"
}
]
},
{
"collectionGroup": "lobbyingFilings",
"queryScope": "COLLECTION",
"fields": [
{
"fieldPath": "generalCourt",
"order": "ASCENDING"
},
{
"fieldPath": "chamber",
"order": "ASCENDING"
}
]
},
{
"collectionGroup": "lobbyingFilings",
"queryScope": "COLLECTION",
"fields": [
{
"fieldPath": "generalCourt",
"order": "ASCENDING"
},
{
"fieldPath": "entityNameNorm",
"order": "ASCENDING"
}
]
},
{
"collectionGroup": "lobbyingFilings",
"queryScope": "COLLECTION",
"fields": [
{
"fieldPath": "generalCourt",
"order": "ASCENDING"
},
{
"fieldPath": "clientNameNorm",
"order": "ASCENDING"
}
]
}
],
"fieldOverrides": [
Expand Down
8 changes: 8 additions & 0 deletions firestore.rules
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,14 @@ service cloud.firestore {
allow read: if true;
allow write: if false;
}
match /lobbyingRegistrants/{id} {
allow read: if true;
allow write: if false;
}
match /lobbyingFilings/{id} {
allow read: if true;
allow write: if false;
}
match /transcriptions/{tid} {
// public, read-only
allow read: if true
Expand Down
2 changes: 2 additions & 0 deletions functions/src/lobbying/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
export * from "./types"
export { normalizeEntityName } from "./normalize"
72 changes: 72 additions & 0 deletions functions/src/lobbying/normalize.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/**
* Entity name normalization pipeline.
*
* The SoS portal does not enforce consistent name formatting. The same client or
* registrant may appear as "Acme Corp.", "ACME CORPORATION", "Acme, Inc. d/b/a
* Acme Consulting", etc. across filings and years.
*
* The steps must be applied in the exact order
* listed here; changing the order produces different (incorrect) output.
*/

// Step 2: strip d/b/a trade-name suffix before any other transforms so the
// trade name doesn't bleed into the canonical form.
const DBA_RE = /\s+D\s*\/+B\s*\/+A?\s+.*|\s+DBA\s+.*/i

// Step 5: remove legal entity type words with whole-word matching so
// "INCORPORATED" and "CORP" are caught in addition to "LLC"/"INC".
const LEGAL_ENTITY_RE =
/\b(LLC|LLP|INC|INCORPORATED|CORPORATION|CORP|LTD|LIMITED|PC|PLLC)\b/g

// Step 6: remove "THE" as a whole word anywhere (not just as a leading prefix).
const THE_RE = /\bTHE\b/g

// Step 9: professional suffix phrases to remove wholesale.
const MISC_PHRASES = [
"LAW OFFICE OF",
"AND ASSOCIATES",
"& ASSOCIATES",
"AND ASSOC",
"ATTORNEY AT LAW",
"ATTORNEY@LAW",
"ATTORNET AT LAW", // known portal typo
"AND PARTNERS",
"PUBLIC POLICY GROUP",
"LEGISLATIVE SERVICES",
"POLICY GROUP",
"ASSOCIATES",
"COUNSELLORS AT LAW"
]

export function normalizeEntityName(raw: string | null | undefined): string {
if (!raw) return ""

let x = raw.toUpperCase() // Step 1: uppercase

x = x.replace(DBA_RE, "") // Step 2: strip d/b/a suffix

x = x.replace(/-/g, " ") // Step 3: hyphen → space

// Step 4: punctuation → space (not empty string, so ",INC" → " INC" → caught
// by step 5's whole-word removal).
for (const ch of [",", ".", "'", "‘", "’", "(", ")"]) {
x = x.split(ch).join(" ")
}

x = x.replace(LEGAL_ENTITY_RE, " ") // Step 5: remove legal entity type words

x = x.replace(THE_RE, " ") // Step 6: remove THE anywhere

x = x.replace(/&/g, "AND") // Step 7: ampersand → AND

x = x.replace("ASSICIATES", "ASSOCIATES") // Step 8: fix known portal typo

// Step 9: remove professional suffix phrases
for (const phrase of MISC_PHRASES) {
x = x.split(phrase).join(" ")
}

x = x.replace(/\s+/g, " ").trim() // Step 10: collapse whitespace

return x
}
101 changes: 101 additions & 0 deletions functions/src/lobbying/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import {
Array,
InstanceOf,
Literal,
Number,
Null,
Record,
Static,
String,
Union
} from "runtypes"
import { Timestamp } from "../firebase"

export type LobbyingChamber = Static<typeof LobbyingChamber>
export const LobbyingChamber = Union(
Literal("House Bill"),
Literal("Senate Bill"),
Literal("House Docket"),
Literal("Senate Docket"),
Literal("Executive"),
Literal("Other")
)

export type LobbyingClient = Static<typeof LobbyingClient>
export const LobbyingClient = Record({
clientName: String,
clientNameNorm: String,
compensation: Null.Or(Number)
})

export type LobbyingRegistrant = Static<typeof LobbyingRegistrant>
export const LobbyingRegistrant = Record({
registrantId: String,
entityName: String,
entityNameNorm: String,
year: Number,
generalCourt: Number,
regType: Union(Literal("Lobbyist"), Literal("Employer")),
clients: Array(LobbyingClient),
disclosureUrls: Array(String),
fetchedAt: InstanceOf(Timestamp)
})

export type LobbyingFiling = Static<typeof LobbyingFiling>
export const LobbyingFiling = Record({
filingId: String,
entityName: String,
entityNameNorm: String,
clientName: String,
clientNameNorm: String,
year: Number,
generalCourt: Number,
chamber: LobbyingChamber,
// Non-null only for legislative chambers (House Bill, Senate Bill, House Docket,
// Senate Docket). For Executive and Other, no bill join should be attempted.
billId: Null.Or(String),
activityTitle: String,
position: String,
amount: Null.Or(Number),
fetchedAt: InstanceOf(Timestamp)
})

/** Firestore path for lobbying registrant documents */
export const REGISTRANTS_COLLECTION = "lobbyingRegistrants"

/** Firestore path for lobbying filing documents */
export const FILINGS_COLLECTION = "lobbyingFilings"

/** Firestore path for the live scraper cursor document */
export const SCRAPER_DOC = "/scrapers/lobbying"

/** Firestore path for the backfill cursor subcollection */
export const BACKFILL_DOC = "/scrapers/lobbyingBackfill"
export const BACKFILL_URLS_COLLECTION = "processedUrls"

/** Earliest year with portal data */
export const FIRST_LOBBYING_YEAR = 2005

/**
* Sentinel clientName used for pre-2013 legacy filings where compensation is
* reported as a single total rather than broken down per client.
*/
export const LEGACY_TOTAL_CLIENT = "_total_salary_"

/**
* Chamber prefix map for constructing billId values that match MAPLE's Bill.id.
* Typed as a plain index signature so portal.ts can look up any LobbyingChamber
* without triggering "Property X does not exist" on the Partial.
*/
export const CHAMBER_PREFIXES: { [chamber: string]: string | undefined } = {
"House Bill": "H",
"Senate Bill": "S",
"House Docket": "HD",
"Senate Docket": "SD"
}

/** Canonical chamber values for legacy short-form codes found in older filings */
export const LEGACY_CHAMBER_MAP: { [raw: string]: LobbyingChamber } = {
HB: "House Bill",
SB: "Senate Bill"
}
4 changes: 4 additions & 0 deletions lobbying-scraper/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
__pycache__/
*.pyc
*.pyo
.env
16 changes: 16 additions & 0 deletions lobbying-scraper/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM python:3.12-slim

WORKDIR /app

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY normalize.py portal.py writer.py scrape.py ./

# Cloud Run sets PORT; we don't use it (this is a job, not a server).
# Cloud Scheduler invokes the container via HTTP POST to /; handle it minimally.
ENV PYTHONUNBUFFERED=1

# ENTRYPOINT is the fixed executable; CMD provides default args that --args overrides.
ENTRYPOINT ["python3", "scrape.py"]
CMD ["--mode", "weekly"]
Binary file not shown.
Binary file not shown.
Loading
Loading