From 66d5582ee64f386734e5a08a9819cb24d1edc314 Mon Sep 17 00:00:00 2001 From: Sam DeMarrais Date: Tue, 16 Jun 2026 20:05:20 -0400 Subject: [PATCH 1/4] Election scraping --- functions/src/legislators/elections.ts | 262 +++++++++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 functions/src/legislators/elections.ts diff --git a/functions/src/legislators/elections.ts b/functions/src/legislators/elections.ts new file mode 100644 index 000000000..254b81308 --- /dev/null +++ b/functions/src/legislators/elections.ts @@ -0,0 +1,262 @@ +import { JSDOM } from 'jsdom' +import { Array as ArrayType, Runtype, Union, Literal, String, Boolean, Number, Optional, Static, Record } from 'runtypes' + +const officeIds = { + "President": 1, + "U.S. Senate": 6, + "U.S. House": 5, + "Governor": 3, + "Lieutenant Governor": 4, + "Attorney General": 12, + "Secretary of the Commonwealth": 45, + "Treasurer": 53, + "Auditor": 90, + "Governor's Council": 529, + "State Senate": 9, + "State Representative": 8, + "Party State Committee Man": 521, + "Party State Committee Woman": 522, + "Delegate to the National Convention": 543, + "Alternate Delegate to the National Convention": 544, + "District Attorney": 530, + "Clerk of Courts": 15, + "Clerk of Superior Court (Civil)": 534, + "Clerk of Superior Court (Criminal)": 535, + "Clerk of Supreme Judicial Court": 536, + "County Charter Commission": 532, + "Register of Deeds": 384, + "Sheriff": 386, + "County Treasurer": 389, + "Probate Judge": 434, + "Register of Probate": 537, + "Council of Governments Executive Committee": 531 +} as const +export const offices = Object.keys(officeIds) as (keyof typeof officeIds)[]; +export type Office = keyof typeof officeIds + +export const parties = [ + "General", + "American", + "Democratic", + "Green-Rainbow", + "Independent Voters", + "Libertarian", + "Republican", + "Working Families", + "United Independent Party", + "United Independent", + "Independent", + "Green", + "Workers Party" +] as const; +export type Party = (typeof parties)[number]; +export const Party = Union( + Literal(parties[0]), ...parties.slice(1).map(Literal) +); + +const stages = [ + "Primaries", + ...parties +] + +export const ElectionCandidate = Record({ + name: String, + writeIn: Boolean, + votes: Number, + percent: Number, + // Note: During a primary election, no candidate is assigned a party + party: Optional(String), +}); + +export type ElectionCandidate = Static; + +export const ElectionResult = Record({ + candidates: ArrayType(ElectionCandidate), + otherVotes: Number, + blankVotes: Number, + totalVotes: Number, + electionDetailsUrl: String, // If we want votes by town +}); + +export type ElectionStage = Static; + +export const ElectionStage = Record({ + party: Party, + special: Boolean, +}) + +export type ElectionResult = Static; + +export const ElectionInfo = Record({ + year: Number, + office: String, + districts: String, + stage: ElectionStage, + result: Optional(ElectionResult), +}); + +export type ElectionInfo = Static; + +const baseURL = 'https://electionstats.state.ma.us' + +function parseElectionStage(input: string): ElectionStage | null { + const escape = (s: string) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + + const partyPattern = parties + .map(escape) + .sort((a, b) => b.length - a.length) + .join("|"); + + const regex = new RegExp( + `^(Special )?(${partyPattern}) (Primary|Election)$` + ); + + const match = input.match(regex); + if (!match) { + return null; + } + + const [, special, party, stage] = match; + + // Only "General Election" is valid. + if (stage === "Election" && party !== "General") { + return null; + } + + // Only non-General parties have primaries. + if (stage === "Primary" && party === "General") { + return null; + } + + return { + party: Party.check(party), + special: special !== undefined, + }; +} + +function parseElectionTable(table: Element): ElectionResult { + const candidateRows = table.querySelectorAll( + ':scope > tr:not(.non_candidate):not(.more_info)' + ); + + const candidates = Array.from(candidateRows).map(row => { + const name = row.querySelector('.candidate .name a')?.textContent?.trim() || ''; + + let affiliationText = + row.querySelector('.candidate .party')?.textContent?.trim() || ''; + + const writeIn = /\bwrite-in\b/i.test(affiliationText); + + let party; + affiliationText = affiliationText.replace(/\(write-in\)/i, '').replace(/unenrolled/i, '').trim() + if (affiliationText) { + party = affiliationText + } + + const voteText = row.querySelector('td:nth-child(2)')?.textContent?.replace(/,/g, '') + const percentText = row.querySelector('td:nth-child(3)')?.textContent?.trim()?.slice(0, -1) + if (!name || !voteText || !percentText) { + throw new Error(row.outerHTML) + } + + const candidate = { + name, + writeIn, + votes: parseInt(voteText, 10), + percent: parseFloat(percentText), + ...(party ? { party } : {}) + }; + + return candidate; + }); + + candidates.sort((a,b) => b.votes - a.votes) + + const getSummaryValue = (selector: string): number => { + const row = table.querySelector(selector); + + const text = row?.querySelector('td:nth-child(2)')?.textContent?.replace(/,/g, ''); + if (!text) { + return 0 + } + + return parseInt(text, 10); + }; + + const link = table.querySelector('tr.more_info a')?.href + if (!link) { + throw new Error("Link not present") + } + + return { + candidates, + otherVotes: getSummaryValue('tr.n_all_other_votes'), + blankVotes: getSummaryValue('tr.n_blank_votes'), + totalVotes: getSummaryValue('tr.n_total_votes'), + electionDetailsUrl: `${baseURL}${link}` + }; +} + +function info(dom: JSDOM): ElectionInfo[] { + const elements = Array.from(dom.window.document.querySelectorAll( + '[id^="election-id-"]' + )) + const info = elements.map(electionElem => { + const electTDs = Array.from(electionElem.children).filter(child => child.tagName === 'TD') + const yearText = electTDs[0].textContent + if (!yearText) { + throw new Error("eh") + } + const year = parseInt(yearText, 10) + const office = electTDs[1].textContent?.trim() + const districts = electTDs[2].textContent?.trim() + const stage = parseElectionStage(electTDs[3].textContent?.trim() ?? '') + if (!stage) { + throw new Error(`${stage} is not a recognized election stage`) + } + if (electTDs[4].querySelector(':scope > .no_candidates')) { + return ElectionInfo.check({ + year, + office, + districts, + stage + }) + } + const candidateTable = electTDs[4].querySelector(':scope tbody') + if (!candidateTable) { + throw new Error("Election results expects table or no candidates") + } + const result = parseElectionTable(candidateTable) + + return ElectionInfo.check({ + year, + office, + districts, + stage, + result + }) + }) + return info +} + +export async function fetchElectionData( + startYear: number, + endYear: number, + office?: Office, + stage: null | string = "General" +): Promise { + if (stage !== null && !stages.includes(stage)) { + throw new Error("Unrecognized election stage") + } + const officeId = office ? `/office_id:${officeIds[office]}` : '' + const electionStage = stage ? `/stage:${stage}` : '' + const url = `${baseURL}/elections/search/year_from:${startYear}/year_to:${endYear}${officeId}${electionStage}` + const dom = new JSDOM(await (await fetch(url)).text()) + return info(dom) +} + +import { writeFileSync } from 'fs' + +(async () => { + writeFileSync('elections.json', JSON.stringify(await fetchElectionData(2022, 2023), undefined, 2)) +})() \ No newline at end of file From cdadf480178c6207ea9458eb76710e9da51377f9 Mon Sep 17 00:00:00 2001 From: Sam DeMarrais Date: Sat, 27 Jun 2026 22:40:52 -0400 Subject: [PATCH 2/4] Further election scraping --- functions/src/index.ts | 1 + functions/src/legislators/ElectionScraper.ts | 51 +++ functions/src/legislators/electionTypes.ts | 127 +++++++ functions/src/legislators/elections.ts | 262 --------------- functions/src/legislators/index.ts | 1 + functions/src/legislators/scrapeElections.ts | 331 +++++++++++++++++++ scripts/firebase-admin/backfillElections.ts | 22 ++ 7 files changed, 533 insertions(+), 262 deletions(-) create mode 100644 functions/src/legislators/ElectionScraper.ts create mode 100644 functions/src/legislators/electionTypes.ts delete mode 100644 functions/src/legislators/elections.ts create mode 100644 functions/src/legislators/index.ts create mode 100644 functions/src/legislators/scrapeElections.ts create mode 100644 scripts/firebase-admin/backfillElections.ts diff --git a/functions/src/index.ts b/functions/src/index.ts index 641255bf4..fc9a911ed 100644 --- a/functions/src/index.ts +++ b/functions/src/index.ts @@ -57,6 +57,7 @@ export { unfollowUser, getFollowers } from "./subscriptions" +export { scrapeElections } from "./legislators" export { transcription } from "./webhooks" diff --git a/functions/src/legislators/ElectionScraper.ts b/functions/src/legislators/ElectionScraper.ts new file mode 100644 index 000000000..9adcbbaf8 --- /dev/null +++ b/functions/src/legislators/ElectionScraper.ts @@ -0,0 +1,51 @@ +import { runWith, RuntimeOptions } from "firebase-functions" +import { db } from "../firebase" +import { electionId } from "./electionTypes" +import { fetchElectionsData } from "./scrapeElections" + +export class ElectionScraper { + private schedule + private timeout + private memory + + constructor( + schedule: string = "every 24 hours", + timeout: number = 480, + memory: RuntimeOptions["memory"] = "256MB" + ) { + this.schedule = schedule + this.timeout = timeout + this.memory = memory + } + + get function() { + return runWith({ + timeoutSeconds: this.timeout, + memory: this.memory, + maxInstances: 1 + }) + .pubsub.schedule(this.schedule) + .onRun(() => this.run()) + } + + private async run(yearTo?: number, yearFrom?: number) { + const date = new Date() + yearTo = yearTo ?? date.getFullYear() + yearFrom = yearFrom ?? (date.getMonth() < 6 ? yearTo - 1 : yearTo) + + const list = await fetchElectionsData(yearFrom, yearTo) + + if (!list) return + + const writer = db.bulkWriter() + + for (let item of list) { + const id = electionId(item) + writer.set(db.doc(`/electionResults/${id}`), item, { merge: true }) + } + + await writer.close() + } +} + +export const scrapeElections = new ElectionScraper().function diff --git a/functions/src/legislators/electionTypes.ts b/functions/src/legislators/electionTypes.ts new file mode 100644 index 000000000..9dd1ebfc2 --- /dev/null +++ b/functions/src/legislators/electionTypes.ts @@ -0,0 +1,127 @@ +import { sha256 } from "js-sha256" +import { + Array, + Union, + Literal, + String, + Boolean, + Number, + Optional, + Static, + Record +} from "runtypes" + +export const officeIds = { + President: 1, + "U.S. Senate": 6, + "U.S. House": 5, + Governor: 3, + "Lieutenant Governor": 4, + "Attorney General": 12, + "Secretary of the Commonwealth": 45, + Treasurer: 53, + Auditor: 90, + "Governor's Council": 529, + "State Senate": 9, + "State Representative": 8, + "Party State Committee Man": 521, + "Party State Committee Woman": 522, + "Delegate to the National Convention": 543, + "Alternate Delegate to the National Convention": 544, + "District Attorney": 530, + "Clerk of Courts": 15, + "Clerk of Superior Court (Civil)": 534, + "Clerk of Superior Court (Criminal)": 535, + "Clerk of Supreme Judicial Court": 536, + "County Charter Commission": 532, + "Register of Deeds": 384, + Sheriff: 386, + "County Treasurer": 389, + "Probate Judge": 434, + "Register of Probate": 537, + "Council of Governments Executive Committee": 531 +} as const +export const offices = Object.keys(officeIds) as (keyof typeof officeIds)[] +export type Office = keyof typeof officeIds + +export const parties = [ + "General", + "American", + "Democratic", + "Green-rainbow", // Green-rainbow has the case Green-Rainbow in some scenarios + "Independent Voters", + "Libertarian", + "Republican", + "Working Families", + "United Independent Party", + "United Independent", + "Independent", + "Green", + "Workers Party" +] as const +export type Party = (typeof parties)[number] +export const Party = Union( + Literal(parties[0]), + ...parties.slice(1).map(Literal) +) + +export const stages = ["Primaries", ...parties] +export type StageSelection = (typeof stages)[number] +export const StageSelection = Union( + Literal(stages[0]), + ...stages.slice(1).map(Literal) +) + +export const ElectionCandidate = Record({ + name: String, + writeIn: Boolean, + votes: Number, + // Note: During a primary election, no candidate is assigned a party + party: Optional(String) +}) + +export type ElectionCandidate = Static + +export const ElectionResult = Record({ + candidates: Array(ElectionCandidate), + otherVotes: Number, + blankVotes: Number, + noPreferenceVotes: Number.optional(), + totalVotes: Number, + electionDetailsUrl: String // Can also provide votes by town/ward +}) + +export type ElectionStage = Static + +export const ElectionStage = Record({ + party: Party, + special: Boolean +}) + +export type ElectionResult = Static + +export const ElectionInfo = Record({ + // Aligned with Candidates[], for use with Firestore array-contains + // More specific than name; for example, a dual election (such as for president/vice president) + // has a name "Harris and Walz", but the link is to the page for Kamala Harris + candidateUrls: Array(String), + // As far as I can tell, the only place exact date is shown + // is the search menu and PDFs + year: Number, + office: String, + // Seemingly non-standardized + districts: String, + // For general elections, party === "General" + party: Party, + special: Boolean, + // If this is missing, candidateUrls is deliberately [] + result: Optional(ElectionResult) +}) + +export type ElectionInfo = Static + +export function electionId(election: ElectionInfo): string { + return sha256( + `${election.office},${election.year},${election.special},${election.party},${election.districts}` + ) +} diff --git a/functions/src/legislators/elections.ts b/functions/src/legislators/elections.ts deleted file mode 100644 index 254b81308..000000000 --- a/functions/src/legislators/elections.ts +++ /dev/null @@ -1,262 +0,0 @@ -import { JSDOM } from 'jsdom' -import { Array as ArrayType, Runtype, Union, Literal, String, Boolean, Number, Optional, Static, Record } from 'runtypes' - -const officeIds = { - "President": 1, - "U.S. Senate": 6, - "U.S. House": 5, - "Governor": 3, - "Lieutenant Governor": 4, - "Attorney General": 12, - "Secretary of the Commonwealth": 45, - "Treasurer": 53, - "Auditor": 90, - "Governor's Council": 529, - "State Senate": 9, - "State Representative": 8, - "Party State Committee Man": 521, - "Party State Committee Woman": 522, - "Delegate to the National Convention": 543, - "Alternate Delegate to the National Convention": 544, - "District Attorney": 530, - "Clerk of Courts": 15, - "Clerk of Superior Court (Civil)": 534, - "Clerk of Superior Court (Criminal)": 535, - "Clerk of Supreme Judicial Court": 536, - "County Charter Commission": 532, - "Register of Deeds": 384, - "Sheriff": 386, - "County Treasurer": 389, - "Probate Judge": 434, - "Register of Probate": 537, - "Council of Governments Executive Committee": 531 -} as const -export const offices = Object.keys(officeIds) as (keyof typeof officeIds)[]; -export type Office = keyof typeof officeIds - -export const parties = [ - "General", - "American", - "Democratic", - "Green-Rainbow", - "Independent Voters", - "Libertarian", - "Republican", - "Working Families", - "United Independent Party", - "United Independent", - "Independent", - "Green", - "Workers Party" -] as const; -export type Party = (typeof parties)[number]; -export const Party = Union( - Literal(parties[0]), ...parties.slice(1).map(Literal) -); - -const stages = [ - "Primaries", - ...parties -] - -export const ElectionCandidate = Record({ - name: String, - writeIn: Boolean, - votes: Number, - percent: Number, - // Note: During a primary election, no candidate is assigned a party - party: Optional(String), -}); - -export type ElectionCandidate = Static; - -export const ElectionResult = Record({ - candidates: ArrayType(ElectionCandidate), - otherVotes: Number, - blankVotes: Number, - totalVotes: Number, - electionDetailsUrl: String, // If we want votes by town -}); - -export type ElectionStage = Static; - -export const ElectionStage = Record({ - party: Party, - special: Boolean, -}) - -export type ElectionResult = Static; - -export const ElectionInfo = Record({ - year: Number, - office: String, - districts: String, - stage: ElectionStage, - result: Optional(ElectionResult), -}); - -export type ElectionInfo = Static; - -const baseURL = 'https://electionstats.state.ma.us' - -function parseElectionStage(input: string): ElectionStage | null { - const escape = (s: string) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); - - const partyPattern = parties - .map(escape) - .sort((a, b) => b.length - a.length) - .join("|"); - - const regex = new RegExp( - `^(Special )?(${partyPattern}) (Primary|Election)$` - ); - - const match = input.match(regex); - if (!match) { - return null; - } - - const [, special, party, stage] = match; - - // Only "General Election" is valid. - if (stage === "Election" && party !== "General") { - return null; - } - - // Only non-General parties have primaries. - if (stage === "Primary" && party === "General") { - return null; - } - - return { - party: Party.check(party), - special: special !== undefined, - }; -} - -function parseElectionTable(table: Element): ElectionResult { - const candidateRows = table.querySelectorAll( - ':scope > tr:not(.non_candidate):not(.more_info)' - ); - - const candidates = Array.from(candidateRows).map(row => { - const name = row.querySelector('.candidate .name a')?.textContent?.trim() || ''; - - let affiliationText = - row.querySelector('.candidate .party')?.textContent?.trim() || ''; - - const writeIn = /\bwrite-in\b/i.test(affiliationText); - - let party; - affiliationText = affiliationText.replace(/\(write-in\)/i, '').replace(/unenrolled/i, '').trim() - if (affiliationText) { - party = affiliationText - } - - const voteText = row.querySelector('td:nth-child(2)')?.textContent?.replace(/,/g, '') - const percentText = row.querySelector('td:nth-child(3)')?.textContent?.trim()?.slice(0, -1) - if (!name || !voteText || !percentText) { - throw new Error(row.outerHTML) - } - - const candidate = { - name, - writeIn, - votes: parseInt(voteText, 10), - percent: parseFloat(percentText), - ...(party ? { party } : {}) - }; - - return candidate; - }); - - candidates.sort((a,b) => b.votes - a.votes) - - const getSummaryValue = (selector: string): number => { - const row = table.querySelector(selector); - - const text = row?.querySelector('td:nth-child(2)')?.textContent?.replace(/,/g, ''); - if (!text) { - return 0 - } - - return parseInt(text, 10); - }; - - const link = table.querySelector('tr.more_info a')?.href - if (!link) { - throw new Error("Link not present") - } - - return { - candidates, - otherVotes: getSummaryValue('tr.n_all_other_votes'), - blankVotes: getSummaryValue('tr.n_blank_votes'), - totalVotes: getSummaryValue('tr.n_total_votes'), - electionDetailsUrl: `${baseURL}${link}` - }; -} - -function info(dom: JSDOM): ElectionInfo[] { - const elements = Array.from(dom.window.document.querySelectorAll( - '[id^="election-id-"]' - )) - const info = elements.map(electionElem => { - const electTDs = Array.from(electionElem.children).filter(child => child.tagName === 'TD') - const yearText = electTDs[0].textContent - if (!yearText) { - throw new Error("eh") - } - const year = parseInt(yearText, 10) - const office = electTDs[1].textContent?.trim() - const districts = electTDs[2].textContent?.trim() - const stage = parseElectionStage(electTDs[3].textContent?.trim() ?? '') - if (!stage) { - throw new Error(`${stage} is not a recognized election stage`) - } - if (electTDs[4].querySelector(':scope > .no_candidates')) { - return ElectionInfo.check({ - year, - office, - districts, - stage - }) - } - const candidateTable = electTDs[4].querySelector(':scope tbody') - if (!candidateTable) { - throw new Error("Election results expects table or no candidates") - } - const result = parseElectionTable(candidateTable) - - return ElectionInfo.check({ - year, - office, - districts, - stage, - result - }) - }) - return info -} - -export async function fetchElectionData( - startYear: number, - endYear: number, - office?: Office, - stage: null | string = "General" -): Promise { - if (stage !== null && !stages.includes(stage)) { - throw new Error("Unrecognized election stage") - } - const officeId = office ? `/office_id:${officeIds[office]}` : '' - const electionStage = stage ? `/stage:${stage}` : '' - const url = `${baseURL}/elections/search/year_from:${startYear}/year_to:${endYear}${officeId}${electionStage}` - const dom = new JSDOM(await (await fetch(url)).text()) - return info(dom) -} - -import { writeFileSync } from 'fs' - -(async () => { - writeFileSync('elections.json', JSON.stringify(await fetchElectionData(2022, 2023), undefined, 2)) -})() \ No newline at end of file diff --git a/functions/src/legislators/index.ts b/functions/src/legislators/index.ts new file mode 100644 index 000000000..7cd688a9a --- /dev/null +++ b/functions/src/legislators/index.ts @@ -0,0 +1 @@ +export { scrapeElections } from "./ElectionScraper" diff --git a/functions/src/legislators/scrapeElections.ts b/functions/src/legislators/scrapeElections.ts new file mode 100644 index 000000000..8309bd32c --- /dev/null +++ b/functions/src/legislators/scrapeElections.ts @@ -0,0 +1,331 @@ +import { JSDOM, VirtualConsole } from "jsdom" +import { + ElectionStage, + parties, + Party, + ElectionInfo, + StageSelection, + ElectionResult, + ElectionCandidate, + Office, + officeIds +} from "./electionTypes" + +const baseURL = "https://electionstats.state.ma.us" + +function parsePartyString(affiliationText: string): { + writeIn: boolean + party?: string +} { + const writeIn = /\bwrite-in\b/i.test(affiliationText) + + let party + affiliationText = affiliationText + .replace(/\(?write-in\)?/i, "") + .replace(/unenrolled/i, "") + .trim() + if (affiliationText) { + party = affiliationText + } + + return { + writeIn, + ...(party ? { party } : {}) + } +} + +function precinctHeaderText(th: Element | undefined): string | undefined { + const a = th?.querySelector("a[title]") ?? th?.querySelector("a[oldtitle]") + return ( + a?.getAttribute("title") ?? + a?.getAttribute("oldtitle") ?? + th?.textContent?.trim() + ) +} + +async function fetchElectionData( + url: string +): Promise<[ElectionResult, string[]]> { + const text = await (await fetch(url)).text() + + const virtualConsole = new VirtualConsole() + virtualConsole.on("jsdomError", error => { + if (error.message.includes("Could not parse CSS stylesheet")) { + return + } + console.error(error) + }) + + const dom = new JSDOM(text, { virtualConsole }) + const document = dom.window.document + + const table = document.querySelector("table.precinct_data") + if (!table) { + throw new Error(`No result table in ${url}`) + } + const headers = Array.from(table.querySelectorAll("thead tr th")).map( + th => precinctHeaderText(th) ?? "" + ) + const totalRow = table.querySelector("tbody tr.total") + if (!totalRow) { + throw new Error(`${url} has no table row for 'total'`) + } + const cells = Array.from(totalRow.querySelectorAll("td")) + const values = new Map() + // Avoid leftward descriptive titles + headers.reverse() + cells.reverse() + headers.forEach((header, i) => { + const text = cells[i].textContent?.replace(/,/g, "").trim() + if (text && /^\d+$/.test(text)) { + values.set(header, parseInt(text)) + } + }) + + const candidates = Array.from( + document.querySelectorAll(".candidate_key .item") + ).map(item => { + const nameElem = item.querySelector(".display_name a") + const name = nameElem?.textContent?.trim() + const votes = values.get(name ?? "") + if (!nameElem || !name || !nameElem.href || !votes) { + throw new Error( + `${item.outerHTML} does not have one of ".display_name a", name, or votes (from ${values})` + ) + } + return { + name, + party: parsePartyString( + item.querySelector(".party")?.textContent?.trim() ?? "" + ), + votes, + candidateUrl: `${baseURL}${nameElem.href}` + } + }) + + candidates.sort((a, b) => b.votes - a.votes) + + const candidateVotes = candidates.map(candidate => { + return { + name: candidate.name, + votes: candidate.votes, + ...candidate.party + } + }) + + const noPreference = values.has("No Preference") + ? { noPreferenceVotes: values.get("No Preference") } + : {} + + const [otherVotes, blankVotes, totalVotes] = [ + values.get("All Others"), + values.get("Blanks"), + values.get("Total Votes Cast") + ] + if (!totalVotes) { + throw new Error(`${url} has no 'Total' column`) + } + return [ + { + candidates: candidateVotes, + otherVotes: otherVotes ?? 0, + blankVotes: blankVotes ?? 0, + totalVotes, + electionDetailsUrl: url, + ...noPreference + }, + candidates.map(candidate => candidate.candidateUrl) + ] +} + +function parseElectionStage(input: string): ElectionStage | null { + const escape = (s: string) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + + const partyPattern = parties + .map(escape) + .sort((a, b) => b.length - a.length) + .join("|") + + const regex = new RegExp(`^(Special )?(${partyPattern}) (Primary|Election)$`) + + const match = input.match(regex) + if (!match) { + return null + } + + const [, special, party, stage] = match + + // Only "General Election" is valid. + if (stage === "Election" && party !== "General") { + return null + } + + // Only non-General parties have primaries. + if (stage === "Primary" && party === "General") { + return null + } + + return { + party: Party.check(party), + special: special !== undefined + } +} + +async function parseElectionTable( + table: Element +): Promise<[ElectionResult, string[]]> { + if (table.querySelector(".other-candidates")) { + // The search page does not include all details on this election; secondary fetch + const link = table.querySelector("tr.more_info a")?.href + if (!link) { + throw new Error(`${table.outerHTML} has no 'more info' link`) + } + const electionDetailsUrl = `${baseURL}${link}` + return await fetchElectionData(electionDetailsUrl) + } + const candidateRows = table.querySelectorAll( + ":scope > tr:not(.non_candidate):not(.more_info)" + ) + + const candidates = Array.from(candidateRows).map(row => { + const nameElem = row.querySelector(".candidate .name a") + const name = nameElem?.textContent?.trim() || "" + const partyText = + row.querySelector(".candidate .party")?.textContent?.trim() || "" + const link = nameElem?.href + + const voteText = row + .querySelector("td:nth-child(2)") + ?.textContent?.replace(/,/g, "") + if (!name || !voteText || !link) { + throw new Error( + `One of name, voteText, or candidate link is missing from ${row.outerHTML}` + ) + } + + const candidate = { + name, + votes: parseInt(voteText, 10), + ...parsePartyString(partyText) + } + + const ret: [ElectionCandidate, string] = [candidate, `${baseURL}${link}`] + return ret + }) + + candidates.sort((a, b) => b[0].votes - a[0].votes) + + const getSummaryValue = (selector: string): number | null => { + const row = table.querySelector(selector) + + const text = row + ?.querySelector("td:nth-child(2)") + ?.textContent?.replace(/,/g, "") + if (!text) { + return null + } + + return parseInt(text, 10) + } + + const link = table.querySelector("tr.more_info a")?.href + if (!link) { + throw new Error(`More info link missing from ${table.outerHTML}`) + } + + const [otherVotes, blankVotes, totalVotes] = [ + getSummaryValue("tr.n_all_other_votes"), + getSummaryValue("tr.n_blank_votes"), + getSummaryValue("tr.n_total_votes") + ] + const noPreference = getSummaryValue("tr.n_no_preference_votes") + if (!totalVotes) { + throw new Error(`No total votes row in ${table.outerHTML}`) + } + return [ + { + candidates: candidates.map(item => item[0]), + otherVotes: otherVotes ?? 0, + blankVotes: blankVotes ?? 0, + totalVotes, + electionDetailsUrl: `${baseURL}${link}`, + ...(noPreference ? { noPreferenceVotes: noPreference } : {}) + }, + candidates.map(item => item[1]) + ] +} + +async function electionsPageInfo(dom: JSDOM): Promise { + const elements = Array.from( + dom.window.document.querySelectorAll('[id^="election-id-"]') + ) + const info = elements.map(async electionElem => { + const electTDs = Array.from(electionElem.children).filter( + child => child.tagName === "TD" + ) + const yearText = electTDs[0].textContent + if (!yearText) { + throw new Error(`Year not present in ${electionElem.outerHTML}`) + } + const year = parseInt(yearText, 10) + const office = electTDs[1].textContent?.trim() + const districts = electTDs[2].textContent?.trim() + if (!year || !office || !districts) { + throw new Error( + `Year, office, or districts not present in ${electionElem.outerHTML}` + ) + } + const stage = parseElectionStage(electTDs[3].textContent?.trim() ?? "") + if (!stage) { + throw new Error( + `${stage} is not a recognized election stage: ${electTDs[3].outerHTML}` + ) + } + if (electTDs[4].querySelector(":scope > .no_candidates")) { + return ElectionInfo.check({ + year, + office, + districts, + candidateUrls: [], + ...stage + }) + } + const candidateTable = electTDs[4].querySelector(":scope tbody") + if (!candidateTable) { + throw new Error(`No candidate table in ${electionElem.outerHTML}`) + } + const [result, candidateUrls] = await parseElectionTable(candidateTable) + + return ElectionInfo.check({ + year, + office, + districts, + ...stage, + candidateUrls, + result + }) + }) + return Promise.all(info) +} + +export async function fetchElectionsData( + startYear: number, + endYear: number, + office?: Office, + stage: StageSelection | null = "General" +): Promise { + const officeId = office ? `/office_id:${officeIds[office]}` : "" + const electionStage = stage ? `/stage:${stage}` : "" + const url = `${baseURL}/elections/search/year_from:${startYear}/year_to:${endYear}${officeId}${electionStage}` + const page = await fetch(url) + const text = await page.text() + const virtualConsole = new VirtualConsole() + virtualConsole.on("jsdomError", error => { + if (error.message.includes("Could not parse CSS stylesheet")) { + return + } + console.error(error) + }) + const dom = new JSDOM(text, { virtualConsole }) + return electionsPageInfo(dom) +} diff --git a/scripts/firebase-admin/backfillElections.ts b/scripts/firebase-admin/backfillElections.ts new file mode 100644 index 000000000..45ce63b18 --- /dev/null +++ b/scripts/firebase-admin/backfillElections.ts @@ -0,0 +1,22 @@ +import { Record, Number } from "runtypes" +import { Script } from "./types" +import { fetchElectionsData } from "functions/src/legislators/scrapeElections" +import { electionId } from "functions/src/legislators/electionTypes" + +const Args = Record({ + startYear: Number +}) + +export const script: Script = async ({ db, args }) => { + const { startYear } = Args.check(args) + const currentYear = new Date().getFullYear() + const writer = db.bulkWriter() + for (let year = startYear; year <= currentYear; year++) { + const data = await fetchElectionsData(year, year) + for (const item of data) { + const id = electionId(item) + writer.set(db.doc(`/electionResults/${id}`), item, { merge: true }) + } + } + await writer.close() +} From b48fa4559aa522272bc24f6b0d32a63af15ba54f Mon Sep 17 00:00:00 2001 From: Sam DeMarrais Date: Sat, 27 Jun 2026 23:01:43 -0400 Subject: [PATCH 3/4] More robust elections --- functions/src/legislators/scrapeElections.ts | 83 +++++++++++--------- 1 file changed, 44 insertions(+), 39 deletions(-) diff --git a/functions/src/legislators/scrapeElections.ts b/functions/src/legislators/scrapeElections.ts index 8309bd32c..fe7da8e63 100644 --- a/functions/src/legislators/scrapeElections.ts +++ b/functions/src/legislators/scrapeElections.ts @@ -255,55 +255,60 @@ async function parseElectionTable( ] } -async function electionsPageInfo(dom: JSDOM): Promise { +async function electionsPageInfo(dom: JSDOM): Promise<(ElectionInfo | null)[]> { const elements = Array.from( dom.window.document.querySelectorAll('[id^="election-id-"]') ) const info = elements.map(async electionElem => { - const electTDs = Array.from(electionElem.children).filter( - child => child.tagName === "TD" - ) - const yearText = electTDs[0].textContent - if (!yearText) { - throw new Error(`Year not present in ${electionElem.outerHTML}`) - } - const year = parseInt(yearText, 10) - const office = electTDs[1].textContent?.trim() - const districts = electTDs[2].textContent?.trim() - if (!year || !office || !districts) { - throw new Error( - `Year, office, or districts not present in ${electionElem.outerHTML}` - ) - } - const stage = parseElectionStage(electTDs[3].textContent?.trim() ?? "") - if (!stage) { - throw new Error( - `${stage} is not a recognized election stage: ${electTDs[3].outerHTML}` + try { + const electTDs = Array.from(electionElem.children).filter( + child => child.tagName === "TD" ) - } - if (electTDs[4].querySelector(":scope > .no_candidates")) { + const yearText = electTDs[0].textContent + if (!yearText) { + throw new Error(`Year not present in ${electionElem.outerHTML}`) + } + const year = parseInt(yearText, 10) + const office = electTDs[1].textContent?.trim() + const districts = electTDs[2].textContent?.trim() + if (!year || !office || !districts) { + throw new Error( + `Year, office, or districts not present in ${electionElem.outerHTML}` + ) + } + const stage = parseElectionStage(electTDs[3].textContent?.trim() ?? "") + if (!stage) { + throw new Error( + `${stage} is not a recognized election stage: ${electTDs[3].outerHTML}` + ) + } + if (electTDs[4].querySelector(":scope > .no_candidates")) { + return ElectionInfo.check({ + year, + office, + districts, + candidateUrls: [], + ...stage + }) + } + const candidateTable = electTDs[4].querySelector(":scope tbody") + if (!candidateTable) { + throw new Error(`No candidate table in ${electionElem.outerHTML}`) + } + const [result, candidateUrls] = await parseElectionTable(candidateTable) + return ElectionInfo.check({ year, office, districts, - candidateUrls: [], - ...stage + ...stage, + candidateUrls, + result }) + } catch (error) { + console.error(error) + return null } - const candidateTable = electTDs[4].querySelector(":scope tbody") - if (!candidateTable) { - throw new Error(`No candidate table in ${electionElem.outerHTML}`) - } - const [result, candidateUrls] = await parseElectionTable(candidateTable) - - return ElectionInfo.check({ - year, - office, - districts, - ...stage, - candidateUrls, - result - }) }) return Promise.all(info) } @@ -327,5 +332,5 @@ export async function fetchElectionsData( console.error(error) }) const dom = new JSDOM(text, { virtualConsole }) - return electionsPageInfo(dom) + return (await electionsPageInfo(dom)).filter((item): item is ElectionInfo => item !== null) } From 99af751c38caf92613e2de67b830eee04628f14d Mon Sep 17 00:00:00 2001 From: Sam DeMarrais Date: Sat, 27 Jun 2026 23:18:33 -0400 Subject: [PATCH 4/4] Prettier --- functions/src/legislators/scrapeElections.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/functions/src/legislators/scrapeElections.ts b/functions/src/legislators/scrapeElections.ts index fe7da8e63..be4010561 100644 --- a/functions/src/legislators/scrapeElections.ts +++ b/functions/src/legislators/scrapeElections.ts @@ -332,5 +332,7 @@ export async function fetchElectionsData( console.error(error) }) const dom = new JSDOM(text, { virtualConsole }) - return (await electionsPageInfo(dom)).filter((item): item is ElectionInfo => item !== null) + return (await electionsPageInfo(dom)).filter( + (item): item is ElectionInfo => item !== null + ) }