diff --git a/functions/src/index.ts b/functions/src/index.ts index 641255bf4..fc9a911ed 100644 --- a/functions/src/index.ts +++ b/functions/src/index.ts @@ -57,6 +57,7 @@ export { unfollowUser, getFollowers } from "./subscriptions" +export { scrapeElections } from "./legislators" export { transcription } from "./webhooks" diff --git a/functions/src/legislators/ElectionScraper.ts b/functions/src/legislators/ElectionScraper.ts new file mode 100644 index 000000000..9adcbbaf8 --- /dev/null +++ b/functions/src/legislators/ElectionScraper.ts @@ -0,0 +1,51 @@ +import { runWith, RuntimeOptions } from "firebase-functions" +import { db } from "../firebase" +import { electionId } from "./electionTypes" +import { fetchElectionsData } from "./scrapeElections" + +export class ElectionScraper { + private schedule + private timeout + private memory + + constructor( + schedule: string = "every 24 hours", + timeout: number = 480, + memory: RuntimeOptions["memory"] = "256MB" + ) { + this.schedule = schedule + this.timeout = timeout + this.memory = memory + } + + get function() { + return runWith({ + timeoutSeconds: this.timeout, + memory: this.memory, + maxInstances: 1 + }) + .pubsub.schedule(this.schedule) + .onRun(() => this.run()) + } + + private async run(yearTo?: number, yearFrom?: number) { + const date = new Date() + yearTo = yearTo ?? date.getFullYear() + yearFrom = yearFrom ?? (date.getMonth() < 6 ? yearTo - 1 : yearTo) + + const list = await fetchElectionsData(yearFrom, yearTo) + + if (!list) return + + const writer = db.bulkWriter() + + for (let item of list) { + const id = electionId(item) + writer.set(db.doc(`/electionResults/${id}`), item, { merge: true }) + } + + await writer.close() + } +} + +export const scrapeElections = new ElectionScraper().function diff --git a/functions/src/legislators/electionTypes.ts b/functions/src/legislators/electionTypes.ts new file mode 100644 index 000000000..9dd1ebfc2 --- /dev/null +++ b/functions/src/legislators/electionTypes.ts @@ -0,0 +1,127 @@ +import { sha256 } from "js-sha256" +import { + Array, + Union, + Literal, + String, + Boolean, + Number, + Optional, + Static, + Record +} from "runtypes" + +export const officeIds = { + President: 1, + "U.S. Senate": 6, + "U.S. House": 5, + Governor: 3, + "Lieutenant Governor": 4, + "Attorney General": 12, + "Secretary of the Commonwealth": 45, + Treasurer: 53, + Auditor: 90, + "Governor's Council": 529, + "State Senate": 9, + "State Representative": 8, + "Party State Committee Man": 521, + "Party State Committee Woman": 522, + "Delegate to the National Convention": 543, + "Alternate Delegate to the National Convention": 544, + "District Attorney": 530, + "Clerk of Courts": 15, + "Clerk of Superior Court (Civil)": 534, + "Clerk of Superior Court (Criminal)": 535, + "Clerk of Supreme Judicial Court": 536, + "County Charter Commission": 532, + "Register of Deeds": 384, + Sheriff: 386, + "County Treasurer": 389, + "Probate Judge": 434, + "Register of Probate": 537, + "Council of Governments Executive Committee": 531 +} as const +export const offices = Object.keys(officeIds) as (keyof typeof officeIds)[] +export type Office = keyof typeof officeIds + +export const parties = [ + "General", + "American", + "Democratic", + "Green-rainbow", // Green-rainbow has the case Green-Rainbow in some scenarios + "Independent Voters", + "Libertarian", + "Republican", + "Working Families", + "United Independent Party", + "United Independent", + "Independent", + "Green", + "Workers Party" +] as const +export type Party = (typeof parties)[number] +export const Party = Union( + Literal(parties[0]), + ...parties.slice(1).map(Literal) +) + +export const stages = ["Primaries", ...parties] +export type StageSelection = (typeof stages)[number] +export const StageSelection = Union( + Literal(stages[0]), + ...stages.slice(1).map(Literal) +) + +export const ElectionCandidate = Record({ + name: String, + writeIn: Boolean, + votes: Number, + // Note: During a primary election, no candidate is assigned a party + party: Optional(String) +}) + +export type ElectionCandidate = Static + +export const ElectionResult = Record({ + candidates: Array(ElectionCandidate), + otherVotes: Number, + blankVotes: Number, + noPreferenceVotes: Number.optional(), + totalVotes: Number, + electionDetailsUrl: String // Can also provide votes by town/ward +}) + +export type ElectionStage = Static + +export const ElectionStage = Record({ + party: Party, + special: Boolean +}) + +export type ElectionResult = Static + +export const ElectionInfo = Record({ + // Aligned with Candidates[], for use with Firestore array-contains + // More specific than name; for example, a dual election (such as for president/vice president) + // has a name "Harris and Walz", but the link is to the page for Kamala Harris + candidateUrls: Array(String), + // As far as I can tell, the only place exact date is shown + // is the search menu and PDFs + year: Number, + office: String, + // Seemingly non-standardized + districts: String, + // For general elections, party === "General" + party: Party, + special: Boolean, + // If this is missing, candidateUrls is deliberately [] + result: Optional(ElectionResult) +}) + +export type ElectionInfo = Static + +export function electionId(election: ElectionInfo): string { + return sha256( + `${election.office},${election.year},${election.special},${election.party},${election.districts}` + ) +} diff --git a/functions/src/legislators/index.ts b/functions/src/legislators/index.ts new file mode 100644 index 000000000..7cd688a9a --- /dev/null +++ b/functions/src/legislators/index.ts @@ -0,0 +1 @@ +export { scrapeElections } from "./ElectionScraper" diff --git a/functions/src/legislators/scrapeElections.ts b/functions/src/legislators/scrapeElections.ts new file mode 100644 index 000000000..be4010561 --- /dev/null +++ b/functions/src/legislators/scrapeElections.ts @@ -0,0 +1,338 @@ +import { JSDOM, VirtualConsole } from "jsdom" +import { + ElectionStage, + parties, + Party, + ElectionInfo, + StageSelection, + ElectionResult, + ElectionCandidate, + Office, + officeIds +} from "./electionTypes" + +const baseURL = "https://electionstats.state.ma.us" + +function parsePartyString(affiliationText: string): { + writeIn: boolean + party?: string +} { + const writeIn = /\bwrite-in\b/i.test(affiliationText) + + let party + affiliationText = affiliationText + .replace(/\(?write-in\)?/i, "") + .replace(/unenrolled/i, "") + .trim() + if (affiliationText) { + party = affiliationText + } + + return { + writeIn, + ...(party ? { party } : {}) + } +} + +function precinctHeaderText(th: Element | undefined): string | undefined { + const a = th?.querySelector("a[title]") ?? th?.querySelector("a[oldtitle]") + return ( + a?.getAttribute("title") ?? + a?.getAttribute("oldtitle") ?? + th?.textContent?.trim() + ) +} + +async function fetchElectionData( + url: string +): Promise<[ElectionResult, string[]]> { + const text = await (await fetch(url)).text() + + const virtualConsole = new VirtualConsole() + virtualConsole.on("jsdomError", error => { + if (error.message.includes("Could not parse CSS stylesheet")) { + return + } + console.error(error) + }) + + const dom = new JSDOM(text, { virtualConsole }) + const document = dom.window.document + + const table = document.querySelector("table.precinct_data") + if (!table) { + throw new Error(`No result table in ${url}`) + } + const headers = Array.from(table.querySelectorAll("thead tr th")).map( + th => precinctHeaderText(th) ?? "" + ) + const totalRow = table.querySelector("tbody tr.total") + if (!totalRow) { + throw new Error(`${url} has no table row for 'total'`) + } + const cells = Array.from(totalRow.querySelectorAll("td")) + const values = new Map() + // Avoid leftward descriptive titles + headers.reverse() + cells.reverse() + headers.forEach((header, i) => { + const text = cells[i].textContent?.replace(/,/g, "").trim() + if (text && /^\d+$/.test(text)) { + values.set(header, parseInt(text)) + } + }) + + const candidates = Array.from( + document.querySelectorAll(".candidate_key .item") + ).map(item => { + const nameElem = item.querySelector(".display_name a") + const name = nameElem?.textContent?.trim() + const votes = values.get(name ?? "") + if (!nameElem || !name || !nameElem.href || !votes) { + throw new Error( + `${item.outerHTML} does not have one of ".display_name a", name, or votes (from ${values})` + ) + } + return { + name, + party: parsePartyString( + item.querySelector(".party")?.textContent?.trim() ?? "" + ), + votes, + candidateUrl: `${baseURL}${nameElem.href}` + } + }) + + candidates.sort((a, b) => b.votes - a.votes) + + const candidateVotes = candidates.map(candidate => { + return { + name: candidate.name, + votes: candidate.votes, + ...candidate.party + } + }) + + const noPreference = values.has("No Preference") + ? { noPreferenceVotes: values.get("No Preference") } + : {} + + const [otherVotes, blankVotes, totalVotes] = [ + values.get("All Others"), + values.get("Blanks"), + values.get("Total Votes Cast") + ] + if (!totalVotes) { + throw new Error(`${url} has no 'Total' column`) + } + return [ + { + candidates: candidateVotes, + otherVotes: otherVotes ?? 0, + blankVotes: blankVotes ?? 0, + totalVotes, + electionDetailsUrl: url, + ...noPreference + }, + candidates.map(candidate => candidate.candidateUrl) + ] +} + +function parseElectionStage(input: string): ElectionStage | null { + const escape = (s: string) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + + const partyPattern = parties + .map(escape) + .sort((a, b) => b.length - a.length) + .join("|") + + const regex = new RegExp(`^(Special )?(${partyPattern}) (Primary|Election)$`) + + const match = input.match(regex) + if (!match) { + return null + } + + const [, special, party, stage] = match + + // Only "General Election" is valid. + if (stage === "Election" && party !== "General") { + return null + } + + // Only non-General parties have primaries. + if (stage === "Primary" && party === "General") { + return null + } + + return { + party: Party.check(party), + special: special !== undefined + } +} + +async function parseElectionTable( + table: Element +): Promise<[ElectionResult, string[]]> { + if (table.querySelector(".other-candidates")) { + // The search page does not include all details on this election; secondary fetch + const link = table.querySelector("tr.more_info a")?.href + if (!link) { + throw new Error(`${table.outerHTML} has no 'more info' link`) + } + const electionDetailsUrl = `${baseURL}${link}` + return await fetchElectionData(electionDetailsUrl) + } + const candidateRows = table.querySelectorAll( + ":scope > tr:not(.non_candidate):not(.more_info)" + ) + + const candidates = Array.from(candidateRows).map(row => { + const nameElem = row.querySelector(".candidate .name a") + const name = nameElem?.textContent?.trim() || "" + const partyText = + row.querySelector(".candidate .party")?.textContent?.trim() || "" + const link = nameElem?.href + + const voteText = row + .querySelector("td:nth-child(2)") + ?.textContent?.replace(/,/g, "") + if (!name || !voteText || !link) { + throw new Error( + `One of name, voteText, or candidate link is missing from ${row.outerHTML}` + ) + } + + const candidate = { + name, + votes: parseInt(voteText, 10), + ...parsePartyString(partyText) + } + + const ret: [ElectionCandidate, string] = [candidate, `${baseURL}${link}`] + return ret + }) + + candidates.sort((a, b) => b[0].votes - a[0].votes) + + const getSummaryValue = (selector: string): number | null => { + const row = table.querySelector(selector) + + const text = row + ?.querySelector("td:nth-child(2)") + ?.textContent?.replace(/,/g, "") + if (!text) { + return null + } + + return parseInt(text, 10) + } + + const link = table.querySelector("tr.more_info a")?.href + if (!link) { + throw new Error(`More info link missing from ${table.outerHTML}`) + } + + const [otherVotes, blankVotes, totalVotes] = [ + getSummaryValue("tr.n_all_other_votes"), + getSummaryValue("tr.n_blank_votes"), + getSummaryValue("tr.n_total_votes") + ] + const noPreference = getSummaryValue("tr.n_no_preference_votes") + if (!totalVotes) { + throw new Error(`No total votes row in ${table.outerHTML}`) + } + return [ + { + candidates: candidates.map(item => item[0]), + otherVotes: otherVotes ?? 0, + blankVotes: blankVotes ?? 0, + totalVotes, + electionDetailsUrl: `${baseURL}${link}`, + ...(noPreference ? { noPreferenceVotes: noPreference } : {}) + }, + candidates.map(item => item[1]) + ] +} + +async function electionsPageInfo(dom: JSDOM): Promise<(ElectionInfo | null)[]> { + const elements = Array.from( + dom.window.document.querySelectorAll('[id^="election-id-"]') + ) + const info = elements.map(async electionElem => { + try { + const electTDs = Array.from(electionElem.children).filter( + child => child.tagName === "TD" + ) + const yearText = electTDs[0].textContent + if (!yearText) { + throw new Error(`Year not present in ${electionElem.outerHTML}`) + } + const year = parseInt(yearText, 10) + const office = electTDs[1].textContent?.trim() + const districts = electTDs[2].textContent?.trim() + if (!year || !office || !districts) { + throw new Error( + `Year, office, or districts not present in ${electionElem.outerHTML}` + ) + } + const stage = parseElectionStage(electTDs[3].textContent?.trim() ?? "") + if (!stage) { + throw new Error( + `${stage} is not a recognized election stage: ${electTDs[3].outerHTML}` + ) + } + if (electTDs[4].querySelector(":scope > .no_candidates")) { + return ElectionInfo.check({ + year, + office, + districts, + candidateUrls: [], + ...stage + }) + } + const candidateTable = electTDs[4].querySelector(":scope tbody") + if (!candidateTable) { + throw new Error(`No candidate table in ${electionElem.outerHTML}`) + } + const [result, candidateUrls] = await parseElectionTable(candidateTable) + + return ElectionInfo.check({ + year, + office, + districts, + ...stage, + candidateUrls, + result + }) + } catch (error) { + console.error(error) + return null + } + }) + return Promise.all(info) +} + +export async function fetchElectionsData( + startYear: number, + endYear: number, + office?: Office, + stage: StageSelection | null = "General" +): Promise { + const officeId = office ? `/office_id:${officeIds[office]}` : "" + const electionStage = stage ? `/stage:${stage}` : "" + const url = `${baseURL}/elections/search/year_from:${startYear}/year_to:${endYear}${officeId}${electionStage}` + const page = await fetch(url) + const text = await page.text() + const virtualConsole = new VirtualConsole() + virtualConsole.on("jsdomError", error => { + if (error.message.includes("Could not parse CSS stylesheet")) { + return + } + console.error(error) + }) + const dom = new JSDOM(text, { virtualConsole }) + return (await electionsPageInfo(dom)).filter( + (item): item is ElectionInfo => item !== null + ) +} diff --git a/scripts/firebase-admin/backfillElections.ts b/scripts/firebase-admin/backfillElections.ts new file mode 100644 index 000000000..45ce63b18 --- /dev/null +++ b/scripts/firebase-admin/backfillElections.ts @@ -0,0 +1,22 @@ +import { Record, Number } from "runtypes" +import { Script } from "./types" +import { fetchElectionsData } from "functions/src/legislators/scrapeElections" +import { electionId } from "functions/src/legislators/electionTypes" + +const Args = Record({ + startYear: Number +}) + +export const script: Script = async ({ db, args }) => { + const { startYear } = Args.check(args) + const currentYear = new Date().getFullYear() + const writer = db.bulkWriter() + for (let year = startYear; year <= currentYear; year++) { + const data = await fetchElectionsData(year, year) + for (const item of data) { + const id = electionId(item) + writer.set(db.doc(`/electionResults/${id}`), item, { merge: true }) + } + } + await writer.close() +}