diff --git a/functions/src/bills/bills.test.ts b/functions/src/bills/bills.test.ts index 1d99b8712..ede7bb307 100644 --- a/functions/src/bills/bills.test.ts +++ b/functions/src/bills/bills.test.ts @@ -5,9 +5,25 @@ jest.mock("../malegislature", () => ({ jest.mock("./pdfText", () => ({ extractBillTextFromPdf: jest.fn() })) +jest.mock("firebase-functions", () => ({ + logger: { warn: jest.fn(), info: jest.fn() } +})) +jest.mock("../scraper", () => ({ + createScraper: jest.fn(() => ({ fetchBatch: {}, startBatches: {} })) +})) +jest.mock("../firebase", () => ({ + Timestamp: { fromMillis: jest.fn(() => ({})), now: jest.fn(() => ({})) }, + FieldValue: { delete: jest.fn() }, + FieldPath: {} +})) +jest.mock("@google-cloud/firestore", () => ({ + FieldValue: { delete: jest.fn() } +})) import { getDocumentWithPdfTextFallback } from "./documentTextFallback" import { extractBillTextFromPdf } from "./pdfText" +import { dropDocumentTextIfTooLarge, MAX_FIRESTORE_DOC_BYTES } from "./bills" +import { logger } from "firebase-functions" const mockedApi = jest.requireMock("../malegislature") as { getDocument: jest.Mock @@ -80,3 +96,47 @@ describe("getDocumentWithPdfTextFallback", () => { }) }) }) + +describe("dropDocumentTextIfTooLarge", () => { + const mockedLogger = logger as jest.Mocked + + beforeEach(() => { + jest.resetAllMocks() + }) + + it("leaves DocumentText intact when resource is within the size limit", () => { + const content = { DocumentText: "short text", Cosponsors: [] } as any + const resource = { content } as any + + dropDocumentTextIfTooLarge(resource, 194, "H1") + + expect(content.DocumentText).toBe("short text") + expect(mockedLogger.warn).not.toHaveBeenCalled() + }) + + it("drops DocumentText and warns when resource exceeds 1 MiB", () => { + const longText = "x".repeat(MAX_FIRESTORE_DOC_BYTES + 100) + const content = { DocumentText: longText, Cosponsors: [] } as any + const resource = { content } as any + + dropDocumentTextIfTooLarge(resource, 194, "H5500") + + expect(content).not.toHaveProperty("DocumentText") + expect(mockedLogger.warn).toHaveBeenCalledWith( + expect.stringContaining("H5500") + ) + expect(mockedLogger.warn).toHaveBeenCalledWith( + expect.stringContaining("dropping DocumentText") + ) + }) + + it("mutates the same content object referenced by the resource", () => { + const longText = "x".repeat(MAX_FIRESTORE_DOC_BYTES + 100) + const content = { DocumentText: longText, Cosponsors: [] } as any + const resource = { content } as any + + dropDocumentTextIfTooLarge(resource, 194, "H5500") + + expect(resource.content).not.toHaveProperty("DocumentText") + }) +}) diff --git a/functions/src/bills/bills.ts b/functions/src/bills/bills.ts index 58aeb8a71..20e57abe3 100644 --- a/functions/src/bills/bills.ts +++ b/functions/src/bills/bills.ts @@ -6,8 +6,31 @@ import { createScraper } from "../scraper" import { getDocumentWithPdfTextFallback } from "./documentTextFallback" import { Bill, MISSING_TIMESTAMP } from "./types" +export const MAX_FIRESTORE_DOC_BYTES = 1_048_576 + export { getDocumentWithPdfTextFallback } from "./documentTextFallback" +/** + * Drops DocumentText from the bill content when the serialized resource would + * exceed Firestore's 1 MiB document size limit. This prevents write failures + * for bills with very long extracted PDF text (e.g. H5500 in court 194). + */ +export function dropDocumentTextIfTooLarge( + resource: Partial, + court: number, + id: string +): void { + const byteLength = Buffer.byteLength(JSON.stringify(resource), "utf8") + if (byteLength > MAX_FIRESTORE_DOC_BYTES) { + logger.warn( + `Bill ${court}/${id} exceeds Firestore size limit (${byteLength} bytes), dropping DocumentText` + ) + if (resource.content) { + delete resource.content.DocumentText + } + } +} + /** * There are around 8000 documents. With 8 batches per day, 20 parallel * scrapers, and 50 documents per batch, we will process all documents once per @@ -53,6 +76,7 @@ export const { fetchBatch: fetchBillBatch, startBatches: startBillBatches } = nextHearingAt: current?.nextHearingAt ?? MISSING_TIMESTAMP } + dropDocumentTextIfTooLarge(resource, court, id) return resource }, listIds: (court: number) =>