codeforboston · akash-m-2001 · Jun 29, 2026
@@ -5,9 +5,25 @@ jest.mock("../malegislature", () => ({
 jest.mock("./pdfText", () => ({
   extractBillTextFromPdf: jest.fn()
 }))
+jest.mock("firebase-functions", () => ({
+  logger: { warn: jest.fn(), info: jest.fn() }
+}))
+jest.mock("../scraper", () => ({
+  createScraper: jest.fn(() => ({ fetchBatch: {}, startBatches: {} }))
+}))
+jest.mock("../firebase", () => ({
+  Timestamp: { fromMillis: jest.fn(() => ({})), now: jest.fn(() => ({})) },
+  FieldValue: { delete: jest.fn() },
+  FieldPath: {}
+}))
+jest.mock("@google-cloud/firestore", () => ({
+  FieldValue: { delete: jest.fn() }
+}))
 
 import { getDocumentWithPdfTextFallback } from "./documentTextFallback"
 import { extractBillTextFromPdf } from "./pdfText"
+import { dropDocumentTextIfTooLarge, MAX_FIRESTORE_DOC_BYTES } from "./bills"
+import { logger } from "firebase-functions"
 
 const mockedApi = jest.requireMock("../malegislature") as {
   getDocument: jest.Mock
@@ -80,3 +96,47 @@ describe("getDocumentWithPdfTextFallback", () => {
     })
   })
 })
+
+describe("dropDocumentTextIfTooLarge", () => {
+  const mockedLogger = logger as jest.Mocked<typeof logger>
+
+  beforeEach(() => {
+    jest.resetAllMocks()
+  })
+
+  it("leaves DocumentText intact when resource is within the size limit", () => {
+    const content = { DocumentText: "short text", Cosponsors: [] } as any
+    const resource = { content } as any
+
+    dropDocumentTextIfTooLarge(resource, 194, "H1")
+
+    expect(content.DocumentText).toBe("short text")
+    expect(mockedLogger.warn).not.toHaveBeenCalled()
+  })
+
+  it("drops DocumentText and warns when resource exceeds 1 MiB", () => {
+    const longText = "x".repeat(MAX_FIRESTORE_DOC_BYTES + 100)
+    const content = { DocumentText: longText, Cosponsors: [] } as any
+    const resource = { content } as any
+
+    dropDocumentTextIfTooLarge(resource, 194, "H5500")
+
+    expect(content).not.toHaveProperty("DocumentText")
+    expect(mockedLogger.warn).toHaveBeenCalledWith(
+      expect.stringContaining("H5500")
+    )
+    expect(mockedLogger.warn).toHaveBeenCalledWith(
+      expect.stringContaining("dropping DocumentText")
+    )
+  })
+
+  it("mutates the same content object referenced by the resource", () => {
+    const longText = "x".repeat(MAX_FIRESTORE_DOC_BYTES + 100)
+    const content = { DocumentText: longText, Cosponsors: [] } as any
+    const resource = { content } as any
+
+    dropDocumentTextIfTooLarge(resource, 194, "H5500")
+
+    expect(resource.content).not.toHaveProperty("DocumentText")
+  })
+})
@@ -6,8 +6,31 @@ import { createScraper } from "../scraper"
 import { getDocumentWithPdfTextFallback } from "./documentTextFallback"
 import { Bill, MISSING_TIMESTAMP } from "./types"
 
+export const MAX_FIRESTORE_DOC_BYTES = 1_048_576
+
 export { getDocumentWithPdfTextFallback } from "./documentTextFallback"
 
+/**
+ * Drops DocumentText from the bill content when the serialized resource would
+ * exceed Firestore's 1 MiB document size limit. This prevents write failures
+ * for bills with very long extracted PDF text (e.g. H5500 in court 194).
+ */
+export function dropDocumentTextIfTooLarge(
+  resource: Partial<Bill>,
+  court: number,
+  id: string
+): void {
+  const byteLength = Buffer.byteLength(JSON.stringify(resource), "utf8")
+  if (byteLength > MAX_FIRESTORE_DOC_BYTES) {
+    logger.warn(
+      `Bill ${court}/${id} exceeds Firestore size limit (${byteLength} bytes), dropping DocumentText`
+    )
+    if (resource.content) {
+      delete resource.content.DocumentText
+    }
+  }
+}
+
 /**
  * There are around 8000 documents. With 8 batches per day, 20 parallel
  * scrapers, and 50 documents per batch, we will process all documents once per
@@ -53,6 +76,7 @@ export const { fetchBatch: fetchBillBatch, startBatches: startBillBatches } =
         nextHearingAt: current?.nextHearingAt ?? MISSING_TIMESTAMP
       }
 
+      dropDocumentTextIfTooLarge(resource, court, id)
       return resource
     },
     listIds: (court: number) =>