Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions functions/src/bills/bills.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,25 @@ jest.mock("../malegislature", () => ({
jest.mock("./pdfText", () => ({
extractBillTextFromPdf: jest.fn()
}))
jest.mock("firebase-functions", () => ({
logger: { warn: jest.fn(), info: jest.fn() }
}))
jest.mock("../scraper", () => ({
createScraper: jest.fn(() => ({ fetchBatch: {}, startBatches: {} }))
}))
jest.mock("../firebase", () => ({
Timestamp: { fromMillis: jest.fn(() => ({})), now: jest.fn(() => ({})) },
FieldValue: { delete: jest.fn() },
FieldPath: {}
}))
jest.mock("@google-cloud/firestore", () => ({
FieldValue: { delete: jest.fn() }
}))

import { getDocumentWithPdfTextFallback } from "./documentTextFallback"
import { extractBillTextFromPdf } from "./pdfText"
import { dropDocumentTextIfTooLarge, MAX_FIRESTORE_DOC_BYTES } from "./bills"
import { logger } from "firebase-functions"

const mockedApi = jest.requireMock("../malegislature") as {
getDocument: jest.Mock
Expand Down Expand Up @@ -80,3 +96,47 @@ describe("getDocumentWithPdfTextFallback", () => {
})
})
})

describe("dropDocumentTextIfTooLarge", () => {
const mockedLogger = logger as jest.Mocked<typeof logger>

beforeEach(() => {
jest.resetAllMocks()
})

it("leaves DocumentText intact when resource is within the size limit", () => {
const content = { DocumentText: "short text", Cosponsors: [] } as any
const resource = { content } as any

dropDocumentTextIfTooLarge(resource, 194, "H1")

expect(content.DocumentText).toBe("short text")
expect(mockedLogger.warn).not.toHaveBeenCalled()
})

it("drops DocumentText and warns when resource exceeds 1 MiB", () => {
const longText = "x".repeat(MAX_FIRESTORE_DOC_BYTES + 100)
const content = { DocumentText: longText, Cosponsors: [] } as any
const resource = { content } as any

dropDocumentTextIfTooLarge(resource, 194, "H5500")

expect(content).not.toHaveProperty("DocumentText")
expect(mockedLogger.warn).toHaveBeenCalledWith(
expect.stringContaining("H5500")
)
expect(mockedLogger.warn).toHaveBeenCalledWith(
expect.stringContaining("dropping DocumentText")
)
})

it("mutates the same content object referenced by the resource", () => {
const longText = "x".repeat(MAX_FIRESTORE_DOC_BYTES + 100)
const content = { DocumentText: longText, Cosponsors: [] } as any
const resource = { content } as any

dropDocumentTextIfTooLarge(resource, 194, "H5500")

expect(resource.content).not.toHaveProperty("DocumentText")
})
})
24 changes: 24 additions & 0 deletions functions/src/bills/bills.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,31 @@ import { createScraper } from "../scraper"
import { getDocumentWithPdfTextFallback } from "./documentTextFallback"
import { Bill, MISSING_TIMESTAMP } from "./types"

export const MAX_FIRESTORE_DOC_BYTES = 1_048_576

export { getDocumentWithPdfTextFallback } from "./documentTextFallback"

/**
* Drops DocumentText from the bill content when the serialized resource would
* exceed Firestore's 1 MiB document size limit. This prevents write failures
* for bills with very long extracted PDF text (e.g. H5500 in court 194).
*/
export function dropDocumentTextIfTooLarge(
resource: Partial<Bill>,
court: number,
id: string
): void {
const byteLength = Buffer.byteLength(JSON.stringify(resource), "utf8")
if (byteLength > MAX_FIRESTORE_DOC_BYTES) {
logger.warn(
`Bill ${court}/${id} exceeds Firestore size limit (${byteLength} bytes), dropping DocumentText`
)
if (resource.content) {
delete resource.content.DocumentText
}
}
}

/**
* There are around 8000 documents. With 8 batches per day, 20 parallel
* scrapers, and 50 documents per batch, we will process all documents once per
Expand Down Expand Up @@ -53,6 +76,7 @@ export const { fetchBatch: fetchBillBatch, startBatches: startBillBatches } =
nextHearingAt: current?.nextHearingAt ?? MISSING_TIMESTAMP
}

dropDocumentTextIfTooLarge(resource, court, id)
return resource
},
listIds: (court: number) =>
Expand Down