Backfill Hearing Transcriptions Script (#1951)

kiminkim724 · web-flow · commit bfb5023d743b · 2025-10-14T16:39:58.000-04:00
* Initial version of backfill

* Add bucketName arguments to specify Firebase bucket

* Change to sequential processing instead of parallel to prevent memory overload

* Add limit to prevent transcription overload
diff --git a/functions/src/events/scrapeEvents.ts b/functions/src/events/scrapeEvents.ts
@@ -148,7 +148,8 @@ class SessionScraper extends EventScraper<SessionContent, Session> {
 
 const extractAudioFromVideo = async (
   EventId: number,
-  videoUrl: string
+  videoUrl: string,
+  bucketName?: string
 ): Promise<string> => {
   const tmpFilePath = `/tmp/hearing-${EventId}-${Date.now()}.m4a`
 
@@ -182,7 +183,7 @@ const extractAudioFromVideo = async (
   })
 
   // Upload the audio file
-  const bucket = storage.bucket()
+  const bucket = bucketName ? storage.bucket(bucketName) : storage.bucket()
   const audioFileName = `hearing-${EventId}-${Date.now()}.m4a`
   const file = bucket.file(audioFileName)
 
@@ -217,19 +218,25 @@ const extractAudioFromVideo = async (
   return url
 }
 
-const submitTranscription = async ({
+export const submitTranscription = async ({
   EventId,
-  maybeVideoUrl
+  maybeVideoUrl,
+  bucketName
 }: {
   EventId: number
   maybeVideoUrl: string
+  bucketName?: string
 }) => {
   const assembly = new AssemblyAI({
     apiKey: process.env.ASSEMBLY_API_KEY ? process.env.ASSEMBLY_API_KEY : ""
   })
 
   const newToken = randomBytes(16).toString("hex")
-  const audioUrl = await extractAudioFromVideo(EventId, maybeVideoUrl)
+  const audioUrl = await extractAudioFromVideo(
+    EventId,
+    maybeVideoUrl,
+    bucketName
+  )
 
   const transcript = await assembly.transcripts.submit({
     audio:
@@ -258,7 +265,7 @@ const submitTranscription = async ({
   return transcript.id
 }
 
-const getHearingVideoUrl = async (EventId: number) => {
+export const getHearingVideoUrl = async (EventId: number) => {
   const req = await fetch(
     `https://malegislature.gov/Events/Hearings/Detail/${EventId}`
   )
diff --git a/scripts/firebase-admin/backfillHearingTranscription.ts b/scripts/firebase-admin/backfillHearingTranscription.ts
@@ -0,0 +1,101 @@
+import { Timestamp } from "../../functions/src/firebase"
+import { Record, Number, String } from "runtypes"
+import { Script } from "./types"
+import { getHearingVideoUrl, submitTranscription } from "functions/src/events"
+
+const Args = Record({
+  eventId: Number.optional(),
+  bucketName: String.optional()
+})
+
+export const script: Script = async ({ db, args }) => {
+  const { eventId, bucketName } = Args.check(args)
+
+  // Process a single event by eventId
+  if (eventId) {
+    const docRef = db.collection("events").doc(`hearing-${eventId}`)
+    const doc = await docRef.get()
+    if (!doc.exists) {
+      console.log(`No hearing found with EventId ${eventId}`)
+      return
+    }
+    const data = doc.data()
+    if (data?.videoTranscriptionId) {
+      console.log(`Hearing ${eventId} already has a transcription.`)
+      return
+    }
+    try {
+      const maybeVideoUrl = await getHearingVideoUrl(eventId)
+      if (maybeVideoUrl) {
+        const transcriptId = await submitTranscription({
+          maybeVideoUrl,
+          EventId: eventId,
+          bucketName
+        })
+
+        await docRef.update({
+          videoURL: maybeVideoUrl,
+          videoFetchedAt: Timestamp.now(),
+          videoTranscriptionId: transcriptId
+        })
+
+        console.log(
+          `Transcription submitted for hearing ${eventId}: ${transcriptId}`
+        )
+      } else {
+        console.log(`No valid video URL found for hearing ${eventId}`)
+      }
+    } catch (error) {
+      console.error(`Failed to process hearing ${eventId}:`, error)
+    }
+  } else {
+    // Run events sequentially to avoid overloading the transcription service
+    const hearingsSnapshot = await db
+      .collection("events")
+      .where("type", "==", "hearing")
+      .get()
+    let count = 0
+
+    for (const doc of hearingsSnapshot.docs) {
+      if (count >= 100) {
+        break // Limit to 100 operations for this run
+      }
+      const data = doc.data()
+      if (!data.videoTranscriptionId) {
+        const EventId = parseInt(doc.id.replace("hearing-", ""))
+        console.log(`Processing hearing ${EventId}...`)
+
+        try {
+          const maybeVideoUrl = await getHearingVideoUrl(EventId)
+          if (maybeVideoUrl) {
+            const transcriptId = await submitTranscription({
+              maybeVideoUrl,
+              EventId,
+              bucketName
+            })
+
+            await doc.ref.update({
+              videoURL: maybeVideoUrl,
+              videoFetchedAt: Timestamp.now(),
+              videoTranscriptionId: transcriptId
+            })
+
+            console.log(
+              `Transcription submitted for hearing ${EventId}: ${transcriptId}`
+            )
+            count++
+          } else {
+            console.log(`No valid video URL found for hearing ${EventId}`)
+          }
+        } catch (error) {
+          console.error(`Failed to process hearing ${EventId}:`, error)
+        }
+      } else {
+        console.log(
+          `Skipping hearing ${data.EventId}, already has transcription.`
+        )
+      }
+    }
+    console.log("Done processing hearings without transcriptions.")
+  }
+}