Skip to content

Commit bfb5023

Browse files
authored
Backfill Hearing Transcriptions Script (#1951)
* Initial version of backfill * Add bucketName arguments to specify Firebase bucket * Change to sequential processing instead of parallel to prevent memory overload * Add limit to prevent transcription overload
1 parent a13c33c commit bfb5023

2 files changed

Lines changed: 114 additions & 6 deletions

File tree

functions/src/events/scrapeEvents.ts

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,8 @@ class SessionScraper extends EventScraper<SessionContent, Session> {
148148

149149
const extractAudioFromVideo = async (
150150
EventId: number,
151-
videoUrl: string
151+
videoUrl: string,
152+
bucketName?: string
152153
): Promise<string> => {
153154
const tmpFilePath = `/tmp/hearing-${EventId}-${Date.now()}.m4a`
154155

@@ -182,7 +183,7 @@ const extractAudioFromVideo = async (
182183
})
183184

184185
// Upload the audio file
185-
const bucket = storage.bucket()
186+
const bucket = bucketName ? storage.bucket(bucketName) : storage.bucket()
186187
const audioFileName = `hearing-${EventId}-${Date.now()}.m4a`
187188
const file = bucket.file(audioFileName)
188189

@@ -217,19 +218,25 @@ const extractAudioFromVideo = async (
217218
return url
218219
}
219220

220-
const submitTranscription = async ({
221+
export const submitTranscription = async ({
221222
EventId,
222-
maybeVideoUrl
223+
maybeVideoUrl,
224+
bucketName
223225
}: {
224226
EventId: number
225227
maybeVideoUrl: string
228+
bucketName?: string
226229
}) => {
227230
const assembly = new AssemblyAI({
228231
apiKey: process.env.ASSEMBLY_API_KEY ? process.env.ASSEMBLY_API_KEY : ""
229232
})
230233

231234
const newToken = randomBytes(16).toString("hex")
232-
const audioUrl = await extractAudioFromVideo(EventId, maybeVideoUrl)
235+
const audioUrl = await extractAudioFromVideo(
236+
EventId,
237+
maybeVideoUrl,
238+
bucketName
239+
)
233240

234241
const transcript = await assembly.transcripts.submit({
235242
audio:
@@ -258,7 +265,7 @@ const submitTranscription = async ({
258265
return transcript.id
259266
}
260267

261-
const getHearingVideoUrl = async (EventId: number) => {
268+
export const getHearingVideoUrl = async (EventId: number) => {
262269
const req = await fetch(
263270
`https://malegislature.gov/Events/Hearings/Detail/${EventId}`
264271
)
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import { Timestamp } from "../../functions/src/firebase"
2+
import { Record, Number, String } from "runtypes"
3+
import { Script } from "./types"
4+
import { getHearingVideoUrl, submitTranscription } from "functions/src/events"
5+
6+
const Args = Record({
7+
eventId: Number.optional(),
8+
bucketName: String.optional()
9+
})
10+
11+
export const script: Script = async ({ db, args }) => {
12+
const { eventId, bucketName } = Args.check(args)
13+
14+
// Process a single event by eventId
15+
if (eventId) {
16+
const docRef = db.collection("events").doc(`hearing-${eventId}`)
17+
const doc = await docRef.get()
18+
if (!doc.exists) {
19+
console.log(`No hearing found with EventId ${eventId}`)
20+
return
21+
}
22+
const data = doc.data()
23+
if (data?.videoTranscriptionId) {
24+
console.log(`Hearing ${eventId} already has a transcription.`)
25+
return
26+
}
27+
try {
28+
const maybeVideoUrl = await getHearingVideoUrl(eventId)
29+
if (maybeVideoUrl) {
30+
const transcriptId = await submitTranscription({
31+
maybeVideoUrl,
32+
EventId: eventId,
33+
bucketName
34+
})
35+
36+
await docRef.update({
37+
videoURL: maybeVideoUrl,
38+
videoFetchedAt: Timestamp.now(),
39+
videoTranscriptionId: transcriptId
40+
})
41+
42+
console.log(
43+
`Transcription submitted for hearing ${eventId}: ${transcriptId}`
44+
)
45+
} else {
46+
console.log(`No valid video URL found for hearing ${eventId}`)
47+
}
48+
} catch (error) {
49+
console.error(`Failed to process hearing ${eventId}:`, error)
50+
}
51+
} else {
52+
// Run events sequentially to avoid overloading the transcription service
53+
const hearingsSnapshot = await db
54+
.collection("events")
55+
.where("type", "==", "hearing")
56+
.get()
57+
let count = 0
58+
59+
for (const doc of hearingsSnapshot.docs) {
60+
if (count >= 100) {
61+
break // Limit to 100 operations for this run
62+
}
63+
const data = doc.data()
64+
if (!data.videoTranscriptionId) {
65+
const EventId = parseInt(doc.id.replace("hearing-", ""))
66+
console.log(`Processing hearing ${EventId}...`)
67+
68+
try {
69+
const maybeVideoUrl = await getHearingVideoUrl(EventId)
70+
if (maybeVideoUrl) {
71+
const transcriptId = await submitTranscription({
72+
maybeVideoUrl,
73+
EventId,
74+
bucketName
75+
})
76+
77+
await doc.ref.update({
78+
videoURL: maybeVideoUrl,
79+
videoFetchedAt: Timestamp.now(),
80+
videoTranscriptionId: transcriptId
81+
})
82+
83+
console.log(
84+
`Transcription submitted for hearing ${EventId}: ${transcriptId}`
85+
)
86+
count++
87+
} else {
88+
console.log(`No valid video URL found for hearing ${EventId}`)
89+
}
90+
} catch (error) {
91+
console.error(`Failed to process hearing ${EventId}:`, error)
92+
}
93+
} else {
94+
console.log(
95+
`Skipping hearing ${data.EventId}, already has transcription.`
96+
)
97+
}
98+
}
99+
console.log("Done processing hearings without transcriptions.")
100+
}
101+
}

0 commit comments

Comments
 (0)