-
Notifications
You must be signed in to change notification settings - Fork 731
fix: data sink worker improvements (CM-1054) #3996
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 4 commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
1589183
fix: monitoring for results, incomming webhooks handling improved, fi…
themarolt b409a2c
Merge branch 'main' into fix/data-sink-worker-improvements-CM-1054
themarolt 84493c7
fix: comments
themarolt ec60e44
fix: comments
themarolt 969567e
fix: comments
themarolt ba8b4c2
fix: small ones
themarolt 896e55d
fix: small ones
themarolt 9fa141e
fix: small ones
themarolt fd0b6cd
Merge branch 'main' into fix/data-sink-worker-improvements-CM-1054
themarolt 2952e09
fix: address PR review comments
themarolt ff79db6
fix: comment
themarolt File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
79 changes: 79 additions & 0 deletions
79
services/apps/cron_service/src/jobs/incomingWebhooksCheck.job.ts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,79 @@ | ||
| import CronTime from 'cron-time-generator' | ||
|
|
||
| import { IS_PROD_ENV } from '@crowd/common' | ||
| import { IntegrationStreamWorkerEmitter } from '@crowd/common_services' | ||
| import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/data-access-layer/src/database' | ||
| import { QUEUE_CONFIG, getKafkaClient, getKafkaMessageCounts } from '@crowd/queue' | ||
| import { KafkaQueueService } from '@crowd/queue/src/vendors/kafka/client' | ||
| import { WebhookState } from '@crowd/types' | ||
|
|
||
| import { IJobDefinition } from '../types' | ||
|
|
||
| const TOPIC = 'integration-stream-worker-high-production' | ||
| const GROUP_ID = 'integration-stream-worker-high-production' | ||
| const MAX_UNCONSUMED = 50000 | ||
|
|
||
| const job: IJobDefinition = { | ||
| name: 'incoming-webhooks-check', | ||
| cronTime: CronTime.everyDay(), | ||
| timeout: 30 * 60, // 30 minutes | ||
| enabled: async () => IS_PROD_ENV, | ||
| process: async (ctx) => { | ||
| const kafkaClient = getKafkaClient(QUEUE_CONFIG()) | ||
| const admin = kafkaClient.admin() | ||
| await admin.connect() | ||
|
|
||
| const counts = await getKafkaMessageCounts(ctx.log, admin, TOPIC, GROUP_ID) | ||
|
|
||
| if (counts.unconsumed >= MAX_UNCONSUMED) { | ||
|
themarolt marked this conversation as resolved.
|
||
| ctx.log.info( | ||
| `Integration stream worker queue has ${counts.unconsumed} unconsumed messages, skipping!`, | ||
| ) | ||
| return | ||
| } | ||
|
|
||
| const dbConnection = await getDbConnection(WRITE_DB_CONFIG()) | ||
|
|
||
| const count = ( | ||
|
themarolt marked this conversation as resolved.
Outdated
|
||
| await dbConnection.one( | ||
| `select count(*)::int as count from "incomingWebhooks" where state = $(state) and "createdAt" < now() - interval '1 day'`, | ||
| { state: WebhookState.PENDING }, | ||
| ) | ||
| ).count | ||
|
themarolt marked this conversation as resolved.
|
||
|
|
||
| if (count <= counts.unconsumed) { | ||
| ctx.log.info(`All ${count} stuck pending webhooks are already in the queue, skipping!`) | ||
| return | ||
| } | ||
|
|
||
| const webhooks = await dbConnection.any<{ id: string; platform: string }>( | ||
| ` | ||
| select iw.id, i.platform | ||
| from "incomingWebhooks" iw | ||
| join integrations i on iw."integrationId" = i.id | ||
| where iw.state = $(state) | ||
| and iw."createdAt" < now() - interval '1 day' | ||
| order by iw."createdAt" asc | ||
| limit 10000 | ||
| `, | ||
| { state: WebhookState.PENDING }, | ||
| ) | ||
|
cursor[bot] marked this conversation as resolved.
|
||
|
|
||
| if (webhooks.length === 0) { | ||
| ctx.log.info('No stuck pending webhooks found!') | ||
| return | ||
| } | ||
|
|
||
| ctx.log.info(`Found ${webhooks.length} stuck pending webhooks, re-triggering!`) | ||
|
|
||
| const queueService = new KafkaQueueService(kafkaClient, ctx.log) | ||
| const emitter = new IntegrationStreamWorkerEmitter(queueService, ctx.log) | ||
| await emitter.init() | ||
|
|
||
| await emitter.triggerWebhookProcessingBatch(webhooks.map((w) => w.id)) | ||
|
|
||
| ctx.log.info(`Re-triggered ${webhooks.length} stuck pending webhooks in total!`) | ||
| }, | ||
| } | ||
|
|
||
| export default job | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
159 changes: 159 additions & 0 deletions
159
services/apps/cron_service/src/jobs/integrationResultsReporting.job.ts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,159 @@ | ||
| import CronTime from 'cron-time-generator' | ||
|
|
||
| import { IS_DEV_ENV } from '@crowd/common' | ||
| import { READ_DB_CONFIG, getDbConnection } from '@crowd/data-access-layer/src/database' | ||
| import { | ||
| SlackChannel, | ||
| SlackMessageSection, | ||
| SlackPersona, | ||
| sendSlackNotificationAsync, | ||
| } from '@crowd/slack' | ||
| import { IntegrationResultState } from '@crowd/types' | ||
|
|
||
| import { IJobDefinition } from '../types' | ||
|
|
||
| interface IResultStateCount { | ||
| state: string | ||
| count: number | ||
| } | ||
|
|
||
| interface IErrorGroup { | ||
| errorMessage: string | ||
| location: string | ||
| message: string | ||
| count: number | ||
| avgRetries: number | ||
| maxRetries: number | ||
| oldest: Date | ||
| newest: Date | ||
| platforms: string | null | ||
| } | ||
|
|
||
| const job: IJobDefinition = { | ||
| name: 'integration-results-reporting', | ||
| cronTime: IS_DEV_ENV ? CronTime.everyMinute() : CronTime.everyDayAt(8, 30), | ||
|
themarolt marked this conversation as resolved.
Outdated
|
||
| timeout: 10 * 60, // 10 minutes | ||
| process: async (ctx) => { | ||
| ctx.log.info('Running integration-results-reporting job...') | ||
|
|
||
| const dbConnection = await getDbConnection(READ_DB_CONFIG(), 3, 0) | ||
|
|
||
| // Count results per state | ||
| const stateCounts = await dbConnection.any<IResultStateCount>( | ||
| `SELECT state, count(*)::int AS count FROM integration.results GROUP BY state ORDER BY count DESC`, | ||
| ) | ||
|
|
||
| const countByState: Record<string, number> = {} | ||
| for (const row of stateCounts) { | ||
| countByState[row.state] = row.count | ||
| } | ||
|
|
||
| const pending = countByState[IntegrationResultState.PENDING] ?? 0 | ||
| const processing = countByState[IntegrationResultState.PROCESSING] ?? 0 | ||
| const processed = countByState[IntegrationResultState.PROCESSED] ?? 0 | ||
| const delayed = countByState[IntegrationResultState.DELAYED] ?? 0 | ||
| const errorCount = countByState[IntegrationResultState.ERROR] ?? 0 | ||
| const total = pending + processing + processed + delayed + errorCount | ||
|
|
||
| // How many delayed results are overdue (i.e. should already be processed) | ||
| const overdueDelayed = ( | ||
| await dbConnection.one<{ count: number }>( | ||
| `SELECT count(*)::int AS count FROM integration.results WHERE state = 'delayed' AND "delayedUntil" < now()`, | ||
| ) | ||
| ).count | ||
|
|
||
| // Break down errors by errorMessage + location, enriched with platform info | ||
| const errorGroups = await dbConnection.any<IErrorGroup>( | ||
| ` | ||
| SELECT | ||
| COALESCE(r.error->>'errorMessage', '[no errorMessage]') AS "errorMessage", | ||
| COALESCE(r.error->>'location', '[no location]') AS location, | ||
| COALESCE(r.error->>'message', '[no message]') AS message, | ||
| count(*)::int AS count, | ||
| round(avg(r.retries), 1)::float AS "avgRetries", | ||
| max(r.retries)::int AS "maxRetries", | ||
| min(r."createdAt") AS oldest, | ||
| max(r."updatedAt") AS newest, | ||
| string_agg(DISTINCT i.platform, ', ' ORDER BY i.platform) AS platforms | ||
| FROM integration.results r | ||
| LEFT JOIN integrations i ON i.id = r."integrationId" | ||
| WHERE r.state = 'error' | ||
| GROUP BY | ||
| r.error->>'errorMessage', | ||
| r.error->>'location', | ||
| r.error->>'message' | ||
| ORDER BY count DESC | ||
| LIMIT 20 | ||
| `, | ||
| ) | ||
|
|
||
| const sections: SlackMessageSection[] = [] | ||
|
|
||
| sections.push({ | ||
| title: 'Integration Results Summary', | ||
| text: [ | ||
| `*Total:* ${total.toLocaleString()}`, | ||
| '', | ||
| `⏳ Pending: *${pending.toLocaleString()}*`, | ||
| `⚙️ Processing: *${processing.toLocaleString()}*`, | ||
| `✅ Processed: *${processed.toLocaleString()}*`, | ||
| `🕐 Delayed: *${delayed.toLocaleString()}*${overdueDelayed > 0 ? ` (${overdueDelayed.toLocaleString()} overdue)` : ''}`, | ||
| `❌ Error: *${errorCount.toLocaleString()}*`, | ||
| ].join('\n'), | ||
| }) | ||
|
|
||
| if (errorCount > 0 && errorGroups.length > 0) { | ||
| const lines: string[] = [ | ||
| `Top ${errorGroups.length} error group${errorGroups.length !== 1 ? 's' : ''} out of *${errorCount.toLocaleString()}* total errors:`, | ||
| '', | ||
| ] | ||
|
|
||
| for (const group of errorGroups) { | ||
| const oldestHoursAgo = Math.round( | ||
| (Date.now() - new Date(group.oldest).getTime()) / 3_600_000, | ||
| ) | ||
| const newestHoursAgo = Math.round( | ||
| (Date.now() - new Date(group.newest).getTime()) / 3_600_000, | ||
| ) | ||
| const ageLabel = | ||
| oldestHoursAgo === newestHoursAgo | ||
| ? formatHoursAgo(oldestHoursAgo) | ||
| : `${formatHoursAgo(newestHoursAgo)} – ${formatHoursAgo(oldestHoursAgo)}` | ||
|
|
||
| lines.push( | ||
| `• *${group.count}x* \`${group.errorMessage}\``, | ||
| ` _Location:_ \`${group.location}\` | _retries avg/max:_ ${group.avgRetries}/${group.maxRetries}${group.platforms ? ` | _platforms:_ \`${group.platforms}\`` : ''}`, | ||
| ` _Age:_ ${ageLabel}`, | ||
| ` _Detail:_ ${group.message}`, | ||
| '', | ||
| ) | ||
| } | ||
|
|
||
| sections.push({ | ||
| title: `Error Breakdown (top ${errorGroups.length})`, | ||
| text: lines.join('\n'), | ||
| }) | ||
| } | ||
|
|
||
| const persona = errorCount > 0 ? SlackPersona.WARNING_PROPAGATOR : SlackPersona.INFO_NOTIFIER | ||
|
|
||
| await sendSlackNotificationAsync( | ||
| SlackChannel.CDP_INTEGRATIONS_ALERTS, | ||
| persona, | ||
| 'Integration Results Daily Report', | ||
| sections, | ||
| ) | ||
|
|
||
| ctx.log.info( | ||
| `Integration results report sent: pending=${pending}, delayed=${delayed} (${overdueDelayed} overdue), errors=${errorCount}`, | ||
| ) | ||
| }, | ||
| } | ||
|
|
||
| function formatHoursAgo(hours: number): string { | ||
| if (hours < 1) return 'just now' | ||
| if (hours < 24) return `${hours}h ago` | ||
| return `${Math.round(hours / 24)}d ago` | ||
| } | ||
|
|
||
| export default job | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.