Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Cotabby.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,7 @@
A736C52C0D280A35946E37A3 /* SurfaceContextComposer.swift in Sources */ = {isa = PBXBuildFile; fileRef = C602357DDED5D11C8B4567FB /* SurfaceContextComposer.swift */; };
A773D96AC9EC16231633542C /* DownloadOutcomeClassifier.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3DE1975F3B5F4A70478DBF41 /* DownloadOutcomeClassifier.swift */; };
A7A1B9BE242959B3EE396D27 /* LaunchAtLogin in Frameworks */ = {isa = PBXBuildFile; productRef = 7B0278A63FEEE8DEDB6C50DB /* LaunchAtLogin */; };
A81E3481B0067E9D6F9D1325 /* ScreenTextExtractorTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6C34A98E5B05DB3F4FE81E70 /* ScreenTextExtractorTests.swift */; };
A87978083EBE1AC294377F7C /* HuggingFaceSearchService.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8F426127917FCB1096134732 /* HuggingFaceSearchService.swift */; };
A8A294534897C461CA5968F3 /* UnitConversionEvaluator.swift in Sources */ = {isa = PBXBuildFile; fileRef = 82E7794DF60664B1FA8F6E7B /* UnitConversionEvaluator.swift */; };
A93A741C8C3973687D28F0B6 /* SuggestionEngineModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = ADBE3E6CC585C1683787C877 /* SuggestionEngineModels.swift */; };
Expand Down Expand Up @@ -823,6 +824,7 @@
684737E62EE6495A71344923 /* DeepGeometryWalkThrottle.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DeepGeometryWalkThrottle.swift; sourceTree = "<group>"; };
6A44BEC8C23FF227731DD0CD /* FocusCapabilityFlickerGate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FocusCapabilityFlickerGate.swift; sourceTree = "<group>"; };
6B2D97BAA3618A7D0357AC44 /* SuggestionWorkController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionWorkController.swift; sourceTree = "<group>"; };
6C34A98E5B05DB3F4FE81E70 /* ScreenTextExtractorTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ScreenTextExtractorTests.swift; sourceTree = "<group>"; };
6CF1FBAABEF545B620AF8D78 /* ru-100k.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = "ru-100k.txt"; sourceTree = "<group>"; };
6D4C1EF008B9DFA753D561D3 /* LlamaEvalScoringTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaEvalScoringTests.swift; sourceTree = "<group>"; };
6DB982BF30B3601F57277776 /* fr-100k.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = "fr-100k.txt"; sourceTree = "<group>"; };
Expand Down Expand Up @@ -1468,6 +1470,7 @@
1909DF39C47A113382BB53B6 /* RequestIDTests.swift */,
B1E6D9CCC0AA3674FEE57AE0 /* RuntimeBootstrapModelTests.swift */,
B2BFD19A159680A495EE02FD /* ScreenshotContextGeneratorTests.swift */,
6C34A98E5B05DB3F4FE81E70 /* ScreenTextExtractorTests.swift */,
474560E524C1D74BAB1570DA /* SecureFieldDetectorTests.swift */,
D5A5591BEB9EE7B6E9064412 /* SelfCaptureGateTests.swift */,
2D7360A6D4261989A66658ED /* SentenceBoundaryClassifierTests.swift */,
Expand Down Expand Up @@ -2489,6 +2492,7 @@
7EB20783E0D36715D1230A5C /* PromptSectionBudgetTests.swift in Sources */,
1C46642846D8FD1475AA5CCF /* RequestIDTests.swift in Sources */,
AD6E005ABE34AB7EBD92A30D /* RuntimeBootstrapModelTests.swift in Sources */,
A81E3481B0067E9D6F9D1325 /* ScreenTextExtractorTests.swift in Sources */,
1B3FFCB9A979F49BF86EAAD4 /* ScreenshotContextGeneratorTests.swift in Sources */,
4FC52FB28AFC013F000D8FF9 /* SecureFieldDetectorTests.swift in Sources */,
AF26E77871200BB1FAAEBE79 /* SelfCaptureGateTests.swift in Sources */,
Expand Down
91 changes: 70 additions & 21 deletions Cotabby/Services/Visual/ScreenTextExtractor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,33 @@ enum ScreenTextExtractionError: LocalizedError {
}
}

/// Guards the callback-to-async bridge for a single OCR request.
///
/// Vision can report a request failure through `VNRecognizeTextRequest`'s completion handler and
/// then rethrow that same failure from `VNImageRequestHandler.perform(_:)`. Swift checked
/// continuations must resume exactly once, so both paths share this short-lived gate.
private final class OCRContinuationResumer {
private let lock = NSLock()
private var hasResumed = false

func resume(_ action: () -> Void) {
lock.lock()
let shouldResume = !hasResumed
if shouldResume {
hasResumed = true
}
lock.unlock()

guard shouldResume else { return }
action()
}
}
Comment on lines +57 to +72

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 OCRContinuationResumer missing @unchecked Sendable

OCRContinuationResumer is captured by reference inside a DispatchQueue.global(qos:).async closure, which requires its captured values to be Sendable in Swift 5.7+ with -strict-concurrency. The class uses NSLock to protect all mutable state, so it is safe for concurrent access, but without declaring @unchecked Sendable it may produce a warning (or error under stricter project settings) like "capture of 'resumer' with non-sendable type 'OCRContinuationResumer' in an isolated closure". Adding the conformance makes the intent explicit and future-proofs the code.

Fix in Codex Fix in Claude Code


struct ScreenTextExtractor: ScreenTextExtracting {
/// Vision cannot produce useful text from near-zero-sized request images. Treating those as
/// empty OCR keeps degenerate screenshots on the same unavailable-context path as blank windows.
private static let minimumOCRImageDimension = 4

let maxImageDimension: Int
let maxRecognizedCharacters: Int

Expand All @@ -72,13 +98,28 @@ struct ScreenTextExtractor: ScreenTextExtracting {
"downsampled=\(wasDownsampled)"
)

guard preparedImage.width >= Self.minimumOCRImageDimension,
preparedImage.height >= Self.minimumOCRImageDimension else {
log(
"ocr-skipped-too-small input=\(image.width)x\(image.height) " +
"prepared=\(preparedImage.width)x\(preparedImage.height)"
)
throw ScreenTextExtractionError.noRecognizedText
}

return try await withCheckedThrowingContinuation { continuation in
let resumer = OCRContinuationResumer()

DispatchQueue.global(qos: .userInitiated).async {
let request = VNRecognizeTextRequest { request, error in
if let error {
let elapsedMilliseconds = Int(Date().timeIntervalSince(startedAt) * 1000)
self.log("ocr-failed elapsed_ms=\(elapsedMilliseconds) reason=\(error.localizedDescription)")
continuation.resume(throwing: ScreenTextExtractionError.ocrFailed(error.localizedDescription))
resumer.resume {
let elapsedMilliseconds = Int(Date().timeIntervalSince(startedAt) * 1000)
self.log("ocr-failed elapsed_ms=\(elapsedMilliseconds) reason=\(error.localizedDescription)")
continuation.resume(
throwing: ScreenTextExtractionError.ocrFailed(error.localizedDescription)
)
}
return
}

Expand All @@ -105,25 +146,29 @@ struct ScreenTextExtractor: ScreenTextExtracting {
let cappedText = String(joinedText.prefix(maxRecognizedCharacters))

guard !cappedText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
let elapsedMilliseconds = Int(Date().timeIntervalSince(startedAt) * 1000)
self.log("ocr-empty elapsed_ms=\(elapsedMilliseconds) lines=\(recognizedLines.count)")
continuation.resume(throwing: ScreenTextExtractionError.noRecognizedText)
resumer.resume {
let elapsedMilliseconds = Int(Date().timeIntervalSince(startedAt) * 1000)
self.log("ocr-empty elapsed_ms=\(elapsedMilliseconds) lines=\(recognizedLines.count)")
continuation.resume(throwing: ScreenTextExtractionError.noRecognizedText)
}
return
}

let elapsedMilliseconds = Int(Date().timeIntervalSince(startedAt) * 1000)
self.log(
"ocr-success elapsed_ms=\(elapsedMilliseconds) lines=\(recognizedLines.count) chars=\(cappedText.count) " +
"preview=\(self.preview(cappedText))"
)

continuation.resume(
returning: ExtractedScreenText(
text: cappedText,
lineCount: recognizedLines.count,
lines: recognizedLines
resumer.resume {
let elapsedMilliseconds = Int(Date().timeIntervalSince(startedAt) * 1000)
self.log(
"ocr-success elapsed_ms=\(elapsedMilliseconds) lines=\(recognizedLines.count) chars=\(cappedText.count) " +
"preview=\(self.preview(cappedText))"
)

continuation.resume(
returning: ExtractedScreenText(
text: cappedText,
lineCount: recognizedLines.count,
lines: recognizedLines
)
)
)
}
}

// Accurate OCR is slower, but visual context is only captured once per focused
Expand All @@ -139,9 +184,13 @@ struct ScreenTextExtractor: ScreenTextExtracting {
let handler = VNImageRequestHandler(cgImage: preparedImage, options: [:])
try handler.perform([request])
} catch {
let elapsedMilliseconds = Int(Date().timeIntervalSince(startedAt) * 1000)
self.log("ocr-failed elapsed_ms=\(elapsedMilliseconds) reason=\(error.localizedDescription)")
continuation.resume(throwing: ScreenTextExtractionError.ocrFailed(error.localizedDescription))
resumer.resume {
let elapsedMilliseconds = Int(Date().timeIntervalSince(startedAt) * 1000)
self.log("ocr-failed elapsed_ms=\(elapsedMilliseconds) reason=\(error.localizedDescription)")
continuation.resume(
throwing: ScreenTextExtractionError.ocrFailed(error.localizedDescription)
)
}
}
}
}
Expand Down
59 changes: 59 additions & 0 deletions CotabbyTests/ScreenTextExtractorTests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import CoreGraphics
import XCTest
@testable import Cotabby

/// Direct tests for the Vision-backed OCR service boundary.
///
/// Screenshot-context tests usually stub OCR so they can focus on orchestration. This file exists
/// separately because the crash happened inside `ScreenTextExtractor`'s callback-to-async bridge,
/// which only runs when the real Vision request path is exercised.
final class ScreenTextExtractorTests: XCTestCase {
func test_extractText_tinyImagesReturnNoRecognizedTextWithoutCrashing() async throws {
let extractor = ScreenTextExtractor()

for dimension in [1, 2] {
let image = try makeSolidImage(width: dimension, height: dimension)
await assertNoRecognizedText(from: image, extractor: extractor)
}
Comment on lines +14 to +17

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Tiny-image test misses the 3px boundary case

The guard threshold is >= 4, which means 1, 2, and 3 px all short-circuit to .noRecognizedText, but the loop only iterates over [1, 2]. A 3px input is the edge case closest to the threshold and is currently untested. Adding 3 to the array would cover all values immediately below the minimum without meaningfully increasing test runtime.

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!

Fix in Codex Fix in Claude Code

}

func test_extractText_blankNormalImageReturnsNoRecognizedText() async throws {
let image = try makeSolidImage(width: 128, height: 128)

await assertNoRecognizedText(from: image, extractor: ScreenTextExtractor())
}

private func assertNoRecognizedText(
from image: CGImage,
extractor: ScreenTextExtractor,
file: StaticString = #filePath,
line: UInt = #line
) async {
do {
_ = try await extractor.extractText(from: image)
XCTFail("Expected OCR to report no recognized text.", file: file, line: line)
} catch ScreenTextExtractionError.noRecognizedText {
// Expected: blank or degenerate screenshots should be treated as unavailable context.
} catch {
XCTFail("Expected noRecognizedText, got \(error).", file: file, line: line)
}
}

private func makeSolidImage(width: Int, height: Int) throws -> CGImage {
let colorSpace = CGColorSpaceCreateDeviceRGB()
let context = try XCTUnwrap(
CGContext(
data: nil,
width: width,
height: height,
bitsPerComponent: 8,
bytesPerRow: width * 4,
space: colorSpace,
bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue
)
)
context.setFillColor(CGColor(gray: 1, alpha: 1))
context.fill(CGRect(x: 0, y: 0, width: width, height: height))
return try XCTUnwrap(context.makeImage())
}
}