diff --git a/.github/workflows/README.md b/.github/workflows/README.md new file mode 100644 index 0000000..e870fdb --- /dev/null +++ b/.github/workflows/README.md @@ -0,0 +1,83 @@ +# CI: Build & TestFlight + +Workflow `testflight.yml` archives the iOS app on the self-hosted +`Mac-mini-GK` runner (`self-hosted`, `macOS`, `ARM64`) and uploads the build +to TestFlight via the App Store Connect API. + +## One-time setup + +### 1. Apple Developer / App Store Connect + +1. In **Apple Developer → Certificates, Identifiers & Profiles**, register + both bundle IDs and enable the required capabilities: + - `com.netlab.TurnBridge` — App Groups (`group.com.netlab.TurnBridge`), + Network Extensions. + - `com.netlab.TurnBridge.network-extension` — App Groups + (`group.com.netlab.TurnBridge`), Network Extensions + (Packet Tunnel Provider). +2. In **App Store Connect → My Apps**, create the app record for + `com.netlab.TurnBridge` (needed before the first TestFlight upload). +3. In **App Store Connect → Users and Access → Integrations → App Store + Connect API**, create an API key with the **App Manager** role. Save the + downloaded `.p8` file — it is shown only once. + +### 2. Self-hosted runner (Mac-mini-GK) + +Make sure the runner has: + +- Xcode (matching the project's deployment target, iOS 16.6+) installed and + selected: `sudo xcode-select -s /Applications/Xcode.app`. +- Command line tools and a logged-in Apple ID in Xcode is **not** required — + signing is driven by the App Store Connect API key. +- Homebrew + Go (`brew install go`) — the `WireGuardKitGo` build phase needs + it. The script looks in `/opt/homebrew/bin`. +- The runner user must be able to access the login keychain non-interactively + (no password prompt). If Xcode prompts for the keychain on first run, + unlock it once manually or store the password with `security + set-key-partition-list`. + +### 3. GitHub repository secrets + +In `truvvor/turnbridge` → **Settings → Secrets and variables → Actions**, add: + +| Secret | Value | +| -------------------- | ------------------------------------------------------------- | +| `APPLE_TEAM_ID` | 10-character Team ID (e.g. `ABCDE12345`) | +| `ASC_ISSUER_ID` | Issuer ID UUID from App Store Connect → Integrations | +| `ASC_KEY_ID` | 10-character Key ID from the same page | +| `ASC_KEY_P8_BASE64` | `base64 -i AuthKey_.p8` output (single line, no wrap) | + +On macOS, generate the base64 secret with: + +```bash +base64 -i AuthKey_XXXXXXXXXX.p8 | pbcopy +``` + +## Triggering + +- Manually: **Actions → Build & TestFlight → Run workflow**, choose `upload` + to send to TestFlight or `build-only` to just produce the IPA artifact. +- Automatically: every push to `claude/build-project-br5tJ` (excluding doc / + asset only changes) runs the workflow and uploads. + +## Build number + +`CFBundleVersion` is overridden to `100 + GITHUB_RUN_NUMBER` so each run is +strictly higher than the previous one. To raise the floor (e.g. after +manually uploading some builds outside CI), set repo variable +`TESTFLIGHT_BUILD_BASE` to a larger number. + +`MARKETING_VERSION` (1.2.6 today) stays as committed in +`TurnBridge.xcodeproj/project.pbxproj` — bump it there when releasing a new +TestFlight version family. + +## Notes + +- Code signing uses Xcode automatic signing with `-allowProvisioningUpdates` + + the App Store Connect API key. The ASC API key is enough — no `.p12` + cert or provisioning profile secrets needed. +- `ENABLE_USER_SCRIPT_SANDBOXING=NO` is passed at archive time because the + WireGuardKitGo build phase writes outside of its declared inputs/outputs. +- The IPA is also published as a workflow artifact (`TurnBridge-ipa`) so you + can download an unsigned-for-AppStore copy without re-running the upload + step. diff --git a/.github/workflows/testflight.yml b/.github/workflows/testflight.yml new file mode 100644 index 0000000..7f9a5ed --- /dev/null +++ b/.github/workflows/testflight.yml @@ -0,0 +1,326 @@ +name: Build & TestFlight + +on: + workflow_dispatch: + inputs: + lane: + description: "What to do" + type: choice + default: upload + options: + - build-only + - upload + push: + branches: + - claude/build-project-br5tJ + +concurrency: + group: testflight-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + name: Archive & upload to TestFlight + runs-on: [self-hosted, macOS, ARM64] + + env: + SCHEME: TurnBridge + CONFIGURATION: Release + WORKSPACE_PROJECT: TurnBridge.xcodeproj + APP_BUNDLE_ID: com.truvvor.turnbridge + EXT_BUNDLE_ID: com.truvvor.turnbridge.network-extension + ARCHIVE_PATH: build/TurnBridge.xcarchive + EXPORT_DIR: build/export + DERIVED_DATA: build/DerivedData + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Show toolchain + run: | + set -eux + sw_vers + xcode-select -p + xcodebuild -version + /usr/bin/which go && go version || true + /usr/bin/which make + + - name: Ensure Go is on PATH for build script + run: | + set -eux + if ! command -v go >/dev/null 2>&1; then + echo "Go not found, installing via Homebrew" + brew install go + fi + echo "/opt/homebrew/bin" >> "$GITHUB_PATH" + + - name: Decide whether to upload + id: mode + run: | + set -eux + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "upload=${{ github.event.inputs.lane == 'upload' }}" >> "$GITHUB_OUTPUT" + else + echo "upload=true" >> "$GITHUB_OUTPUT" + fi + + - name: Compute build number + id: ver + run: | + set -eux + # Use a monotonically increasing build number from the run id. + # GITHUB_RUN_NUMBER is per-workflow and resets if the workflow is + # recreated, so add a base offset to stay above the App Store Connect + # high-water mark. + BASE=${TESTFLIGHT_BUILD_BASE:-100} + BUILD_NUMBER=$((BASE + GITHUB_RUN_NUMBER)) + echo "build_number=$BUILD_NUMBER" >> "$GITHUB_OUTPUT" + echo "Using CFBundleVersion=$BUILD_NUMBER" + + - name: Provision App Store Connect API key + if: steps.mode.outputs.upload == 'true' + env: + ASC_KEY_ID: ${{ secrets.ASC_KEY_ID }} + ASC_KEY_P8_BASE64: ${{ secrets.ASC_KEY_P8_BASE64 }} + run: | + set -eu + if [ -z "${ASC_KEY_ID}" ] || [ -z "${ASC_KEY_P8_BASE64}" ]; then + echo "Missing ASC_KEY_ID or ASC_KEY_P8_BASE64 secret" >&2 + exit 1 + fi + KEY_DIR="$RUNNER_TEMP/asc_keys" + mkdir -p "$KEY_DIR" + KEY_PATH="$KEY_DIR/AuthKey_${ASC_KEY_ID}.p8" + printf '%s' "$ASC_KEY_P8_BASE64" | base64 --decode > "$KEY_PATH" + chmod 600 "$KEY_PATH" + echo "ASC_KEY_PATH=$KEY_PATH" >> "$GITHUB_ENV" + # Also place a copy where xcrun altool / Transporter look by default + mkdir -p "$HOME/.appstoreconnect/private_keys" + cp "$KEY_PATH" "$HOME/.appstoreconnect/private_keys/AuthKey_${ASC_KEY_ID}.p8" + chmod 600 "$HOME/.appstoreconnect/private_keys/AuthKey_${ASC_KEY_ID}.p8" + + - name: Sanity-check required secrets + env: + APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }} + ASC_ISSUER_ID: ${{ secrets.ASC_ISSUER_ID }} + ASC_KEY_ID: ${{ secrets.ASC_KEY_ID }} + run: | + set -eu + missing=0 + for v in APPLE_TEAM_ID ASC_ISSUER_ID ASC_KEY_ID; do + if [ -z "$(printenv $v)" ]; then + echo "::error::Secret $v is not set" + missing=1 + fi + done + [ $missing -eq 0 ] + + - name: Unlock login keychain for codesign + env: + MAC_KEYCHAIN_PASSWORD: ${{ secrets.MAC_KEYCHAIN_PASSWORD }} + run: | + set -u + if [ -z "${MAC_KEYCHAIN_PASSWORD:-}" ]; then + echo "::error::MAC_KEYCHAIN_PASSWORD secret is not set" + echo "::error::Add the macOS login password of the runner user '$USER' as repo secret MAC_KEYCHAIN_PASSWORD" + exit 1 + fi + + KEYCHAIN_PATH="$HOME/Library/Keychains/login.keychain-db" + if [ ! -f "$KEYCHAIN_PATH" ]; then + KEYCHAIN_PATH=$(security list-keychains -d user | sed -e 's/^[[:space:]]*"//' -e 's/"$//' | head -n1) + fi + echo "Using keychain: $KEYCHAIN_PATH" + echo "Current user search list:" + security list-keychains -d user + + # 1. Unlock first — every other security op needs an unlocked keychain + security unlock-keychain -p "$MAC_KEYCHAIN_PASSWORD" "$KEYCHAIN_PATH" + rc=$? + echo "unlock-keychain rc=$rc" + if [ $rc -ne 0 ]; then + echo "::error::Failed to unlock keychain. The MAC_KEYCHAIN_PASSWORD secret does not match the password of $KEYCHAIN_PATH" + echo "::error::Verify by running on the runner: security unlock-keychain $KEYCHAIN_PATH (interactive). If that succeeds, refresh the secret value." + exit $rc + fi + + # 2. Keep the keychain unlocked for the rest of the build (6h) + security set-keychain-settings -lut 21600 "$KEYCHAIN_PATH" || \ + echo "::warning::set-keychain-settings rc=$? (non-fatal)" + + # 3. Make sure the login keychain is in the user search list (additive — do not drop System.keychain etc) + EXISTING=$(security list-keychains -d user | tr -d '"' | xargs) + if ! printf '%s\n' $EXISTING | grep -Fxq "$KEYCHAIN_PATH"; then + security list-keychains -d user -s $EXISTING "$KEYCHAIN_PATH" + echo "Added $KEYCHAIN_PATH to user search list" + fi + security default-keychain -d user -s "$KEYCHAIN_PATH" || true + + # 4. Allow codesign / productbuild / other Apple tools to access private keys without UI prompts + set +e + security set-key-partition-list \ + -S apple-tool:,apple:,codesign:,productbuild: \ + -s -k "$MAC_KEYCHAIN_PASSWORD" \ + "$KEYCHAIN_PATH" >/dev/null 2>&1 + rc=$? + set -e + echo "set-key-partition-list rc=$rc" + if [ $rc -ne 0 ]; then + echo "::warning::set-key-partition-list rc=$rc — codesign may still prompt; continuing" + fi + + echo "Keychain ready" + + - name: Ensure Distribution cert and App Store profiles + env: + APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }} + ASC_KEY_ID: ${{ secrets.ASC_KEY_ID }} + ASC_ISSUER_ID: ${{ secrets.ASC_ISSUER_ID }} + MAC_KEYCHAIN_PASSWORD: ${{ secrets.MAC_KEYCHAIN_PASSWORD }} + run: | + set -euxo pipefail + ruby --version + ruby script/ci_setup_signing.rb + + - name: Resolve Swift package dependencies + run: | + set -euxo pipefail + mkdir -p build/logs + xcodebuild \ + -project "$WORKSPACE_PROJECT" \ + -scheme "$SCHEME" \ + -configuration "$CONFIGURATION" \ + -derivedDataPath "$DERIVED_DATA" \ + -resolvePackageDependencies 2>&1 | tee build/logs/resolve.log + + - name: Archive + env: + APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }} + ASC_KEY_ID: ${{ secrets.ASC_KEY_ID }} + ASC_ISSUER_ID: ${{ secrets.ASC_ISSUER_ID }} + run: | + set -euxo pipefail + mkdir -p build/logs + xcodebuild \ + -project "$WORKSPACE_PROJECT" \ + -scheme "$SCHEME" \ + -configuration "$CONFIGURATION" \ + -destination 'generic/platform=iOS' \ + -derivedDataPath "$DERIVED_DATA" \ + -archivePath "$ARCHIVE_PATH" \ + -allowProvisioningUpdates \ + -authenticationKeyPath "$ASC_KEY_PATH" \ + -authenticationKeyID "$ASC_KEY_ID" \ + -authenticationKeyIssuerID "$ASC_ISSUER_ID" \ + DEVELOPMENT_TEAM="$APPLE_TEAM_ID" \ + CODE_SIGN_STYLE=Automatic \ + CURRENT_PROJECT_VERSION="${{ steps.ver.outputs.build_number }}" \ + ENABLE_USER_SCRIPT_SANDBOXING=NO \ + archive 2>&1 | tee build/logs/archive.log + + - name: Tail archive log on failure + if: failure() + run: | + echo "::group::archive.log (last 400 lines)" + tail -n 400 build/logs/archive.log || true + echo "::endgroup::" + echo "::group::error/warning lines from archive.log" + grep -nE 'error:|warning:|Code Sign|Provisioning|fatal|failed|^\*\* ' build/logs/archive.log | tail -n 200 || true + echo "::endgroup::" + + - name: Generate exportOptions.plist + env: + APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }} + run: | + set -eu + cat > build/exportOptions.plist < + + + + method + app-store-connect + destination + export + signingStyle + manual + teamID + ${APPLE_TEAM_ID} + signingCertificate + ${SIGNING_CERT_SHA1} + provisioningProfiles + + ${APP_BUNDLE_ID} + ${PROFILE_APP_NAME} + ${EXT_BUNDLE_ID} + ${PROFILE_EXT_NAME} + + stripSwiftSymbols + + uploadSymbols + + + + PLIST + echo "exportOptions.plist:" + cat build/exportOptions.plist + + - name: Export IPA + run: | + set -euxo pipefail + mkdir -p build/logs + xcodebuild \ + -exportArchive \ + -archivePath "$ARCHIVE_PATH" \ + -exportOptionsPlist build/exportOptions.plist \ + -exportPath "$EXPORT_DIR" 2>&1 | tee build/logs/export.log + ls -la "$EXPORT_DIR" + + - name: Upload build logs artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: build-logs + path: | + build/logs/** + build/exportOptions.plist + if-no-files-found: ignore + retention-days: 14 + + - name: Upload IPA artifact + if: success() + uses: actions/upload-artifact@v4 + with: + name: TurnBridge-ipa + path: build/export/*.ipa + if-no-files-found: error + retention-days: 14 + + - name: Upload to TestFlight + if: steps.mode.outputs.upload == 'true' + env: + ASC_KEY_ID: ${{ secrets.ASC_KEY_ID }} + ASC_ISSUER_ID: ${{ secrets.ASC_ISSUER_ID }} + run: | + set -eux + IPA_PATH=$(ls "$EXPORT_DIR"/*.ipa | head -n1) + if [ -z "$IPA_PATH" ]; then + echo "::error::No IPA produced in $EXPORT_DIR" + exit 1 + fi + xcrun altool \ + --upload-app \ + --type ios \ + --file "$IPA_PATH" \ + --apiKey "$ASC_KEY_ID" \ + --apiIssuer "$ASC_ISSUER_ID" + + - name: Cleanup ASC key + if: always() + run: | + set -eu + rm -f "$HOME/.appstoreconnect/private_keys/AuthKey_${{ secrets.ASC_KEY_ID }}.p8" || true + rm -rf "$RUNNER_TEMP/asc_keys" || true diff --git a/TurnBridge.xcodeproj/project.pbxproj b/TurnBridge.xcodeproj/project.pbxproj index 2a56d9c..fcb4f68 100755 --- a/TurnBridge.xcodeproj/project.pbxproj +++ b/TurnBridge.xcodeproj/project.pbxproj @@ -424,6 +424,7 @@ DEVELOPMENT_TEAM = ""; ENABLE_PREVIEWS = YES; GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_KEY_ITSAppUsesNonExemptEncryption = NO; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; INFOPLIST_KEY_UILaunchScreen_Generation = YES; @@ -434,8 +435,8 @@ "$(inherited)", "@executable_path/Frameworks", ); - MARKETING_VERSION = 1.2.6; - PRODUCT_BUNDLE_IDENTIFIER = com.netlab.TurnBridge; + MARKETING_VERSION = 1.3.29; + PRODUCT_BUNDLE_IDENTIFIER = com.truvvor.turnbridge; PRODUCT_NAME = "$(TARGET_NAME)"; STRING_CATALOG_GENERATE_SYMBOLS = YES; SWIFT_APPROACHABLE_CONCURRENCY = YES; @@ -458,6 +459,7 @@ DEVELOPMENT_TEAM = ""; ENABLE_PREVIEWS = YES; GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_KEY_ITSAppUsesNonExemptEncryption = NO; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; INFOPLIST_KEY_UILaunchScreen_Generation = YES; @@ -468,8 +470,8 @@ "$(inherited)", "@executable_path/Frameworks", ); - MARKETING_VERSION = 1.2.6; - PRODUCT_BUNDLE_IDENTIFIER = com.netlab.TurnBridge; + MARKETING_VERSION = 1.3.29; + PRODUCT_BUNDLE_IDENTIFIER = com.truvvor.turnbridge; PRODUCT_NAME = "$(TARGET_NAME)"; STRING_CATALOG_GENERATE_SYMBOLS = YES; SWIFT_APPROACHABLE_CONCURRENCY = YES; @@ -498,8 +500,8 @@ "@executable_path/Frameworks", "@executable_path/../../Frameworks", ); - MARKETING_VERSION = 1.2.6; - PRODUCT_BUNDLE_IDENTIFIER = "com.netlab.TurnBridge.network-extension"; + MARKETING_VERSION = 1.3.29; + PRODUCT_BUNDLE_IDENTIFIER = "com.truvvor.turnbridge.network-extension"; PRODUCT_NAME = "$(TARGET_NAME)"; SKIP_INSTALL = YES; STRING_CATALOG_GENERATE_SYMBOLS = YES; @@ -528,8 +530,8 @@ "@executable_path/Frameworks", "@executable_path/../../Frameworks", ); - MARKETING_VERSION = 1.2.6; - PRODUCT_BUNDLE_IDENTIFIER = "com.netlab.TurnBridge.network-extension"; + MARKETING_VERSION = 1.3.29; + PRODUCT_BUNDLE_IDENTIFIER = "com.truvvor.turnbridge.network-extension"; PRODUCT_NAME = "$(TARGET_NAME)"; SKIP_INSTALL = YES; STRING_CATALOG_GENERATE_SYMBOLS = YES; diff --git a/TurnBridge/CaptchaIPC.swift b/TurnBridge/CaptchaIPC.swift new file mode 100644 index 0000000..0b4230d --- /dev/null +++ b/TurnBridge/CaptchaIPC.swift @@ -0,0 +1,40 @@ +import Foundation + +/// IPC constants shared with the network extension's CaptchaBridge.swift. +/// Keep these strings in sync between the two targets. +enum CaptchaIPC { + static let appGroupID = "group.com.truvvor.turnbridge" + static let requestUserDefaultsKey = "captcha.pendingRequest" + static let requestDarwinNotification = "com.truvvor.turnbridge.captcha.request" + static let cancelDarwinNotification = "com.truvvor.turnbridge.captcha.cancel" + + struct AppMessage: Codable { + let type: String + let requestId: String + let successToken: String? + let reason: String? + /// New (1.3.24+): when the WebView replayed the failing VK API + /// call inside its own browser session and got the full JSON + /// response back, the app forwards that response here instead + /// of a bare success_token. The extension routes it through + /// TurnBridgeSubmitManualCaptchaResponse so getCreds skips its + /// own retry — VK never sees a session switch between captcha + /// solve and token redemption. Both fields are optional; the + /// extension prefers responseJson when present. + let responseJson: String? + } + + struct PendingRequest: Codable { + let requestId: String + let redirectUri: String + let createdAt: TimeInterval + /// New (1.3.24+): if non-empty, the WebView should POST + /// retryBody to retryUrl after extracting success_token, with + /// the literal string "__TOKEN__" inside retryBody replaced by + /// the actual token. The HTTP response is what the extension + /// wants back via AppMessage.responseJson. Empty fields mean + /// legacy "just send the token" path is fine. + let retryUrl: String? + let retryBody: String? + } +} diff --git a/TurnBridge/CaptchaManager.swift b/TurnBridge/CaptchaManager.swift new file mode 100644 index 0000000..98df26f --- /dev/null +++ b/TurnBridge/CaptchaManager.swift @@ -0,0 +1,159 @@ +import Foundation +import NetworkExtension +import UIKit +import Combine + +@MainActor +final class CaptchaManager: ObservableObject { + + static let shared = CaptchaManager() + + @Published var pending: CaptchaIPC.PendingRequest? + + private var registered = false + + private init() {} + + func start() { + guard !registered else { return } + registered = true + + // Darwin notification fired from the extension. + let name = CaptchaIPC.requestDarwinNotification as CFString + let observer = Unmanaged.passUnretained(self).toOpaque() + CFNotificationCenterAddObserver( + CFNotificationCenterGetDarwinNotifyCenter(), + observer, + { _, observer, _, _, _ in + guard let observer = observer else { return } + let mgr = Unmanaged.fromOpaque(observer).takeUnretainedValue() + Task { @MainActor in mgr.refresh() } + }, + name, + nil, + .deliverImmediately + ) + + // Cancel notification fired from the extension when the tunnel + // stops: the pending prompt is unanswerable, drop it so the + // sheet dismisses instead of trapping the user on a dead page. + let cancelName = CaptchaIPC.cancelDarwinNotification as CFString + CFNotificationCenterAddObserver( + CFNotificationCenterGetDarwinNotifyCenter(), + observer, + { _, observer, _, _, _ in + guard let observer = observer else { return } + let mgr = Unmanaged.fromOpaque(observer).takeUnretainedValue() + Task { @MainActor in mgr.refresh() } + }, + cancelName, + nil, + .deliverImmediately + ) + + // Also refresh on becoming active in case the notification arrived while + // the app was suspended. + NotificationCenter.default.addObserver( + forName: UIApplication.didBecomeActiveNotification, + object: nil, + queue: .main + ) { [weak self] _ in + guard let self = self else { return } + Task { @MainActor in self.refresh() } + } + + refresh() + } + + func refresh() { + guard let defaults = UserDefaults(suiteName: CaptchaIPC.appGroupID), + let data = defaults.data(forKey: CaptchaIPC.requestUserDefaultsKey), + let req = try? JSONDecoder().decode(CaptchaIPC.PendingRequest.self, from: data) else { + pending = nil + return + } + // Drop stale requests (5 minutes). + if Date().timeIntervalSince1970 - req.createdAt > 300 { + clearPending() + return + } + if pending?.requestId != req.requestId { + SharedLogger.info("Captcha UI: picked up pending request \(req.requestId)", source: .app) + } + pending = req + } + + /// WebView extracted just the success_token (legacy or fallback + /// when the in-WebView retry failed). The extension will do the + /// VK API retry itself — VK may reject because of session + /// mismatch. + /// + /// clearPending() runs BEFORE sendMessage so the sheet dismisses + /// the instant the WebView reports a result, even if the IPC + /// (NETunnelProviderSession.sendProviderMessage) is slow or + /// hangs for some reason. We've observed sheets getting stuck + /// on "Got response, finishing…" when the IPC took its time; + /// users had no way out except kill the app. + func submit(token: String) async { + guard let req = pending else { return } + let msg = CaptchaIPC.AppMessage(type: "captcha_answer", + requestId: req.requestId, + successToken: token, + reason: nil, + responseJson: nil) + clearPending() + await sendMessage(msg) + } + + /// WebView solved the captcha AND replayed the failing VK API + /// call in the same browser session. This is what we want: VK + /// sees a single coherent session for both the solve and the + /// redemption, no fingerprint switch. + func submit(response: String) async { + guard let req = pending else { return } + let msg = CaptchaIPC.AppMessage(type: "captcha_answer", + requestId: req.requestId, + successToken: nil, + reason: nil, + responseJson: response) + clearPending() + await sendMessage(msg) + } + + func cancel(reason: String = "user cancelled") async { + guard let req = pending else { return } + let msg = CaptchaIPC.AppMessage(type: "captcha_cancel", + requestId: req.requestId, + successToken: nil, + reason: reason, + responseJson: nil) + clearPending() + await sendMessage(msg) + } + + // MARK: - Private + + private func clearPending() { + UserDefaults(suiteName: CaptchaIPC.appGroupID)? + .removeObject(forKey: CaptchaIPC.requestUserDefaultsKey) + pending = nil + } + + private func sendMessage(_ msg: CaptchaIPC.AppMessage) async { + do { + let managers = try await NETunnelProviderManager.loadAllFromPreferences() + guard let session = managers.first?.connection as? NETunnelProviderSession else { + SharedLogger.warning("Captcha UI: no active tunnel session to deliver answer", source: .app) + return + } + let payload = try JSONEncoder().encode(msg) + try session.sendProviderMessage(payload) { reply in + if let reply = reply, let text = String(data: reply, encoding: .utf8) { + SharedLogger.debug("Captcha UI: extension reply \(text)", source: .app) + } + } + } catch { + SharedLogger.error("Captcha UI: failed to deliver answer: \(error.localizedDescription)", source: .app) + } + } +} diff --git a/TurnBridge/CaptchaStatsBadge.swift b/TurnBridge/CaptchaStatsBadge.swift new file mode 100644 index 0000000..0815e05 --- /dev/null +++ b/TurnBridge/CaptchaStatsBadge.swift @@ -0,0 +1,162 @@ +import SwiftUI +import Combine + +/// Polls captcha solve counters published by PacketTunnelProvider into +/// the App Group's shared UserDefaults. Surfaces two numbers in the UI: +/// +/// "Direct" — captchas solved from the user's mobile IP. Bounded +/// above by VK's per-IP rate-limit (~16 in practice). +/// "Tunnel" — captchas solved from the WG server's egress IP, which +/// kicks in for sessions spawned AFTER WG handshake +/// completes through the bootstrap fleet. Independent +/// budget from direct, so e.g. N=30 can yield 16 direct +/// + 14 tunnel and all 30 sessions come up. +@MainActor +final class CaptchaStatsState: ObservableObject { + @Published private(set) var direct: Int = 0 + @Published private(set) var tunnel: Int = 0 + @Published private(set) var remote: Int = 0 + @Published private(set) var directAttempts: Int = 0 + @Published private(set) var tunnelAttempts: Int = 0 + @Published private(set) var remoteAttempts: Int = 0 + @Published private(set) var directInFlight: Int = 0 + @Published private(set) var tunnelInFlight: Int = 0 + @Published private(set) var remoteInFlight: Int = 0 + @Published private(set) var sessionsReady: Int = 0 + @Published private(set) var sessionsTarget: Int = 0 + @Published private(set) var directSaturated: Bool = false + @Published private(set) var tunnelSaturated: Bool = false + + private var timer: AnyCancellable? + + func start() { + guard timer == nil else { return } + refresh() + timer = Timer.publish(every: 1, on: .main, in: .common) + .autoconnect() + .sink { [weak self] _ in self?.refresh() } + } + + func stop() { + timer?.cancel() + timer = nil + direct = 0 + tunnel = 0 + remote = 0 + directAttempts = 0 + tunnelAttempts = 0 + remoteAttempts = 0 + directInFlight = 0 + tunnelInFlight = 0 + remoteInFlight = 0 + sessionsReady = 0 + sessionsTarget = 0 + directSaturated = false + tunnelSaturated = false + } + + private func refresh() { + guard let defaults = UserDefaults(suiteName: "group.com.truvvor.turnbridge") else { + return + } + direct = defaults.integer(forKey: "captchaDirectCount") + tunnel = defaults.integer(forKey: "captchaTunnelCount") + remote = defaults.integer(forKey: "captchaRemoteCount") + directAttempts = defaults.integer(forKey: "captchaDirectAttempts") + tunnelAttempts = defaults.integer(forKey: "captchaTunnelAttempts") + remoteAttempts = defaults.integer(forKey: "captchaRemoteAttempts") + directInFlight = defaults.integer(forKey: "captchaDirectInFlight") + tunnelInFlight = defaults.integer(forKey: "captchaTunnelInFlight") + remoteInFlight = defaults.integer(forKey: "captchaRemoteInFlight") + sessionsReady = defaults.integer(forKey: "sessionsReady") + sessionsTarget = defaults.integer(forKey: "sessionsTarget") + directSaturated = defaults.bool(forKey: "captchaDirectSaturated") + tunnelSaturated = defaults.bool(forKey: "captchaTunnelSaturated") + } +} + +struct CaptchaStatsBadge: View { + @ObservedObject var stats: CaptchaStatsState + + var body: some View { + VStack(spacing: 6) { + // Top row: Sessions ready/target — the single most useful + // number while connecting (are we making progress?). + HStack(spacing: 6) { + Image(systemName: "antenna.radiowaves.left.and.right") + .font(.system(size: 11)) + .foregroundColor(.secondary) + Text("\(stats.sessionsReady)/\(stats.sessionsTarget) sessions ready") + .font(.system(size: 13, weight: .medium, design: .rounded)) + .foregroundColor(.primary) + } + + Divider().padding(.horizontal, 4) + + // Bottom row: per-egress counters with in-flight indicators. + // Three buckets — Direct (phone IP), Tunnel (WG egress + // routed through utun), Server (captcha-service cluster). + // The Server cell is the entire reason the total session + // count exceeds Direct+Tunnel: server-side solves never + // touch this phone's HTTP stack so they don't show up in + // the other two buckets. With multi-link configured, the + // Server cell is typically the largest of the three. + HStack(spacing: 10) { + cell(label: "Direct", + ok: stats.direct, + attempts: stats.directAttempts, + inFlight: stats.directInFlight, + saturated: stats.directSaturated, + accent: .blue) + Divider().frame(height: 30) + cell(label: "Tunnel", + ok: stats.tunnel, + attempts: stats.tunnelAttempts, + inFlight: stats.tunnelInFlight, + saturated: stats.tunnelSaturated, + accent: .green) + Divider().frame(height: 30) + cell(label: "Server", + ok: stats.remote, + attempts: stats.remoteAttempts, + inFlight: stats.remoteInFlight, + saturated: false, + accent: .purple) + } + } + .padding(.horizontal, 14) + .padding(.vertical, 8) + .background(.regularMaterial) + .clipShape(RoundedRectangle(cornerRadius: 12)) + .overlay( + RoundedRectangle(cornerRadius: 12) + .strokeBorder(Color.secondary.opacity(0.25), lineWidth: 1) + ) + } + + private func cell(label: String, ok: Int, attempts: Int, inFlight: Int, saturated: Bool, accent: Color) -> some View { + VStack(spacing: 1) { + HStack(spacing: 4) { + Text("\(ok)") + .font(.system(size: 18, weight: .semibold, design: .rounded)) + .foregroundColor(accent) + Text("/\(attempts)") + .font(.system(size: 12, weight: .regular, design: .rounded)) + .foregroundColor(.secondary) + if inFlight > 0 { + Text("·\(inFlight)⟳") + .font(.system(size: 11, weight: .medium, design: .rounded)) + .foregroundColor(.orange) + } + if saturated { + Image(systemName: "exclamationmark.octagon.fill") + .font(.system(size: 11)) + .foregroundColor(.orange) + } + } + Text(label) + .font(.system(size: 11, weight: .medium, design: .rounded)) + .foregroundColor(.secondary) + } + } +} diff --git a/TurnBridge/CaptchaWebView.swift b/TurnBridge/CaptchaWebView.swift new file mode 100644 index 0000000..c146430 --- /dev/null +++ b/TurnBridge/CaptchaWebView.swift @@ -0,0 +1,664 @@ +import SwiftUI +import WebKit +import Combine + +/// Sheet that loads the VK captcha page in a WKWebView, watches XHR / URL +/// activity for a `success_token`, and reports the result back via +/// CaptchaManager. +struct CaptchaWebView: View { + let redirectUri: String + /// When non-nil, the injected JS will, after extracting + /// success_token, replay this request inside the WebView's session + /// (POST `retryBody` with literal "__TOKEN__" swapped for the + /// actual token, to `retryUrl`). The full JSON response goes back + /// via `onResponse`. nil = legacy token-only flow. + let retryUrl: String? + let retryBody: String? + @ObservedObject var manager: CaptchaManager = .shared + @Environment(\.dismiss) private var dismiss + + @State private var status: String = "Solve the VK challenge below" + @State private var didFinish = false + + var body: some View { + NavigationView { + VStack(spacing: 0) { + Text(status) + .font(.footnote) + .foregroundStyle(.secondary) + .padding(.horizontal) + .padding(.vertical, 8) + .frame(maxWidth: .infinity) + .background(Color(.secondarySystemBackground)) + + CaptchaWKWebView( + url: URL(string: redirectUri), + retryUrl: retryUrl, + retryBody: retryBody, + onToken: { token in + guard !didFinish else { return } + didFinish = true + status = "Got token, finishing…" + Task { + await manager.submit(token: token) + dismiss() + } + }, + onResponse: { responseJson in + guard !didFinish else { return } + didFinish = true + status = "Got response, finishing…" + Task { + await manager.submit(response: responseJson) + dismiss() + } + }, + onTerminal: { reason in + // VK rendered a terminal failure page ("Attempt + // limit reached" etc). No way for the user to + // recover from inside the sheet — cancel and + // let Go bail to identity recycling. + guard !didFinish else { return } + didFinish = true + status = "VK refused: \(reason.prefix(80))" + SharedLogger.info("CaptchaWebView terminal page detected: \(reason)", source: .app) + Task { + await manager.cancel(reason: "vk terminal: \(reason.prefix(120))") + dismiss() + } + }, + onStatus: { s in status = s } + ) + .frame(maxWidth: .infinity, maxHeight: .infinity) + } + .frame(maxWidth: .infinity, maxHeight: .infinity) + .navigationTitle("Verify human") + .navigationBarTitleDisplayMode(.inline) + .toolbar { + ToolbarItem(placement: .cancellationAction) { + Button("Cancel") { + // Cancel ALWAYS works, even if didFinish is + // already true. This is the user's escape + // hatch when something downstream wedges — + // we'd rather double-cancel (idempotent on + // both Swift and Go sides) than trap the + // user staring at a frozen sheet. + didFinish = true + SharedLogger.info("CaptchaWebView: user pressed Cancel (state.didFinish was \(didFinish))", source: .app) + Task { + await manager.cancel() + dismiss() + } + } + } + } + .onAppear { + SharedLogger.info("Captcha sheet appeared. redirect_uri=\(redirectUri)", source: .app) + // Watchdog: if nothing — solve, terminal, user cancel + // — happens in 175 s, cancel ourselves. Go's + // requestManualCaptcha times out at 180 s; firing 5 s + // earlier on our side means the UI never lingers past + // a backend already-gave-up state. + Task { + try? await Task.sleep(nanoseconds: 175_000_000_000) + guard !didFinish else { return } + didFinish = true + status = "Timed out waiting for solve" + SharedLogger.warning("CaptchaWebView watchdog timeout (175 s) — auto-cancelling", source: .app) + await manager.cancel(reason: "ui watchdog timeout") + dismiss() + } + } + } + .navigationViewStyle(.stack) + } +} + +private struct CaptchaWKWebView: UIViewRepresentable { + let url: URL? + let retryUrl: String? + let retryBody: String? + let onToken: (String) -> Void + let onResponse: (String) -> Void + let onTerminal: (String) -> Void + let onStatus: (String) -> Void + + func makeCoordinator() -> Coordinator { + Coordinator(onToken: onToken, + onResponse: onResponse, + onTerminal: onTerminal, + onStatus: onStatus) + } + + func makeUIView(context: Context) -> WKWebView { + let userContent = WKUserContentController() + userContent.add(context.coordinator, name: "captcha") + + // Inject the bot-tell scrubbers BEFORE any page JS runs. + // - safariUAOverride: navigator.* spoofing so JS-visible UA + // matches the HTTP UA set via customUserAgent. + // - retry config: small script that defines window.__capRetry + // with the URL + body template before the captcha helper + // reads it. JSON-stringified to handle the body's special + // chars safely. + // - injectedJS: the helper that hooks fetch/XHR for + // success_token and either fires onResponse (if retry params + // are present and the in-WebView retry succeeded) or + // onToken (legacy). + userContent.addUserScript(WKUserScript( + source: Self.safariUAOverride, + injectionTime: .atDocumentStart, + forMainFrameOnly: false + )) + if let url = retryUrl, let body = retryBody, !url.isEmpty { + let urlJSON = Self.jsString(url) + let bodyJSON = Self.jsString(body) + let retryScript = "window.__capRetry = {url: \(urlJSON), body: \(bodyJSON)};" + userContent.addUserScript(WKUserScript( + source: retryScript, + injectionTime: .atDocumentStart, + forMainFrameOnly: false + )) + } + userContent.addUserScript(WKUserScript( + source: Self.injectedJS, + injectionTime: .atDocumentStart, + forMainFrameOnly: false + )) + + let config = WKWebViewConfiguration() + config.userContentController = userContent + if #available(iOS 14.0, *) { + config.defaultWebpagePreferences.allowsContentJavaScript = true + } + // Persistent data store: VK's classifier treats a captcha + // session with zero prior vk.com cookies / localStorage as a + // signal of a freshly-spun-up automation environment. Sharing + // state across captcha sheets within the app gives real users + // the same "I've been here before" signal that web Safari has. + // We don't share with the system Safari (that requires + // ASWebAuthenticationSession), but app-scoped persistence is + // enough for the classifier. + config.websiteDataStore = .default() + + let webView = WKWebView(frame: UIScreen.main.bounds, configuration: config) + // Send the EXACT Mobile Safari UA. WKWebView's default UA is + // missing the "Version/X.Y Safari/604.1" suffix — that gap is + // one of the cheapest bot tells VK has. iOS 18 is the current + // production version; if Apple ships 19 the suffix updates + // organically but anything in the 17-18-19 range matches what + // VK sees from real Safari iOS users. + webView.customUserAgent = "Mozilla/5.0 (iPhone; CPU iPhone OS 18_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Mobile/15E148 Safari/604.1" + webView.navigationDelegate = context.coordinator + webView.allowsBackForwardNavigationGestures = true + webView.backgroundColor = .systemBackground + webView.scrollView.backgroundColor = .systemBackground + webView.isOpaque = true + + if let url = url { + context.coordinator.captchaURL = url + // Cookie / state warm-up. VK's classifier reads a captcha + // session with zero prior vk.com cookies + localStorage as a + // freshly-spun-up automation environment — exactly the BOT + // signal we're trying to avoid on the first hand-solved + // sessions. So before navigating to the captcha, briefly + // load m.vk.com so the persistent data store picks up the + // organic "I've been here before" state a real Safari user + // would have. The real captcha load is kicked off from the + // warm-up's didFinish (or a 3 s hard cap, whichever first) + // so a blocked / slow warm-up never strands the user. + SharedLogger.info("CaptchaWebView warming up vk.com cookies before captcha load", source: .app) + onStatus("Preparing…") + if let warmURL = URL(string: "https://m.vk.com/") { + webView.load(URLRequest(url: warmURL)) + DispatchQueue.main.asyncAfter(deadline: .now() + 3.0) { [weak webView] in + guard let webView = webView else { return } + context.coordinator.loadCaptchaIfNeeded(webView, reason: "warmup cap 3s") + } + } else { + context.coordinator.loadCaptchaIfNeeded(webView, reason: "no warmup url") + } + } else { + SharedLogger.error("CaptchaWebView: URL is nil — won't load", source: .app) + onStatus("Bad captcha URL") + } + return webView + } + + func updateUIView(_ uiView: WKWebView, context: Context) {} + + final class Coordinator: NSObject, WKNavigationDelegate, WKScriptMessageHandler { + let onToken: (String) -> Void + let onResponse: (String) -> Void + let onStatus: (String) -> Void + + let onTerminal: (String) -> Void + + /// The real captcha URL, loaded only AFTER the cookie warm-up + /// (or its 3 s cap) so VK sees an aged vk.com session rather + /// than a cold one. Set in makeUIView. + var captchaURL: URL? + private var captchaLoadStarted = false + + init(onToken: @escaping (String) -> Void, + onResponse: @escaping (String) -> Void, + onTerminal: @escaping (String) -> Void, + onStatus: @escaping (String) -> Void) { + self.onToken = onToken + self.onResponse = onResponse + self.onTerminal = onTerminal + self.onStatus = onStatus + } + + /// Navigate to the real captcha page exactly once. Called from + /// the warm-up's didFinish and from a 3 s fallback timer — + /// whichever fires first wins; the flag makes the loser a no-op. + func loadCaptchaIfNeeded(_ webView: WKWebView, reason: String) { + guard !captchaLoadStarted, let url = captchaURL else { return } + captchaLoadStarted = true + SharedLogger.info("CaptchaWebView loading captcha after warmup (\(reason)): \(url.absoluteString)", source: .app) + onStatus("Loading…") + webView.load(URLRequest(url: url)) + } + + func userContentController(_ userContentController: WKUserContentController, + didReceive message: WKScriptMessage) { + guard message.name == "captcha", + let body = message.body as? [String: Any], + let type = body["type"] as? String else { return } + switch type { + case "final_response": + // Preferred path: WebView did the VK API replay + // inside the same browser session that solved the + // captcha, and the response is the raw JSON the + // extension would have gotten by doing the redemption + // itself. No session switch. + if let json = body["json"] as? String, !json.isEmpty { + onResponse(json) + } + case "success_token": + // Legacy / fallback path: the in-WebView retry didn't + // happen (no retry params, fetch threw, etc). Pass + // the raw token; the extension does the retry from Go + // and hopes VK accepts the session switch. + if let token = body["token"] as? String, !token.isEmpty { + onToken(token) + } + case "status": + if let s = body["text"] as? String { + // Mirror WebView-side status into the shared log so a + // device sysdiagnose shows whether the in-session + // replay actually produced a final_response or quietly + // fell back to the bot-prone raw-token path (CORS/CSP + // on the cross-origin api.vk.com fetch is the usual + // culprit). See injectedJS handleSuccessToken. + SharedLogger.debug("CaptchaWebView JS: \(s)", source: .app) + onStatus(s) + } + case "terminal": + // Server-rendered failure page ("Attempt limit + // reached" etc). The captcha isn't going anywhere, + // and the user has no way to recover from inside + // the sheet — close it. The Cancel path in the + // CaptchaManager fires the cancel IPC so Go's + // requestManualCaptcha unblocks immediately rather + // than waiting for its 180s timeout. + let reason = (body["reason"] as? String) ?? "terminal page" + onTerminal(reason) + default: + break + } + } + + func webView(_ webView: WKWebView, + decidePolicyFor navigationAction: WKNavigationAction, + decisionHandler: @escaping (WKNavigationActionPolicy) -> Void) { + // Watch top-level navigations for `?success_token=...` or + // `#success_token=...` — some flows put it in the URL. + if let url = navigationAction.request.url { + let token = tokenFromURL(url) + if !token.isEmpty { + onToken(token) + decisionHandler(.cancel) + return + } + } + decisionHandler(.allow) + } + + func webView(_ webView: WKWebView, didStartProvisionalNavigation navigation: WKNavigation!) { + onStatus("Loading captcha…") + } + + func webView(_ webView: WKWebView, didFinish navigation: WKNavigation!) { + SharedLogger.info("CaptchaWebView: page finished loading: \(webView.url?.absoluteString ?? "?")", source: .app) + // The first finished navigation is the cookie warm-up + // (m.vk.com). Now that the data store has organic vk.com + // state, navigate to the real captcha. Subsequent didFinish + // calls (the captcha page itself) are no-ops via the flag. + if captchaURL != nil { + loadCaptchaIfNeeded(webView, reason: "warmup finished") + } + onStatus("Solve the VK challenge below") + } + + func webView(_ webView: WKWebView, + didFail navigation: WKNavigation!, + withError error: Error) { + SharedLogger.error("CaptchaWebView: navigation failed: \(error.localizedDescription)", source: .app) + onStatus("Failed: \(error.localizedDescription)") + } + + func webView(_ webView: WKWebView, + didFailProvisionalNavigation navigation: WKNavigation!, + withError error: Error) { + SharedLogger.error("CaptchaWebView: provisional navigation failed: \(error.localizedDescription)", source: .app) + onStatus("Failed: \(error.localizedDescription)") + } + + private func tokenFromURL(_ url: URL) -> String { + if let comps = URLComponents(url: url, resolvingAgainstBaseURL: false), + let item = comps.queryItems?.first(where: { $0.name == "success_token" }), + let v = item.value { + return v + } + if let fragment = url.fragment { + for part in fragment.split(separator: "&") { + let kv = part.split(separator: "=", maxSplits: 1).map(String.init) + if kv.count == 2, kv[0] == "success_token" { + return kv[1].removingPercentEncoding ?? kv[1] + } + } + } + return "" + } + } + + /// JSON-quote a Swift string for embedding into JS source code. + /// Handles backslashes, quotes, newlines, etc. + fileprivate static func jsString(_ s: String) -> String { + let data = try? JSONSerialization.data(withJSONObject: [s], options: []) + guard let data = data, + let json = String(data: data, encoding: .utf8) else { + return "\"\"" + } + // Strip the surrounding [ ... ] so we get just the quoted string. + let inner = json.dropFirst().dropLast() + // Belt-and-braces: escape " + // tag if the source ever ends up embedded in HTML directly. + return String(inner).replacingOccurrences(of: " ua, configurable: true }); + } catch (e) {} + try { + Object.defineProperty(navigator, 'appVersion', { get: () => ua.replace(/^Mozilla\\//, ''), configurable: true }); + } catch (e) {} + try { + Object.defineProperty(navigator, 'vendor', { get: () => 'Apple Computer, Inc.', configurable: true }); + } catch (e) {} + try { + Object.defineProperty(navigator, 'platform', { get: () => 'iPhone', configurable: true }); + } catch (e) {} + try { + Object.defineProperty(navigator, 'languages', { get: () => ['en-US','en'], configurable: true }); + } catch (e) {} + // Touch surface: real Mobile Safari on iPhone reports + // maxTouchPoints = 5. WKWebView sometimes reports 0/1, which is + // an obvious "this isn't a phone browser" tell to a fingerprinter. + try { + Object.defineProperty(navigator, 'maxTouchPoints', { get: () => 5, configurable: true }); + } catch (e) {} + // Real Safari iOS doesn't expose userAgentData (Client Hints). + // WKWebView under some configurations does — strip it to match. + try { delete navigator.userAgentData; } catch (e) {} + // Drop the webdriver flag entirely. Real Safari has no such + // property; WKWebView sets it (usually false). Presence ≠ + // absence to a fingerprinter. + try { delete navigator.webdriver; } catch (e) {} + try { + Object.defineProperty(navigator, 'webdriver', { get: () => undefined, configurable: true }); + } catch (e) {} + })(); + """ + + // Injected as document-start so we patch fetch/XHR before VK's page code + // gets a chance to fire. Looks for any response from `captchaNotRobot.*` + // that carries `success_token`, and also polls the URL / page text as a + // belt-and-braces fallback. + private static let injectedJS = """ + (function() { + function send(payload) { + try { window.webkit.messageHandlers.captcha.postMessage(payload); } catch (e) {} + } + + // Once true, no more sends — first solve wins. + let solved = false; + + // When the captcha helper grabs success_token, this runs. + // If window.__capRetry is set (retryUrl + retryBody from the + // extension), do the follow-up VK API call inside this + // browser session — same cookies, same TLS, same IP, no + // session switch for VK to flag. Send the JSON response as + // 'final_response'. Fall back to sending the raw token on any + // hiccup so the legacy Go-side retry still gets a chance. + function handleSuccessToken(token) { + if (solved) return; + solved = true; + const cfg = window.__capRetry; + if (!cfg || !cfg.url || !cfg.body) { + send({type: 'success_token', token: token}); + return; + } + const body = cfg.body.replace(/__TOKEN__/g, encodeURIComponent(token)); + send({type: 'status', text: 'replay: redeeming token in-session (POST ' + cfg.url + ')'}); + fetch(cfg.url, { + method: 'POST', + credentials: 'include', + headers: {'Content-Type': 'application/x-www-form-urlencoded'}, + body: body + }).then(function(r) { return r.text(); }) + .then(function(text) { + if (text && text.length > 0) { + send({type: 'status', text: 'replay OK: final_response (' + text.length + ' bytes) — single coherent session'}); + send({type: 'final_response', json: text}); + } else { + // 2xx but empty body — treat as replay miss so the + // log makes the silent-degrade-to-raw-token explicit. + send({type: 'status', text: 'replay FALLBACK: empty response body → raw token (Go will redeem, session switch risk)'}); + send({type: 'success_token', token: token}); + } + }) + .catch(function(e) { + // The common cause here is the cross-origin fetch to + // api.vk.com being blocked by the captcha page's CSP + // connect-src or by missing CORS — which silently + // demotes us to the bot-prone Go redemption path. + send({type: 'status', text: 'replay FALLBACK: fetch threw (' + e + ') → raw token (likely CORS/CSP; Go will redeem, session switch risk)'}); + send({type: 'success_token', token: token}); + }); + } + + function maybeTokenFromText(text) { + if (!text) return null; + try { + const json = JSON.parse(text); + if (json && json.response && json.response.success_token) { + return json.response.success_token; + } + } catch (e) {} + const m = String(text).match(/"success_token"\\s*:\\s*"([^"]+)"/); + return m ? m[1] : null; + } + + // Diagnostic: surface VK's verdict on every captchaNotRobot.* + // response, even when there's no success_token. status:BOT here + // means the solve was rejected IN the WebView (fingerprint / + // dirty IP) — a different failure from "token captured but + // redemption fell back". Without this the sheet just hangs to + // the watchdog and the device log says nothing. Deduped so the + // 250 ms pollers don't spam identical lines. + let verdictLogged = {}; + function logVerdict(url, text) { + try { + if (!url || String(url).indexOf('captchaNotRobot') === -1) return; + const after = String(url).split('captchaNotRobot.')[1] || ''; + const tag = after.split('?')[0].split('&')[0].split('/')[0]; + let json = null; try { json = JSON.parse(text); } catch (e) {} + const r = (json && (json.response || json)) || {}; + let status = r.status || ''; + if (!status && json && json.error) status = 'error:' + (json.error.error_code || '?'); + const showType = r.show_captcha_type || r.show_type || ''; + const key = tag + '|' + status + '|' + showType; + if (verdictLogged[key]) return; + verdictLogged[key] = true; + send({type: 'status', text: 'verdict ' + tag + ': status=' + (status || '?') + (showType ? (' show_type=' + showType) : '')}); + } catch (e) {} + } + + // fetch hook + const origFetch = window.fetch; + if (origFetch) { + window.fetch = function(input, init) { + const url = (typeof input === 'string') ? input : (input && input.url) || ''; + const p = origFetch.apply(this, arguments); + if (url && url.indexOf('captchaNotRobot') !== -1) { + p.then(function(res) { + try { + res.clone().text().then(function(text) { + logVerdict(url, text); + const t = maybeTokenFromText(text); + if (t) handleSuccessToken(t); + }); + } catch (e) {} + }).catch(function() {}); + } + return p; + }; + } + + // XHR hook. Two hooks because VK sites use both: + // - Override of xhr.onreadystatechange catches code that + // sets the handler via property assignment. + // - addEventListener('load', ...) catches code that + // subscribes via the event listener API. Without this + // second hook, sites that prefer addEventListener (which + // is increasingly the norm for SPA frameworks) fire their + // handlers without our knowledge — we miss the response + // and the captcha sheet looks stuck even though VK has + // already issued the success_token. + const origOpen = XMLHttpRequest.prototype.open; + const origSend = XMLHttpRequest.prototype.send; + XMLHttpRequest.prototype.open = function(method, url) { + this.__cap_url = url; + return origOpen.apply(this, arguments); + }; + XMLHttpRequest.prototype.send = function() { + const xhr = this; + // load-event hook (independent of any onreadystatechange). + try { + xhr.addEventListener('load', function() { + try { + if (xhr.__cap_url && + String(xhr.__cap_url).indexOf('captchaNotRobot') !== -1) { + logVerdict(xhr.__cap_url, xhr.responseText); + const t = maybeTokenFromText(xhr.responseText); + if (t) handleSuccessToken(t); + } + } catch (e) {} + }); + } catch (e) {} + // onreadystatechange wrap (catches direct property assignment). + const prev = xhr.onreadystatechange; + xhr.onreadystatechange = function() { + if (xhr.readyState === 4 && xhr.__cap_url && + String(xhr.__cap_url).indexOf('captchaNotRobot') !== -1) { + logVerdict(xhr.__cap_url, xhr.responseText); + const t = maybeTokenFromText(xhr.responseText); + if (t) handleSuccessToken(t); + } + if (typeof prev === 'function') return prev.apply(this, arguments); + }; + return origSend.apply(this, arguments); + }; + + // postMessage relay + window.addEventListener('message', function(ev) { + try { + const data = ev.data; + if (data && typeof data === 'object') { + if (data.success_token) handleSuccessToken(data.success_token); + if (data.type === 'captcha_success' && data.token) { + handleSuccessToken(data.token); + } + } else if (typeof data === 'string') { + const t = maybeTokenFromText(data); + if (t) handleSuccessToken(t); + } + } catch (e) {} + }); + + // URL / location polling — sometimes VK reflects token in hash on success. + let lastUrl = ''; + setInterval(function() { + if (location.href !== lastUrl) { + lastUrl = location.href; + try { + const u = new URL(location.href); + let t = u.searchParams.get('success_token'); + if (!t && u.hash) { + const params = new URLSearchParams(u.hash.replace(/^#/, '')); + t = params.get('success_token'); + } + if (t) handleSuccessToken(t); + } catch (e) {} + } + }, 250); + + // Terminal-state polling. VK renders some failure pages + // server-side as plain HTML — no XHR for our fetch/XHR hooks + // to catch — so the only way to detect them is to inspect + // the rendered DOM text. When found, fire 'terminal' so + // native dismisses the sheet instead of leaving the user + // staring at a dead end (the most common: "Attempt limit + // reached", which the user has to currently kill the whole + // app to escape). + const terminalPatterns = [ + /attempt[\\s_]?limit[\\s_]?reached/i, + /превышен[оа]?\\s*колич/i, + /попыток.*исчерпан/i, + /please\\s*try\\s*again\\s*later/i, + /повторите\\s*попытку\\s*позже/i, + ]; + let terminalFired = false; + setInterval(function() { + if (solved || terminalFired || !document.body) return; + const txt = document.body.innerText || ''; + for (let i = 0; i < terminalPatterns.length; i++) { + if (terminalPatterns[i].test(txt)) { + terminalFired = true; + send({type:'terminal', reason: txt.slice(0, 200)}); + return; + } + } + }, 750); + + send({type:'status', text:'Loaded captcha helper'}); + })(); + """ +} diff --git a/TurnBridge/CapturedCaptchasView.swift b/TurnBridge/CapturedCaptchasView.swift new file mode 100644 index 0000000..e9c3ae3 --- /dev/null +++ b/TurnBridge/CapturedCaptchasView.swift @@ -0,0 +1,306 @@ +import SwiftUI + +/// Browser for the captcha trap directory. The network extension drops +/// a folder per FAILED captcha solve into the App Group container +/// (/captcha_trap/_ diff --git a/TurnBridge/TurnBridgeApp.swift b/TurnBridge/TurnBridgeApp.swift index 11455d9..56b7c01 100755 --- a/TurnBridge/TurnBridgeApp.swift +++ b/TurnBridge/TurnBridgeApp.swift @@ -7,14 +7,42 @@ import NetworkExtension @main struct TurnBridge: App { + @StateObject private var captchaManager = CaptchaManager.shared + var body: some Scene { WindowGroup { ContentView(app: self) + .onAppear { captchaManager.start() } + .sheet(item: Binding( + get: { captchaManager.pending.map(IdentifiedCaptcha.init) }, + set: { newValue in + if newValue == nil { + Task { await captchaManager.cancel(reason: "sheet dismissed") } + } + } + )) { identified in + CaptchaWebView(redirectUri: identified.request.redirectUri, + retryUrl: identified.request.retryUrl, + retryBody: identified.request.retryBody, + manager: captchaManager) + .interactiveDismissDisabled() + } } } + + private struct IdentifiedCaptcha: Identifiable { + let request: CaptchaIPC.PendingRequest + var id: String { request.requestId } + } - func turnOnTunnel(vkLink: String, peerAddr: String, listenAddr: String, nValue: Int, wgQuickConfig: String, completionHandler: @escaping (Bool) -> Void) { - SharedLogger.info("Connecting... peer=\(peerAddr), listen=\(listenAddr), n=\(nValue)") + func turnOnTunnel(vkLink: String, peerAddr: String, listenAddr: String, nValue: Int, useUDP: Bool, streamAggregation: Bool, wrapKey: String, wgQuickConfig: String, completionHandler: @escaping (Bool) -> Void) { + // Strip whitespace (including Unicode thin space U+2009 that + // sneaks in from web copy-paste). Field log 1.3.14 showed the + // proxy aborting at startup with `port "56010 " invalid` + // — the trailing thin space lived inside the saved profile. + let peerAddr = peerAddr.trimmingCharacters(in: .whitespacesAndNewlines) + let listenAddr = listenAddr.trimmingCharacters(in: .whitespacesAndNewlines) + SharedLogger.info("Connecting... peer=\(peerAddr), listen=\(listenAddr), n=\(nValue), udp=\(useUDP), streamAgg=\(streamAggregation)") NETunnelProviderManager.loadAllFromPreferences { tunnelManagersInSettings, error in if let error = error { @@ -29,7 +57,7 @@ struct TurnBridge: App { SharedLogger.debug("Using \(preExistingTunnelManager != nil ? "existing" : "new") tunnel manager") let protocolConfiguration = NETunnelProviderProtocol() - let currentAppBundleId = Bundle.main.bundleIdentifier ?? "com.netlab.TurnBridge" + let currentAppBundleId = Bundle.main.bundleIdentifier ?? "com.truvvor.turnbridge" protocolConfiguration.providerBundleIdentifier = "\(currentAppBundleId).network-extension" let cleanIP = peerAddr.components(separatedBy: ":").first ?? peerAddr @@ -40,7 +68,10 @@ struct TurnBridge: App { "vkLink": vkLink, "peerAddr": peerAddr, "listenAddr": listenAddr, - "nValue": nValue + "nValue": nValue, + "useUDP": useUDP, + "streamAggregation": streamAggregation, + "wrapKey": wrapKey ] let defaults = UserDefaults.standard @@ -48,12 +79,32 @@ struct TurnBridge: App { let excludeCellular = defaults.object(forKey: "excludeCellularServices") as? Bool ?? false let excludeLAN = defaults.object(forKey: "excludeLocalNetworks") as? Bool ?? true - protocolConfiguration.includeAllNetworks = true + // includeAllNetworks acts as a kill-switch — it installs + // the tunnel's default route BEFORE the tunnel is up, so + // outbound traffic in the Connecting phase has nowhere + // to go and the kernel returns "no route to host" for + // EVERY destination (including 1.1.1.1 / hardcoded VK + // IPs / DoH endpoints). + // + // That's fatal for both captcha modes: + // manual — the in-app WebView can't load id.vk.com + // auto — the in-extension captcha solver can't reach + // login.vk.com / api.vk.com / 1.1.1.1 + // + // So we turn the kill-switch off in both modes. The user + // loses fail-closed behaviour during a mid-session + // tunnel break, but in exchange the tunnel can actually + // come up. Real kill-switch parity would require + // changing includeAllNetworks DYNAMICALLY after WG + // handshake — iOS doesn't support that, the flag is + // saveToPreferences-time only. + let manualCaptcha = ManualCaptchaSetting.isEnabled + protocolConfiguration.includeAllNetworks = false protocolConfiguration.excludeAPNs = excludeAPNs protocolConfiguration.excludeCellularServices = excludeCellular protocolConfiguration.excludeLocalNetworks = excludeLAN - SharedLogger.debug("Routing: LAN=\(excludeLAN), APNs=\(excludeAPNs), Cellular=\(excludeCellular)") + SharedLogger.debug("Routing: includeAll=false (manualCaptcha=\(manualCaptcha)), LAN=\(excludeLAN), APNs=\(excludeAPNs), Cellular=\(excludeCellular)") tunnelManager.protocolConfiguration = protocolConfiguration tunnelManager.isEnabled = true diff --git a/TurnBridge/VPNProfile.swift b/TurnBridge/VPNProfile.swift index 271c928..a561d44 100644 --- a/TurnBridge/VPNProfile.swift +++ b/TurnBridge/VPNProfile.swift @@ -7,15 +7,54 @@ struct VPNProfile: Codable, Identifiable, Equatable { var peerAddr: String var listenAddr: String var nValue: Int + /// Transport from client to TURN server. + /// true = UDP (faster, default; what upstream turnbridge has hardcoded) + /// false = TCP (more reliable over flaky cellular; survives short blips) + var useUDP: Bool + /// Enables the kiper292/vk-turn-proxy 17-byte Session-ID handshake + /// on every DTLS stream so the server-side aggregator can fuse the + /// N parallel TURN allocations into a single stable endpoint for + /// WireGuard. Must ONLY be enabled when the WG server is running a + /// compatible aggregator; otherwise the 17 bytes corrupt the very + /// first WG handshake and the tunnel never comes up. + var streamAggregation: Bool + /// 32-byte ChaCha20-Poly1305 key as 64 hex chars. Empty = disabled. + /// When set, each session wraps its DTLS-over-TURN payload to look + /// like an SRTP/Opus voice stream so VK's relay DPI can't fingerprint + /// us as non-call traffic. The matching server (vk-turn-proxy with + /// -wrap -wrap-key=) MUST have the same key configured. + var wrapKey: String var wgQuickConfig: String - init(id: UUID = UUID(), name: String = "", vkLink: String = "", peerAddr: String = "", listenAddr: String = "127.0.0.1:9000", nValue: Int = 1, wgQuickConfig: String = "") { + init(id: UUID = UUID(), name: String = "", vkLink: String = "", peerAddr: String = "", listenAddr: String = "127.0.0.1:9000", nValue: Int = 1, useUDP: Bool = true, streamAggregation: Bool = false, wrapKey: String = "", wgQuickConfig: String = "") { self.id = id self.name = name self.vkLink = vkLink self.peerAddr = peerAddr self.listenAddr = listenAddr self.nValue = nValue + self.useUDP = useUDP + self.streamAggregation = streamAggregation + self.wrapKey = wrapKey self.wgQuickConfig = wgQuickConfig } + + // Backwards compatibility: older saved profiles in UserDefaults won't have useUDP / streamAggregation / wrapKey. + enum CodingKeys: String, CodingKey { + case id, name, vkLink, peerAddr, listenAddr, nValue, useUDP, streamAggregation, wrapKey, wgQuickConfig + } + + init(from decoder: Decoder) throws { + let c = try decoder.container(keyedBy: CodingKeys.self) + id = try c.decode(UUID.self, forKey: .id) + name = try c.decode(String.self, forKey: .name) + vkLink = try c.decode(String.self, forKey: .vkLink) + peerAddr = try c.decode(String.self, forKey: .peerAddr) + listenAddr = try c.decode(String.self, forKey: .listenAddr) + nValue = try c.decode(Int.self, forKey: .nValue) + useUDP = (try? c.decode(Bool.self, forKey: .useUDP)) ?? true + streamAggregation = (try? c.decode(Bool.self, forKey: .streamAggregation)) ?? false + wrapKey = (try? c.decode(String.self, forKey: .wrapKey)) ?? "" + wgQuickConfig = try c.decode(String.self, forKey: .wgQuickConfig) + } } diff --git a/captcha-service/.gitignore b/captcha-service/.gitignore new file mode 100644 index 0000000..2993ecf --- /dev/null +++ b/captcha-service/.gitignore @@ -0,0 +1,3 @@ +# `go build .` produces a binary named after the directory. +captcha-service +trap/ diff --git a/captcha-service/Dockerfile b/captcha-service/Dockerfile new file mode 100644 index 0000000..1722a49 --- /dev/null +++ b/captcha-service/Dockerfile @@ -0,0 +1,29 @@ +# Build stage +FROM golang:1.22-alpine AS build + +WORKDIR /src +COPY go.mod go.sum ./ +RUN go mod download +COPY . . +RUN CGO_ENABLED=0 GOOS=linux go build -ldflags='-s -w' -o /out/captcha-service . + +# Runtime stage +FROM alpine:3.20 + +RUN apk add --no-cache ca-certificates tzdata && \ + addgroup -S app && adduser -S app -G app && \ + mkdir -p /var/trap && chown app:app /var/trap + +USER app +WORKDIR /app +COPY --from=build /out/captcha-service /app/captcha-service + +ENV LISTEN_ADDR=:8080 +ENV CAPTCHA_TRAP_DIR=/var/trap + +EXPOSE 8080 + +HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ + CMD wget -qO- http://127.0.0.1:8080/healthz || exit 1 + +ENTRYPOINT ["/app/captcha-service"] diff --git a/captcha-service/README.md b/captcha-service/README.md new file mode 100644 index 0000000..a178542 --- /dev/null +++ b/captcha-service/README.md @@ -0,0 +1,253 @@ +# captcha-service + +Server-side companion to the iOS TurnBridge NetworkExtension. Hosts +the VK captcha + identity-registration pipeline outside the +50-100 MB iOS sandbox so the heavy work (slider rendering, PoW +solving, the 8-RT VK auth flow) runs on a real machine with proper +memory and proper DNS. + +The server does **not** open TURN allocations on behalf of the +client — TURN allocations are bound by RFC 5766 to the 5-tuple that +issued them and can't be handed off. What the server does is the +captcha + auth flow, then returns the `{user, pass, addr}` triple +from VK's `vchat.joinConversationByLink` response. The iOS client +uses those credentials to issue its own `Allocate` from the +phone's source IP, within VK's ~50 s rotation window. + +## Why this exists + +Per-IP captcha rate-limit (`ERROR_LIMIT`) caps the iOS client at +roughly 8 unique identities per `connect`. The other ceiling — the +49-candidate slider ranker allocating ~70 MB transiently — was +already softened in F5 but still constrains how many solves can run +in parallel. Moving captcha solving to a server with its own IP +budget and gigabytes of RAM lifts both ceilings. + +Each server instance gives the fleet **one** fresh per-IP budget +(`directSaturated` cools down 60 s after `ERROR_LIMIT`). Run several +behind a round-robin load balancer or rotating residential proxies +for multiplicative throughput. + +## Running locally + +```sh +go run . # requires API_KEY env +``` + +```sh +API_KEY=$(openssl rand -hex 16) go run . +``` + +The service listens on `:8080` by default. Healthcheck at +`GET /healthz`, stats snapshot at `GET /stats` (unauthenticated). + +## API + +`POST /cred` — solve one captcha + register one VK identity, return +ready-to-use TURN creds. + +```http +POST /cred HTTP/1.1 +Authorization: Bearer +Content-Type: application/json + +{"link": ""} +``` + +Successful response (typical 2-10 s): + +```json +{ + "user": "1715792025:guest", + "pass": "abc123…", + "addr": "95.163.34.151:3478", + "expires_at": "2026-05-15T07:45:00Z" +} +``` + +Error responses: +- `400` — missing/invalid `link`. +- `401` — missing or wrong bearer token. +- `429` — egress is in the 60 s `ERROR_LIMIT` cooldown. Client + should back off until `Retry-After` (header) seconds elapse. +- `502` — captcha pipeline failed (downstream VK returned + something we can't parse, or captcha solving exhausted retries). +- `503` — solve queue full (more than `maxConcurrentCaptchaSolves` + in flight; client backoff and retry). + +## Environment + +| Var | Default | Purpose | +| --- | --- | --- | +| `LISTEN_ADDR` | `:8080` | bind address | +| `API_KEY` | *(required)* | bearer token clients must send | +| `CAPTCHA_TRAP_DIR` | `./trap` | where to write debug artefacts for failed solves (slider images, etc.) | +| `WARP_INTERFACE` | *(unset)* | name of a pre-existing WireGuard interface (typically Cloudflare WARP) to pin VK-bound HTTP sockets to. See **WARP egress** below | + +## WARP egress + +In mid-2026 VK rolled out per-source-IP traffic-shaping on captcha +endpoints. Hosts that solve many captchas in a short window get +throttled or temp-blacklisted by source IP. Routing the captcha +service's outbound HTTP through Cloudflare's WARP (free WireGuard +tunnel) moves egress to Cloudflare's IP space, which VK has not been +observed to throttle (so far). + +The captcha-service does NOT manage the WireGuard interface itself — +key generation, registration with Cloudflare, and `wg-quick` +lifecycle stay at the OS layer. We only consume an interface name and +bind sockets to it via `SO_BINDTODEVICE`. + +### Operator setup + +1. **Get WARP credentials** with `wgcf` (https://github.com/ViRb3/wgcf): + + ```sh + wgcf register + wgcf generate + mv wgcf-profile.conf /etc/wireguard/wgcf.conf + ``` + +2. **Edit `/etc/wireguard/wgcf.conf`** and add `Table = off` to the + `[Interface]` section so wg-quick doesn't install default routes + (we only want VK traffic on WARP, not all egress): + + ```ini + [Interface] + PrivateKey = ... + Address = 172.16.0.2/32, 2606:4700:110:.../128 + Table = off # <-- add this + MTU = 1280 + ``` + +3. **Bring the interface up**: + + ```sh + sudo wg-quick up /etc/wireguard/wgcf.conf + ip link show wgcf # confirm it exists + ``` + +4. **Run captcha-service with `WARP_INTERFACE=wgcf`** and grant it + `CAP_NET_RAW` so non-root processes can call `SO_BINDTODEVICE`: + + ```sh + docker run -d \ + -p 8080:8080 \ + -e API_KEY=$(openssl rand -hex 32) \ + -e WARP_INTERFACE=wgcf \ + --cap-add=NET_RAW \ + --network=host \ + turnbridge/captcha-service + ``` + + `--network=host` is required so the container shares the host's + network namespace and can see the `wgcf` interface. + +5. **Verify**: + + ```sh + curl -s http://localhost:8080/stats | jq .warp + # "on:wgcf" + ``` + + Then trigger a `/cred` call and check the WARP interface counters + went up (`wg show wgcf transfer`). + +### Falling back + +Unset `WARP_INTERFACE` (or simply don't pass it) and outbound goes via +the host's default route again. No code change, no restart of the +WireGuard interface — captcha-service just doesn't pin sockets. + +## Deployment (Docker) + +```sh +docker build -t turnbridge/captcha-service . +docker run -d \ + -p 8080:8080 \ + -e API_KEY=$(openssl rand -hex 16) \ + -v $(pwd)/trap:/var/trap \ + --name captcha-service \ + turnbridge/captcha-service +``` + +The container uses a non-root `app` user and exposes a `wget`-based +healthcheck on `/healthz`. + +## Concurrency + +`maxConcurrentCaptchaSolves = 5` matches the iOS client's pacing. +VK trips `ERROR_LIMIT` more aggressively when more than ~5-6 solves +land on the same source IP in the same 60 s window. Scale capacity +by adding more peers on different IPs (see Cluster mode below), +not by raising this limit on one instance. + +## Cluster mode (V2) + +Every binary is symmetric. The peer the client hits acts as +**master** for that request: it round-robins through the configured +peer list (including itself), forwards to `/internal/cred` when +round-robin lands on a different peer, and falls through to the +next peer on `429`. Single-node mode (no `PEERS`) is the default +and matches V1 behaviour. + +### Deployment + +Run one captcha-service per IP. Each binary needs to know all +peers in the fleet. Example for 3 nodes on different VPS: + +```sh +# Node A (77.90.8.199) +docker run -d -p 8080:8080 \ + -e API_KEY=$SHARED_KEY \ + -e SELF_URL=http://77.90.8.199:8080 \ + -e PEERS='http://77.90.8.199:8080|'$SHARED_KEY',http://77.90.8.200:8080|'$SHARED_KEY',http://77.90.8.201:8080|'$SHARED_KEY \ + --name cs turnbridge/captcha-service + +# Node B (77.90.8.200) — same PEERS list, different SELF_URL +docker run -d -p 8080:8080 \ + -e API_KEY=$SHARED_KEY \ + -e SELF_URL=http://77.90.8.200:8080 \ + -e PEERS='http://77.90.8.199:8080|'$SHARED_KEY',http://77.90.8.200:8080|'$SHARED_KEY',http://77.90.8.201:8080|'$SHARED_KEY \ + --name cs turnbridge/captcha-service + +# Node C (77.90.8.201) — analogous +``` + +`PEERS` format: comma-separated `URL|API_KEY` entries. `SELF_URL` +must exactly match one of the `PEERS` URLs so the binary recognises +itself and bypasses HTTP when round-robin picks it. Different +peers can use different API keys (each entry carries its own); the +common pattern is one shared key across the whole fleet. + +### Saturation propagation + +Each peer tracks its own VK rate-limit cooldown locally +(`directSaturated()` flips on `ERROR_LIMIT` and auto-clears after +60 s). When a master forwards to a peer and the peer returns `429` +or sets `X-Captcha-Self-Saturated: 1` on its response, the master +records "peer X cool down until now + 60 s" and skips X in +subsequent rounds. No gossip / heartbeats — saturation is learned +passively from response headers. + +`GET /stats` includes the master's view of each peer's availability. + +### Client config + +The client always talks to ONE master URL. To survive a single-node +outage, either: + +- Front the cluster with a load balancer / DNS round-robin (single + client URL → many backend nodes). +- Or accept that "master" is whichever node the client points at + and live-edit the URL when needed. + +### HTTP path summary + +- `POST /cred` — public, client-facing. Master logic; forwards. +- `POST /internal/cred` — peer-only. Same auth as `/cred` but + never forwards. Sets `X-Captcha-Self-Saturated: 1` when this + peer is in its 60 s cooldown. +- `GET /stats` — counters + peer-view snapshot. +- `GET /healthz` — for Docker HEALTHCHECK. + diff --git a/captcha-service/captcha_client.go b/captcha-service/captcha_client.go new file mode 100644 index 0000000..c318a7e --- /dev/null +++ b/captcha-service/captcha_client.go @@ -0,0 +1,73 @@ +// captcha_client.go — server-side equivalent of the iOS-side +// captcha_client.go. Builds a Safari iOS 18.0 TLS+HTTP/2 impersonator +// for VK API calls. See the iOS file for the full rationale of why +// we use bogdanfinn/tls-client + fhttp instead of net/http. +// +// Server-side specific: the underlying net.Dialer carries the WARP +// SO_BINDTODEVICE control hook so outbound captcha traffic egresses +// via Cloudflare's WARP edge when WARP_INTERFACE is configured. See +// warp_dialer.go. + +package main + +import ( + "context" + + fhttp "github.com/bogdanfinn/fhttp" + "github.com/bogdanfinn/fhttp/cookiejar" + tlsclient "github.com/bogdanfinn/tls-client" + tlsprofiles "github.com/bogdanfinn/tls-client/profiles" +) + +// Header / pseudo-header order Safari iOS 18 emits. Order is a +// classifier signal even though HTTP/2 is order-insensitive in the +// spec. Mirror of the iOS-side list — keep in sync. +var safariHeaderOrder = []string{ + "host", + "accept", + "sec-fetch-site", + "accept-encoding", + "sec-fetch-mode", + "user-agent", + "accept-language", + "sec-fetch-dest", + "referer", + "priority", + "cookie", + "content-type", + "content-length", + "origin", +} + +var safariPHeaderOrder = []string{ + ":method", + ":scheme", + ":path", + ":authority", +} + +func newTLSCaptchaClient() (tlsclient.HttpClient, error) { + jar, err := cookiejar.New(nil) + if err != nil { + return nil, err + } + opts := []tlsclient.HttpClientOption{ + tlsclient.WithTimeoutSeconds(20), + tlsclient.WithClientProfile(tlsprofiles.Safari_IOS_18_0), + tlsclient.WithCookieJar(jar), + tlsclient.WithDisableHttp3(), + // WARP-pinned dialer if WARP_INTERFACE is set; otherwise a + // vanilla net.Dialer with a no-op control hook. + tlsclient.WithDialer(newWARPNetDialer()), + } + return tlsclient.NewHttpClient(tlsclient.NewNoopLogger(), opts...) +} + +func applySafariHeaderOrder(req *fhttp.Request) { + req.Header[fhttp.HeaderOrderKey] = safariHeaderOrder + req.Header[fhttp.PHeaderOrderKey] = safariPHeaderOrder +} + +func withCaptchaCtx(ctx context.Context, req *fhttp.Request) *fhttp.Request { + return req.WithContext(ctx) +} diff --git a/captcha-service/captcha_debug_info.go b/captcha-service/captcha_debug_info.go new file mode 100644 index 0000000..2d80c3b --- /dev/null +++ b/captcha-service/captcha_debug_info.go @@ -0,0 +1,70 @@ +// captcha_debug_info.go — mirror of the iOS-side file. See that file +// for the full rationale. + +package main + +import ( + "context" + "fmt" + "io" + "regexp" + "sync" + + fhttp "github.com/bogdanfinn/fhttp" + tlsclient "github.com/bogdanfinn/tls-client" +) + +var debugInfoCache sync.Map + +var scriptURLRe = regexp.MustCompile(`]+src="([^"]+not_robot_captcha\.js[^"]*)"`) + +var debugInfoRe = regexp.MustCompile(`debug_info\s*:\s*(?:[^"]*\|\|\s*)?"([a-fA-F0-9]{64})"`) + +func extractScriptURL(html string) string { + if m := scriptURLRe.FindStringSubmatch(html); len(m) >= 2 { + return m[1] + } + return "" +} + +func fetchDebugInfo(ctx context.Context, client tlsclient.HttpClient, profile Profile, scriptURL string) (string, error) { + if scriptURL == "" { + return "", fmt.Errorf("empty scriptURL") + } + if cached, ok := debugInfoCache.Load(scriptURL); ok { + return cached.(string), nil + } + + req, err := fhttp.NewRequest("GET", scriptURL, nil) + if err != nil { + return "", err + } + req = withCaptchaCtx(ctx, req) + req.Header.Set("User-Agent", profile.UserAgent) + req.Header.Set("Accept", "text/javascript,application/javascript,*/*;q=0.1") + req.Header.Set("Accept-Language", "en-US,en;q=0.9") + req.Header.Set("Referer", "https://id.vk.com/") + req.Header.Set("Sec-Fetch-Site", "same-site") + req.Header.Set("Sec-Fetch-Mode", "no-cors") + req.Header.Set("Sec-Fetch-Dest", "script") + applySafariHeaderOrder(req) + + resp, err := client.Do(req) + if err != nil { + return "", fmt.Errorf("fetch script: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("read script: %w", err) + } + + m := debugInfoRe.FindSubmatch(body) + if len(m) < 2 { + return "", fmt.Errorf("debug_info constant not found in %s", scriptURL) + } + di := string(m[1]) + debugInfoCache.Store(scriptURL, di) + return di, nil +} diff --git a/captcha-service/captcha_slider.go b/captcha-service/captcha_slider.go new file mode 100644 index 0000000..beef098 --- /dev/null +++ b/captcha-service/captcha_slider.go @@ -0,0 +1,651 @@ +package main + +import ( + "bytes" + "encoding/base64" + "encoding/json" + "fmt" + "image" + "image/color" + _ "image/jpeg" + "log" + neturl "net/url" + "sort" + "strconv" + "strings" + "time" +) + +const ( + sliderCaptchaType = "slider" + defaultSliderAttempts = 4 +) + +// sliderRankSlot bounds the parallelism of the scoring step in +// rankSliderCandidates. Before F5 each ranking materialised the full +// rearranged image per candidate (49 × 600×600 × 4 = ~70 MB transient +// per solver), so cap-2 was a hard memory ceiling. F5 scores directly +// on the source image without materialising swaps — peak transient +// drops to a few KB — so the slot now exists only to bound CPU when +// many slider captchas land at once. Matches maxConcurrentCaptchaSolves +// so the captcha pipeline isn't artificially throttled below its own +// concurrency cap. +var sliderRankSlot = make(chan struct{}, maxConcurrentCaptchaSolves) + +// vkReqFunc is the type for the VK API request helper from callCaptchaNotRobotAPI. +type vkReqFunc func(method, postData string) (map[string]interface{}, error) + +type sliderCaptchaContent struct { + Image image.Image + GridW int // tile columns + GridH int // tile rows + Steps []int // swap pairs + Attempts int // max submit attempts +} + +type sliderCandidate struct { + Index int + ActiveSteps []int + Score int64 +} + +// solveSliderCaptcha attempts to solve a VK slider captcha automatically. +// It fetches the scrambled image via captchaNotRobot.getContent, analyzes +// tile border continuity to find the correct permutation, and submits the answer. +func solveSliderCaptcha( + vkReq vkReqFunc, + baseParams string, + browserFp string, + hash string, + debugInfo string, + settingsResp map[string]interface{}, + isTunnel bool, +) (string, error) { + // Extract slider settings from the settings response + sliderSettings := extractSliderSettings(settingsResp) + + log.Printf("slider: fetching captcha content (settings=%q)", sliderSettings) + + // Open a captcha trap. Every artefact we collect during the solve + // is buffered in memory and either Discarded (on success) or + // Committed (on any failure path). The deferred Discard is the + // safety net — explicit Commit calls in the failure branches run + // first, and Commit/Discard are idempotent. + trap := newCaptchaTrap("slider") + defer trap.Discard() + trap.Note("settings_raw=%q", sliderSettings) + + // Get scrambled image and swap instructions + getContentData := baseParams + if sliderSettings != "" { + getContentData += "&captcha_settings=" + neturl.QueryEscape(sliderSettings) + } + + resp, err := vkReq("captchaNotRobot.getContent", getContentData) + if err != nil { + trap.Note("getContent transport error: %v", err) + trap.Commit("getContent_transport_err") + return "", fmt.Errorf("slider getContent: %w", err) + } + + // Save the raw getContent response and the image bytes as soon as + // we have them, BEFORE parsing — that way a new captcha variant + // that breaks parseSliderContent still leaves us a self-contained + // artefact to inspect. + if rawJSON, jerr := json.MarshalIndent(resp, "", " "); jerr == nil { + trap.Save("getContent_response.json", rawJSON) + } + if respMap, ok := resp["response"].(map[string]interface{}); ok { + if imgStr, ok := respMap["image"].(string); ok && imgStr != "" { + if rawBytes, derr := base64.StdEncoding.DecodeString(imgStr); derr == nil { + ext := "bin" + if e, ok := respMap["extension"].(string); ok && e != "" { + ext = strings.ToLower(e) + } + trap.Save("image."+ext, rawBytes) + } + } + } + + content, err := parseSliderContent(resp) + if err != nil { + // status:ERROR / status:ERROR_LIMIT from slider getContent + // is VK rate-limiting us at the slider gate — count as a + // saturation hit so a high-N run doesn't keep spawning more + // sessions that will all hit the same wall. The fail streak + // resets on the next success. + markCaptchaSaturated(isTunnel) + trap.Note("parseSliderContent failed: %v", err) + trap.Commit("unparseable_response") + return "", fmt.Errorf("slider parse: %w", err) + } + trap.Note("parsed grid=%dx%d swaps=%d attempts=%d", + content.GridW, content.GridH, len(content.Steps)/2, content.Attempts) + + log.Printf("slider: image=%dx%d grid=%dx%d steps=%d attempts=%d", + content.Image.Bounds().Dx(), content.Image.Bounds().Dy(), + content.GridW, content.GridH, len(content.Steps)/2, content.Attempts) + + // Rank candidate positions by pixel border continuity. Gate the + // memory-heavy render+score with sliderRankSlot so we don't OOM + // the iOS extension when several captcha solves arrive in + // parallel. Plain blocking send is fine — each ranking finishes + // in ~100 ms, so a stuck sender waits at most that long for a + // slot to free. + sliderRankSlot <- struct{}{} + candidates, err := rankSliderCandidates(content.Image, content.GridW, content.GridH, content.Steps) + <-sliderRankSlot + if err != nil { + trap.Note("rank failed: %v", err) + trap.Commit("rank_failed") + return "", fmt.Errorf("slider rank: %w", err) + } + + maxTries := content.Attempts + if maxTries > len(candidates) { + maxTries = len(candidates) + } + + log.Printf("slider: ranked %d positions, trying top %d", len(candidates), maxTries) + + // Try each candidate + for i := 0; i < maxTries; i++ { + c := candidates[i] + log.Printf("slider: guess %d/%d position=%d score=%d", i+1, maxTries, c.Index, c.Score) + + answer, err := encodeSliderAnswer(c.ActiveSteps) + if err != nil { + trap.Note("encodeSliderAnswer failed: %v", err) + trap.Commit("encode_answer_err") + return "", err + } + + // Generate slider cursor (simulates drag from left to position) + cursor := generateSliderCursor(c.Index, len(candidates)) + + checkData := baseParams + fmt.Sprintf( + "&accelerometer=%s&gyroscope=%s&motion=%s&cursor=%s&taps=%s&connectionRtt=%s&connectionDownlink=%s"+ + "&browser_fp=%s&hash=%s&answer=%s&debug_info=%s", + neturl.QueryEscape("[]"), neturl.QueryEscape("[]"), neturl.QueryEscape("[]"), + neturl.QueryEscape(cursor), + neturl.QueryEscape("[]"), neturl.QueryEscape("[]"), neturl.QueryEscape("[]"), + browserFp, hash, neturl.QueryEscape(answer), + debugInfo, + ) + + checkResp, err := vkReq("captchaNotRobot.check", checkData) + if err != nil { + trap.Note("attempt %d/%d transport err: %v", i+1, maxTries, err) + trap.Commit("check_transport_err") + return "", fmt.Errorf("slider check: %w", err) + } + + respObj, ok := checkResp["response"].(map[string]interface{}) + if !ok { + trap.Note("attempt %d/%d invalid response: %v", i+1, maxTries, checkResp) + trap.Commit("check_invalid_response") + return "", fmt.Errorf("slider check: invalid response") + } + + status, _ := respObj["status"].(string) + trap.Note("attempt %d/%d position=%d score=%d → status=%s", + i+1, maxTries, c.Index, c.Score, status) + switch status { + case "OK": + successToken, _ := respObj["success_token"].(string) + if successToken == "" { + trap.Note("OK but success_token missing in: %v", respObj) + trap.Commit("ok_without_token") + return "", fmt.Errorf("slider: success_token not found") + } + log.Printf("slider: solved! position=%d (attempt %d/%d)", c.Index, i+1, maxTries) + // Commit solved captchas too so the user can actually see + // what our solver is processing. The reason field marks + // them "solved_ok"; unsolved entries use other reasons. + // Without this commit a healthy run produces an empty trap + // dir, which looks indistinguishable from "the trap isn't + // wired correctly". + trap.Note("SOLVED at attempt %d/%d, position=%d", i+1, maxTries, c.Index) + trap.Commit("solved_ok") + return successToken, nil + case "ERROR_LIMIT": + markCaptchaSaturated(isTunnel) + trap.Commit("error_limit") + return "", fmt.Errorf("slider: ERROR_LIMIT") + default: + log.Printf("slider: position=%d rejected (status=%s)", c.Index, status) + time.Sleep(500 * time.Millisecond) + } + } + + trap.Commit("all_guesses_rejected") + return "", fmt.Errorf("slider: all %d guesses rejected", maxTries) +} + +// extractSliderSettings extracts slider captcha_settings from settings API response. +func extractSliderSettings(settingsResp map[string]interface{}) string { + if settingsResp == nil { + return "" + } + respObj, ok := settingsResp["response"].(map[string]interface{}) + if !ok { + return "" + } + + // Try to find captcha_settings for slider type + raw := respObj["captcha_settings"] + if raw == nil { + return "" + } + + // captcha_settings can be array or map + switch v := raw.(type) { + case []interface{}: + for _, item := range v { + m, ok := item.(map[string]interface{}) + if !ok { + continue + } + t, _ := m["type"].(string) + if t == sliderCaptchaType { + return normalizeSettings(m["settings"]) + } + } + case map[string]interface{}: + if s, ok := v[sliderCaptchaType]; ok { + return normalizeSettings(s) + } + case string: + // Try JSON parse + trimmed := strings.TrimSpace(v) + if trimmed == "" { + return "" + } + var items []interface{} + if err := json.Unmarshal([]byte(trimmed), &items); err == nil { + return extractSliderSettings(map[string]interface{}{ + "response": map[string]interface{}{"captcha_settings": items}, + }) + } + } + return "" +} + +func normalizeSettings(raw interface{}) string { + switch v := raw.(type) { + case nil: + return "" + case string: + return v + default: + data, err := json.Marshal(v) + if err != nil { + return "" + } + return string(data) + } +} + +// parseSliderContent parses the getContent API response. +func parseSliderContent(resp map[string]interface{}) (*sliderCaptchaContent, error) { + respObj, ok := resp["response"].(map[string]interface{}) + if !ok { + return nil, fmt.Errorf("invalid response: %v", resp) + } + + status, _ := respObj["status"].(string) + if status != "OK" { + return nil, fmt.Errorf("status: %s", status) + } + + ext, _ := respObj["extension"].(string) + ext = strings.ToLower(ext) + if ext != "jpeg" && ext != "jpg" { + return nil, fmt.Errorf("unsupported image format: %s", ext) + } + + rawImage, _ := respObj["image"].(string) + if rawImage == "" { + return nil, fmt.Errorf("image missing") + } + + rawSteps, ok := respObj["steps"].([]interface{}) + if !ok { + return nil, fmt.Errorf("steps missing") + } + + steps, err := parseIntSlice(rawSteps) + if err != nil { + return nil, err + } + + gridW, gridH, swaps, attempts, err := parseSliderSteps(steps) + if err != nil { + return nil, err + } + + img, err := decodeSliderImage(rawImage) + if err != nil { + return nil, err + } + + return &sliderCaptchaContent{ + Image: img, + GridW: gridW, + GridH: gridH, + Steps: swaps, + Attempts: attempts, + }, nil +} + +func parseIntSlice(raw []interface{}) ([]int, error) { + values := make([]int, 0, len(raw)) + for _, item := range raw { + switch v := item.(type) { + case float64: + values = append(values, int(v)) + case int: + values = append(values, v) + case string: + n, err := strconv.Atoi(strings.TrimSpace(v)) + if err != nil { + return nil, fmt.Errorf("invalid numeric: %v", item) + } + values = append(values, n) + default: + return nil, fmt.Errorf("invalid numeric: %v", item) + } + } + return values, nil +} + +// parseSliderSteps decodes VK's `steps` array. Two formats observed: +// +// square: [size, swap_pairs..., attempts?] // tile grid = size×size +// rect: [width, height, swap_pairs..., attempts?] // tile grid = width×height +// +// VK started serving the rectangular variant (3×7 word-strip layouts: +// ШАПОЧКИ / КОРРУПЦИЯ / СКЕПТИЦИЗМ etc.) where the old square parser +// produces tile-counts that don't contain the swap indices and the +// renderer scrambles the image instead of unscrambling. We try +// square first (backward-compatible: pre-existing 3×3, 4×4, etc. +// captchas keep parsing the same way), then rect, then bail with the +// raw payload logged so a third format can be added without +// guesswork. +func parseSliderSteps(steps []int) (gridW int, gridH int, swaps []int, attempts int, err error) { + if len(steps) < 3 { + return 0, 0, nil, 0, fmt.Errorf("steps too short: %d", len(steps)) + } + log.Printf("slider: raw steps payload: %v", steps) + + if w, h, sw, at, ok := decodeSliderStepsSquare(steps); ok { + log.Printf("slider: parsed as %dx%d (square format), %d candidates, %d attempts", + w, h, len(sw)/2, at) + return w, h, sw, at, nil + } + if w, h, sw, at, ok := decodeSliderStepsRect(steps); ok { + log.Printf("slider: parsed as %dx%d (rect format), %d candidates, %d attempts", + w, h, len(sw)/2, at) + return w, h, sw, at, nil + } + return 0, 0, nil, 0, fmt.Errorf("unrecognised steps payload %v", steps) +} + +func decodeSliderStepsSquare(steps []int) (w, h int, swaps []int, attempts int, ok bool) { + size := steps[0] + if size <= 0 { + return 0, 0, nil, 0, false + } + tileCount := size * size + rest := append([]int(nil), steps[1:]...) + attempts = defaultSliderAttempts + if len(rest)%2 != 0 { + attempts = rest[len(rest)-1] + rest = rest[:len(rest)-1] + } + if attempts <= 0 { + attempts = defaultSliderAttempts + } + if len(rest) == 0 || len(rest)%2 != 0 { + return 0, 0, nil, 0, false + } + for _, v := range rest { + if v < 0 || v >= tileCount { + return 0, 0, nil, 0, false + } + } + return size, size, rest, attempts, true +} + +func decodeSliderStepsRect(steps []int) (w, h int, swaps []int, attempts int, ok bool) { + if len(steps) < 4 { + return 0, 0, nil, 0, false + } + width, height := steps[0], steps[1] + if width <= 0 || height <= 0 { + return 0, 0, nil, 0, false + } + tileCount := width * height + rest := append([]int(nil), steps[2:]...) + attempts = defaultSliderAttempts + if len(rest)%2 != 0 { + attempts = rest[len(rest)-1] + rest = rest[:len(rest)-1] + } + if attempts <= 0 { + attempts = defaultSliderAttempts + } + if len(rest) == 0 || len(rest)%2 != 0 { + return 0, 0, nil, 0, false + } + for _, v := range rest { + if v < 0 || v >= tileCount { + return 0, 0, nil, 0, false + } + } + return width, height, rest, attempts, true +} + +func decodeSliderImage(rawImage string) (image.Image, error) { + decoded, err := base64.StdEncoding.DecodeString(rawImage) + if err != nil { + return nil, fmt.Errorf("base64 decode: %w", err) + } + img, _, err := image.Decode(bytes.NewReader(decoded)) + if err != nil { + return nil, fmt.Errorf("image decode: %w", err) + } + return img, nil +} + +func encodeSliderAnswer(activeSteps []int) (string, error) { + payload := struct { + Value []int `json:"value"` + }{Value: activeSteps} + data, err := json.Marshal(payload) + if err != nil { + return "", err + } + return base64.StdEncoding.EncodeToString(data), nil +} + +// rankSliderCandidates analyzes each candidate permutation and ranks by +// pixel border continuity (lower score = better match = more likely correct). +func rankSliderCandidates(img image.Image, gridW, gridH int, swaps []int) ([]sliderCandidate, error) { + candidateCount := len(swaps) / 2 + if candidateCount == 0 { + return nil, fmt.Errorf("no candidates") + } + + candidates := make([]sliderCandidate, 0, candidateCount) + for idx := 1; idx <= candidateCount; idx++ { + activeSteps := buildSliderActiveSteps(swaps, idx) + mapping, err := buildSliderTileMapping(gridW, gridH, activeSteps) + if err != nil { + return nil, err + } + + // F5: score directly on the source image without rendering a + // full RGBA buffer per candidate. The seam-energy metric only + // needs pixel values at adjacent-tile boundaries, which we + // can look up via the mapping (destination position d's + // pixels come from source tile mapping[d]). Drops slider rank + // peak memory from ~140 MB to a few KB, lets the slot cap be + // raised back to maxConcurrentCaptchaSolves. + score := scoreSliderMapping(img, gridW, gridH, mapping) + candidates = append(candidates, sliderCandidate{ + Index: idx, + ActiveSteps: activeSteps, + Score: score, + }) + } + + sort.SliceStable(candidates, func(i, j int) bool { + if candidates[i].Score == candidates[j].Score { + return candidates[i].Index < candidates[j].Index + } + return candidates[i].Score < candidates[j].Score + }) + + return candidates, nil +} + +// scoreSliderMapping computes a seam-energy score for a candidate +// tile mapping without materialising the rearranged image. For every +// pair of adjacent destination positions it looks up the source +// tiles via the mapping and sums pixel differences across the shared +// border directly on `img`. The correct (=originally-arranged) +// mapping produces the lowest total energy; rankSliderCandidates +// sorts ascending and picks the top one. +// +// Equivalent to first rendering the rearranged image and then +// scoring across its inter-tile borders, except this skips a +// 600×600×4-byte allocation per candidate and slashes ranking peak +// memory by ~70 MB. +func scoreSliderMapping(img image.Image, gridW, gridH int, mapping []int) int64 { + bounds := img.Bounds() + var score int64 + + // Horizontal seams: dest left tile's right edge vs dest right + // tile's left edge. Source tile rects for each give the pixels. + for row := 0; row < gridH; row++ { + for col := 0; col < gridW-1; col++ { + srcLeft := sliderTileRect(bounds, gridW, gridH, mapping[row*gridW+col]) + srcRight := sliderTileRect(bounds, gridW, gridH, mapping[row*gridW+col+1]) + height := srcLeft.Dy() + if h := srcRight.Dy(); h < height { + height = h + } + for y := 0; y < height; y++ { + score += pixelDiff( + img.At(srcLeft.Max.X-1, srcLeft.Min.Y+y), + img.At(srcRight.Min.X, srcRight.Min.Y+y), + ) + } + } + } + + // Vertical seams: dest top tile's bottom edge vs dest bottom + // tile's top edge. + for row := 0; row < gridH-1; row++ { + for col := 0; col < gridW; col++ { + srcTop := sliderTileRect(bounds, gridW, gridH, mapping[row*gridW+col]) + srcBottom := sliderTileRect(bounds, gridW, gridH, mapping[(row+1)*gridW+col]) + width := srcTop.Dx() + if w := srcBottom.Dx(); w < width { + width = w + } + for x := 0; x < width; x++ { + score += pixelDiff( + img.At(srcTop.Min.X+x, srcTop.Max.Y-1), + img.At(srcBottom.Min.X+x, srcBottom.Min.Y), + ) + } + } + } + + return score +} + +func buildSliderActiveSteps(swaps []int, candidateIndex int) []int { + if candidateIndex <= 0 { + return []int{} + } + end := candidateIndex * 2 + if end > len(swaps) { + end = len(swaps) + } + return append([]int(nil), swaps[:end]...) +} + +func buildSliderTileMapping(gridW, gridH int, activeSteps []int) ([]int, error) { + tileCount := gridW * gridH + if tileCount <= 0 { + return nil, fmt.Errorf("invalid tile count") + } + if len(activeSteps)%2 != 0 { + return nil, fmt.Errorf("invalid steps length: %d", len(activeSteps)) + } + + mapping := make([]int, tileCount) + for i := range mapping { + mapping[i] = i + } + for idx := 0; idx < len(activeSteps); idx += 2 { + l, r := activeSteps[idx], activeSteps[idx+1] + if l < 0 || r < 0 || l >= tileCount || r >= tileCount { + return nil, fmt.Errorf("step out of range: %d,%d", l, r) + } + mapping[l], mapping[r] = mapping[r], mapping[l] + } + return mapping, nil +} + +func sliderTileRect(bounds image.Rectangle, gridW, gridH, index int) image.Rectangle { + row := index / gridW + col := index % gridW + x0 := bounds.Min.X + col*bounds.Dx()/gridW + x1 := bounds.Min.X + (col+1)*bounds.Dx()/gridW + y0 := bounds.Min.Y + row*bounds.Dy()/gridH + y1 := bounds.Min.Y + (row+1)*bounds.Dy()/gridH + return image.Rect(x0, y0, x1, y1) +} + +func pixelDiff(a, b color.Color) int64 { + ar, ag, ab, _ := a.RGBA() + br, bg, bb, _ := b.RGBA() + return absDiff(ar, br) + absDiff(ag, bg) + absDiff(ab, bb) +} + +func absDiff(a, b uint32) int64 { + if a > b { + return int64(a - b) + } + return int64(b - a) +} + +func generateSliderCursor(candidateIndex, candidateCount int) string { + if candidateCount <= 0 { + return "[]" + } + type point struct { + X int `json:"x"` + Y int `json:"y"` + T int64 `json:"t"` + } + startX := 140 + endX := startX + 620*candidateIndex/candidateCount + startY := 430 + startTime := time.Now().Add(-220 * time.Millisecond).UnixMilli() + + points := make([]point, 12) + for i := 0; i < 12; i++ { + points[i] = point{ + X: startX + (endX-startX)*i/11, + Y: startY + (i%3 - 1), + T: startTime + int64(i*18), + } + } + data, _ := json.Marshal(points) + return string(data) +} diff --git a/captcha-service/cluster.go b/captcha-service/cluster.go new file mode 100644 index 0000000..75b2e26 --- /dev/null +++ b/captcha-service/cluster.go @@ -0,0 +1,201 @@ +// cluster.go — peer-to-peer fan-out for the captcha-service. +// +// Each binary is symmetric: it can act as the master (the entry +// point the client talks to) AND as a slave (a worker another peer +// forwards to). The peer the client hits acts as master FOR THAT +// REQUEST — there's no global leader. Picking a peer for a /cred +// call is just round-robin over the configured peer list, including +// self. +// +// Why this exists: VK enforces captcha rate-limits per source IP, +// so a single server's per-IP budget caps the unique-identity +// throughput. Running N captcha-services on N distinct VPS IPs +// multiplies the budget. The client only ever talks to one URL; +// the master transparently distributes work behind it. +// +// Saturation tracking: each peer knows its OWN cooldown state +// (directSaturated() — trips on VK ERROR_LIMIT, auto-clears after +// captchaCooldown). When a master forwards to a peer and the peer +// returns 429 or sets X-Captcha-Self-Saturated: 1, the master +// records "peer X cool down until now + 60 s" locally and skips X +// in subsequent rounds. No proactive gossip — saturation is learned +// passively from response. + +package main + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "os" + "strings" + "sync" + "sync/atomic" + "time" +) + +type peer struct { + URL string // empty when Self is true and we never need to dial out + Key string + Self bool + + mu sync.Mutex + saturatedUntil time.Time +} + +var ( + peers []*peer + rrCursor atomic.Uint64 +) + +// peerHTTPClient is dedicated to inter-peer /internal/cred calls. A +// separate transport from sharedAuthClient (which talks to VK) so a +// hung peer can't starve the captcha pipeline of HTTP connections, +// and vice versa. +var peerHTTPClient = &http.Client{ + Timeout: 90 * time.Second, + Transport: &http.Transport{ + MaxIdleConns: 20, + MaxIdleConnsPerHost: 20, + IdleConnTimeout: 120 * time.Second, + }, +} + +// initPeers parses PEERS + SELF_URL env vars. PEERS is comma- +// separated URL|KEY pairs; SELF_URL must exactly match one of the +// URLs (so the binary can recognise itself and avoid HTTP-looping +// back to its own listen socket). +// +// PEERS absent → single-node mode; the binary serves /cred locally +// with no forwarding. This is the same behaviour as V1. +func initPeers() { + peersEnv := strings.TrimSpace(os.Getenv("PEERS")) + selfURL := strings.TrimSpace(os.Getenv("SELF_URL")) + + if peersEnv == "" { + peers = []*peer{{Self: true}} + log.Printf("cluster: single-node mode (no PEERS configured)") + return + } + + sawSelf := false + for _, entry := range strings.Split(peersEnv, ",") { + entry = strings.TrimSpace(entry) + if entry == "" { + continue + } + parts := strings.SplitN(entry, "|", 2) + if len(parts) != 2 { + log.Fatalf("cluster: malformed PEERS entry %q (expected URL|API_KEY)", entry) + } + url := strings.TrimRight(strings.TrimSpace(parts[0]), "/") + key := strings.TrimSpace(parts[1]) + isSelf := url == selfURL + if isSelf { + sawSelf = true + } + peers = append(peers, &peer{URL: url, Key: key, Self: isSelf}) + } + + if !sawSelf { + log.Fatalf("cluster: SELF_URL=%q does not match any PEERS entry; refusing to start (would loop)", selfURL) + } + + log.Printf("cluster: %d peer(s) configured, self=%s", len(peers), selfURL) + for _, p := range peers { + role := "remote" + if p.Self { + role = "self" + } + log.Printf("cluster: - %s (%s)", p.URL, role) + } +} + +func (p *peer) isAvailable() bool { + p.mu.Lock() + defer p.mu.Unlock() + return time.Now().After(p.saturatedUntil) +} + +func (p *peer) markSaturated() { + p.mu.Lock() + defer p.mu.Unlock() + until := time.Now().Add(captchaCooldown) + if until.After(p.saturatedUntil) { + p.saturatedUntil = until + } +} + +func (p *peer) statusLabel() string { + if p.Self { + return "self" + } + return p.URL +} + +// pickPeer advances the round-robin cursor by one and returns the +// next available (non-saturated) peer. Caller is expected to use +// it inside a loop bounded by len(peers): if every peer is +// saturated, this returns nil after a full sweep. +func pickPeer(triedMask []bool) *peer { + for i := 0; i < len(peers); i++ { + idx := int(rrCursor.Add(1)-1) % len(peers) + if triedMask[idx] { + continue + } + if !peers[idx].isAvailable() { + continue + } + triedMask[idx] = true + return peers[idx] + } + return nil +} + +// forwardToPeer POSTs to peer.URL + /internal/cred with the link. +// On 429 / X-Captcha-Self-Saturated=1 it returns the saturated flag +// so the master can mark the peer for cooldown. On other errors the +// returned saturated=false leaves the peer in rotation (transient +// failures shouldn't ban the peer for 60 s). +func forwardToPeer(ctx context.Context, p *peer, link string) (*credResponse, bool, error) { + body, _ := json.Marshal(map[string]string{"link": link}) + req, err := http.NewRequestWithContext(ctx, "POST", p.URL+"/internal/cred", bytes.NewReader(body)) + if err != nil { + return nil, false, fmt.Errorf("build request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+p.Key) + + resp, err := peerHTTPClient.Do(req) + if err != nil { + return nil, false, fmt.Errorf("call peer %s: %w", p.URL, err) + } + defer resp.Body.Close() + + rawBody, _ := io.ReadAll(resp.Body) + + saturated := resp.Header.Get("X-Captcha-Self-Saturated") == "1" || resp.StatusCode == http.StatusTooManyRequests + + if resp.StatusCode == http.StatusTooManyRequests { + return nil, true, fmt.Errorf("peer %s saturated", p.URL) + } + if resp.StatusCode != http.StatusOK { + var errBody errorResponse + _ = json.Unmarshal(rawBody, &errBody) + msg := errBody.Error + if msg == "" { + msg = fmt.Sprintf("HTTP %d", resp.StatusCode) + } + return nil, saturated, fmt.Errorf("peer %s: %s", p.URL, msg) + } + + var cr credResponse + if err := json.Unmarshal(rawBody, &cr); err != nil { + return nil, saturated, fmt.Errorf("peer %s decode: %w", p.URL, err) + } + return &cr, saturated, nil +} diff --git a/captcha-service/creds.go b/captcha-service/creds.go new file mode 100644 index 0000000..30e9f56 --- /dev/null +++ b/captcha-service/creds.go @@ -0,0 +1,171 @@ +// creds.go — captcha-and-identity pipeline against VK Calls. +// +// Lifted verbatim from wireguard-apple/Sources/WireGuardKitGo/turn_proxy.go. +// Keep changes in sync when the iOS side gets fixes — eventually the +// shared logic should move into a third Go package both consume, but +// for V1 we copy. + +package main + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + neturl "net/url" + "strings" + "time" + + "github.com/google/uuid" +) + +type getCredsFunc func(context.Context, string) (string, string, string, error) + +// sharedAuthClient — package-level so the connection pool spans the +// whole server lifetime. See F4 in the iOS-side commit history. +// customDial already carries the WARP control hook (see dns_resolver.go), +// so VK token-acquisition POSTs egress via WARP when WARP_INTERFACE is +// configured. +var sharedAuthClient = &http.Client{ + Timeout: 20 * time.Second, + Transport: &http.Transport{ + DialContext: customDial, + MaxIdleConns: 100, + MaxIdleConnsPerHost: 100, + IdleConnTimeout: 90 * time.Second, + }, +} + +func getCreds(ctx context.Context, link string) (resUser string, resPass string, resTurn string, resErr error) { + profile := getRandomProfile() + name := generateName() + escapedName := neturl.QueryEscape(name) + + log.Printf("Connecting - Name: %s | UA: %s", name, profile.UserAgent) + + doRequest := func(data string, url string) (resp map[string]interface{}, err error) { + req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer([]byte(data))) + if err != nil { + return nil, err + } + + req.Header.Add("User-Agent", profile.UserAgent) + req.Header.Add("Content-Type", "application/x-www-form-urlencoded") + + httpResp, err := sharedAuthClient.Do(req) + if err != nil { + return nil, err + } + defer func() { + if closeErr := httpResp.Body.Close(); closeErr != nil { + log.Printf("close response body: %s", closeErr) + } + }() + + body, err := io.ReadAll(httpResp.Body) + if err != nil { + return nil, err + } + + err = json.Unmarshal(body, &resp) + if err != nil { + return nil, err + } + + return resp, nil + } + + var resp map[string]interface{} + defer func() { + if r := recover(); r != nil { + log.Printf("get TURN creds error (bad JSON?): %v\n\n", resp) + resErr = fmt.Errorf("panic in getCreds: %v", r) + } + }() + + data := "client_id=6287487&token_type=messages&client_secret=QbYic1K3lEV5kTGiqlq2&version=1&app_id=6287487" + url := "https://login.vk.com/?act=get_anonym_token" + + resp, err := doRequest(data, url) + if err != nil { + return "", "", "", fmt.Errorf("request error:%s", err) + } + + token1 := resp["data"].(map[string]interface{})["access_token"].(string) + + data = fmt.Sprintf("vk_join_link=https://vk.com/call/join/%s&name=%s&access_token=%s", link, escapedName, token1) + reqURL := "https://api.vk.com/method/calls.getAnonymousToken?v=5.274&client_id=6287487" + + var token2 string + const maxCaptchaAttempts = 3 + for attempt := 0; attempt <= maxCaptchaAttempts; attempt++ { + resp, err = doRequest(data, reqURL) + if err != nil { + return "", "", "", fmt.Errorf("request error:%s", err) + } + + if errObj, hasErr := resp["error"].(map[string]interface{}); hasErr { + errCode, _ := errObj["error_code"].(float64) + if errCode == 14 { + if attempt == maxCaptchaAttempts { + return "", "", "", fmt.Errorf("captcha failed after %d attempts", maxCaptchaAttempts) + } + + captchaErr := ParseVkCaptchaError(errObj) + if captchaErr.IsCaptchaError() { + log.Printf("[Captcha] Attempt %d/%d: solving...", attempt+1, maxCaptchaAttempts) + + successToken, solveErr := solveVkCaptcha(ctx, captchaErr) + if solveErr != nil { + return "", "", "", fmt.Errorf("captcha solve error: %v", solveErr) + } + + if captchaErr.CaptchaAttempt == "0" || captchaErr.CaptchaAttempt == "" { + captchaErr.CaptchaAttempt = "1" + } + + data = fmt.Sprintf("vk_join_link=https://vk.com/call/join/%s&name=%s"+ + "&captcha_key=&captcha_sid=%s&is_sound_captcha=0&success_token=%s"+ + "&captcha_ts=%s&captcha_attempt=%s&access_token=%s", + link, escapedName, captchaErr.CaptchaSid, successToken, + captchaErr.CaptchaTs, captchaErr.CaptchaAttempt, token1) + continue + } + } + return "", "", "", fmt.Errorf("VK API error: %v", errObj) + } + + token2 = resp["response"].(map[string]interface{})["token"].(string) + break + } + + data = fmt.Sprintf("%s%s%s", "session_data=%7B%22version%22%3A2%2C%22device_id%22%3A%22", uuid.New(), "%22%2C%22client_version%22%3A1.1%2C%22client_type%22%3A%22SDK_JS%22%7D&method=auth.anonymLogin&format=JSON&application_key=CGMMEJLGDIHBABABA") + url = "https://calls.okcdn.ru/fb.do" + + resp, err = doRequest(data, url) + if err != nil { + return "", "", "", fmt.Errorf("request error:%s", err) + } + + token3 := resp["session_key"].(string) + + data = fmt.Sprintf("joinLink=%s&isVideo=false&protocolVersion=5&anonymToken=%s&method=vchat.joinConversationByLink&format=JSON&application_key=CGMMEJLGDIHBABABA&session_key=%s", link, token2, token3) + url = "https://calls.okcdn.ru/fb.do" + + resp, err = doRequest(data, url) + if err != nil { + return "", "", "", fmt.Errorf("request error:%s", err) + } + + user := resp["turn_server"].(map[string]interface{})["username"].(string) + pass := resp["turn_server"].(map[string]interface{})["credential"].(string) + turn := resp["turn_server"].(map[string]interface{})["urls"].([]interface{})[0].(string) + + clean := strings.Split(turn, "?")[0] + address := strings.TrimPrefix(strings.TrimPrefix(clean, "turn:"), "turns:") + + return user, pass, address, nil +} diff --git a/captcha-service/dns_resolver.go b/captcha-service/dns_resolver.go new file mode 100644 index 0000000..5a4c92d --- /dev/null +++ b/captcha-service/dns_resolver.go @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: MIT +// +// Resilient DNS for VK captcha/identity HTTP. The system resolver on +// mobile carriers in censorship-heavy regions sometimes returns +// NXDOMAIN, hijacked IPs, or hangs on api.vk.com / id.vk.com lookups, +// even when the underlying network is otherwise fine. The captcha +// solver then errors out with "no such host" or a timeout before any +// of our retry logic can engage. +// +// customDial is a drop-in replacement for net.Dialer.DialContext that +// layers: +// 1. literal IP addresses — dial immediately, no resolution. +// 2. system resolver — 4 s budget. Works on WiFi where the +// carrier isn't censoring. +// 3. DNS-over-HTTPS (DoH) — Cloudflare's 1.1.1.1 JSON endpoint +// by IP, so the lookup itself needs +// no DNS. Cached for 10 minutes per +// hostname to avoid hammering DoH. +// 4. fallback IP map — last-resort hardcoded A records for +// VK domains, in case DoH is also +// blocked. Stale risk but better than +// a hard failure. +// +// The TLS handshake uses the original hostname (Go's http.Transport +// passes the request URL host as SNI/ServerName regardless of what +// DialContext returned), so dialing to a raw IP doesn't break cert +// verification. + +package main + +import ( + "context" + "crypto/tls" + "encoding/json" + "fmt" + "io" + "log" + "net" + "net/http" + "strings" + "sync" + "time" +) + +const ( + dohURL = "https://1.1.1.1/dns-query" + dohCacheTTL = 10 * time.Minute + systemDialBudget = 4 * time.Second + dohDialBudget = 6 * time.Second +) + +// dohClient is used ONLY for the DoH lookup itself. Plain net.Dialer +// (no recursion into customDial), with the WARP control hook so DoH +// queries to 1.1.1.1 also egress via the WARP interface when +// WARP_INTERFACE is set. Cloudflare's 1.1.1.1 is reachable from inside +// WARP just fine. +var dohClient = &http.Client{ + Timeout: 5 * time.Second, + Transport: &http.Transport{ + DialContext: (&net.Dialer{Timeout: 4 * time.Second, Control: warpControl}).DialContext, + TLSClientConfig: &tls.Config{}, + }, +} + +type dohEntry struct { + ips []string + expires time.Time +} + +var dohCache sync.Map // host -> dohEntry + +// Last-resort hardcoded A records. Used only if BOTH system resolver +// and DoH fail. VK's API endpoints have lived on these IPs for a long +// time; refresh manually if VK migrates infrastructure. +var fallbackIPs = map[string][]string{ + "login.vk.com": {"87.240.132.78", "87.240.137.158"}, + "api.vk.com": {"87.240.132.78", "87.240.137.158"}, + "id.vk.com": {"87.240.132.78", "87.240.137.158"}, + "vk.com": {"87.240.132.78", "87.240.137.158"}, + "m.vk.com": {"87.240.132.78"}, + // keep .ru hosts too in case some upstream code path still + // hits them (and they're reachable on the user's network). + "login.vk.ru": {"87.240.137.158", "87.240.190.78"}, + "api.vk.ru": {"87.240.137.158", "87.240.190.78"}, + "id.vk.ru": {"87.240.137.158", "87.240.190.78"}, + "vk.ru": {"87.240.137.158"}, +} + +// customDial is the net.Dialer.DialContext-shaped function plug into +// http.Transport on any HTTP client that needs censorship-tolerant DNS. +func customDial(ctx context.Context, network, address string) (net.Conn, error) { + host, port, err := net.SplitHostPort(address) + if err != nil { + return nil, err + } + + // Fast path: literal IP needs no resolution. + // Control hook pins the socket to WARP_INTERFACE when set; no-op + // otherwise. See warp_dialer.go. + if net.ParseIP(host) != nil { + return (&net.Dialer{Timeout: 8 * time.Second, Control: warpControl}).DialContext(ctx, network, address) + } + + // Layer 1: system resolver. WARP-pinned via Control hook. + d := &net.Dialer{Timeout: dohDialBudget, Control: warpControl} + sysCtx, cancel := context.WithTimeout(ctx, systemDialBudget) + conn, sysErr := d.DialContext(sysCtx, network, address) + cancel() + if sysErr == nil { + return conn, nil + } + log.Printf("dns: system resolve+dial failed for %s: %v — falling back to DoH", host, sysErr) + + // Layer 2: DoH. + if ips, err := resolveViaDoH(ctx, host); err == nil && len(ips) > 0 { + log.Printf("dns: DoH %s → %v", host, ips) + for _, ip := range ips { + c, derr := d.DialContext(ctx, network, net.JoinHostPort(ip, port)) + if derr == nil { + return c, nil + } + log.Printf("dns: dial %s (DoH) failed: %v", ip, derr) + } + } else if err != nil { + log.Printf("dns: DoH lookup failed for %s: %v", host, err) + } + + // Layer 3: hardcoded fallback. + if ips, ok := fallbackIPs[strings.ToLower(host)]; ok { + log.Printf("dns: trying hardcoded fallback IPs for %s: %v", host, ips) + for _, ip := range ips { + c, derr := d.DialContext(ctx, network, net.JoinHostPort(ip, port)) + if derr == nil { + return c, nil + } + log.Printf("dns: dial %s (fallback) failed: %v", ip, derr) + } + } + + return nil, fmt.Errorf("all DNS layers exhausted for %s (sys=%v)", host, sysErr) +} + +func resolveViaDoH(ctx context.Context, host string) ([]string, error) { + host = strings.ToLower(host) + if v, ok := dohCache.Load(host); ok { + if entry, ok := v.(dohEntry); ok && time.Now().Before(entry.expires) { + return entry.ips, nil + } + } + + url := dohURL + "?name=" + host + "&type=A" + req, err := http.NewRequestWithContext(ctx, "GET", url, nil) + if err != nil { + return nil, err + } + req.Header.Set("accept", "application/dns-json") + + resp, err := dohClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + + var doh struct { + Answer []struct { + Type int `json:"type"` + Data string `json:"data"` + } `json:"Answer"` + } + if err := json.Unmarshal(body, &doh); err != nil { + return nil, err + } + + var ips []string + for _, a := range doh.Answer { + if a.Type == 1 && net.ParseIP(a.Data) != nil { // A record + ips = append(ips, strings.TrimSpace(a.Data)) + } + } + if len(ips) == 0 { + return nil, fmt.Errorf("DoH returned no A records for %s", host) + } + dohCache.Store(host, dohEntry{ + ips: ips, + expires: time.Now().Add(dohCacheTTL), + }) + return ips, nil +} diff --git a/captcha-service/go.mod b/captcha-service/go.mod new file mode 100644 index 0000000..79a2fa6 --- /dev/null +++ b/captcha-service/go.mod @@ -0,0 +1,23 @@ +module github.com/truvvor/turnbridge/captcha-service + +go 1.25.0 + +require github.com/google/uuid v1.6.0 + +require ( + github.com/andybalholm/brotli v1.2.0 // indirect + github.com/bdandy/go-errors v1.2.2 // indirect + github.com/bdandy/go-socks4 v1.2.3 // indirect + github.com/bogdanfinn/fhttp v0.6.8 // indirect + github.com/bogdanfinn/quic-go-utls v1.0.9-utls // indirect + github.com/bogdanfinn/tls-client v1.14.0 // indirect + github.com/bogdanfinn/utls v1.7.7-barnius // indirect + github.com/bogdanfinn/websocket v1.5.5-barnius // indirect + github.com/klauspost/compress v1.18.2 // indirect + github.com/quic-go/qpack v0.6.0 // indirect + github.com/tam7t/hpkp v0.0.0-20160821193359-2b70b4024ed5 // indirect + golang.org/x/crypto v0.46.0 // indirect + golang.org/x/net v0.48.0 // indirect + golang.org/x/sys v0.46.0 // indirect + golang.org/x/text v0.32.0 // indirect +) diff --git a/captcha-service/go.sum b/captcha-service/go.sum new file mode 100644 index 0000000..330c434 --- /dev/null +++ b/captcha-service/go.sum @@ -0,0 +1,38 @@ +github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ= +github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY= +github.com/bdandy/go-errors v1.2.2 h1:WdFv/oukjTJCLa79UfkGmwX7ZxONAihKu4V0mLIs11Q= +github.com/bdandy/go-errors v1.2.2/go.mod h1:NkYHl4Fey9oRRdbB1CoC6e84tuqQHiqrOcZpqFEkBxM= +github.com/bdandy/go-socks4 v1.2.3 h1:Q6Y2heY1GRjCtHbmlKfnwrKVU/k81LS8mRGLRlmDlic= +github.com/bdandy/go-socks4 v1.2.3/go.mod h1:98kiVFgpdogR8aIGLWLvjDVZ8XcKPsSI/ypGrO+bqHI= +github.com/bogdanfinn/fhttp v0.6.8 h1:LiQyHOY3i0QoxxNB7nq27/nGNNbtPj0fuBPozhR7Ws4= +github.com/bogdanfinn/fhttp v0.6.8/go.mod h1:A+EKDzMx2hb4IUbMx4TlkoHnaJEiLl8r/1Ss1Y+5e5M= +github.com/bogdanfinn/quic-go-utls v1.0.9-utls h1:tV6eDEiRbRCcepALSzxR94JUVD3N3ACIiRLgyc2Ep8s= +github.com/bogdanfinn/quic-go-utls v1.0.9-utls/go.mod h1:aHph9B9H9yPOt5xnhWKSOum27DJAqpiHzwX+gjvaXcg= +github.com/bogdanfinn/tls-client v1.14.0 h1:vyk7Cn4BIvLAGVuMfb0tP22OqogfO1lYamquQNEZU1A= +github.com/bogdanfinn/tls-client v1.14.0/go.mod h1:LsU6mXVn8MOFDwTkyRfI7V1BZM1p0wf2ZfZsICW/1fM= +github.com/bogdanfinn/utls v1.7.7-barnius h1:OuJ497cc7F3yKNVHRsYPQdGggmk5x6+V5ZlrCR7fOLU= +github.com/bogdanfinn/utls v1.7.7-barnius/go.mod h1:aAK1VZQlpKZClF1WEQeq6kyclbkPq4hz6xTbB5xSlmg= +github.com/bogdanfinn/websocket v1.5.5-barnius h1:bY+qnxpai1qe7Jmjx+Sds/cmOSpuuLoR8x61rWltjOI= +github.com/bogdanfinn/websocket v1.5.5-barnius/go.mod h1:gvvEw6pTKHb7yOiFvIfAFTStQWyrm25BMVCTj5wRSsI= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/klauspost/compress v1.18.2 h1:iiPHWW0YrcFgpBYhsA6D1+fqHssJscY/Tm/y2Uqnapk= +github.com/klauspost/compress v1.18.2/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4= +github.com/quic-go/qpack v0.6.0 h1:g7W+BMYynC1LbYLSqRt8PBg5Tgwxn214ZZR34VIOjz8= +github.com/quic-go/qpack v0.6.0/go.mod h1:lUpLKChi8njB4ty2bFLX2x4gzDqXwUpaO1DP9qMDZII= +github.com/tam7t/hpkp v0.0.0-20160821193359-2b70b4024ed5 h1:YqAladjX7xpA6BM04leXMWAEjS0mTZ5kUU9KRBriQJc= +github.com/tam7t/hpkp v0.0.0-20160821193359-2b70b4024ed5/go.mod h1:2JjD2zLQYH5HO74y5+aE3remJQvl6q4Sn6aWA2wD1Ng= +golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU= +golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0= +golang.org/x/net v0.0.0-20211104170005-ce137452f963/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU= +golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.46.0 h1:noSf2Fq6F8DBgS+LysIkx7rIExoNHJsxOAtPp4rthXw= +golang.org/x/sys v0.46.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU= +golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/captcha-service/identity.go b/captcha-service/identity.go new file mode 100644 index 0000000..578b188 --- /dev/null +++ b/captcha-service/identity.go @@ -0,0 +1,78 @@ +package main + +import ( + "fmt" + mathrand "math/rand" +) + +type Profile struct { + UserAgent string + SecChUa string + SecChUaMobile string + SecChUaPlatform string +} + + +var firstNames = []string{ + "Александр", "Дмитрий", "Максим", "Сергей", "Андрей", "Алексей", "Артём", "Илья", + "Кирилл", "Михаил", "Никита", "Матвей", "Роман", "Егор", "Арсений", "Иван", + "Денис", "Даниил", "Тимофей", "Владислав", "Игорь", "Павел", "Руслан", "Марк", + "Анна", "Мария", "Елена", "Дарья", "Анастасия", "Екатерина", "Виктория", "Ольга", + "Наталья", "Юлия", "Татьяна", "Светлана", "Ирина", "Ксения", "Алина", "Елизавета", +} + +var lastNames = []string{ + "Иванов", "Смирнов", "Кузнецов", "Попов", "Васильев", "Петров", "Соколов", "Михайлов", + "Новиков", "Федоров", "Морозов", "Волков", "Алексеев", "Лебедев", "Семенов", "Егоров", + "Павлов", "Козлов", "Степанов", "Николаев", "Орлов", "Андреев", "Макаров", "Никитин", + "Захаров", "Зайцев", "Соловьев", "Борисов", "Яковлев", "Григорьев", "Романов", "Воробьев", +} + +var profiles = []Profile{ + // iPhone Safari only. VK's anti-bot pipeline triggers the + // "Confirm you're not a robot" checkbox when it sees a mismatch + // between the connection (Russian cellular IP, iPhone-shaped TLS + // fingerprint from NSURLSession's underlying CFNetwork stack) + // and the User-Agent header. Real users clicking a VK call link + // from Safari on iPhone aren't asked for a captcha — and that's + // exactly the request we want to look like. + // + // Safari deliberately doesn't implement Client Hints; vk_captcha + // skips the sec-ch-ua headers entirely when SecChUa is empty, + // matching what mobile Safari actually sends on the wire. + { + UserAgent: "Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Mobile/15E148 Safari/604.1", + }, + { + UserAgent: "Mozilla/5.0 (iPhone; CPU iPhone OS 18_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Mobile/15E148 Safari/604.1", + }, + { + UserAgent: "Mozilla/5.0 (iPhone; CPU iPhone OS 18_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Mobile/15E148 Safari/604.1", + }, + { + UserAgent: "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.6 Mobile/15E148 Safari/604.1", + }, + { + UserAgent: "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1", + }, + { + UserAgent: "Mozilla/5.0 (iPhone; CPU iPhone OS 17_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Mobile/15E148 Safari/604.1", + }, +} + +func getRandomProfile() Profile { + return profiles[mathrand.Intn(len(profiles))] +} + +func generateName() string { + if mathrand.Float32() < 0.3 { + return firstNames[mathrand.Intn(len(firstNames))] + } + fn := firstNames[mathrand.Intn(len(firstNames))] + ln := lastNames[mathrand.Intn(len(lastNames))] + lastChar := fn[len(fn)-2:] + if lastChar == "а" || lastChar == "я" { + return fmt.Sprintf("%s %sа", fn, ln) + } + return fmt.Sprintf("%s %s", fn, ln) +} diff --git a/captcha-service/main.go b/captcha-service/main.go new file mode 100644 index 0000000..c7cf2ce --- /dev/null +++ b/captcha-service/main.go @@ -0,0 +1,342 @@ +// captcha-service — server-side companion to the iOS TurnBridge +// extension. Hosts the VK captcha + identity-registration pipeline +// outside the 50-100 MB NetworkExtension sandbox. Clients POST a VK +// call link, the server returns ready-to-use TURN credentials the +// client then uses for its own Allocate. +// +// TURN allocations are bound by RFC 5766 to the 5-tuple that issued +// them, so the server CANNOT hand off a live allocation. What it +// hands off is the username/password/relay-address tuple returned +// from vchat.joinConversationByLink — those are HMAC-signed by VK's +// TURN secret and the client can use them from any source IP within +// the ~50 s rotation window. +// +// V2 adds peer-to-peer fan-out (see cluster.go): every binary can +// act as both master (for the client URL it's behind) and slave +// (for sibling masters). Configure with PEERS + SELF_URL env vars. + +package main + +import ( + "context" + "encoding/json" + "fmt" + "log" + "net/http" + "os" + "strings" + "sync" + "sync/atomic" + "time" +) + +type credResponse struct { + User string `json:"user"` + Pass string `json:"pass"` + Addr string `json:"addr"` + ExpiresAt time.Time `json:"expires_at"` +} + +type errorResponse struct { + Error string `json:"error"` +} + +var ( + apiKey string + solveSlot chan struct{} + credsTotal atomic.Int64 + credsErrs atomic.Int64 +) + +func main() { + addr := os.Getenv("LISTEN_ADDR") + if addr == "" { + addr = ":8080" + } + apiKey = os.Getenv("API_KEY") + if apiKey == "" { + log.Fatal("API_KEY env var is required") + } + solveSlot = make(chan struct{}, maxConcurrentCaptchaSolves) + initPeers() + + mux := http.NewServeMux() + mux.HandleFunc("/cred", handleCred) + mux.HandleFunc("/internal/cred", handleInternalCred) + mux.HandleFunc("/stats", handleStats) + mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("ok")) + }) + + srv := &http.Server{ + Addr: addr, + Handler: withLogging(mux), + ReadHeaderTimeout: 5 * time.Second, + ReadTimeout: 60 * time.Second, + WriteTimeout: 120 * time.Second, + IdleTimeout: 120 * time.Second, + } + + log.Printf("captcha-service listening on %s (max concurrent solves=%d, WARP=%s)", addr, maxConcurrentCaptchaSolves, warpStatus()) + if err := srv.ListenAndServe(); err != nil { + log.Fatalf("ListenAndServe: %v", err) + } +} + +func withLogging(h http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + start := time.Now() + h.ServeHTTP(w, r) + log.Printf("%s %s %s in %v", r.Method, r.URL.Path, r.RemoteAddr, time.Since(start)) + }) +} + +func authorized(r *http.Request) bool { + got := r.Header.Get("Authorization") + return strings.HasPrefix(got, "Bearer ") && strings.TrimPrefix(got, "Bearer ") == apiKey +} + +func writeJSON(w http.ResponseWriter, code int, v interface{}) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(code) + _ = json.NewEncoder(w).Encode(v) +} + +// handleCred — POST /cred {"link":"..."} → {user,pass,addr,expires_at}. +// Public client-facing endpoint. Acts as MASTER: round-robins through +// the peer list (including self), forwards to /internal/cred when the +// chosen peer isn't self, falls through to the next peer on +// saturation or transient failure. Single-node mode (no PEERS env) +// reduces to "always solve locally", matching V1 behaviour. +func handleCred(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + writeJSON(w, http.StatusMethodNotAllowed, errorResponse{"POST only"}) + return + } + if !authorized(r) { + writeJSON(w, http.StatusUnauthorized, errorResponse{"invalid api key"}) + return + } + + var req struct { + Link string `json:"link"` + } + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + writeJSON(w, http.StatusBadRequest, errorResponse{fmt.Sprintf("bad body: %v", err)}) + return + } + if req.Link == "" { + writeJSON(w, http.StatusBadRequest, errorResponse{"link is required"}) + return + } + + ctx, cancel := context.WithTimeout(r.Context(), 100*time.Second) + defer cancel() + + tried := make([]bool, len(peers)) + var lastErr error + + for attempt := 0; attempt < len(peers); attempt++ { + p := pickPeer(tried) + if p == nil { + break + } + + var creds *credResponse + var saturated bool + var err error + + if p.Self { + creds, saturated, err = solveLocally(ctx, req.Link) + } else { + creds, saturated, err = forwardToPeer(ctx, p, req.Link) + } + + if saturated { + p.markSaturated() + log.Printf("cluster: peer %s marked saturated for %v", p.statusLabel(), captchaCooldown) + } + + if err != nil { + lastErr = err + log.Printf("cluster: peer %s attempt failed: %v", p.statusLabel(), err) + continue + } + + credsTotal.Add(1) + w.Header().Set("X-Captcha-Served-By", p.statusLabel()) + writeJSON(w, http.StatusOK, *creds) + return + } + + // Distinguish "all peers in cooldown" (return 429 with Retry-After) + // from "we tried and they all errored on this request" (502). + allSaturated := true + for _, p := range peers { + if p.isAvailable() { + allSaturated = false + break + } + } + credsErrs.Add(1) + if allSaturated { + w.Header().Set("Retry-After", "60") + writeJSON(w, http.StatusTooManyRequests, errorResponse{"all peers saturated, retry after cooldown"}) + return + } + msg := "all peers failed" + if lastErr != nil { + msg = lastErr.Error() + } + writeJSON(w, http.StatusBadGateway, errorResponse{msg}) +} + +// handleInternalCred — peer-only endpoint. Same auth gate as /cred, +// but solves locally and never forwards to other peers. This is what +// other masters call when round-robin lands on this binary; using a +// distinct path prevents accidental HTTP loops where a misconfigured +// PEERS list makes a peer forward to itself via /cred. +// +// Response always sets X-Captcha-Self-Saturated so the caller knows +// our current rate-limit state without making a separate /stats call. +func handleInternalCred(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + writeJSON(w, http.StatusMethodNotAllowed, errorResponse{"POST only"}) + return + } + if !authorized(r) { + writeJSON(w, http.StatusUnauthorized, errorResponse{"invalid api key"}) + return + } + + var req struct { + Link string `json:"link"` + } + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + writeJSON(w, http.StatusBadRequest, errorResponse{fmt.Sprintf("bad body: %v", err)}) + return + } + if req.Link == "" { + writeJSON(w, http.StatusBadRequest, errorResponse{"link is required"}) + return + } + + ctx, cancel := context.WithTimeout(r.Context(), 80*time.Second) + defer cancel() + + creds, saturated, err := solveLocally(ctx, req.Link) + + if saturated { + w.Header().Set("X-Captcha-Self-Saturated", "1") + } + + if err != nil { + credsErrs.Add(1) + if saturated { + w.Header().Set("Retry-After", "60") + writeJSON(w, http.StatusTooManyRequests, errorResponse{err.Error()}) + return + } + writeJSON(w, http.StatusBadGateway, errorResponse{err.Error()}) + return + } + + credsTotal.Add(1) + writeJSON(w, http.StatusOK, *creds) +} + +// solveLocally runs one full VK captcha+identity-registration cycle +// using THIS instance's egress. Used both by /internal/cred (peer- +// to-peer leaf) and /cred when round-robin picks self. Returns +// saturated=true when the solve hit ERROR_LIMIT so the master can +// take this peer out of rotation for captchaCooldown. +func solveLocally(ctx context.Context, link string) (*credResponse, bool, error) { + if directSaturated() { + return nil, true, fmt.Errorf("egress saturated, retry after cooldown") + } + + select { + case solveSlot <- struct{}{}: + case <-ctx.Done(): + return nil, false, fmt.Errorf("solve queue full or context cancelled") + } + defer func() { <-solveSlot }() + + user, pass, addr, err := getCreds(ctx, link) + if err != nil { + // directSaturated() flips during the captcha pipeline when + // VK returns ERROR_LIMIT — re-check after so the caller can + // mark this peer for cooldown even on the request that + // tripped the limit. + return nil, directSaturated(), err + } + + return &credResponse{ + User: user, + Pass: pass, + Addr: addr, + ExpiresAt: time.Now().Add(45 * time.Second), // VK rotates ~50 s; 45 s is the safe usable window. + }, directSaturated(), nil +} + +// handleStats — GET /stats → snapshot of solve counters and cluster +// peer state. No auth so monitoring can scrape it; only counters, +// no per-cred info. +func handleStats(w http.ResponseWriter, r *http.Request) { + type peerStat struct { + URL string `json:"url"` + Self bool `json:"self"` + Available bool `json:"available"` + SaturatedRemaining int64 `json:"saturated_remaining_seconds"` + } + peerStats := make([]peerStat, 0, len(peers)) + now := time.Now() + for _, p := range peers { + p.mu.Lock() + remaining := int64(0) + if p.saturatedUntil.After(now) { + remaining = int64(p.saturatedUntil.Sub(now).Seconds()) + } + peerStats = append(peerStats, peerStat{ + URL: p.statusLabel(), + Self: p.Self, + Available: p.saturatedUntil.Before(now), + SaturatedRemaining: remaining, + }) + p.mu.Unlock() + } + + stats.mu.Lock() + snap := struct { + Attempts int64 `json:"attempts"` + Successes int64 `json:"successes"` + Saturated int64 `json:"saturated"` + InFlight int64 `json:"in_flight"` + CredsTotal int64 `json:"creds_total"` + CredsErrors int64 `json:"creds_errors"` + SaturatedNow bool `json:"saturated_now"` + UptimeSeconds int64 `json:"uptime_seconds"` + WARP string `json:"warp"` + Peers []peerStat `json:"peers"` + }{ + Attempts: stats.attempts, + Successes: stats.successes, + Saturated: stats.saturatedTotal, + InFlight: stats.inFlight, + CredsTotal: credsTotal.Load(), + CredsErrors: credsErrs.Load(), + SaturatedNow: directSaturated(), + UptimeSeconds: int64(time.Since(startedAt).Seconds()), + WARP: warpStatus(), + Peers: peerStats, + } + stats.mu.Unlock() + writeJSON(w, http.StatusOK, snap) +} + +var startedAt = time.Now() + +// Compile-time guard so go-mod-tidy doesn't drop the imports if some +// helpers are inadvertently dead-coded during refactors. +var _ = sync.Mutex{} diff --git a/captcha-service/stubs.go b/captcha-service/stubs.go new file mode 100644 index 0000000..6b058e8 --- /dev/null +++ b/captcha-service/stubs.go @@ -0,0 +1,198 @@ +// stubs.go — server-side replacements for iOS-specific globals that +// vk_captcha.go and captcha_slider.go reach for. The server has a +// single egress IP (no tunnel/direct split), no manual-captcha UI, +// and writes debug artefacts to a directory rather than handing them +// to a Swift bridge. + +package main + +import ( + "context" + "fmt" + "net" + "os" + "path/filepath" + "sync" + "sync/atomic" + "time" +) + +// maxConcurrentCaptchaSolves caps in-flight captcha solves per server. +// VK rate-limits captchaNotRobot per source IP; 5 is the same value +// the iOS client uses and is well below the actual ERROR_LIMIT +// trigger on a fresh per-IP budget. +const maxConcurrentCaptchaSolves = 5 + +// captchaTunnelEgress — always false on server. The server has one +// physical egress; the "tunnel egress" notion only matters on the +// iOS client where post-WG-handshake traffic leaves through utun. +var captchaTunnelEgress atomic.Bool + +// saturation: a single per-egress flag. Tripped when a captcha solve +// hits ERROR_LIMIT; auto-clears after captchaCooldown. +const captchaCooldown = 60 * time.Second + +var ( + saturatedAt atomic.Int64 // unix nano of last ERROR_LIMIT +) + +func directSaturated() bool { + at := saturatedAt.Load() + if at == 0 { + return false + } + return time.Since(time.Unix(0, at)) < captchaCooldown +} + +func tunnelSaturated() bool { return false } + +// cellularDial — on iOS this pins to a physical interface; on server +// there's no utun to escape from, so just delegate to customDial. +var cellularDial = customDial + +// manual-captcha UI — server never runs in manual mode. +func manualCaptchaForcedMode() bool { return false } + +func requestManualCaptcha(redirectURI string, timeout time.Duration) (string, error) { + return "", fmt.Errorf("manual captcha not supported on server") +} + +// markCaptcha* — stats hooks. The iOS bridge ships these to Swift for +// the live counter UI; on server we keep an in-memory tally for the +// /stats endpoint and trip saturation on ERROR_LIMIT. +type captchaStats struct { + mu sync.Mutex + attempts, successes int64 + saturatedTotal int64 + inFlight int64 +} + +var stats captchaStats + +func markCaptchaAttemptStart(forceDirect bool) bool { + stats.mu.Lock() + stats.attempts++ + stats.inFlight++ + stats.mu.Unlock() + return false // returned bool = isTunnel; always false on server +} + +func markCaptchaAttemptDone(isTunnel bool) { + stats.mu.Lock() + stats.inFlight-- + stats.mu.Unlock() +} + +func markCaptchaSuccess(isTunnel bool) { + stats.mu.Lock() + stats.successes++ + stats.mu.Unlock() +} + +func markCaptchaSaturated(isTunnel bool) { + stats.mu.Lock() + stats.saturatedTotal++ + stats.mu.Unlock() + saturatedAt.Store(time.Now().UnixNano()) +} + +// captchaTrap — debug artefact collector. On iOS this writes into an +// AppGroup directory the user can browse from the app; on server it +// writes into $CAPTCHA_TRAP_DIR (default ./trap), or stays in memory +// if the dir isn't writable. Commit() materialises everything Note'd +// and Save'd since New; Discard() throws it all away. +type captchaTrap struct { + mu sync.Mutex + prefix string + created time.Time + notes []string + saves map[string][]byte + finished bool +} + +var trapBaseDir = func() string { + if d := os.Getenv("CAPTCHA_TRAP_DIR"); d != "" { + return d + } + return "./trap" +}() + +func newCaptchaTrap(prefix string) *captchaTrap { + return &captchaTrap{ + prefix: prefix, + created: time.Now(), + saves: make(map[string][]byte), + } +} + +func (t *captchaTrap) Note(format string, args ...interface{}) { + if t == nil { + return + } + t.mu.Lock() + defer t.mu.Unlock() + t.notes = append(t.notes, fmt.Sprintf("[%s] ", time.Now().UTC().Format(time.RFC3339Nano))+fmt.Sprintf(format, args...)) +} + +func (t *captchaTrap) Save(name string, data []byte) { + if t == nil { + return + } + t.mu.Lock() + defer t.mu.Unlock() + // Copy so the caller can reuse the buffer. + buf := make([]byte, len(data)) + copy(buf, data) + t.saves[name] = buf +} + +func (t *captchaTrap) Commit(reason string) { + if t == nil { + return + } + t.mu.Lock() + if t.finished { + t.mu.Unlock() + return + } + t.finished = true + prefix := t.prefix + notes := t.notes + saves := t.saves + t.mu.Unlock() + + if trapBaseDir == "" || prefix == "" { + return + } + stamp := t.created.UTC().Format("20060102_150405") + dir := filepath.Join(trapBaseDir, fmt.Sprintf("%s_%s_%s", stamp, prefix, reason)) + if err := os.MkdirAll(dir, 0o755); err != nil { + return + } + if len(notes) > 0 { + var b []byte + for _, n := range notes { + b = append(b, n...) + b = append(b, '\n') + } + _ = os.WriteFile(filepath.Join(dir, "notes.log"), b, 0o644) + } + for name, data := range saves { + _ = os.WriteFile(filepath.Join(dir, name), data, 0o644) + } +} + +func (t *captchaTrap) Discard() { + if t == nil { + return + } + t.mu.Lock() + t.finished = true + t.notes = nil + t.saves = nil + t.mu.Unlock() +} + +// Compile-time assertion: cellularDial has the same signature as +// customDial. Catches anyone changing the dialer shape. +var _ func(context.Context, string, string) (net.Conn, error) = cellularDial diff --git a/captcha-service/vk_captcha.go b/captcha-service/vk_captcha.go new file mode 100644 index 0000000..3e75b28 --- /dev/null +++ b/captcha-service/vk_captcha.go @@ -0,0 +1,464 @@ +package main + +import ( + "context" + "crypto/rand" + "crypto/sha256" + "encoding/base64" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "log" + mathrand "math/rand" + "net/url" + "regexp" + "strconv" + "strings" + "time" + + fhttp "github.com/bogdanfinn/fhttp" + tlsclient "github.com/bogdanfinn/tls-client" +) + +type VkCaptchaError struct { + ErrorCode int + ErrorMsg string + CaptchaSid string + CaptchaImg string + RedirectUri string + IsSoundCaptchaAvailable bool + SessionToken string + CaptchaTs string + CaptchaAttempt string +} + +func randomHex(n int) string { + bytes := make([]byte, n) + if _, err := rand.Read(bytes); err != nil { + for i := range bytes { + bytes[i] = byte(mathrand.Intn(256)) + } + } + return hex.EncodeToString(bytes) +} + +// newCaptchaClient now returns a TLS-fingerprinted client (Safari iOS +// 18) that also pins outbound sockets to the WARP WireGuard interface +// when WARP_INTERFACE is set. See captcha_client.go and warp_dialer.go. +// forceDirect kept in the signature for callsite compat but ignored: +// the iOS-side meaning (bypass utun for tunnel-egress rate-limit) has +// no analog on a Linux server. +func newCaptchaClient(_ bool) tlsclient.HttpClient { + c, err := newTLSCaptchaClient() + if err != nil { + panic(fmt.Sprintf("newTLSCaptchaClient: %v", err)) + } + return c +} + +func ParseVkCaptchaError(errData map[string]interface{}) *VkCaptchaError { + codeFloat, _ := errData["error_code"].(float64) + code := int(codeFloat) + + redirectUri, _ := errData["redirect_uri"].(string) + captchaSid, _ := errData["captcha_sid"].(string) + captchaImg, _ := errData["captcha_img"].(string) + errorMsg, _ := errData["error_msg"].(string) + + var sessionToken string + if redirectUri != "" { + if parsed, err := url.Parse(redirectUri); err == nil { + sessionToken = parsed.Query().Get("session_token") + } + } + + isSound, _ := errData["is_sound_captcha_available"].(bool) + + var captchaTs string + if tsFloat, ok := errData["captcha_ts"].(float64); ok { + captchaTs = fmt.Sprintf("%.0f", tsFloat) + } else if tsStr, ok := errData["captcha_ts"].(string); ok { + captchaTs = tsStr + } + + var captchaAttempt string + if attFloat, ok := errData["captcha_attempt"].(float64); ok { + captchaAttempt = fmt.Sprintf("%.0f", attFloat) + } else if attStr, ok := errData["captcha_attempt"].(string); ok { + captchaAttempt = attStr + } + + return &VkCaptchaError{ + ErrorCode: code, + ErrorMsg: errorMsg, + CaptchaSid: captchaSid, + CaptchaImg: captchaImg, + RedirectUri: redirectUri, + IsSoundCaptchaAvailable: isSound, + SessionToken: sessionToken, + CaptchaTs: captchaTs, + CaptchaAttempt: captchaAttempt, + } +} + +func (e *VkCaptchaError) IsCaptchaError() bool { + return e.ErrorCode == 14 && e.RedirectUri != "" && e.SessionToken != "" +} + +func solveVkCaptcha(ctx context.Context, captchaErr *VkCaptchaError) (string, error) { + if manualCaptchaForcedMode() { + log.Printf("[Captcha] Manual mode enabled — handing the challenge to the UI") + return requestManualCaptcha(captchaErr.RedirectUri, 180*time.Second) + } + + // Egress decision. The default is whatever captchaTunnelEgress + // dictates (direct pre-handshake, tunnel post-handshake). When + // tunnel is saturated AND direct still has budget, we override + // and pin a physical interface (cellular / WiFi) for this attempt + // so the request bypasses utun — that's the only way to retry + // the direct egress after WG comes up. cellularDial falls back + // to the system route if no usable physical interface is found. + forceDirect := captchaTunnelEgress.Load() && tunnelSaturated() && !directSaturated() + if forceDirect { + log.Printf("[Captcha] tunnel egress saturated — forcing physical-interface egress") + } + + // Bump the in-flight gauge for this egress so the UI sees an + // increase the moment a solve starts. Released on every return + // path via defer. + isTunnel := markCaptchaAttemptStart(forceDirect) + defer markCaptchaAttemptDone(isTunnel) + + // Anti-bot pacing used to live here as a 1.5-2.5 s pre-solve + // sleep, but it was held INSIDE poolCreds' solveSlot semaphore + // which throttles 5 in-flight solves. The slot now covers only + // the real PoW + HTTP work; pacing has been moved to poolCreds' + // pre-slot wait so the same wall-clock delay overlaps the slot + // queue instead of serialising inside it. + + log.Printf("[Captcha] Solving Not Robot Captcha...") + + sessionToken := captchaErr.SessionToken + if sessionToken == "" { + return "", fmt.Errorf("no session_token in redirect_uri") + } + + profile := getRandomProfile() + client := newCaptchaClient(forceDirect) + + powInput, difficulty, htmlSettings, err := fetchPowInput(ctx, client, profile, captchaErr.RedirectUri) + if err != nil { + return "", fmt.Errorf("failed to fetch PoW input: %w", err) + } + + log.Printf("[Captcha] PoW input: %s, difficulty: %d, htmlSettings=%v", powInput, difficulty, htmlSettings != nil) + + hash := solvePoW(powInput, difficulty) + log.Printf("[Captcha] PoW solved: hash=%s", hash) + + successToken, err := callCaptchaNotRobot(ctx, client, profile, sessionToken, hash, htmlSettings, isTunnel) + if err != nil { + return "", fmt.Errorf("captchaNotRobot API failed: %w", err) + } + + log.Printf("[Captcha] Success! Got success_token") + return successToken, nil +} + +func fetchPowInput(ctx context.Context, client tlsclient.HttpClient, profile Profile, redirectUri string) (string, int, map[string]interface{}, error) { + req, err := fhttp.NewRequest("GET", redirectUri, nil) + if err != nil { + return "", 0, nil, err + } + req = withCaptchaCtx(ctx, req) + + req.Header.Set("User-Agent", profile.UserAgent) + req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8") + req.Header.Set("Accept-Language", "en-US,en;q=0.9") + // Safari iOS doesn't implement Client Hints. With Safari_IOS_18_0 + // fingerprint we mirror real Safari at every layer, so drop + // sec-ch-ua* unconditionally. + req.Header.Set("Sec-Fetch-Site", "none") + req.Header.Set("Sec-Fetch-Mode", "navigate") + req.Header.Set("Sec-Fetch-Dest", "document") + applySafariHeaderOrder(req) + + resp, err := client.Do(req) + if err != nil { + return "", 0, nil, err + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", 0, nil, err + } + + html := string(body) + + // Parse PoW input + powInputRe := regexp.MustCompile(`const\s+powInput\s*=\s*"([^"]+)"`) + powInputMatch := powInputRe.FindStringSubmatch(html) + if len(powInputMatch) < 2 { + return "", 0, nil, fmt.Errorf("powInput not found in captcha HTML") + } + powInput := powInputMatch[1] + + // Parse difficulty + diffRe := regexp.MustCompile(`startsWith\('0'\.repeat\((\d+)\)\)`) + diffMatch := diffRe.FindStringSubmatch(html) + difficulty := 2 + if len(diffMatch) >= 2 { + if d, err := strconv.Atoi(diffMatch[1]); err == nil { + difficulty = d + } + } + + // Parse window.init for slider captcha settings + var htmlSettings map[string]interface{} + initRe := regexp.MustCompile(`(?s)window\.init\s*=\s*(\{.*?\})\s*;\s*window\.lang`) + if initMatch := initRe.FindStringSubmatch(html); len(initMatch) >= 2 { + var initPayload map[string]interface{} + if err := json.Unmarshal([]byte(initMatch[1]), &initPayload); err == nil { + if data, ok := initPayload["data"].(map[string]interface{}); ok { + htmlSettings = map[string]interface{}{"response": data} + log.Printf("[Captcha] Parsed window.init htmlSettings") + } + } + } + + // Stash not_robot_captcha.js URL so the caller can fetch debug_info + // dynamically. See captcha_debug_info.go. + scriptURL := extractScriptURL(html) + if scriptURL != "" { + if htmlSettings == nil { + htmlSettings = map[string]interface{}{} + } + htmlSettings["_scriptURL"] = scriptURL + } + + return powInput, difficulty, htmlSettings, nil +} + +func solvePoW(powInput string, difficulty int) string { + target := strings.Repeat("0", difficulty) + + for nonce := 1; nonce <= 10000000; nonce++ { + data := powInput + strconv.Itoa(nonce) + hash := sha256.Sum256([]byte(data)) + hexHash := hex.EncodeToString(hash[:]) + + if strings.HasPrefix(hexHash, target) { + return hexHash + } + } + return "" +} + +func callCaptchaNotRobot(ctx context.Context, client tlsclient.HttpClient, profile Profile, sessionToken, hash string, htmlSettings map[string]interface{}, isTunnel bool) (string, error) { + vkReq := func(method string, postData string) (map[string]interface{}, error) { + requestURL := "https://api.vk.com/method/" + method + "?v=5.131" + + req, err := fhttp.NewRequest("POST", requestURL, strings.NewReader(postData)) + if err != nil { + return nil, err + } + req = withCaptchaCtx(ctx, req) + + req.Header.Set("User-Agent", profile.UserAgent) + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + req.Header.Set("Accept", "*/*") + req.Header.Set("Accept-Language", "en-US,en;q=0.9") + req.Header.Set("Origin", "https://id.vk.com") + req.Header.Set("Referer", "https://id.vk.com/") + req.Header.Set("Sec-Fetch-Site", "same-site") + req.Header.Set("Sec-Fetch-Mode", "cors") + req.Header.Set("Sec-Fetch-Dest", "empty") + req.Header.Set("Priority", "u=1, i") + applySafariHeaderOrder(req) + + httpResp, err := client.Do(req) + if err != nil { + return nil, err + } + defer httpResp.Body.Close() + + body, err := io.ReadAll(httpResp.Body) + if err != nil { + return nil, err + } + + var resp map[string]interface{} + if err := json.Unmarshal(body, &resp); err != nil { + return nil, err + } + + return resp, nil + } + + domain := "vk.com" + baseParams := fmt.Sprintf("session_token=%s&domain=%s&adFp=&access_token=", + url.QueryEscape(sessionToken), url.QueryEscape(domain)) + + // Step 1: settings + log.Printf("[Captcha] Step 1/4: settings") + settingsResp, err := vkReq("captchaNotRobot.settings", baseParams) + if err != nil { + return "", fmt.Errorf("settings failed: %w", err) + } + time.Sleep(time.Duration(100+mathrand.Intn(100)) * time.Millisecond) + + // Step 2: componentDone + log.Printf("[Captcha] Step 2/4: componentDone") + + // crypto/rand-backed 32-hex-char browser fingerprint (v2). + browserFp := randomHex(16) + + // v2 device shape: fixed desktop Chrome 8-core/1080p. See iOS-side + // captcha-vk for full rationale. + const ( + screenW = 1920 + screenH = 1080 + ) + deviceMap := map[string]interface{}{ + "screenWidth": screenW, + "screenHeight": screenH, + "screenAvailWidth": screenW, + "screenAvailHeight": screenH, + "innerWidth": screenW, + "innerHeight": 951, + "devicePixelRatio": 1, + "language": "en-US", + "languages": []string{"en-US", "en"}, + "webdriver": false, + "hardwareConcurrency": 8, + "notificationsPermission": "denied", + } + deviceBytes, _ := json.Marshal(deviceMap) + + componentDoneData := baseParams + fmt.Sprintf("&browser_fp=%s&device=%s", + browserFp, url.QueryEscape(string(deviceBytes))) + + _, err = vkReq("captchaNotRobot.componentDone", componentDoneData) + if err != nil { + return "", fmt.Errorf("componentDone failed: %w", err) + } + time.Sleep(time.Duration(1500+mathrand.Intn(1000)) * time.Millisecond) + + // Step 3: checkbox check + log.Printf("[Captcha] Step 3/4: check (checkbox)") + + type Point struct { + X int `json:"x"` + Y int `json:"y"` + T int64 `json:"t"` + } + var cursor []Point + startX, startY := screenW/2+mathrand.Intn(200)-100, screenH/2+mathrand.Intn(200)-100 + startTime := time.Now().Add(-300 * time.Millisecond).UnixMilli() + + pointsCount := 4 + mathrand.Intn(5) + for i := 0; i < pointsCount; i++ { + cursor = append(cursor, Point{ + X: startX, + Y: startY, + T: startTime + int64(i*20+mathrand.Intn(10)), + }) + startX += mathrand.Intn(30) - 15 + startY += mathrand.Intn(30) - 15 + } + cursorBytes, _ := json.Marshal(cursor) + + answer := base64.StdEncoding.EncodeToString([]byte("{}")) + + // Dynamic debug_info from not_robot_captcha.js. See iOS-side + // captcha_debug_info.go for the rationale; fallback to legacy + // constant when fetch fails. + scriptURL, _ := htmlSettings["_scriptURL"].(string) + debugInfo, debugErr := fetchDebugInfo(ctx, client, profile, scriptURL) + if debugErr != nil { + log.Printf("[Captcha] fetchDebugInfo: %v — using legacy constant", debugErr) + debugInfo = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + } + + // v2 wire shape: all motion arrays empty including connectionDownlink. + checkData := baseParams + fmt.Sprintf( + "&accelerometer=%s&gyroscope=%s&motion=%s&cursor=%s&taps=%s&connectionRtt=%s&connectionDownlink=%s"+ + "&browser_fp=%s&hash=%s&answer=%s&debug_info=%s", + url.QueryEscape("[]"), + url.QueryEscape("[]"), + url.QueryEscape("[]"), + url.QueryEscape(string(cursorBytes)), + url.QueryEscape("[]"), + url.QueryEscape("[]"), + url.QueryEscape("[]"), + browserFp, + hash, + answer, + debugInfo, + ) + + checkResp, err := vkReq("captchaNotRobot.check", checkData) + if err != nil { + return "", fmt.Errorf("check failed: %w", err) + } + + respObj, ok := checkResp["response"].(map[string]interface{}) + if !ok { + return "", fmt.Errorf("invalid check response: %v", checkResp) + } + + status, _ := respObj["status"].(string) + showType, _ := respObj["show_captcha_type"].(string) + log.Printf("[Captcha] checkbox status: %s show_type=%q", status, showType) + + if status == "OK" { + successToken, ok := respObj["success_token"].(string) + if ok && successToken != "" { + log.Printf("[Captcha] Step 4/4: endSession") + _, _ = vkReq("captchaNotRobot.endSession", baseParams) + markCaptchaSuccess(isTunnel) + return successToken, nil + } + } + + if status == "ERROR_LIMIT" { + markCaptchaSaturated(isTunnel) + return "", fmt.Errorf("captchaNotRobot.check ERROR_LIMIT (no slider fallback under rate-limit)") + } + + // v2 routing: only try slider on explicit BOT status with slider show_type. + sliderEligible := status == "BOT" && (showType == "" || showType == "slider") + if !sliderEligible { + return "", fmt.Errorf("captchaNotRobot.check non-OK status=%q show_type=%q", status, showType) + } + + log.Printf("[Captcha] Checkbox status=BOT show_type=%q, switching to slider", showType) + + // Use htmlSettings from the HTML page if available, otherwise use API settings + mergedSettings := settingsResp + if htmlSettings != nil { + mergedSettings = htmlSettings + } + + sliderToken, sliderErr := solveSliderCaptcha(vkReq, baseParams, browserFp, hash, debugInfo, mergedSettings, isTunnel) + if sliderErr != nil { + // saturation accounting now happens inside solveSliderCaptcha + // at the exact branch (ERROR_LIMIT or unparseable_response), + // so this caller just propagates the error. + return "", fmt.Errorf("slider captcha also failed: %w", sliderErr) + } + + log.Printf("[Captcha] Slider solved! endSession...") + _, _ = vkReq("captchaNotRobot.endSession", baseParams) + markCaptchaSuccess(isTunnel) + return sliderToken, nil +} + +func buildCaptchaDeviceJSON(profile Profile) string { + return fmt.Sprintf( + `{"screenWidth":1920,"screenHeight":1080,"screenAvailWidth":1920,"screenAvailHeight":1040,"innerWidth":1920,"innerHeight":969,"devicePixelRatio":1,"language":"en-US","languages":["en-US"],"webdriver":false,"hardwareConcurrency":8,"deviceMemory":8,"connectionEffectiveType":"4g","notificationsPermission":"default","userAgent":"%s","platform":"Win32"}`, + profile.UserAgent, + ) +} diff --git a/captcha-service/warp_dialer.go b/captcha-service/warp_dialer.go new file mode 100644 index 0000000..99214f2 --- /dev/null +++ b/captcha-service/warp_dialer.go @@ -0,0 +1,107 @@ +// warp_dialer.go — bind outbound HTTP sockets to a pre-configured +// WireGuard interface (typically a Cloudflare WARP tunnel set up +// externally via wg-quick) so VK-bound captcha traffic egresses from +// Cloudflare's edge IP space instead of the host's eth0. +// +// Operational model: +// 1. Operator runs wgcf (https://github.com/ViRb3/wgcf) to obtain +// free WARP credentials, gets a WireGuard config file. +// 2. Operator brings the interface up out-of-band: +// sudo wg-quick up /etc/wireguard/wgcf.conf +// with `Table = off` in the [Interface] block so it doesn't +// install default routes — we don't want WARP eating all +// outbound traffic from this host, only the captcha-service's +// VK calls. +// 3. captcha-service runs with WARP_INTERFACE=wgcf in its env. +// Every outbound HTTP socket aimed at VK gets pinned to that +// interface via SO_BINDTODEVICE — kernel routes the packet +// through the WireGuard interface regardless of the host's +// default route. +// +// Why this approach and not in-process WireGuard: +// - In-process means importing golang.zx2c4.com/wireguard into a +// service that runs as non-root → CAP_NET_ADMIN required or +// fall back to a userspace TUN which needs root anyway. wg-quick +// handles all that cleanly out of band. +// - Separation of concerns: WARP setup, key rotation, MTU tuning +// stays at the network layer where the operator already has +// tools. captcha-service just consumes an interface name. +// - Falling back is trivial: unset WARP_INTERFACE and outbound +// uses the host default route again. No code changes. +// +// SO_BINDTODEVICE requires CAP_NET_RAW or running as root. Our +// Dockerfile drops to non-root user `app`; the operator either grants +// CAP_NET_RAW (--cap-add=NET_RAW in docker run) or runs the container +// with --network=host and a host-level firewall mark instead. The +// README documents both. + +package main + +import ( + "context" + "net" + "os" + "syscall" + + "golang.org/x/sys/unix" +) + +// warpInterface is the name of the WireGuard interface to bind +// outbound captcha sockets to. Empty = WARP off, use default route. +// Read once at startup from WARP_INTERFACE env var. +var warpInterface = os.Getenv("WARP_INTERFACE") + +// warpControl is a net.Dialer.Control hook that pins the socket to +// warpInterface before connect(). Idempotent and safe to call from +// multiple goroutines — net.Dialer guarantees serial control invoc +// per socket. Returns nil if WARP isn't configured so it's safe to +// always wire in. +func warpControl(network, address string, c syscall.RawConn) error { + if warpInterface == "" { + return nil + } + var serr error + if err := c.Control(func(fd uintptr) { + serr = unix.BindToDevice(int(fd), warpInterface) + }); err != nil { + return err + } + return serr +} + +// warpDialer wraps an arbitrary upstream DialContext so we can layer +// our SO_BINDTODEVICE control on top while preserving custom DNS +// resolution behavior (e.g. dns_resolver.customDial). +type warpDialer struct { + upstream func(ctx context.Context, network, address string) (net.Conn, error) +} + +func (d *warpDialer) DialContext(ctx context.Context, network, address string) (net.Conn, error) { + if d.upstream != nil { + // The upstream dialer (e.g. customDial) does DNS resolution + // and may dial-by-IP; we still need to pin to warpInterface + // after it produces a Conn. SetsockoptString on the live + // socket isn't reliable cross-platform (kernel may have + // already started SYN), so instead the upstream dialer must + // itself install the control hook. Document this contract + // in callers — see vk_captcha.go. + return d.upstream(ctx, network, address) + } + dialer := &net.Dialer{Control: warpControl} + return dialer.DialContext(ctx, network, address) +} + +// newWARPNetDialer returns a net.Dialer pre-wired with the WARP +// control hook. Use this where a net.Dialer value (not a DialContext +// function) is required — notably tls-client's WithDialer option. +func newWARPNetDialer() net.Dialer { + return net.Dialer{Control: warpControl} +} + +// warpStatus is for the /stats endpoint and startup log. +func warpStatus() string { + if warpInterface == "" { + return "off" + } + return "on:" + warpInterface +} diff --git a/docs/MANUAL_CAPTCHA.md b/docs/MANUAL_CAPTCHA.md new file mode 100644 index 0000000..ee2bd38 --- /dev/null +++ b/docs/MANUAL_CAPTCHA.md @@ -0,0 +1,80 @@ +# Manual captcha path — hardening for bootstrap sessions + +Under hard network blocking there is **no tunnel and no reachable +captcha-service until the first session is up**, so the first few VK +identities must be earned by hand in the on-device WebView. That path +was being flagged as a bot. This change addresses the likely causes, +in priority order. + +## 1. Bootstrap-manual-first (skip the poisoning auto attempt) + +`solveVkCaptcha` previously ran the tls-client auto solver first in +`fallback` mode; only on its failure did the manual sheet appear. But +the auto attempt reliably draws `status:BOT`, and that verdict poisons +the captcha session / source IP that the user is about to solve in a +real WebKit engine moments later. + +New `manualCaptchaBootstrapActive()` (captcha_manual.go): while +`captchaSessionsReady == 0`, a manual handler is registered, the user +opted into prompts (`mode != off`), and quota remains, `solveVkCaptcha` +goes **straight to the manual sheet** and skips the auto chain. After +the first session is up it returns false and normal mode behaviour +resumes. `errDeferToRemote` (quota spent / a session came up while we +queued) falls through to the auto chain instead of failing the solve. + +Net effect: the real-browser solve is the first and only thing VK sees +on a clean session during bootstrap. + +## 2. Cookie / state warm-up (CaptchaWebView.swift) + +A captcha session with zero prior vk.com cookies + localStorage reads +as a freshly-spun-up automation environment. Before navigating to the +captcha the WebView now briefly loads `https://m.vk.com/` so the +persistent data store picks up organic state. The real captcha load is +kicked from the warm-up's `didFinish` or a 3 s hard cap, whichever +fires first, so a blocked/slow warm-up never strands the user. + +## 3. In-session replay logging (CaptchaWebView.swift) + +The in-WebView replay (`window.__capRetry`) POSTs `getAnonymousToken` +inside the solved session so VK sees one coherent actor. That fetch is +**cross-origin** (captcha origin → api.vk.com) and can be silently +blocked by the page's CSP `connect-src` or missing CORS — which demotes +us to the bot-prone Go redemption path without any signal. + +The JS now emits explicit status lines — `replay OK: final_response`, +`replay FALLBACK: fetch threw (… likely CORS/CSP)`, `replay FALLBACK: +empty response body` — and the native side mirrors WebView status into +`SharedLogger.debug`, so a device sysdiagnose shows exactly which branch +ran. If logs show the fallback firing, that — not the solve gesture — +is why redemption looks like a session switch. + +Also added: `navigator.maxTouchPoints => 5` parity (real iPhone Safari). + +## 4. Real-Safari path (EXPERIMENTAL, opt-in, not wired) + +`SafariCaptchaView.swift` adds an `ASWebAuthenticationSession` solver +that runs the page in the real Safari service process — real Safari +fingerprint + shared Safari cookies. **Hard limitation:** no JS +injection and no per-navigation callbacks, so it can only capture +`success_token` if the flow ends by redirecting to a URL whose scheme +matches `callbackURLScheme`. To use it: + +1. Register a `redirect_uri` you control as the captcha redirect target + (https / Universal Link), have it 302 to + `turnbridge://captcha?success_token=...`, set `callbackScheme` + accordingly; **or** point the scheme at whatever target VK reflects + `success_token` into. +2. Add the file to the TurnBridge target and gate it behind a setting. + +Until that redirect plumbing exists, `CaptchaWebView` (WKWebView) +remains the shipping path; this file compiles and presents but will not +complete a solve on its own. + +## Build / validation note + +These edits could not be compiled here (no macOS/Xcode/Go toolchain in +the authoring environment). Go syntax was checked with `gofmt`. Swift +needs a device build + a real blocked-network run to confirm: watch the +log for `bootstrap (sessions_ready=0) — manual-first`, the warm-up +load, and the `replay OK` vs `replay FALLBACK` line. diff --git a/docs/SERVER_COMPAT.md b/docs/SERVER_COMPAT.md new file mode 100644 index 0000000..ef878cf --- /dev/null +++ b/docs/SERVER_COMPAT.md @@ -0,0 +1,343 @@ +# Server-side compatibility requirements + +This document is for an engineer who maintains the **WG server-side +proxy** (typically `Moroka8/vk-turn-proxy` running on the host that +WireGuard ultimately terminates at, e.g. `77.90.8.199:56010` in our +current setup). + +The iOS TurnBridge client has evolved past the upstream proxy in a +handful of ways. Most of the changes don't need server cooperation — +the TLS-fingerprint impersonation, the captcha-v2 algorithm changes, +the runtime memory tunings all live in the client. The exceptions are +listed below; without these the corresponding feature either silently +no-ops or breaks the data path entirely. + +Versions referenced below come from +`truvvor/turnbridge@claude/build-project-br5tJ`. The client today is +**1.3.18**. + +--- + +## 1. SRTP/Opus mimicry wrap (**REQUIRED for any client where wrap key is set**) — HIGHEST PRIORITY + +### What it is + +A custom AEAD layer placed **between** our DTLS-over-TURN payload and +the TURN ChannelData frame on the wire. The wrapper re-frames each +DTLS record (or any other payload our client sends through the relay) +into a packet that looks byte-identical to a real WebRTC SRTP/Opus +voice frame. + +VK's TURN relay appears to fast-path SRTP-shaped ChannelData payloads +and rate-limit anything that doesn't match (DTLS application-data +records are the dominant pattern that gets throttled). + +### Where in the data path + +``` + Client side (iOS) Server side (vk-turn-proxy) + ---------------- --------------------------- + WireGuard packets WireGuard packets + | ^ + v | + DTLS encrypt DTLS decrypt + | ^ + v | + ┌─ wrap.wrapInto ─┐ ┌─ wrap.unwrapPacket ─┐ + │ prepend RTP │ │ AEAD-verify │ + │ hdr + nonce, │ ───────────▶ │ strip RTP+nonce, │ + │ AEAD-encrypt │ ChannelData │ return plaintext │ + └─────────────────┘ over TURN └──────────────────────┘ + | ^ + v | + TURN ChannelData TURN ChannelData receive + | ^ + v | + UDP socket ───── wire ────── VK TURN relay (forwards opaque + ChannelData byte-for-byte) +``` + +The wrap is **outside DTLS** (DTLS payloads are the plaintext fed into +wrap), and **inside TURN** (wrapped bytes are the payload of the +ChannelData frame). The TURN relay never sees inside the wrap, never +sees inside DTLS, so neither the WireGuard tunnel nor the wrap key +need to be known to VK. + +### Reference implementation + +Verbatim source: `wireguard-apple/Sources/WireGuardKitGo/wrap.go` in +the truvvor repo, which is itself a verbatim port of +`pkg/clientcore/wrap.go` from `Moroka8/vk-turn-proxy`. If +`Moroka8/vk-turn-proxy` is what you're patching, **the file already +exists in your tree** — the port mirrors it exactly. + +### Wire format + +Per packet, the wrap layer produces / consumes: + +``` +[ 12-byte RTP header | 12-byte explicit nonce | AEAD ciphertext | 16-byte tag ] +``` + +Layout details: + +| Offset | Length | Content | +|---|---|---| +| 0 | 1 | `0x80` — RTP version=2, P=0, X=0, CC=0 | +| 1 | 1 | `0x6F` — Marker=0, payload type 111 (Opus) | +| 2 | 2 | Sequence number, big-endian, monotonic, init random | +| 4 | 4 | Timestamp, big-endian, monotonic, **+960 per packet** (20 ms at 48 kHz Opus framing) | +| 8 | 4 | SSRC (random per `wrapConn`, MSB encodes direction) | +| 12 | 4 | nonce part 1: sessionID (random per `wrapConn`, MSB matches SSRC MSB) | +| 16 | 8 | nonce part 2: counter, big-endian, monotonic, init random uint64 | +| 24 | N | AEAD ciphertext (length == plaintext length) | +| 24+N | 16 | AEAD authentication tag | + +Total overhead per packet: **40 bytes** (12 header + 12 nonce + 16 tag). + +### AEAD + +- **Algorithm:** ChaCha20-Poly1305 (RFC 7539). NOT AES-GCM. Real + WebRTC SRTP usually uses AES-GCM (RFC 7714); the ciphertext+tag + lengths are identical so the wire shape matches regardless. We use + ChaCha20-Poly1305 because it's faster than AES-GCM on mobile CPUs + without AES-NI and the wire fingerprint doesn't expose the cipher. +- **Key:** 32 bytes, shared between client and server out of band. +- **Nonce:** the 12-byte explicit nonce field (offset 12-23). Both + endpoints use the **same key** but **disjoint nonce subspaces** via + the direction bit (below) and per-conn random init, so accidental + nonce reuse is computationally impossible. +- **AAD:** the first 24 bytes of the packet (RTP header || nonce). This + authenticates the SSRC/seq/timestamp so VK can't reorder packets to + smuggle different ciphertext to a different sequence position. + +### Per-`wrapConn` state initialisation + +Each `wrapConn` is created on session start (one per `oneTurnConnection` +on the client side; one per equivalent on the server side). State to +initialise: + +| Field | Width | How initialised | +|---|---|---| +| `sessionID[0..4]` | 4 bytes | `crypto/rand`; then byte 0 has its high bit set/cleared by direction | +| `ssrc[0..4]` | 4 bytes | `crypto/rand`; then byte 0 has its high bit set/cleared by direction | +| `seq` | uint32 | `crypto/rand`-derived random uint16 stored into a uint32 | +| `timestamp` | uint32 | random uint32 from `crypto/rand` | +| `counter` | uint64 | random uint64 from `crypto/rand` | + +### Direction bit (CRITICAL — server-side requires `isServer=true`) + +The client builds `wrapConn` with `isServer=false`: +```go +sessionID[0] &^= 0x80 // clear MSB +ssrc[0] &^= 0x80 // clear MSB +``` + +The server **MUST** build `wrapConn` with `isServer=true`: +```go +sessionID[0] |= 0x80 // set MSB +ssrc[0] |= 0x80 // set MSB +``` + +This guarantees that even with the same shared AEAD key the two ends +write into completely disjoint nonce spaces — client packets always +have `nonce[0] & 0x80 == 0`, server packets always have +`nonce[0] & 0x80 == 1`. This is a hard correctness requirement, not a +defense-in-depth nicety: if both ends pick the same direction the +counter+sessionID space collides and AEAD reuse is possible. + +### Encrypt path (client → relay → server) + +```go +seq := w.seq.Add(1) - 1 // atomic monotonic +ts := w.timestamp.Add(960) - 960 // atomic monotonic, +960/packet + +dst[0] = 0x80 +dst[1] = 0x6F +binary.BigEndian.PutUint16(dst[2:4], uint16(seq)) +binary.BigEndian.PutUint32(dst[4:8], ts) +copy(dst[8:12], w.ssrc[:]) + +copy(dst[12:16], w.sessionID[:]) +ctr := w.counter.Add(1) - 1 // atomic monotonic +binary.BigEndian.PutUint64(dst[16:24], ctr) + +nonce := dst[12:24] +aad := dst[:24] +copy(dst[24:], plaintext) +aead.Seal(dst[24:24], nonce, dst[24:24+len(plaintext)], aad) + +// dst now contains wireLen = 24 + len(plaintext) + 16 bytes. +``` + +### Decrypt path (relay → server) + +```go +if len(wire) < 24+16 { return error("short") } + +nonce := wire[12:24] +aad := wire[:24] +ct := wire[24:] + +plain, err := aead.Open(ct[:0], nonce, ct, aad) +if err != nil { return error("AEAD"); /* drop packet, do NOT tear down */ } +// hand `plain` to the DTLS terminator (the next layer up). +``` + +### Server-side error handling + +- **AEAD-open failure on a single packet:** drop the packet, **do + not** tear down the TURN allocation or the WireGuard session. The + most common cause of a one-off AEAD failure is a stray un-wrapped + packet arriving right at session bring-up (the client hasn't fully + configured wrap yet, or the relay re-delivered an old datagram). + Continue reading from the relay. +- **Repeated AEAD failures (e.g., >100 in a row from the same client + 5-tuple):** log + close that allocation. Almost always indicates a + key mismatch. +- **Out-of-order packets (sequence regression):** accept them + silently. Our pipeline downstream of wrap is DTLS, which already + has its own anti-replay (32-packet replay window per RFC 6347), and + WireGuard's anti-replay is downstream of that. The wrap layer + itself is anti-replay-naive on purpose — adding a window here would + break legitimate reordering that the lower-layer crypto tolerates. + +### Config surface on the server + +Exactly one operator-supplied input: + +- **Wrap key** — 64 hex chars (32 bytes after decoding). Same value + that the iOS client has configured. Provided via either: + - a CLI flag (`-wrap -wrap-key=` is what `Moroka8/vk-turn-proxy` + already uses), or + - an env var (`WRAP_KEY=` is cleaner for systemd / Docker). + +If the key is empty / unset, the server should accept un-wrapped +ChannelData payloads (legacy DTLS-over-TURN, what we used before +1.3.18). Mixing per-allocation (wrap on for some sessions, off for +others) is acceptable; the server can sniff the first datagram per +allocation: + +- if bytes `[0:2] == 0x80 0x6F` → wrap on, run unwrap path +- otherwise → un-wrapped, run legacy path + +This auto-detect lets the operator deploy the server with WRAP +support without coordinating a sharp cutover on the client side. + +### What MUST NOT change + +- **Don't validate the RTP header semantically.** Specifically: + don't reject packets with a regressing sequence number, don't + enforce the +960 timestamp step, don't reject mismatched SSRCs + within a session. The header is purely cover-traffic for VK's DPI + — the AEAD AAD makes any tampering AEAD-fail, so semantic + validation buys no security and costs you legitimate packets. +- **Don't add additional metadata** before or after the wrap envelope. + Anything you prepend or append moves the visible TURN ChannelData + payload away from the SRTP/Opus shape, defeating the whole + point. +- **Don't reuse the wrap key across unrelated client groups.** AEAD + with a shared key requires disjoint nonce spaces; the direction + bit handles client-vs-server, but two independent client devices + sharing the same key would collide. Each (client, server) pair + needs its own key. + +--- + +## 2. Stream Aggregation 17-byte preamble (**existing — confirm support**) + +The client already sends a 17-byte preamble at the start of every +DTLS stream when `streamAggregation=true` (default for our deployment): + +``` +[16-byte sessionID | 1-byte streamID] +``` + +`Moroka8/vk-turn-proxy` already supports this — no change needed — +but flagging it here so the patching engineer knows it's still on the +wire and the wrap layer (item 1) wraps the preamble too. The preamble +is plaintext inside DTLS, so on the server side it's seen AFTER both +the wrap and the DTLS layers are stripped. + +--- + +## 3. TURN allocation count (**no protocol change**) + +Client may open up to **100 concurrent TURN allocations** against the +same relay (default cap 60, configurable up to 100). Each is a normal +RFC 5766 Allocate with long-term auth; no protocol extension. The +server only needs to ensure its TURN allocation limit per source IP +isn't lower than the client's `N`. + +--- + +## 4. Minimal TURN client behaviour (**informational**) + +As of 1.3.16 the iOS client uses a hand-rolled minimal TURN client +(`turn_min.go`) instead of `github.com/pion/turn/v5`. From the relay's +perspective this is a **pure RFC 5766 client** — Allocate with +long-term auth (REALM + NONCE challenge), ChannelBind for the single +peer, ChannelData for the data plane, Refresh at half-lifetime. No +extension attributes, no proprietary tweaks. Any RFC-5766-compliant +relay (which is what the VK TURN servers and what `Moroka8/vk-turn- +proxy` already implement) accepts these unchanged. + +If the server is patched at the same time as a wrap rollout, the +server's TURN client (used to dial the upstream VK relay on behalf of +WireGuard if the deployment proxies via TURN — `Moroka8`'s setup +doesn't normally do this, but mention it for completeness) likewise +needs no change. + +--- + +## Checklist for the server engineer + +1. [ ] Pull the latest `Moroka8/vk-turn-proxy` (or your fork). Confirm + `pkg/clientcore/wrap.go` (and `wrap_test.go`) exist and pass + `go test ./...`. +2. [ ] Decide how the server consumes the wrap key — CLI flag or env + var. Both are fine; pick one for consistency with the rest of + the deployment. +3. [ ] On the server's per-allocation receive loop, before passing + the ChannelData payload to the DTLS terminator, branch on + wrap-on (`payload[0:2] == 0x80 0x6F`) vs wrap-off (legacy). On + wrap-on, run `wrapConn.unwrapPacket`. AEAD failure on a single + packet → drop, continue. +4. [ ] On the server's per-allocation send loop, after the DTLS + terminator hands you the encrypted bytes and before you stuff + them into a ChannelData frame, run `wrapConn.wrapInto`. The + `wrapConn` was built once with `isServer=true` at allocation + setup. +5. [ ] Test against the iOS client at 1.3.18: + - Set the same key on both ends. + - Connect. Confirm in client logs: `wrap: enabled (key set, + 32 bytes)` at startup and at least one `Established DTLS + connection!` per session. + - Confirm in server logs no `AEAD open` errors after the first + packet. + - Pull data through the tunnel (any HTTP through the WG + interface). Confirm throughput and that VK's relay isn't + shaping (sustained ≥1 Mbps per session over several minutes + is the smoke test). +6. [ ] **Backwards compat smoke**: with wrap key set on the server, + connect a 1.3.17 client (no wrap support) and confirm the + legacy path still works because the server's first-packet + sniff falls through to the un-wrapped branch. + +--- + +## Out-of-scope on the server + +These are client-side only and the server should ignore: + +- **TLS fingerprint impersonation** (`bogdanfinn/tls-client` + `fhttp`, + Safari iOS 18 profile). Only affects the iOS↔VK captcha API path, + which is captcha-service traffic to `api.vk.com`/`id.vk.com`, not + WG-relay traffic. +- **Captcha v2 algorithm** (dynamic `debug_info`, slim device shape, + show-type routing). Same as above — captcha pipeline, not WG. +- **WARP egress** (`WARP_INTERFACE` env var on `captcha-service`). + Server-side only on the **captcha-service**, not the WG server. +- **Bounded packet pipe / readBufPool / minimal TURN goroutine zoo + reduction**. Client-side memory optimisations; the server sees the + same wire bytes as before. diff --git a/network-extension/CaptchaBridge.swift b/network-extension/CaptchaBridge.swift new file mode 100644 index 0000000..0780d65 --- /dev/null +++ b/network-extension/CaptchaBridge.swift @@ -0,0 +1,166 @@ +import Foundation +import WireGuardKitGo + +/// Constants shared with the main app for the manual captcha IPC. +enum CaptchaIPC { + static let appGroupID = "group.com.truvvor.turnbridge" + static let requestUserDefaultsKey = "captcha.pendingRequest" + static let requestDarwinNotification = "com.truvvor.turnbridge.captcha.request" + static let cancelDarwinNotification = "com.truvvor.turnbridge.captcha.cancel" + + /// JSON payload the app sends back via NETunnelProviderSession.sendProviderMessage. + struct AppMessage: Codable { + let type: String // "captcha_answer" | "captcha_cancel" + let requestId: String + let successToken: String? + let reason: String? + /// New (1.3.24+): full JSON response when the WebView replayed + /// the failing VK call inside its own session. See the + /// matching field in TurnBridge/CaptchaIPC.swift. + let responseJson: String? + } + + /// Persistent payload the extension writes when it needs the app to solve a captcha. + struct PendingRequest: Codable { + let requestId: String + let redirectUri: String + let createdAt: TimeInterval + /// New (1.3.24+): see TurnBridge/CaptchaIPC.swift. + let retryUrl: String? + let retryBody: String? + } +} + +/// Trampoline from cgo into Swift. Note: this runs on a Go goroutine / +/// arbitrary thread, so anything heavy must be dispatched off it. +private let manualCaptchaCCallback: @convention(c) (UnsafePointer?, UnsafePointer?) -> Void = { reqIDPtr, uriPtr in + guard let reqIDPtr = reqIDPtr, let uriPtr = uriPtr else { return } + let reqID = String(cString: reqIDPtr) + let uri = String(cString: uriPtr) + CaptchaBridge.publishRequest(requestId: reqID, redirectUri: uri) +} + +enum CaptchaBridge { + + /// Registered once at tunnel start. + static func install() { + TurnBridgeSetManualCaptchaCallback(manualCaptchaCCallback) + } + + /// Called from the cgo callback. Persists the request in the shared + /// User Defaults so the app can pick it up, then fires a Darwin + /// notification that wakes the app's observer. + fileprivate static func publishRequest(requestId: String, redirectUri: String) { + SharedLogger.info("Manual captcha requested (reqID=\(requestId))", source: .tunnel) + + // Pull the retry-request template from Go. When set, the + // WebView will POST the body to this URL after extracting + // success_token, INSIDE its own browser session — VK then + // sees a single coherent session for both captcha solve and + // the follow-up API call. Free the C string after parsing. + var retryUrl: String? + var retryBody: String? + requestId.withCString { reqIDC in + if let cStr = TurnBridgeGetManualCaptchaRetryRequest(reqIDC) { + defer { free(UnsafeMutablePointer(mutating: cStr)) } + let json = String(cString: cStr) + if let data = json.data(using: .utf8), + let parsed = try? JSONSerialization.jsonObject(with: data) as? [String: String] { + retryUrl = parsed["url"] + retryBody = parsed["body"] + } + } + } + + if let defaults = UserDefaults(suiteName: CaptchaIPC.appGroupID) { + let payload = CaptchaIPC.PendingRequest( + requestId: requestId, + redirectUri: redirectUri, + createdAt: Date().timeIntervalSince1970, + retryUrl: retryUrl, + retryBody: retryBody + ) + if let data = try? JSONEncoder().encode(payload) { + defaults.set(data, forKey: CaptchaIPC.requestUserDefaultsKey) + } + } + + let name = CFNotificationName(CaptchaIPC.requestDarwinNotification as CFString) + CFNotificationCenterPostNotification( + CFNotificationCenterGetDarwinNotifyCenter(), + name, nil, nil, true + ) + } + + /// Called from NEPacketTunnelProvider.handleAppMessage when the app + /// delivers a result (token or cancel) for an outstanding request. + static func handleAppMessage(_ data: Data) -> Data? { + guard let msg = try? JSONDecoder().decode(CaptchaIPC.AppMessage.self, from: data) else { + SharedLogger.warning("CaptchaBridge: ignoring unparseable app message (\(data.count) bytes)", source: .tunnel) + return nil + } + + switch msg.type { + case "captcha_answer": + // Prefer the full JSON response (the WebView did the retry + // itself, so getCreds can skip its own redemption call). + // Fall through to the legacy token-only path when the + // WebView fell back to just extracting success_token + // (network error during the in-WebView fetch, retryUrl + // wasn't provided, etc). + if let resp = msg.responseJson, !resp.isEmpty { + msg.requestId.withCString { reqIDC in + resp.withCString { respC in + TurnBridgeSubmitManualCaptchaResponse(reqIDC, respC) + } + } + SharedLogger.info("CaptchaBridge: delivered full response (\(resp.count) bytes) for reqID=\(msg.requestId)", source: .tunnel) + } else { + let token = msg.successToken ?? "" + msg.requestId.withCString { reqIDC in + token.withCString { tokenC in + TurnBridgeSubmitManualCaptchaToken(reqIDC, tokenC) + } + } + SharedLogger.info("CaptchaBridge: delivered success_token for reqID=\(msg.requestId)", source: .tunnel) + } + + case "captcha_cancel": + let reason = msg.reason ?? "user cancelled" + msg.requestId.withCString { reqIDC in + reason.withCString { reasonC in + TurnBridgeCancelManualCaptcha(reqIDC, reasonC) + } + } + SharedLogger.info("CaptchaBridge: cancelled reqID=\(msg.requestId) (\(reason))", source: .tunnel) + + default: + return nil + } + + // Clear pending request from shared UserDefaults so the app doesn't + // re-prompt on next launch. + if let defaults = UserDefaults(suiteName: CaptchaIPC.appGroupID) { + defaults.removeObject(forKey: CaptchaIPC.requestUserDefaultsKey) + } + return Data("ok".utf8) + } + + /// Called from stopTunnel: the Go side is going away, so any + /// pending captcha prompt can never be answered. Clear the + /// published request and tell the app to drop its sheet -- + /// otherwise the user keeps solving captchas into a dead session + /// (1.3.27 field log: tunnel died with stop reason 9 four seconds + /// after the first sheet appeared; the sheet stayed up for 20+ + /// minutes while the user kept solving into the void). + static func teardown() { + if let defaults = UserDefaults(suiteName: CaptchaIPC.appGroupID) { + defaults.removeObject(forKey: CaptchaIPC.requestUserDefaultsKey) + } + let name = CFNotificationName(CaptchaIPC.cancelDarwinNotification as CFString) + CFNotificationCenterPostNotification( + CFNotificationCenterGetDarwinNotifyCenter(), + name, nil, nil, true + ) + } +} diff --git a/network-extension/PacketTunnelProvider.swift b/network-extension/PacketTunnelProvider.swift index 6001bcc..2274de9 100755 --- a/network-extension/PacketTunnelProvider.swift +++ b/network-extension/PacketTunnelProvider.swift @@ -2,12 +2,14 @@ // Created by nullcstring. // +import Darwin import NetworkExtension +import Network import WireGuardKit import WireGuardKitGo import os -let sharedLogger = Logger(subsystem: "com.netlab.TurnBridge.network-extension", category: "wgtunnel") +let sharedLogger = Logger(subsystem: "com.truvvor.turnbridge.network-extension", category: "wgtunnel") enum PacketTunnelProviderError: String, Error { case invalidProtocolConfiguration @@ -18,6 +20,8 @@ private let goProxyCLoggerCallback: @convention(c) (UnsafeMutableRawPointer?, In guard let cStr = messageCStr else { return } let message = String(cString: cStr).trimmingCharacters(in: .newlines) + TransportHealthMonitor.observe(message) + if level == 1 { sharedLogger.error("[TP]: \(message, privacy: .public)") SharedLogger.error(message, source: .tunnel) @@ -36,6 +40,26 @@ class PacketTunnelProvider: NEPacketTunnelProvider { } }() + private var pathMonitor: NWPathMonitor? + private var lastPathStatus: Network.NWPath.Status? + private var lastPathInterfaceLabel: String? + private var lastTransportRestartAt = Date.distantPast + private var captchaStatsTimer: DispatchSourceTimer? + + + /// Tear down the current TURN/DTLS cycle and let the proxy spin up + /// fresh inner connections, reusing cached credentials when possible + /// (so no captcha re-prompt). Debounced to 5s to avoid stampedes when + /// several signals fire at once (wake + network change). + private func restartTransport(reason: String) { + if Date().timeIntervalSince(lastTransportRestartAt) < 5 { + return + } + lastTransportRestartAt = Date() + SharedLogger.info("Transport restart: \(reason)", source: .tunnel) + RestartProxy() + } + override func startTunnel(options: [String : NSObject]?, completionHandler: @escaping (Error?) -> Void) { sharedLogger.log("=== Starting tunnel ===") @@ -76,23 +100,126 @@ class PacketTunnelProvider: NEPacketTunnelProvider { return } let nValue = Int32(nValueInt) + // Default true for backward-compat with profiles saved before this field existed. + let useUDP = (providerConfiguration["useUDP"] as? Bool) ?? true + let udpFlag: Int32 = useUDP ? 1 : 0 + let streamAggregation = (providerConfiguration["streamAggregation"] as? Bool) ?? false + let wrapKey = (providerConfiguration["wrapKey"] as? String) ?? "" - SharedLogger.info("Peer: \(peerAddr), Listen: \(listenAddr), N: \(nValue)", source: .tunnel) + SharedLogger.info("Peer: \(peerAddr), Listen: \(listenAddr), N: \(nValue), UDP: \(useUDP), streamAgg: \(streamAggregation), wrap: \(wrapKey.isEmpty ? "off" : "on")", source: .tunnel) SharedLogger.info("Starting TURN proxy...", source: .tunnel) ProxySetLogger(nil, goProxyCLoggerCallback) + CaptchaBridge.install() + + // Toggle the Stream-Aggregation handshake on the Go side + // BEFORE StartProxy. The Go global is read once when each + // DTLS session completes its handshake, so setting it later + // would race the per-session goroutines. + TurnBridgeSetStreamAggregation(streamAggregation ? 1 : 0) + + // SRTP/Opus wrap key. Empty string disables wrap and falls + // back to the legacy direct DTLS-over-TURN path. Set BEFORE + // StartProxy — currentWrapKey() is sampled once per session + // start in oneTurnConnection. + wrapKey.withCString { TurnBridgeSetWrapKey($0) } + + // Captcha trap: every slider captcha buffers its raw VK + // response + decoded image in memory and only flushes to disk + // when the solve ultimately fails. The artefacts land inside + // the App Group container so they show up in the Files app + // and survive across extension restarts. Passing the path + // before StartProxy ensures the very first solve is covered. + if let container = FileManager.default.containerURL(forSecurityApplicationGroupIdentifier: CaptchaIPC.appGroupID) { + let trapDir = container.appendingPathComponent("captcha_trap", isDirectory: true) + try? FileManager.default.createDirectory(at: trapDir, withIntermediateDirectories: true) + trapDir.path.withCString { TurnBridgeSetCaptchaTrapDir($0) } + SharedLogger.info("Captcha trap dir: \(trapDir.path)", source: .tunnel) + } + + // Captcha solve mode: 0=off (auto only), 1=forced (always manual), + // 2=fallback (auto first, manual on failure). Backwards-compat: + // if the new int key isn't set, fall back to the legacy bool + // (true → 1 forced, false → 0 off). + let defaults = UserDefaults(suiteName: CaptchaIPC.appGroupID) + let captchaModeRaw: Int = { + if let raw = defaults?.object(forKey: "manualCaptchaMode") as? Int { + return raw + } + return (defaults?.bool(forKey: "manualCaptcha") ?? false) ? 1 : 0 + }() + TurnBridgeSetManualCaptchaMode(Int32(captchaModeRaw)) + let captchaModeLabel: String + switch captchaModeRaw { + case 1: captchaModeLabel = "manual (forced — always browser sheet)" + case 2: captchaModeLabel = "manual fallback (browser sheet only when auto fails)" + default: captchaModeLabel = "auto (in-tunnel solver only)" + } + SharedLogger.info("Captcha mode: \(captchaModeLabel)", source: .tunnel) + + // Remote captcha service: if the user configured a backend + // URL + API key in Settings, the Go side will offload + // getCreds to it after the first few local solves succeed — + // letting us pull a second per-IP rate-limit budget from a + // machine that isn't on the user's mobile IP. Empty values + // disable the feature (server's getCreds falls back to local + // every time). + let remoteURL = UserDefaults(suiteName: CaptchaIPC.appGroupID)? + .string(forKey: "remoteCaptchaServiceURL") ?? "" + let remoteKey = UserDefaults(suiteName: CaptchaIPC.appGroupID)? + .string(forKey: "remoteCaptchaServiceAPIKey") ?? "" + remoteURL.withCString { urlPtr in + remoteKey.withCString { keyPtr in + ProxySetRemoteCaptchaService(urlPtr, keyPtr) + } + } + if !remoteURL.isEmpty && !remoteKey.isEmpty { + SharedLogger.info("Remote captcha service configured (\(remoteURL))", source: .tunnel) + } + + // Scale the readiness budget by N: StartProxy on the Go side + // now waits for ALL N TURN allocations to come up before it + // signals proxyReady (otherwise the WG adapter starts after + // session 1 is up, iOS installs AllowedIPs=0.0.0.0/0 into + // utun, and the captcha load for sessions 2..N gets routed + // through the half-built tunnel and never completes — see + // turn_proxy.go's StartProxy comment). + // + // Per-session budget: + // manual: the user is in the loop solving each captcha by + // hand, so plan for ~30 s/session plus a generous floor. + // auto: the in-tunnel solver finishes in ~3–6 s on a + // warm path but burns longer on a slider+retry sequence, + // so budget ~15 s/session. + // + // The old 12 s / 300 s constants assumed N=1 and were the + // direct cause of "DTLS connection timeout (12s)" landing + // mid-Step-2/4 when nValue>1. + // Bump the per-session DTLS budget whenever a user prompt is + // POSSIBLE — forced (every session prompts) or fallback (auto + // first, prompt only on failure). Even in fallback mode we + // need to account for the wall-clock the user can take to + // tap "solve" on the small minority that does prompt. + let userPromptPossible = captchaModeRaw == 1 || captchaModeRaw == 2 + let perSessionMs: Int32 = userPromptPossible ? 30_000 : 15_000 + let floorMs: Int32 = userPromptPossible ? 60_000 : 20_000 + let dtlsReadyTimeoutMs: Int32 = max(floorMs, perSessionMs * nValue) + SharedLogger.info("DTLS ready budget: \(dtlsReadyTimeoutMs / 1000)s for N=\(nValue) (\(captchaModeLabel))", source: .tunnel) DispatchQueue.global(qos: .userInteractive).async { - StartProxy(vkLink, peerAddr, listenAddr, nValue) + StartProxy(vkLink, peerAddr, listenAddr, nValue, udpFlag) } + startCaptchaStatsPublisher() + Self.startMemoryLogger() + DispatchQueue.global(qos: .userInteractive).async { [weak self] in - let ready = ProxyWaitReady(12000) + let ready = ProxyWaitReady(dtlsReadyTimeoutMs) guard let self = self else { return } if ready == 0 { sharedLogger.error("DTLS connection timeout!") - SharedLogger.error("DTLS connection timeout (12s)", source: .tunnel) + SharedLogger.error("DTLS connection timeout (\(dtlsReadyTimeoutMs / 1000)s)", source: .tunnel) completionHandler(PacketTunnelProviderError.invalidProtocolConfiguration) return } @@ -107,18 +234,140 @@ class PacketTunnelProvider: NEPacketTunnelProvider { let interfaceName = self.adapter.interfaceName ?? "unknown" sharedLogger.log("Tunnel interface is \(interfaceName)") SharedLogger.info("Tunnel up on interface \(interfaceName)", source: .wireguard) + self.logRouteScope() + self.startNetworkMonitoring() } completionHandler(adapterError) } } } + /// Dump what is actually going into the tunnel. The previous version + /// of this method read `routeLAN`/`manualCaptcha` from + /// `providerConfiguration`, but the app never puts those keys + /// there — it bakes the routing decision into the WG peer's + /// `AllowedIPs` and reads `manualCaptcha` from the App Group's + /// shared UserDefaults. The result: this log was reporting false + /// for everything regardless of the actual UI state. Fixed to read + /// the same sources the rest of the extension uses. + private func logRouteScope() { + // Manual-captcha flag is the app-group setting that the rest of + // PacketTunnelProvider already reads (see startTunnel:104). + let manualCap = UserDefaults(suiteName: CaptchaIPC.appGroupID)? + .bool(forKey: "manualCaptcha") ?? false + + // The peer's AllowedIPs is the source of truth for what the OS + // routes into utun. With AllowedIPs=0.0.0.0/0, ::/0 everything + // goes through; with a narrow LAN list, the user's browser + // traffic exits via the underlying interface and only LAN/peer + // traffic uses the tunnel. + var allowedIPs: [String] = [] + if let settings = self.protocolConfiguration as? NETunnelProviderProtocol, + let cfg = settings.providerConfiguration, + let wgQuick = cfg["wgQuickConfig"] as? String { + for raw in wgQuick.split(separator: "\n") { + let line = raw.trimmingCharacters(in: .whitespaces) + if line.lowercased().hasPrefix("allowedips") { + if let eq = line.firstIndex(of: "=") { + let value = line[line.index(after: eq)...] + .trimmingCharacters(in: .whitespaces) + allowedIPs = value.split(separator: ",") + .map { $0.trimmingCharacters(in: .whitespaces) } + } + } + } + } + + let isFullTunnel = allowedIPs.contains { $0 == "0.0.0.0/0" || $0 == "::/0" } + SharedLogger.info( + "Tunnel routing scope: AllowedIPs=\(allowedIPs.isEmpty ? "?" : allowedIPs.joined(separator: ",")) fullTunnel=\(isFullTunnel) manualCaptcha=\(manualCap)", + source: .tunnel + ) + if !isFullTunnel { + SharedLogger.info( + "Split tunnel: only AllowedIPs subnets go via utun, the user's browser traffic exits via the underlying network", + source: .tunnel + ) + } + } + + private func describe(_ status: Network.NWPath.Status) -> String { + switch status { + case .satisfied: return "satisfied" + case .unsatisfied: return "unsatisfied" + case .requiresConnection: return "requiresConnection" + @unknown default: return "unknown" + } + } + + private func startNetworkMonitoring() { + guard pathMonitor == nil else { return } + let monitor = NWPathMonitor() + monitor.pathUpdateHandler = { [weak self] path in + guard let self = self else { return } + let descriptors: [String] = [ + path.usesInterfaceType(.wifi) ? "wifi" : nil, + path.usesInterfaceType(.cellular) ? "cellular" : nil, + path.usesInterfaceType(.wiredEthernet) ? "ethernet" : nil + ].compactMap { $0 } + let label = descriptors.isEmpty ? "unknown" : descriptors.joined(separator: "+") + let prevStatus = self.lastPathStatus + let prevLabel = self.lastPathInterfaceLabel ?? "?" + self.lastPathStatus = path.status + self.lastPathInterfaceLabel = label + + let curStatusStr = self.describe(path.status) + + // Cellular flaps the path on PDP-context refreshes / tower + // handovers — same interface kind, status stays .satisfied, but + // an event fires every ~20s. Restarting on each one tears down + // a working DTLS for no reason. We only restart when something + // observable actually changed: interface kind flipped (wifi + // <-> cellular), or the path was previously unavailable and is + // now satisfied. Pure-noise events are dropped silently; the + // watchdog still catches real DTLS death. + guard let prevStatus = prevStatus else { + SharedLogger.info("NWPath initial: status=\(curStatusStr), via=\(label)", source: .tunnel) + return + } + let prevStatusStr = self.describe(prevStatus) + let interfaceFlipped = prevLabel != label + let recovered = prevStatus != Network.NWPath.Status.satisfied && path.status == Network.NWPath.Status.satisfied + if !interfaceFlipped && !recovered { + return + } + SharedLogger.info("NWPath change: \(prevLabel)/\(prevStatusStr) -> \(label)/\(curStatusStr)", source: .tunnel) + if path.status == Network.NWPath.Status.satisfied { + let reason = interfaceFlipped + ? "interface flip \(prevLabel) -> \(label)" + : "path recovered to \(label)" + self.restartTransport(reason: reason) + } + } + monitor.start(queue: DispatchQueue.global(qos: .utility)) + pathMonitor = monitor + SharedLogger.debug("NWPathMonitor started", source: .tunnel) + } + override func stopTunnel(with reason: NEProviderStopReason, completionHandler: @escaping () -> Void) { sharedLogger.log("Stopping tunnel") SharedLogger.info("Stopping tunnel (reason: \(reason.rawValue))", source: .tunnel) + // Tear down any in-flight manual captcha prompt FIRST: after + // StopProxy the Go waiter is gone and the sheet can never + // resolve. The app observes the cancel Darwin notification and + // dismisses the sheet. + CaptchaBridge.teardown() + + pathMonitor?.cancel() + pathMonitor = nil + lastPathStatus = nil + lastPathInterfaceLabel = nil + + stopCaptchaStatsPublisher() StopProxy() SharedLogger.info("TURN proxy stopped", source: .tunnel) + TransportHealthMonitor.reset() adapter.stop { [weak self] error in guard self != nil else { return } @@ -140,17 +389,164 @@ class PacketTunnelProvider: NEPacketTunnelProvider { override func handleAppMessage(_ messageData: Data, completionHandler: ((Data?) -> Void)?) { - if let handler = completionHandler { - handler(messageData) + let response = CaptchaBridge.handleAppMessage(messageData) ?? messageData + completionHandler?(response) + } + + /// Periodically copy the Go-side captcha counters into the App + /// Group's shared UserDefaults so the main app's UI can render + /// "Direct: X · Tunnel: Y" without an IPC round-trip every tick. + /// Reset to 0/0 happens on disconnect so the previous run's + /// numbers don't ghost into the next connection. + private func startCaptchaStatsPublisher() { + stopCaptchaStatsPublisher() + let timer = DispatchSource.makeTimerSource(queue: DispatchQueue.global(qos: .utility)) + timer.schedule(deadline: .now(), repeating: .seconds(1)) + timer.setEventHandler { + let direct = Int(TurnBridgeGetCaptchaDirectCount()) + let tunnel = Int(TurnBridgeGetCaptchaTunnelCount()) + let remote = Int(TurnBridgeGetCaptchaRemoteCount()) + let directAttempts = Int(TurnBridgeGetCaptchaDirectAttempts()) + let tunnelAttempts = Int(TurnBridgeGetCaptchaTunnelAttempts()) + let remoteAttempts = Int(TurnBridgeGetCaptchaRemoteAttempts()) + let directInFlight = Int(TurnBridgeGetCaptchaDirectInFlight()) + let tunnelInFlight = Int(TurnBridgeGetCaptchaTunnelInFlight()) + let remoteInFlight = Int(TurnBridgeGetCaptchaRemoteInFlight()) + let sessionsReady = Int(TurnBridgeGetSessionsReady()) + let sessionsTarget = Int(TurnBridgeGetSessionsTarget()) + let directSat = TurnBridgeIsCaptchaDirectSaturated() != 0 + let tunnelSat = TurnBridgeIsCaptchaTunnelSaturated() != 0 + guard let defaults = UserDefaults(suiteName: CaptchaIPC.appGroupID) else { return } + defaults.set(direct, forKey: "captchaDirectCount") + defaults.set(tunnel, forKey: "captchaTunnelCount") + defaults.set(remote, forKey: "captchaRemoteCount") + defaults.set(directAttempts, forKey: "captchaDirectAttempts") + defaults.set(tunnelAttempts, forKey: "captchaTunnelAttempts") + defaults.set(remoteAttempts, forKey: "captchaRemoteAttempts") + defaults.set(directInFlight, forKey: "captchaDirectInFlight") + defaults.set(tunnelInFlight, forKey: "captchaTunnelInFlight") + defaults.set(remoteInFlight, forKey: "captchaRemoteInFlight") + defaults.set(sessionsReady, forKey: "sessionsReady") + defaults.set(sessionsTarget, forKey: "sessionsTarget") + defaults.set(directSat, forKey: "captchaDirectSaturated") + defaults.set(tunnelSat, forKey: "captchaTunnelSaturated") + } + timer.resume() + captchaStatsTimer = timer + } + + private func stopCaptchaStatsPublisher() { + captchaStatsTimer?.cancel() + captchaStatsTimer = nil + if let defaults = UserDefaults(suiteName: CaptchaIPC.appGroupID) { + defaults.set(0, forKey: "captchaDirectCount") + defaults.set(0, forKey: "captchaTunnelCount") + defaults.set(0, forKey: "captchaRemoteCount") + defaults.set(0, forKey: "captchaDirectAttempts") + defaults.set(0, forKey: "captchaTunnelAttempts") + defaults.set(0, forKey: "captchaRemoteAttempts") + defaults.set(0, forKey: "captchaDirectInFlight") + defaults.set(0, forKey: "captchaTunnelInFlight") + defaults.set(0, forKey: "captchaRemoteInFlight") + defaults.set(0, forKey: "sessionsReady") + defaults.set(0, forKey: "sessionsTarget") + defaults.set(false, forKey: "captchaDirectSaturated") + defaults.set(false, forKey: "captchaTunnelSaturated") } } override func sleep(completionHandler: @escaping () -> Void) { - // Add code here to get ready to sleep. + // iOS is about to suspend us. Don't tear anything down (iOS will + // resume us via wake()), but record the moment so wake() can decide + // whether the gap was long enough to need a fresh TURN allocation. + sharedLogger.log("System sleep — flagging proxy for reconnect on wake") + SharedLogger.info("System sleep — flagging proxy for reconnect on wake", source: .tunnel) + Self.lastSleepAt = Date() completionHandler() } override func wake() { - // Add code here to wake up. + // After a sleep iOS thaws our Go runtime. The TURN allocation + // and DTLS session held by the embedded vk-turn-proxy client + // MAY be stale (VK TURN drops idle channels after ~50 s, NAT + // mappings on the cellular side expire, pion/dtls sequence + // numbers can drift out of the replay window), but only after + // a long enough suspension. Short sleeps — screen-off blink, + // brief task switch, lock-unlock cycle — leave every socket + // intact and we just lose a few hundred ms of keepalive RTT. + // + // ProxyForceReconnect cancels ALL live TURN+DTLS sessions + // (test logs showed 96–100 cancellations per wake on N=50, + // and the recovery storm immediately trips VK's per-IP + // ERROR_LIMIT). For short gaps we'd rather keep the work the + // captcha pipeline already invested in. wakeReconnectThreshold + // is set below VK's allocation rotation window so anything + // shorter is presumed survivable. + let gap = Self.lastSleepAt.map { Date().timeIntervalSince($0) } ?? 0 + let wakeReconnectThreshold: TimeInterval = 30 + if gap < wakeReconnectThreshold { + sharedLogger.log("System wake — short gap=\(String(format: "%.1f", gap))s, keeping live sessions") + SharedLogger.info("System wake — short gap=\(String(format: "%.1f", gap))s, keeping live sessions", source: .tunnel) + } else { + sharedLogger.log("System wake — gap=\(String(format: "%.1f", gap))s ≥ \(Int(wakeReconnectThreshold))s, forcing TURN/DTLS reconnect") + SharedLogger.info("System wake — gap=\(String(format: "%.1f", gap))s ≥ \(Int(wakeReconnectThreshold))s, forcing TURN/DTLS reconnect", source: .tunnel) + ProxyForceReconnect() + } + Self.lastSleepAt = nil + } + + // Records when iOS told us to sleep so wake() can log the suspension gap. + // Static because PacketTunnelProvider instances are owned by the system + // and we want to survive whatever lifecycle iOS chooses. + private static var lastSleepAt: Date? + + // Memory logger — runs every 5 s while the extension is alive. + // Reports (a) the iOS-given remaining memory budget for this + // extension via os_proc_available_memory() — this is the number + // that, once it hits zero, makes iOS terminate us. (b) resident + // set size via mach_task_basic_info, so we can see WHAT our + // memory actually is in OS terms (not just Go heap, which the + // Go-side memstats logger reports separately). The pair tells + // us how much headroom we have for raising N, where N=50 + // currently sits on the memory budget, and whether spikes + // come from Go (captcha pipeline) or non-Go (libdtls, mach + // ports, etc). + private static var memoryTimer: DispatchSourceTimer? + + static func startMemoryLogger() { + // Re-arm on every StartProxy so it survives Stop/Start cycles + // without leaking the previous timer. + memoryTimer?.cancel() + let timer = DispatchSource.makeTimerSource(queue: .global(qos: .utility)) + timer.schedule(deadline: .now(), repeating: .seconds(5)) + timer.setEventHandler { + let avail = os_proc_available_memory() + let rss = currentResidentMemoryBytes() + // Numbers in MB for human-readable logs. + let availMB = Double(avail) / 1024.0 / 1024.0 + let rssMB = Double(rss) / 1024.0 / 1024.0 + let msg = String( + format: "memory: rss=%.1fMB available=%.1fMB", + rssMB, availMB + ) + sharedLogger.log("\(msg, privacy: .public)") + SharedLogger.info(msg, source: .tunnel) + } + timer.resume() + memoryTimer = timer + } + + private static func currentResidentMemoryBytes() -> UInt64 { + var info = mach_task_basic_info() + var count = mach_msg_type_number_t(MemoryLayout.size / MemoryLayout.size) + let kr = withUnsafeMutablePointer(to: &info) { ptr -> kern_return_t in + ptr.withMemoryRebound(to: integer_t.self, capacity: Int(count)) { + task_info(mach_task_self_, task_flavor_t(MACH_TASK_BASIC_INFO), $0, &count) + } + } + if kr != KERN_SUCCESS { + return 0 + } + return info.resident_size } } diff --git a/network-extension/TransportHealthMonitor.swift b/network-extension/TransportHealthMonitor.swift new file mode 100644 index 0000000..d13d391 --- /dev/null +++ b/network-extension/TransportHealthMonitor.swift @@ -0,0 +1,53 @@ +import Foundation + +/// Watches the Go proxy log stream to maintain a "transport-alive" flag in +/// the App Group's UserDefaults. The main app reads this to surface a +/// "Connection unstable" banner when iOS still says NEVPNStatus=.connected +/// but the underlying DTLS/TURN tunnel hasn't seen any traffic in a while. +enum TransportHealthMonitor { + static let lastAliveKey = "transport.lastAliveAt" + static let lastDeadKey = "transport.lastDeadAt" + + private static let appGroupID = "group.com.truvvor.turnbridge" + + private static let aliveSignals: [String] = [ + "Established DTLS connection", + "Proxy started on", + "Successfully registered User Identity" + ] + + private static let deadSignals: [String] = [ + "Watchdog:", + "Failed: ", + "Closed DTLS connection", + "DTLS connection timeout", + "Proxy gracefully stopped", + "RestartProxy:" + ] + + /// Inspect a single log line emitted from the Go proxy. + static func observe(_ message: String) { + for s in aliveSignals where message.contains(s) { + markAlive() + return + } + for s in deadSignals where message.contains(s) { + markDead() + return + } + } + + static func markAlive() { + UserDefaults(suiteName: appGroupID)?.set(Date(), forKey: lastAliveKey) + } + + static func markDead() { + UserDefaults(suiteName: appGroupID)?.set(Date(), forKey: lastDeadKey) + } + + static func reset() { + let defaults = UserDefaults(suiteName: appGroupID) + defaults?.removeObject(forKey: lastAliveKey) + defaults?.removeObject(forKey: lastDeadKey) + } +} diff --git a/network-extension/network_extension.entitlements b/network-extension/network_extension.entitlements index afcf562..12515c8 100644 --- a/network-extension/network_extension.entitlements +++ b/network-extension/network_extension.entitlements @@ -8,7 +8,7 @@ com.apple.security.application-groups - group.com.netlab.TurnBridge + group.com.truvvor.turnbridge diff --git a/quick_link.py b/quick_link.py index d40f4cf..166b658 100755 --- a/quick_link.py +++ b/quick_link.py @@ -7,6 +7,9 @@ "peer": "YOUR_SERVER_IP:PORT", "listen": "127.0.0.1:9000", "n": 1, + # Optional. true=UDP transport to TURN (default), false=TCP (more + # reliable on flaky cellular at the cost of head-of-line blocking). + "udp": True, "wg": """[Interface] PrivateKey = YOUR_CLIENT_PRIVATE_KEY Address = 10.100.0.2/32 diff --git a/script/ci_setup_signing.rb b/script/ci_setup_signing.rb new file mode 100644 index 0000000..068f1b6 --- /dev/null +++ b/script/ci_setup_signing.rb @@ -0,0 +1,354 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true +# +# Ensures a Distribution certificate and App Store provisioning profiles +# exist for TurnBridge using nothing but an App Store Connect API key. +# Talks to App Store Connect over plain Net::HTTP + a hand-signed ES256 JWT, +# so no fastlane / spaceship / external gems are required. +# +# Idempotent: reuses cert + profiles across runs and only recreates them +# when they're missing or expiring within 7 days. +# +# Required env: +# ASC_KEY_ID, ASC_ISSUER_ID, ASC_KEY_PATH +# APPLE_TEAM_ID +# APP_BUNDLE_ID, EXT_BUNDLE_ID +# MAC_KEYCHAIN_PASSWORD +# Optional env: +# KEYCHAIN_PATH (default ~/Library/Keychains/login.keychain-db) +# SIGNING_CACHE_DIR (default ~/.turnbridge_signing) +# +# Writes to $GITHUB_ENV when present: +# PROFILE_APP_NAME, PROFILE_EXT_NAME + +require 'openssl' +require 'base64' +require 'json' +require 'net/http' +require 'uri' +require 'fileutils' +require 'time' +require 'cgi' +require 'shellwords' + +KEY_ID = ENV.fetch('ASC_KEY_ID') +ISSUER_ID = ENV.fetch('ASC_ISSUER_ID') +KEY_FILE = ENV.fetch('ASC_KEY_PATH') +TEAM_ID = ENV.fetch('APPLE_TEAM_ID') +APP_BID = ENV.fetch('APP_BUNDLE_ID') +EXT_BID = ENV.fetch('EXT_BUNDLE_ID') +KEYCHAIN = ENV['KEYCHAIN_PATH'] || "#{ENV['HOME']}/Library/Keychains/login.keychain-db" +KEYCHAIN_PASS = ENV.fetch('MAC_KEYCHAIN_PASSWORD') +CACHE_DIR = ENV['SIGNING_CACHE_DIR'] || "#{ENV['HOME']}/.turnbridge_signing" +PROFILES_DIR = "#{ENV['HOME']}/Library/MobileDevice/Provisioning Profiles" + +CERT_KEY_PEM = File.join(CACHE_DIR, 'distribution.key') +CERT_CER_PEM = File.join(CACHE_DIR, 'distribution.cer.pem') +CERT_P12 = File.join(CACHE_DIR, 'distribution.p12') +P12_PASSWORD = 'TurnBridgeCI' + +PROFILE_NAMES = { + APP_BID => 'TurnBridge AppStore CI', + EXT_BID => 'TurnBridge Ext AppStore CI' +}.freeze + +FileUtils.mkdir_p(CACHE_DIR) +FileUtils.mkdir_p(PROFILES_DIR) + +# ----------------------------------------------------------------------- +# JWT (ES256) — built by hand so we don't need the `jwt` gem. +# ----------------------------------------------------------------------- + +def base64url(bytes) + Base64.urlsafe_encode64(bytes, padding: false) +end + +# Convert an ECDSA DER signature to the raw r || s JOSE encoding. +def der_to_jose(der) + seq = OpenSSL::ASN1.decode(der) + r = seq.value[0].value.to_s(2) + s = seq.value[1].value.to_s(2) + r = r.rjust(32, "\x00".b) + s = s.rjust(32, "\x00".b) + r + s +end + +def asc_jwt + ec = OpenSSL::PKey.read(File.read(KEY_FILE)) + header = JSON.generate('alg' => 'ES256', 'kid' => KEY_ID, 'typ' => 'JWT') + payload = JSON.generate('iss' => ISSUER_ID, + 'exp' => Time.now.to_i + 1200, + 'aud' => 'appstoreconnect-v1') + signing_input = "#{base64url(header)}.#{base64url(payload)}" + der_sig = ec.sign(OpenSSL::Digest.new('SHA256'), signing_input) + "#{signing_input}.#{base64url(der_to_jose(der_sig))}" +end + +JWT_TOKEN = asc_jwt +HOST = 'api.appstoreconnect.apple.com' + +def asc(method, path, body = nil) + uri = URI("https://#{HOST}#{path}") + req = case method + when :get then Net::HTTP::Get.new(uri) + when :post then Net::HTTP::Post.new(uri) + when :patch then Net::HTTP::Patch.new(uri) + when :delete then Net::HTTP::Delete.new(uri) + end + req['Authorization'] = "Bearer #{JWT_TOKEN}" + req['Accept'] = 'application/json' + if body + req['Content-Type'] = 'application/json' + req.body = JSON.generate(body) + end + res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) { |http| http.request(req) } + parsed = res.body && !res.body.empty? ? (JSON.parse(res.body) rescue { 'raw' => res.body }) : nil + [res.code.to_i, parsed] +end + +def asc_ok!(code, body, action) + return if (200..299).include?(code) + msg = (body && body['errors']) ? body['errors'].map { |e| e['detail'] || e['title'] }.join('; ') : body.inspect + abort "#{action} failed: HTTP #{code} #{msg}" +end + +# ----------------------------------------------------------------------- +# Distribution certificate +# ----------------------------------------------------------------------- + +def list_distribution_certificates + page = "/v1/certificates?filter[certificateType]=IOS_DISTRIBUTION&limit=200" + certs = [] + loop do + code, body = asc(:get, page) + asc_ok!(code, body, 'list certificates') + certs.concat(body['data'] || []) + next_link = body.dig('links', 'next') + break unless next_link + page = next_link.sub(/^https:\/\/#{Regexp.escape(HOST)}/, '') + end + certs +end + +def cert_matches_key?(api_cert_b64, priv_pem_path) + return false unless File.exist?(priv_pem_path) + cer_der = Base64.decode64(api_cert_b64) + x509 = OpenSSL::X509::Certificate.new(cer_der) + priv = OpenSSL::PKey::RSA.new(File.read(priv_pem_path)) + x509.public_key.to_pem == priv.public_key.to_pem +rescue StandardError + false +end + +def import_p12!(p12_path) + ok = system('security', 'import', p12_path, + '-k', KEYCHAIN, + '-P', P12_PASSWORD, + '-T', '/usr/bin/codesign', + '-T', '/usr/bin/productbuild', + '-A') + abort 'security import failed' unless ok + system('security', 'set-key-partition-list', + '-S', 'apple-tool:,apple:,codesign:,productbuild:', + '-s', '-k', KEYCHAIN_PASS, KEYCHAIN) +end + +def keychain_distribution_sha1s + out = `security find-identity -v -p codesigning #{Shellwords.escape(KEYCHAIN)} 2>/dev/null` + out.lines.grep(/Apple Distribution/i).map { |l| l[/[0-9A-F]{40}/] }.compact.map(&:upcase) +end + +cert_resource_id = nil +cert_sha1 = nil +api_certs = list_distribution_certificates + +# Strategy 1: prefer a Distribution cert that already exists in the login +# keychain (i.e. its private key is locally available) AND is registered in +# App Store Connect. This avoids creating duplicates if the user already had +# a working signing identity from a previous Xcode session. +keychain_sha1s = keychain_distribution_sha1s +unless keychain_sha1s.empty? + best = nil + api_certs.each do |c| + cer_b64 = c.dig('attributes', 'certificateContent') + next unless cer_b64 + sha1 = OpenSSL::Digest::SHA1.hexdigest(Base64.decode64(cer_b64)).upcase + next unless keychain_sha1s.include?(sha1) + exp = Time.parse(c.dig('attributes', 'expirationDate')) rescue Time.now + if best.nil? || exp > best[:exp] + best = { cert: c, sha1: sha1, exp: exp } + end + end + + if best + cert_resource_id = best[:cert]['id'] + cert_sha1 = best[:sha1] + cn = OpenSSL::X509::Certificate.new(Base64.decode64(best[:cert].dig('attributes', 'certificateContent'))) + .subject.to_a.find { |f| f[0] == 'CN' }&.[](1) + puts "Using existing keychain Distribution cert (CN=#{cn}, api_id=#{cert_resource_id}, sha1=#{cert_sha1})" + end +end + +# Strategy 2: an earlier run of this script created and cached a cert; reuse if it's still in API. +if cert_resource_id.nil? + api_certs.each do |c| + cer_content = c.dig('attributes', 'certificateContent') + next unless cer_content && cert_matches_key?(cer_content, CERT_KEY_PEM) + cert_resource_id = c['id'] + cert_sha1 = OpenSSL::Digest::SHA1.hexdigest(Base64.decode64(cer_content)).upcase + puts "Reusing API cert that matches cached key: #{c['id']} (sha1=#{cert_sha1})" + if File.exist?(CERT_P12) + import_p12!(CERT_P12) + end + break + end +end + +if cert_resource_id.nil? + puts 'Creating new Distribution certificate via ASC API' + + priv = OpenSSL::PKey::RSA.new(2048) + csr = OpenSSL::X509::Request.new + csr.subject = OpenSSL::X509::Name.new([['CN', 'TurnBridge Distribution']]) + csr.public_key = priv.public_key + csr.sign(priv, OpenSSL::Digest.new('SHA256')) + csr_b64 = Base64.strict_encode64(csr.to_der) + + body = { + data: { + type: 'certificates', + attributes: { certificateType: 'IOS_DISTRIBUTION', csrContent: csr_b64 } + } + } + code, response = asc(:post, '/v1/certificates', body) + + if code == 409 || (response && response['errors']&.any? { |e| (e['detail'] || '') =~ /maximum number/i }) + puts 'Hit Apple distribution-cert limit; revoking oldest existing one' + victim = api_certs.min_by do |c| + Time.parse(c.dig('attributes', 'expirationDate')) rescue Time.now + 365 * 86_400 + end + if victim + d_code, d_body = asc(:delete, "/v1/certificates/#{victim['id']}") + asc_ok!(d_code, d_body, "delete cert #{victim['id']}") + end + code, response = asc(:post, '/v1/certificates', body) + end + asc_ok!(code, response, 'create distribution certificate') + + data = response['data'] + cert_resource_id = data['id'] + cer_b64 = data.dig('attributes', 'certificateContent') + cer_der = Base64.decode64(cer_b64) + cer_x509 = OpenSSL::X509::Certificate.new(cer_der) + cert_sha1 = OpenSSL::Digest::SHA1.hexdigest(cer_der).upcase + + File.write(CERT_KEY_PEM, priv.to_pem) + File.write(CERT_CER_PEM, cer_x509.to_pem) + p12 = OpenSSL::PKCS12.create(P12_PASSWORD, 'Apple Distribution', priv, cer_x509) + File.binwrite(CERT_P12, p12.to_der) + + import_p12!(CERT_P12) + puts "Distribution cert ready: #{cert_resource_id}" +end + +# ----------------------------------------------------------------------- +# Bundle IDs lookup +# ----------------------------------------------------------------------- + +def find_bundle_id(identifier) + q = CGI.escape(identifier) + code, body = asc(:get, "/v1/bundleIds?filter[identifier]=#{q}&limit=200") + asc_ok!(code, body, "list bundle ids for #{identifier}") + (body['data'] || []).find { |b| b.dig('attributes', 'identifier') == identifier } +end + +bundles = {} +PROFILE_NAMES.each_key do |bid| + bundle = find_bundle_id(bid) + abort "Bundle ID #{bid} is not registered" unless bundle + bundles[bid] = bundle +end + +# ----------------------------------------------------------------------- +# App Store profiles +# ----------------------------------------------------------------------- + +def find_profile_by_name(name) + q = CGI.escape(name) + code, body = asc(:get, "/v1/profiles?filter[name]=#{q}&include=certificates&limit=200") + asc_ok!(code, body, "find profile #{name}") + (body['data'] || []).first +end + +results = {} +PROFILE_NAMES.each do |bid, name| + bundle = bundles[bid] + profile = find_profile_by_name(name) + + recreate = profile.nil? + if profile + cert_ids = (profile.dig('relationships', 'certificates', 'data') || []).map { |c| c['id'] } + unless cert_ids.include?(cert_resource_id) + puts "Profile '#{name}' references different cert; recreating" + recreate = true + end + if !recreate + exp = profile.dig('attributes', 'expirationDate') + if exp && Time.parse(exp) < Time.now + 7 * 86_400 + puts "Profile '#{name}' expires within 7 days; recreating" + recreate = true + end + end + end + + if recreate + if profile + code, body = asc(:delete, "/v1/profiles/#{profile['id']}") + asc_ok!(code, body, "delete profile #{profile['id']}") unless code == 404 + end + + create_body = { + data: { + type: 'profiles', + attributes: { name: name, profileType: 'IOS_APP_STORE' }, + relationships: { + bundleId: { data: { type: 'bundleIds', id: bundle['id'] } }, + certificates: { data: [{ type: 'certificates', id: cert_resource_id }] } + } + } + } + code, body = asc(:post, '/v1/profiles', create_body) + asc_ok!(code, body, "create profile #{name}") + profile = body['data'] + puts "Created profile '#{name}' (#{profile['id']})" + else + puts "Reusing profile '#{name}' (#{profile['id']})" + end + + prof_b64 = profile.dig('attributes', 'profileContent') + abort "Profile '#{name}' has no content" unless prof_b64 + prof_data = Base64.decode64(prof_b64) + uuid = prof_data[%r{UUID\s*([^<]+)}, 1] + abort "Could not parse UUID from profile #{name}" unless uuid + + dest = File.join(PROFILES_DIR, "#{uuid}.mobileprovision") + File.binwrite(dest, prof_data) + puts "Saved #{dest}" + + results[bid] = { name: name, uuid: uuid, profile_id: profile['id'] } +end + +# ----------------------------------------------------------------------- +# Hand profile names back to the workflow +# ----------------------------------------------------------------------- + +if (gh_env = ENV['GITHUB_ENV']) + File.open(gh_env, 'a') do |f| + f.puts "PROFILE_APP_NAME=#{results[APP_BID][:name]}" + f.puts "PROFILE_EXT_NAME=#{results[EXT_BID][:name]}" + f.puts "SIGNING_CERT_SHA1=#{cert_sha1}" if cert_sha1 + end +end + +puts JSON.pretty_generate(results) +puts 'Signing setup complete' diff --git a/wireguard-apple/Sources/WireGuardKitGo/.gitignore b/wireguard-apple/Sources/WireGuardKitGo/.gitignore index 5d25f8f..1c4d180 100644 --- a/wireguard-apple/Sources/WireGuardKitGo/.gitignore +++ b/wireguard-apple/Sources/WireGuardKitGo/.gitignore @@ -1,3 +1,8 @@ .cache/ .tmp/ out/ + +# `go build .` from this directory produces a binary named after the +# Go module (github.com/amnezia-vpn/amneziawg-apple). Ignore it so +# verifying compilation locally doesn't leave a 50 MB artifact behind. +amneziawg-apple diff --git a/wireguard-apple/Sources/WireGuardKitGo/captcha_client.go b/wireguard-apple/Sources/WireGuardKitGo/captcha_client.go new file mode 100644 index 0000000..74b1fc3 --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/captcha_client.go @@ -0,0 +1,118 @@ +// captcha_client.go — HTTP client for the VK captcha solver that +// impersonates Safari iOS at the TLS + HTTP/2 layer. +// +// Why this exists: in mid-2026 VK upgraded its anti-bot to fingerprint +// every captcha request by JA3/JA4 (TLS ClientHello shape) and HTTP/2 +// SETTINGS + header-order. Go's net/http has a distinctive ClientHello +// and writes HTTP/2 headers in a stable but non-Chrome/Safari order. +// VK's classifier now flags us as a non-browser and either traffic- +// shapes the IP or returns ERROR_LIMIT on captcha solve, even when +// our UA + cookies + behavioral signals are pristine. +// +// bogdanfinn/tls-client (built on utls) lets us send TLS handshakes +// with byte-identical ClientHello to a real Safari iOS 18, plus +// HTTP/2 SETTINGS frame and pseudo-header order. fhttp (a fork of +// net/http) preserves the original header order via the magic +// HeaderOrderKey, so the server sees the exact sequence Safari emits. +// Pure Go, no cgo, runs unchanged inside the iOS NetworkExtension. +// +// Limitation vs the old newCaptchaClient: tls-client's WithDialer +// takes a net.Dialer struct, not a DialContext function, so we can't +// plug our customDial / cellularDial / DoH fallback chain (see +// dns_resolver.go) directly. iOS system resolver handles the captcha +// hosts (api.vk.ru, id.vk.ru) just fine in practice; the DoH chain +// was a paranoia-fallback for Russian carriers that NXDOMAIN +// vk-family hosts, which the captcha-service field log hasn't shown +// in the wild. If that path becomes necessary, the path is +// pre-resolve-via-DoH + dial-by-IP + WithServerNameOverwrite. + +package main + +import ( + "context" + + fhttp "github.com/bogdanfinn/fhttp" + "github.com/bogdanfinn/fhttp/cookiejar" + tlsclient "github.com/bogdanfinn/tls-client" + tlsprofiles "github.com/bogdanfinn/tls-client/profiles" +) + +// safariHeaderOrder is the order Safari iOS 18 writes HTTP/2 request +// headers in. Order matters for VK's classifier even though HTTP/2 +// is semantically order-insensitive — Chrome and Safari diverge here +// and that's one of the cheap bot tells. Keep this in sync with +// whatever profile we pass to WithClientProfile below. +var safariHeaderOrder = []string{ + "host", + "accept", + "sec-fetch-site", + "accept-encoding", + "sec-fetch-mode", + "user-agent", + "accept-language", + "sec-fetch-dest", + "referer", + "priority", + "cookie", + "content-type", + "content-length", + "origin", +} + +// safariPHeaderOrder is the order Safari iOS writes HTTP/2 pseudo- +// headers. Almost every HTTP/2 client writes :method, :scheme, +// :path, :authority — but the exact order varies and is yet another +// bot tell. Safari iOS is the order below. +var safariPHeaderOrder = []string{ + ":method", + ":scheme", + ":path", + ":authority", +} + +// newTLSCaptchaClient returns a tls-client HttpClient with Safari +// iOS 18.0 fingerprint and a fresh cookie jar (each captcha solve +// gets its own jar — VK's classifier checks for prior session state +// and an empty jar simulates a clean browser launch). +// +// forceDirect was previously a hook to route HTTP through a non-utun +// interface when the tunnel egress hit a per-IP rate-limit. tls- +// client doesn't expose a DialContext slot the same way net/http +// does, so for now forceDirect is honored only as a documentation +// signal — the actual dial goes through whatever route iOS picks. +// If the field-log starts showing tunnel-egress rate-limits on +// captcha calls again, revisit by either (a) WithLocalAddr to a +// physical-interface IP we discover via getifaddrs, or (b) fork +// tls-client to add a DialContext option. +func newTLSCaptchaClient(forceDirect bool) (tlsclient.HttpClient, error) { + jar, err := cookiejar.New(nil) + if err != nil { + return nil, err + } + opts := []tlsclient.HttpClientOption{ + tlsclient.WithTimeoutSeconds(20), + tlsclient.WithClientProfile(tlsprofiles.Safari_IOS_18_0), + tlsclient.WithCookieJar(jar), + // HTTP/3 isn't worth racing for VK's API hosts; HTTP/2 is + // faster to first byte and matches what mobile Safari uses + // for these endpoints anyway. + tlsclient.WithDisableHttp3(), + } + _ = forceDirect // see comment above + return tlsclient.NewHttpClient(tlsclient.NewNoopLogger(), opts...) +} + +// applySafariHeaderOrder stamps the magic fhttp.HeaderOrderKey and +// PHeaderOrderKey on a request so the underlying RoundTripper writes +// headers in Safari's order. Call after setting all real headers. +func applySafariHeaderOrder(req *fhttp.Request) { + req.Header[fhttp.HeaderOrderKey] = safariHeaderOrder + req.Header[fhttp.PHeaderOrderKey] = safariPHeaderOrder +} + +// withCaptchaCtx attaches ctx to req — fhttp.NewRequest doesn't take +// a context the way net/http.NewRequestWithContext does, so we apply +// it post-hoc. Centralised so callers don't forget. +func withCaptchaCtx(ctx context.Context, req *fhttp.Request) *fhttp.Request { + return req.WithContext(ctx) +} diff --git a/wireguard-apple/Sources/WireGuardKitGo/captcha_debug_info.go b/wireguard-apple/Sources/WireGuardKitGo/captcha_debug_info.go new file mode 100644 index 0000000..6af6eb8 --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/captcha_debug_info.go @@ -0,0 +1,110 @@ +// captcha_debug_info.go — dynamic debug_info hash for VK captcha. +// +// VK's not_robot_captcha.js embeds a per-version 64-char hex string in +// a debug_info constant. The captchaNotRobot.check API call expects +// the same string echoed back as a debug_info query param — VK uses it +// as a "did this client actually load the same script the page +// referenced" signal. Sending the wrong value (or a stale one from a +// previous JS version) is one of the easiest bot tells they have. +// +// Our pre-v2 code pasted a hard-coded SHA-256 from one specific build +// of not_robot_captcha.js. VK pushes the script regularly; whenever +// they do, every solve from us starts failing with status=BOT until +// someone notices and updates the constant. The Moroka8 reference +// implementation handles this by fetching the script live, regex- +// extracting the hash, and caching it by script URL — which is what +// this file does. +// +// The script URL itself rotates infrequently (versioned path like +// `/vkid/2.5.7/not_robot_captcha.js`), so the cache hit-rate is high. + +package main + +import ( + "context" + "fmt" + "io" + "regexp" + "sync" + + fhttp "github.com/bogdanfinn/fhttp" + tlsclient "github.com/bogdanfinn/tls-client" +) + +// debugInfoCache maps script URL → 64-char hex debug_info. sync.Map +// because reads dominate writes and we want lock-free hits on the +// common path. Entries never expire — when VK ships a new version the +// URL changes, the cache miss kicks fetchDebugInfo, and the old entry +// stays around harmlessly (tens of bytes). +var debugInfoCache sync.Map + +// scriptURLRe pulls the not_robot_captcha.js URL out of the bootstrap +// HTML. The path shape has stayed stable across the redesigns we've +// observed: `/vkid//not_robot_captcha.js` under one of a few +// CDN hosts. We match the whole src URL so we can fetch it directly. +var scriptURLRe = regexp.MustCompile(`]+src="([^"]+not_robot_captcha\.js[^"]*)"`) + +// debugInfoRe pulls the 64-hex-char debug_info constant out of the +// minified script. VK has used a few syntactic forms (assignment to a +// const, embedded in a larger string with || operators); the regex +// tolerates both. +var debugInfoRe = regexp.MustCompile(`debug_info\s*:\s*(?:[^"]*\|\|\s*)?"([a-fA-F0-9]{64})"`) + +// extractScriptURL finds the not_robot_captcha.js URL in the bootstrap +// HTML. Returns "" if not present (typical when VK responds with a +// non-captcha page) — caller should fall back to the legacy hard-coded +// hash in that case. +func extractScriptURL(html string) string { + if m := scriptURLRe.FindStringSubmatch(html); len(m) >= 2 { + return m[1] + } + return "" +} + +// fetchDebugInfo returns the debug_info hash for the given script URL, +// fetching and caching on first miss. ctx-aware so cancellation +// propagates. Returns "" + error if the fetch failed or the regex +// didn't match — caller should NOT fall back to a stale constant when +// this errors; better to fail the solve and retry on the next run +// (where the cache might be warm or VK might be in a healable state). +func fetchDebugInfo(ctx context.Context, client tlsclient.HttpClient, profile Profile, scriptURL string) (string, error) { + if scriptURL == "" { + return "", fmt.Errorf("empty scriptURL") + } + if cached, ok := debugInfoCache.Load(scriptURL); ok { + return cached.(string), nil + } + + req, err := fhttp.NewRequest("GET", scriptURL, nil) + if err != nil { + return "", err + } + req = withCaptchaCtx(ctx, req) + req.Header.Set("User-Agent", profile.UserAgent) + req.Header.Set("Accept", "text/javascript,application/javascript,*/*;q=0.1") + req.Header.Set("Accept-Language", "en-US,en;q=0.9") + req.Header.Set("Referer", "https://id.vk.com/") + req.Header.Set("Sec-Fetch-Site", "same-site") + req.Header.Set("Sec-Fetch-Mode", "no-cors") + req.Header.Set("Sec-Fetch-Dest", "script") + applySafariHeaderOrder(req) + + resp, err := client.Do(req) + if err != nil { + return "", fmt.Errorf("fetch script: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("read script: %w", err) + } + + m := debugInfoRe.FindSubmatch(body) + if len(m) < 2 { + return "", fmt.Errorf("debug_info constant not found in %s", scriptURL) + } + di := string(m[1]) + debugInfoCache.Store(scriptURL, di) + return di, nil +} diff --git a/wireguard-apple/Sources/WireGuardKitGo/captcha_manual.go b/wireguard-apple/Sources/WireGuardKitGo/captcha_manual.go new file mode 100644 index 0000000..c734ac3 --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/captcha_manual.go @@ -0,0 +1,457 @@ +// SPDX-License-Identifier: MIT +// +// Manual captcha bridge. Lets the Swift app/extension show a real browser +// (WKWebView) for the VK NotRobot captcha when the auto solver can't beat +// it. Swift registers a single C callback via TurnBridgeSetManualCaptchaCallback; +// when the auto-solver bails, Go invokes that callback with a redirect_uri +// and blocks until Swift answers via TurnBridgeSubmitManualCaptchaToken or +// TurnBridgeCancelManualCaptcha. + +package main + +/* +#include +#include + +typedef void (*manual_captcha_cb)(const char* request_id, const char* redirect_uri); + +static inline void invoke_manual_captcha_cb(manual_captcha_cb cb, + const char* request_id, + const char* redirect_uri) { + cb(request_id, redirect_uri); +} +*/ +import "C" + +import ( + "encoding/json" + "fmt" + "log" + "sync" + "sync/atomic" + "time" + "unsafe" +) + +type manualCaptchaSlot struct { + tokenCh chan string + // responseCh carries the full JSON response when the WebView did + // the follow-up VK API call itself instead of just extracting the + // success_token. The follow-up runs inside the same browser + // session that solved the captcha — same cookies, same TLS, same + // IP — so VK sees a single coherent actor instead of "token minted + // here, redeemed over there". Empty retryURL = WebView falls back + // to the legacy tokenCh path. + responseCh chan string + errCh chan error + // retryURL + retryBody are the request the WebView should make + // after extracting success_token. retryBody contains the literal + // string "__TOKEN__" which the WebView's injected JS replaces + // with the actual token before sending. + retryURL string + retryBody string +} + +// manualCaptchaQuotaPerSession bounds how many times the iOS UI may +// be prompted within a single StartProxy session. The user's +// expectation, restated 1.3.28: solve at most 3 captchas yourself, +// after that the remote captcha-service cluster picks up. +// +// Once the quota is exhausted, manualCaptchaForcedMode / +// manualCaptchaFallbackAvailable return false for the remainder of +// the session and getCreds returns "quota exhausted"; the +// per-session goroutine then either picks up creds from a recycled +// identity OR (with the lowered remoteHandoverThreshold) gets routed +// to the remote service. Quota resets on the next StartProxy via the +// counter being reset there. +const manualCaptchaQuotaPerSession = 3 + +var manualCaptchaInvocations atomic.Int64 + +// manualCaptchaLastSolveUnix records when the last manual solve +// SUCCEEDED. VK rate-limits NotRobot success per source IP: once one +// solve lands on a given IP, the widget short-circuits to an +// already-checked box for ~a minute and refuses to mint a second +// success_token — so a second sheet on the same IP is unsolvable, the +// user can only Cancel (observed in the field). Within the cooldown we +// therefore don't show another sheet: defer to the remote captcha +// service (its own IPs) if configured, else fail fast so the caller +// reuses an identity instead of trapping the user. Reset per StartProxy. +var manualCaptchaLastSolveUnix atomic.Int64 + +const manualCaptchaPerIPCooldown = 60 * time.Second + +// resetManualCaptchaQuota is called from StartProxy at the top so +// each fresh session starts the user out at full quota again. +func resetManualCaptchaQuota() { + manualCaptchaInvocations.Store(0) + manualCaptchaLastSolveUnix.Store(0) +} + +func manualCaptchaQuotaRemaining() int64 { + used := manualCaptchaInvocations.Load() + rem := int64(manualCaptchaQuotaPerSession) - used + if rem < 0 { + return 0 + } + return rem +} + +// manualCaptchaSerialise enforces that only one captcha sheet is +// shown to the user at a time. Without it — see the 1.3.27 field +// log — the first wave of N=60 session goroutines all called +// requestManualCaptcha within the same millisecond. Each one +// publishRequest'd its own PendingRequest into the App Group +// UserDefaults under the SAME KEY; only the last write survived. +// The Swift Manager only ever saw the LATEST request, the sheet +// kept swapping URLs under the user's fingers, and the visible +// "stuck on green checkmark" was actually the page from one +// request being overwritten by the next request's URL while the +// JS helper from the first request was mid-flight. +// +// Serialising at the Go entry point means goroutines line up in +// arrival order. Each waits for the previous one to fully finish +// (solve, cancel, or timeout) before publishRequest fires the +// callback. The Swift side never sees overlapping requests. +// +// Quota and serialisation interact cleanly: the lock acquisition +// happens first, then quota is checked (and decrements on +// rejection). At most 5 ever hold the lock; the 6th — 60th wait +// in line until the 5th completes, then immediately fall through +// to the quota-exhausted branch. +// manualCaptchaSerialise is a binary semaphore implemented as a +// 1-buffered channel: send to claim, receive to release. Channel +// pick at send time is FIFO-ish under Go's runtime, which matches +// the "arrival order" intent better than sync.Mutex's unspecified +// wakeup order. +var manualCaptchaSerialise = make(chan struct{}, 1) + +const ( + manualCaptchaModeOff = 0 + manualCaptchaModeForced = 1 + manualCaptchaModeFallback = 2 +) + +var ( + manualCaptchaMu sync.RWMutex + manualCaptchaCB C.manual_captcha_cb + manualCaptchaMode int + manualCaptchaSlotsMu sync.Mutex + manualCaptchaSlots = make(map[string]*manualCaptchaSlot) +) + +//export TurnBridgeSetManualCaptchaMode +func TurnBridgeSetManualCaptchaMode(mode C.int) { + manualCaptchaMu.Lock() + defer manualCaptchaMu.Unlock() + manualCaptchaMode = int(mode) +} + +// manualCaptchaForcedMode reports whether the user explicitly chose +// forced mode AND a callback is available to display sheets. It does +// NOT consult the per-session quota — caller dispatches to +// requestManualCaptcha which handles quota + defer-to-remote inside. +// That single-source-of-truth is what lets quota exhaustion route to +// the server cluster instead of silently falling to the auto solver +// (which would surprise a forced-mode user with auto behaviour). +func manualCaptchaForcedMode() bool { + manualCaptchaMu.RLock() + defer manualCaptchaMu.RUnlock() + return manualCaptchaMode == manualCaptchaModeForced && manualCaptchaCB != nil +} + +// manualCaptchaFallbackAvailable reports whether the UI prompt can +// be used as a last-resort fallback when both the auto solver and +// the remote /cred path have given up on this captcha. Different +// from forced mode: only consulted by solveVkCaptcha at the end of +// the auto chain, not at the start. Quota is consulted here — if +// exhausted there is no point asking the caller to invoke +// requestManualCaptcha, since fallback mode's only escalation path +// IS the sheet (no further auto-retry after this). +func manualCaptchaFallbackAvailable() bool { + manualCaptchaMu.RLock() + defer manualCaptchaMu.RUnlock() + if manualCaptchaMode != manualCaptchaModeFallback || manualCaptchaCB == nil { + return false + } + return manualCaptchaQuotaRemaining() > 0 +} + +// manualCaptchaBootstrapActive reports whether we're still bringing up +// the very first session and should solve by hand BEFORE trying the +// auto tls-client chain. Rationale (see solveVkCaptcha): under hard +// blocking there's no tunnel and no reachable captcha-service until the +// first session exists, and the auto chain's status:BOT verdict poisons +// the session/IP the user is about to solve in real WebKit. So when no +// session is ready yet, a manual handler is registered, the user opted +// into prompts (mode != off), and quota remains, do manual-first. +// +// Unlike manualCaptchaFallbackAvailable this also fires in fallback +// mode — the whole point is to pre-empt the auto attempt during +// bootstrap. After the first session comes up (captchaSessionsReady>0) +// this returns false and normal mode behaviour resumes. +func manualCaptchaBootstrapActive() bool { + manualCaptchaMu.RLock() + mode := manualCaptchaMode + cb := manualCaptchaCB + manualCaptchaMu.RUnlock() + if cb == nil || mode == manualCaptchaModeOff { + return false + } + if captchaSessionsReady.Load() > 0 { + return false + } + return manualCaptchaQuotaRemaining() > 0 +} + +//export TurnBridgeSetManualCaptchaCallback +func TurnBridgeSetManualCaptchaCallback(cb C.manual_captcha_cb) { + manualCaptchaMu.Lock() + defer manualCaptchaMu.Unlock() + manualCaptchaCB = cb +} + +//export TurnBridgeSubmitManualCaptchaToken +func TurnBridgeSubmitManualCaptchaToken(cReqID *C.char, cToken *C.char) { + if cReqID == nil { + return + } + reqID := C.GoString(cReqID) + token := "" + if cToken != nil { + token = C.GoString(cToken) + } + + manualCaptchaSlotsMu.Lock() + slot, ok := manualCaptchaSlots[reqID] + manualCaptchaSlotsMu.Unlock() + if !ok { + return + } + select { + case slot.tokenCh <- token: + default: + } +} + +//export TurnBridgeCancelManualCaptcha +func TurnBridgeCancelManualCaptcha(cReqID *C.char, cReason *C.char) { + if cReqID == nil { + return + } + reqID := C.GoString(cReqID) + reason := "user cancelled" + if cReason != nil { + if s := C.GoString(cReason); s != "" { + reason = s + } + } + + manualCaptchaSlotsMu.Lock() + slot, ok := manualCaptchaSlots[reqID] + manualCaptchaSlotsMu.Unlock() + if !ok { + return + } + select { + case slot.errCh <- fmt.Errorf("%s", reason): + default: + } +} + +// requestManualCaptcha asks the registered handler (the iOS app, via +// the extension) to solve the captcha at redirectURI. Blocks until +// the WebView responds. +// +// If retryURL is non-empty, the WebView will use the just-solved +// browser session to POST retryBody (with literal "__TOKEN__" +// replaced by the actual success_token) to retryURL, and return the +// resulting JSON response as `response`. The caller can use that +// response directly, skipping its own retry — VK never sees a +// session switch between captcha solve and API redemption. +// +// If retryURL is empty OR if the WebView's in-session retry fails +// (network error, fetch threw, response not extractable), it falls +// back to returning just the success_token via `token`. The caller +// then does the legacy retry from its own HTTP client. +// +// Exactly one of (token, response, err) is non-empty on return. +func requestManualCaptcha(redirectURI, retryURL, retryBody string, timeout time.Duration) (token, response string, err error) { + manualCaptchaMu.RLock() + cb := manualCaptchaCB + manualCaptchaMu.RUnlock() + if cb == nil { + return "", "", fmt.Errorf("manual captcha handler not registered") + } + if redirectURI == "" { + return "", "", fmt.Errorf("manual captcha redirect_uri is empty") + } + + // Serialise: only one captcha sheet is shown at a time. See + // manualCaptchaSerialise comment. Goroutines stack up here and + // release the slot on return (success, cancel, or timeout). + // CRITICAL: the slot is held across the entire user-facing + // solve so PendingRequest in App Group UserDefaults isn't + // overwritten by the next goroutine while the user is still + // looking at the current sheet. + manualCaptchaSerialise <- struct{}{} + defer func() { <-manualCaptchaSerialise }() + + // While we waited in the queue, another goroutine may have + // brought up the first WG session. If so, the remote + // captcha-service is now preferred (see remoteHandoverThreshold) + // and bothering the user with another sheet would defeat the + // "max 3 captchas per StartProxy" promise. Bail out with a + // sentinel error; getCredsRouted catches it and re-routes this + // call to the server cluster. + if shouldDeferToRemoteNow() { + log.Printf("[Captcha] deferring queued manual prompt to remote (sessions_ready=%d)", captchaSessionsReady.Load()) + return "", "", errDeferToRemote + } + + // Per-IP cooldown. If a manual solve already succeeded in the last + // minute, VK won't issue a second success_token on this IP — the + // next sheet would render an already-checked box the user can't + // re-solve. Catches the race where a queued prompt is dequeued in + // the ~1 s before captchaSessionsReady ticks up and shouldDeferTo- + // RemoteNow would have caught it. Defer to remote if available; + // otherwise fail fast so the caller reuses an identity. + if last := manualCaptchaLastSolveUnix.Load(); last > 0 { + if elapsed := time.Since(time.Unix(last, 0)); elapsed < manualCaptchaPerIPCooldown { + log.Printf("[Captcha] per-IP cooldown after manual solve (%s/%s) — suppressing second sheet", + elapsed.Round(time.Second), manualCaptchaPerIPCooldown) + if remoteCaptchaEnabled() { + return "", "", errDeferToRemote + } + return "", "", fmt.Errorf("manual captcha per-IP cooldown (%s remaining), no remote configured", + (manualCaptchaPerIPCooldown - elapsed).Round(time.Second)) + } + } + + // Reserve a slot in the per-session quota AFTER acquiring the + // serialise lock. Without that ordering, all 60 goroutines + // could race past the quota gate at once (Add(1) returns a + // monotonic counter but doesn't block), then only the first 5 + // to acquire the lock actually run. The remaining 55 would + // have already burned a slot. With this ordering, only as + // many slots are spent as sheets are actually shown. + used := manualCaptchaInvocations.Add(1) + if used > int64(manualCaptchaQuotaPerSession) { + manualCaptchaInvocations.Add(-1) + // User's hard ask: never see more than N sheets per + // StartProxy. If the remote captcha-service is available + // after we've exhausted the quota, route this one there + // instead of returning a hard error (which would cascade + // into auto-solver attempts that fail at slider step). + if shouldDeferToRemoteNow() { + log.Printf("[Captcha] manual quota exhausted (%d/%d), deferring to remote", used-1, manualCaptchaQuotaPerSession) + return "", "", errDeferToRemote + } + return "", "", fmt.Errorf("manual captcha quota exhausted (%d/%d)", used-1, manualCaptchaQuotaPerSession) + } + log.Printf("[Captcha] manual prompt %d/%d this session", used, manualCaptchaQuotaPerSession) + + reqID := randomHex(8) + slot := &manualCaptchaSlot{ + tokenCh: make(chan string, 1), + responseCh: make(chan string, 1), + errCh: make(chan error, 1), + retryURL: retryURL, + retryBody: retryBody, + } + + manualCaptchaSlotsMu.Lock() + manualCaptchaSlots[reqID] = slot + manualCaptchaSlotsMu.Unlock() + defer func() { + manualCaptchaSlotsMu.Lock() + delete(manualCaptchaSlots, reqID) + manualCaptchaSlotsMu.Unlock() + }() + + cReqID := C.CString(reqID) + cURI := C.CString(redirectURI) + defer C.free(unsafe.Pointer(cReqID)) + defer C.free(unsafe.Pointer(cURI)) + + C.invoke_manual_captcha_cb(cb, cReqID, cURI) + + select { + case resp := <-slot.responseCh: + if resp == "" { + return "", "", fmt.Errorf("manual captcha returned empty response") + } + manualCaptchaLastSolveUnix.Store(time.Now().Unix()) + return "", resp, nil + case t := <-slot.tokenCh: + if t == "" { + return "", "", fmt.Errorf("manual captcha returned empty token") + } + manualCaptchaLastSolveUnix.Store(time.Now().Unix()) + return t, "", nil + case e := <-slot.errCh: + return "", "", e + case <-time.After(timeout): + return "", "", fmt.Errorf("manual captcha timeout after %s", timeout) + } +} + +// TurnBridgeGetManualCaptchaRetryRequest lets Swift fetch the retry +// URL + body template for a given request ID right after the +// callback fires. Returns a JSON string {"url":..., "body":...} or +// empty string if no retry is configured for this slot. Caller must +// free() the returned pointer. We use this pull-from-Swift pattern +// rather than passing retry params as callback arguments to avoid +// breaking the existing C callback ABI. +// +//export TurnBridgeGetManualCaptchaRetryRequest +func TurnBridgeGetManualCaptchaRetryRequest(cReqID *C.char) *C.char { + if cReqID == nil { + return nil + } + reqID := C.GoString(cReqID) + + manualCaptchaSlotsMu.Lock() + slot, ok := manualCaptchaSlots[reqID] + manualCaptchaSlotsMu.Unlock() + if !ok || slot.retryURL == "" { + return nil + } + payload := struct { + URL string `json:"url"` + Body string `json:"body"` + }{URL: slot.retryURL, Body: slot.retryBody} + b, err := json.Marshal(payload) + if err != nil { + return nil + } + return C.CString(string(b)) +} + +// TurnBridgeSubmitManualCaptchaResponse is the WebView's "I did the +// retry myself, here's the full JSON response" entry. Swift calls +// this instead of TurnBridgeSubmitManualCaptchaToken when the +// in-WebView fetch succeeded. +// +//export TurnBridgeSubmitManualCaptchaResponse +func TurnBridgeSubmitManualCaptchaResponse(cReqID *C.char, cResponseJSON *C.char) { + if cReqID == nil { + return + } + reqID := C.GoString(cReqID) + resp := "" + if cResponseJSON != nil { + resp = C.GoString(cResponseJSON) + } + + manualCaptchaSlotsMu.Lock() + slot, ok := manualCaptchaSlots[reqID] + manualCaptchaSlotsMu.Unlock() + if !ok { + return + } + select { + case slot.responseCh <- resp: + default: + } +} diff --git a/wireguard-apple/Sources/WireGuardKitGo/captcha_slider.go b/wireguard-apple/Sources/WireGuardKitGo/captcha_slider.go index 01642dc..beef098 100644 --- a/wireguard-apple/Sources/WireGuardKitGo/captcha_slider.go +++ b/wireguard-apple/Sources/WireGuardKitGo/captcha_slider.go @@ -21,14 +21,26 @@ const ( defaultSliderAttempts = 4 ) +// sliderRankSlot bounds the parallelism of the scoring step in +// rankSliderCandidates. Before F5 each ranking materialised the full +// rearranged image per candidate (49 × 600×600 × 4 = ~70 MB transient +// per solver), so cap-2 was a hard memory ceiling. F5 scores directly +// on the source image without materialising swaps — peak transient +// drops to a few KB — so the slot now exists only to bound CPU when +// many slider captchas land at once. Matches maxConcurrentCaptchaSolves +// so the captcha pipeline isn't artificially throttled below its own +// concurrency cap. +var sliderRankSlot = make(chan struct{}, maxConcurrentCaptchaSolves) + // vkReqFunc is the type for the VK API request helper from callCaptchaNotRobotAPI. type vkReqFunc func(method, postData string) (map[string]interface{}, error) type sliderCaptchaContent struct { Image image.Image - Size int // grid NxN - Steps []int // swap pairs - Attempts int // max submit attempts + GridW int // tile columns + GridH int // tile rows + Steps []int // swap pairs + Attempts int // max submit attempts } type sliderCandidate struct { @@ -45,13 +57,24 @@ func solveSliderCaptcha( baseParams string, browserFp string, hash string, + debugInfo string, settingsResp map[string]interface{}, + isTunnel bool, ) (string, error) { // Extract slider settings from the settings response sliderSettings := extractSliderSettings(settingsResp) log.Printf("slider: fetching captcha content (settings=%q)", sliderSettings) + // Open a captcha trap. Every artefact we collect during the solve + // is buffered in memory and either Discarded (on success) or + // Committed (on any failure path). The deferred Discard is the + // safety net — explicit Commit calls in the failure branches run + // first, and Commit/Discard are idempotent. + trap := newCaptchaTrap("slider") + defer trap.Discard() + trap.Note("settings_raw=%q", sliderSettings) + // Get scrambled image and swap instructions getContentData := baseParams if sliderSettings != "" { @@ -60,21 +83,61 @@ func solveSliderCaptcha( resp, err := vkReq("captchaNotRobot.getContent", getContentData) if err != nil { + trap.Note("getContent transport error: %v", err) + trap.Commit("getContent_transport_err") return "", fmt.Errorf("slider getContent: %w", err) } + // Save the raw getContent response and the image bytes as soon as + // we have them, BEFORE parsing — that way a new captcha variant + // that breaks parseSliderContent still leaves us a self-contained + // artefact to inspect. + if rawJSON, jerr := json.MarshalIndent(resp, "", " "); jerr == nil { + trap.Save("getContent_response.json", rawJSON) + } + if respMap, ok := resp["response"].(map[string]interface{}); ok { + if imgStr, ok := respMap["image"].(string); ok && imgStr != "" { + if rawBytes, derr := base64.StdEncoding.DecodeString(imgStr); derr == nil { + ext := "bin" + if e, ok := respMap["extension"].(string); ok && e != "" { + ext = strings.ToLower(e) + } + trap.Save("image."+ext, rawBytes) + } + } + } + content, err := parseSliderContent(resp) if err != nil { + // status:ERROR / status:ERROR_LIMIT from slider getContent + // is VK rate-limiting us at the slider gate — count as a + // saturation hit so a high-N run doesn't keep spawning more + // sessions that will all hit the same wall. The fail streak + // resets on the next success. + markCaptchaSaturated(isTunnel) + trap.Note("parseSliderContent failed: %v", err) + trap.Commit("unparseable_response") return "", fmt.Errorf("slider parse: %w", err) } + trap.Note("parsed grid=%dx%d swaps=%d attempts=%d", + content.GridW, content.GridH, len(content.Steps)/2, content.Attempts) - log.Printf("slider: image=%dx%d grid=%d steps=%d attempts=%d", + log.Printf("slider: image=%dx%d grid=%dx%d steps=%d attempts=%d", content.Image.Bounds().Dx(), content.Image.Bounds().Dy(), - content.Size, len(content.Steps)/2, content.Attempts) - - // Rank candidate positions by pixel border continuity - candidates, err := rankSliderCandidates(content.Image, content.Size, content.Steps) + content.GridW, content.GridH, len(content.Steps)/2, content.Attempts) + + // Rank candidate positions by pixel border continuity. Gate the + // memory-heavy render+score with sliderRankSlot so we don't OOM + // the iOS extension when several captcha solves arrive in + // parallel. Plain blocking send is fine — each ranking finishes + // in ~100 ms, so a stuck sender waits at most that long for a + // slot to free. + sliderRankSlot <- struct{}{} + candidates, err := rankSliderCandidates(content.Image, content.GridW, content.GridH, content.Steps) + <-sliderRankSlot if err != nil { + trap.Note("rank failed: %v", err) + trap.Commit("rank_failed") return "", fmt.Errorf("slider rank: %w", err) } @@ -92,6 +155,8 @@ func solveSliderCaptcha( answer, err := encodeSliderAnswer(c.ActiveSteps) if err != nil { + trap.Note("encodeSliderAnswer failed: %v", err) + trap.Commit("encode_answer_err") return "", err } @@ -105,29 +170,47 @@ func solveSliderCaptcha( neturl.QueryEscape(cursor), neturl.QueryEscape("[]"), neturl.QueryEscape("[]"), neturl.QueryEscape("[]"), browserFp, hash, neturl.QueryEscape(answer), - "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + debugInfo, ) checkResp, err := vkReq("captchaNotRobot.check", checkData) if err != nil { + trap.Note("attempt %d/%d transport err: %v", i+1, maxTries, err) + trap.Commit("check_transport_err") return "", fmt.Errorf("slider check: %w", err) } respObj, ok := checkResp["response"].(map[string]interface{}) if !ok { + trap.Note("attempt %d/%d invalid response: %v", i+1, maxTries, checkResp) + trap.Commit("check_invalid_response") return "", fmt.Errorf("slider check: invalid response") } status, _ := respObj["status"].(string) + trap.Note("attempt %d/%d position=%d score=%d → status=%s", + i+1, maxTries, c.Index, c.Score, status) switch status { case "OK": successToken, _ := respObj["success_token"].(string) if successToken == "" { + trap.Note("OK but success_token missing in: %v", respObj) + trap.Commit("ok_without_token") return "", fmt.Errorf("slider: success_token not found") } log.Printf("slider: solved! position=%d (attempt %d/%d)", c.Index, i+1, maxTries) + // Commit solved captchas too so the user can actually see + // what our solver is processing. The reason field marks + // them "solved_ok"; unsolved entries use other reasons. + // Without this commit a healthy run produces an empty trap + // dir, which looks indistinguishable from "the trap isn't + // wired correctly". + trap.Note("SOLVED at attempt %d/%d, position=%d", i+1, maxTries, c.Index) + trap.Commit("solved_ok") return successToken, nil case "ERROR_LIMIT": + markCaptchaSaturated(isTunnel) + trap.Commit("error_limit") return "", fmt.Errorf("slider: ERROR_LIMIT") default: log.Printf("slider: position=%d rejected (status=%s)", c.Index, status) @@ -135,6 +218,7 @@ func solveSliderCaptcha( } } + trap.Commit("all_guesses_rejected") return "", fmt.Errorf("slider: all %d guesses rejected", maxTries) } @@ -235,7 +319,7 @@ func parseSliderContent(resp map[string]interface{}) (*sliderCaptchaContent, err return nil, err } - size, swaps, attempts, err := parseSliderSteps(steps) + gridW, gridH, swaps, attempts, err := parseSliderSteps(steps) if err != nil { return nil, err } @@ -247,7 +331,8 @@ func parseSliderContent(resp map[string]interface{}) (*sliderCaptchaContent, err return &sliderCaptchaContent{ Image: img, - Size: size, + GridW: gridW, + GridH: gridH, Steps: swaps, Attempts: attempts, }, nil @@ -274,30 +359,91 @@ func parseIntSlice(raw []interface{}) ([]int, error) { return values, nil } -func parseSliderSteps(steps []int) (int, []int, int, error) { +// parseSliderSteps decodes VK's `steps` array. Two formats observed: +// +// square: [size, swap_pairs..., attempts?] // tile grid = size×size +// rect: [width, height, swap_pairs..., attempts?] // tile grid = width×height +// +// VK started serving the rectangular variant (3×7 word-strip layouts: +// ШАПОЧКИ / КОРРУПЦИЯ / СКЕПТИЦИЗМ etc.) where the old square parser +// produces tile-counts that don't contain the swap indices and the +// renderer scrambles the image instead of unscrambling. We try +// square first (backward-compatible: pre-existing 3×3, 4×4, etc. +// captchas keep parsing the same way), then rect, then bail with the +// raw payload logged so a third format can be added without +// guesswork. +func parseSliderSteps(steps []int) (gridW int, gridH int, swaps []int, attempts int, err error) { if len(steps) < 3 { - return 0, nil, 0, fmt.Errorf("steps too short: %d", len(steps)) + return 0, 0, nil, 0, fmt.Errorf("steps too short: %d", len(steps)) + } + log.Printf("slider: raw steps payload: %v", steps) + + if w, h, sw, at, ok := decodeSliderStepsSquare(steps); ok { + log.Printf("slider: parsed as %dx%d (square format), %d candidates, %d attempts", + w, h, len(sw)/2, at) + return w, h, sw, at, nil } + if w, h, sw, at, ok := decodeSliderStepsRect(steps); ok { + log.Printf("slider: parsed as %dx%d (rect format), %d candidates, %d attempts", + w, h, len(sw)/2, at) + return w, h, sw, at, nil + } + return 0, 0, nil, 0, fmt.Errorf("unrecognised steps payload %v", steps) +} +func decodeSliderStepsSquare(steps []int) (w, h int, swaps []int, attempts int, ok bool) { size := steps[0] if size <= 0 { - return 0, nil, 0, fmt.Errorf("invalid grid size: %d", size) + return 0, 0, nil, 0, false } - - remaining := append([]int(nil), steps[1:]...) - attempts := defaultSliderAttempts - if len(remaining)%2 != 0 { - attempts = remaining[len(remaining)-1] - remaining = remaining[:len(remaining)-1] + tileCount := size * size + rest := append([]int(nil), steps[1:]...) + attempts = defaultSliderAttempts + if len(rest)%2 != 0 { + attempts = rest[len(rest)-1] + rest = rest[:len(rest)-1] } if attempts <= 0 { attempts = defaultSliderAttempts } - if len(remaining) == 0 || len(remaining)%2 != 0 { - return 0, nil, 0, fmt.Errorf("invalid swap payload") + if len(rest) == 0 || len(rest)%2 != 0 { + return 0, 0, nil, 0, false + } + for _, v := range rest { + if v < 0 || v >= tileCount { + return 0, 0, nil, 0, false + } } + return size, size, rest, attempts, true +} - return size, remaining, attempts, nil +func decodeSliderStepsRect(steps []int) (w, h int, swaps []int, attempts int, ok bool) { + if len(steps) < 4 { + return 0, 0, nil, 0, false + } + width, height := steps[0], steps[1] + if width <= 0 || height <= 0 { + return 0, 0, nil, 0, false + } + tileCount := width * height + rest := append([]int(nil), steps[2:]...) + attempts = defaultSliderAttempts + if len(rest)%2 != 0 { + attempts = rest[len(rest)-1] + rest = rest[:len(rest)-1] + } + if attempts <= 0 { + attempts = defaultSliderAttempts + } + if len(rest) == 0 || len(rest)%2 != 0 { + return 0, 0, nil, 0, false + } + for _, v := range rest { + if v < 0 || v >= tileCount { + return 0, 0, nil, 0, false + } + } + return width, height, rest, attempts, true } func decodeSliderImage(rawImage string) (image.Image, error) { @@ -325,7 +471,7 @@ func encodeSliderAnswer(activeSteps []int) (string, error) { // rankSliderCandidates analyzes each candidate permutation and ranks by // pixel border continuity (lower score = better match = more likely correct). -func rankSliderCandidates(img image.Image, gridSize int, swaps []int) ([]sliderCandidate, error) { +func rankSliderCandidates(img image.Image, gridW, gridH int, swaps []int) ([]sliderCandidate, error) { candidateCount := len(swaps) / 2 if candidateCount == 0 { return nil, fmt.Errorf("no candidates") @@ -334,17 +480,19 @@ func rankSliderCandidates(img image.Image, gridSize int, swaps []int) ([]sliderC candidates := make([]sliderCandidate, 0, candidateCount) for idx := 1; idx <= candidateCount; idx++ { activeSteps := buildSliderActiveSteps(swaps, idx) - mapping, err := buildSliderTileMapping(gridSize, activeSteps) - if err != nil { - return nil, err - } - - rendered, err := renderSliderCandidate(img, gridSize, mapping) + mapping, err := buildSliderTileMapping(gridW, gridH, activeSteps) if err != nil { return nil, err } - score := scoreRenderedSliderImage(rendered, gridSize) + // F5: score directly on the source image without rendering a + // full RGBA buffer per candidate. The seam-energy metric only + // needs pixel values at adjacent-tile boundaries, which we + // can look up via the mapping (destination position d's + // pixels come from source tile mapping[d]). Drops slider rank + // peak memory from ~140 MB to a few KB, lets the slot cap be + // raised back to maxConcurrentCaptchaSolves. + score := scoreSliderMapping(img, gridW, gridH, mapping) candidates = append(candidates, sliderCandidate{ Index: idx, ActiveSteps: activeSteps, @@ -362,6 +510,63 @@ func rankSliderCandidates(img image.Image, gridSize int, swaps []int) ([]sliderC return candidates, nil } +// scoreSliderMapping computes a seam-energy score for a candidate +// tile mapping without materialising the rearranged image. For every +// pair of adjacent destination positions it looks up the source +// tiles via the mapping and sums pixel differences across the shared +// border directly on `img`. The correct (=originally-arranged) +// mapping produces the lowest total energy; rankSliderCandidates +// sorts ascending and picks the top one. +// +// Equivalent to first rendering the rearranged image and then +// scoring across its inter-tile borders, except this skips a +// 600×600×4-byte allocation per candidate and slashes ranking peak +// memory by ~70 MB. +func scoreSliderMapping(img image.Image, gridW, gridH int, mapping []int) int64 { + bounds := img.Bounds() + var score int64 + + // Horizontal seams: dest left tile's right edge vs dest right + // tile's left edge. Source tile rects for each give the pixels. + for row := 0; row < gridH; row++ { + for col := 0; col < gridW-1; col++ { + srcLeft := sliderTileRect(bounds, gridW, gridH, mapping[row*gridW+col]) + srcRight := sliderTileRect(bounds, gridW, gridH, mapping[row*gridW+col+1]) + height := srcLeft.Dy() + if h := srcRight.Dy(); h < height { + height = h + } + for y := 0; y < height; y++ { + score += pixelDiff( + img.At(srcLeft.Max.X-1, srcLeft.Min.Y+y), + img.At(srcRight.Min.X, srcRight.Min.Y+y), + ) + } + } + } + + // Vertical seams: dest top tile's bottom edge vs dest bottom + // tile's top edge. + for row := 0; row < gridH-1; row++ { + for col := 0; col < gridW; col++ { + srcTop := sliderTileRect(bounds, gridW, gridH, mapping[row*gridW+col]) + srcBottom := sliderTileRect(bounds, gridW, gridH, mapping[(row+1)*gridW+col]) + width := srcTop.Dx() + if w := srcBottom.Dx(); w < width { + width = w + } + for x := 0; x < width; x++ { + score += pixelDiff( + img.At(srcTop.Min.X+x, srcTop.Max.Y-1), + img.At(srcBottom.Min.X+x, srcBottom.Min.Y), + ) + } + } + } + + return score +} + func buildSliderActiveSteps(swaps []int, candidateIndex int) []int { if candidateIndex <= 0 { return []int{} @@ -373,8 +578,8 @@ func buildSliderActiveSteps(swaps []int, candidateIndex int) []int { return append([]int(nil), swaps[:end]...) } -func buildSliderTileMapping(gridSize int, activeSteps []int) ([]int, error) { - tileCount := gridSize * gridSize +func buildSliderTileMapping(gridW, gridH int, activeSteps []int) ([]int, error) { + tileCount := gridW * gridH if tileCount <= 0 { return nil, fmt.Errorf("invalid tile count") } @@ -396,87 +601,16 @@ func buildSliderTileMapping(gridSize int, activeSteps []int) ([]int, error) { return mapping, nil } -func renderSliderCandidate(img image.Image, gridSize int, mapping []int) (*image.RGBA, error) { - tileCount := gridSize * gridSize - if len(mapping) != tileCount { - return nil, fmt.Errorf("mapping length %d != %d", len(mapping), tileCount) - } - - bounds := img.Bounds() - rendered := image.NewRGBA(bounds) - for dstIdx, srcIdx := range mapping { - srcRect := sliderTileRect(bounds, gridSize, srcIdx) - dstRect := sliderTileRect(bounds, gridSize, dstIdx) - copyTile(rendered, dstRect, img, srcRect) - } - return rendered, nil -} - -func scoreRenderedSliderImage(img image.Image, gridSize int) int64 { - bounds := img.Bounds() - var score int64 - - // Horizontal borders (left tile right edge vs right tile left edge) - for row := 0; row < gridSize; row++ { - for col := 0; col < gridSize-1; col++ { - leftRect := sliderTileRect(bounds, gridSize, row*gridSize+col) - rightRect := sliderTileRect(bounds, gridSize, row*gridSize+col+1) - height := leftRect.Dy() - if h := rightRect.Dy(); h < height { - height = h - } - for y := 0; y < height; y++ { - score += pixelDiff( - img.At(leftRect.Max.X-1, leftRect.Min.Y+y), - img.At(rightRect.Min.X, rightRect.Min.Y+y), - ) - } - } - } - - // Vertical borders (top tile bottom edge vs bottom tile top edge) - for row := 0; row < gridSize-1; row++ { - for col := 0; col < gridSize; col++ { - topRect := sliderTileRect(bounds, gridSize, row*gridSize+col) - bottomRect := sliderTileRect(bounds, gridSize, (row+1)*gridSize+col) - width := topRect.Dx() - if w := bottomRect.Dx(); w < width { - width = w - } - for x := 0; x < width; x++ { - score += pixelDiff( - img.At(topRect.Min.X+x, topRect.Max.Y-1), - img.At(bottomRect.Min.X+x, bottomRect.Min.Y), - ) - } - } - } - - return score -} - -func sliderTileRect(bounds image.Rectangle, gridSize, index int) image.Rectangle { - row := index / gridSize - col := index % gridSize - x0 := bounds.Min.X + col*bounds.Dx()/gridSize - x1 := bounds.Min.X + (col+1)*bounds.Dx()/gridSize - y0 := bounds.Min.Y + row*bounds.Dy()/gridSize - y1 := bounds.Min.Y + (row+1)*bounds.Dy()/gridSize +func sliderTileRect(bounds image.Rectangle, gridW, gridH, index int) image.Rectangle { + row := index / gridW + col := index % gridW + x0 := bounds.Min.X + col*bounds.Dx()/gridW + x1 := bounds.Min.X + (col+1)*bounds.Dx()/gridW + y0 := bounds.Min.Y + row*bounds.Dy()/gridH + y1 := bounds.Min.Y + (row+1)*bounds.Dy()/gridH return image.Rect(x0, y0, x1, y1) } -func copyTile(dst *image.RGBA, dstRect image.Rectangle, src image.Image, srcRect image.Rectangle) { - dw, dh := dstRect.Dx(), dstRect.Dy() - sw, sh := srcRect.Dx(), srcRect.Dy() - for y := 0; y < dh; y++ { - sy := srcRect.Min.Y + y*sh/dh - for x := 0; x < dw; x++ { - sx := srcRect.Min.X + x*sw/dw - dst.Set(dstRect.Min.X+x, dstRect.Min.Y+y, src.At(sx, sy)) - } - } -} - func pixelDiff(a, b color.Color) int64 { ar, ag, ab, _ := a.RGBA() br, bg, bb, _ := b.RGBA() diff --git a/wireguard-apple/Sources/WireGuardKitGo/captcha_stats.go b/wireguard-apple/Sources/WireGuardKitGo/captcha_stats.go new file mode 100644 index 0000000..bdc29fc --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/captcha_stats.go @@ -0,0 +1,243 @@ +// SPDX-License-Identifier: MIT +// +// Per-connect captcha solve counters and saturation flags. +// +// Two egress IPs feed our captcha solves once phased bring-up is on: +// +// * "direct" — the user's mobile IP, used by the bootstrap session +// before WG comes up. +// * "tunnel" — the WG server's egress IP, used by every session +// spawned AFTER WG handshake completes (the extension's +// own net/http auto-routes through utun under +// includeAllNetworks=true). +// +// VK enforces captcha.isNotRobot rate-limits per source IP, so the two +// pools have independent budgets. The UI surfaces both counts so the +// user can see how many sessions came up via each route. The two +// saturation flags (`direct` / `tunnel`) flip on the first +// ERROR_LIMIT seen in that mode and let StartProxy stop spawning new +// sessions once the tunneled egress is also exhausted. + +package main + +/* +#include +*/ +import "C" + +import ( + "sync/atomic" + "time" +) + +var ( + captchaDirectOK atomic.Int64 + captchaTunnelOK atomic.Int64 + captchaDirectAttempts atomic.Int64 // total captcha solve attempts started on direct egress + captchaTunnelAttempts atomic.Int64 // ditto for tunnel egress + captchaDirectInFlight atomic.Int64 // currently mid-solve on direct + captchaTunnelInFlight atomic.Int64 // ditto for tunnel + captchaDirectFailStreak atomic.Int64 // consecutive ERROR_LIMITs on direct egress without a success + captchaTunnelFailStreak atomic.Int64 // ditto for tunnel + captchaDirectSatAt atomic.Int64 // unix-nano timestamp of last ERROR_LIMIT on direct + captchaTunnelSatAt atomic.Int64 // unix-nano timestamp of last ERROR_LIMIT on tunnel + captchaTunnelEgress atomic.Bool // true once we believe HTTP from this extension routes through utun + captchaSessionsReady atomic.Int64 // DTLS sessions that have reached sessionOk + captchaSessionsTarget atomic.Int64 // requested N + + // Remote-server captcha pool stats. Server-cluster solves are + // completely opaque to the markCaptcha{Attempt,Success,Saturated} + // helpers above because the captcha never touches this phone's + // HTTP stack — getCredsRemote just receives a finished cred. The + // counters below are bumped exclusively from remote_creds.go on + // every /cred call so the UI can surface the cluster's + // contribution alongside Direct/Tunnel. + captchaRemoteOK atomic.Int64 + captchaRemoteAttempts atomic.Int64 + captchaRemoteInFlight atomic.Int64 +) + +// satThreshold is the number of consecutive ERROR_LIMITs that count as +// "egress saturated, stop spawning more sessions there". One failure is +// noise; three in a row is a genuine rate-limit pattern. +const satThreshold = 3 + +// captchaCooldown is how long the saturated flag stays sticky after the +// last ERROR_LIMIT. VK's per-IP captcha rate-limit windows are short +// (~60 s in practice), so once a minute has elapsed without a fresh +// failure we let the spawn paths try again. Without this the system +// gives up forever after one rate-limit burst, even if the network +// would have recovered. +const captchaCooldown = 60 * time.Second + +func resetCaptchaStats() { + captchaDirectOK.Store(0) + captchaTunnelOK.Store(0) + captchaDirectAttempts.Store(0) + captchaTunnelAttempts.Store(0) + captchaDirectInFlight.Store(0) + captchaTunnelInFlight.Store(0) + captchaDirectFailStreak.Store(0) + captchaTunnelFailStreak.Store(0) + captchaDirectSatAt.Store(0) + captchaTunnelSatAt.Store(0) + captchaTunnelEgress.Store(false) + captchaSessionsReady.Store(0) + captchaSessionsTarget.Store(0) + captchaRemoteOK.Store(0) + captchaRemoteAttempts.Store(0) + captchaRemoteInFlight.Store(0) +} + +//export TurnBridgeGetCaptchaRemoteCount +func TurnBridgeGetCaptchaRemoteCount() C.int { + return C.int(captchaRemoteOK.Load()) +} + +//export TurnBridgeGetCaptchaRemoteAttempts +func TurnBridgeGetCaptchaRemoteAttempts() C.int { + return C.int(captchaRemoteAttempts.Load()) +} + +//export TurnBridgeGetCaptchaRemoteInFlight +func TurnBridgeGetCaptchaRemoteInFlight() C.int { + return C.int(captchaRemoteInFlight.Load()) +} + +// markCaptchaAttemptStart bumps the in-flight gauge for the egress +// this attempt will use. forceDirect=true means the caller is pinning +// a physical interface to bypass utun (see cellularDial), so the +// attempt should be counted against the direct bucket even though +// captchaTunnelEgress is true. +func markCaptchaAttemptStart(forceDirect bool) (isTunnel bool) { + isTunnel = captchaTunnelEgress.Load() && !forceDirect + if isTunnel { + captchaTunnelAttempts.Add(1) + captchaTunnelInFlight.Add(1) + return true + } + captchaDirectAttempts.Add(1) + captchaDirectInFlight.Add(1) + return false +} + +func markCaptchaAttemptDone(isTunnel bool) { + if isTunnel { + captchaTunnelInFlight.Add(-1) + } else { + captchaDirectInFlight.Add(-1) + } +} + +// markCaptchaSuccess clears the streak for the egress that just got a +// success_token. The isTunnel flag is the one returned from +// markCaptchaAttemptStart so a force-direct retry credits the right +// pool even when captchaTunnelEgress is globally true. +func markCaptchaSuccess(isTunnel bool) { + if isTunnel { + captchaTunnelOK.Add(1) + captchaTunnelFailStreak.Store(0) + captchaTunnelSatAt.Store(0) + } else { + captchaDirectOK.Add(1) + captchaDirectFailStreak.Store(0) + captchaDirectSatAt.Store(0) + } +} + +// markCaptchaSaturated records an ERROR_LIMIT against the egress this +// attempt actually used. Stamps the timestamp so the cooldown in +// directSaturated/tunnelSaturated can auto-clear after captchaCooldown. +func markCaptchaSaturated(isTunnel bool) { + now := time.Now().UnixNano() + if isTunnel { + captchaTunnelFailStreak.Add(1) + captchaTunnelSatAt.Store(now) + } else { + captchaDirectFailStreak.Add(1) + captchaDirectSatAt.Store(now) + } +} + +// saturatedWithCooldown is the shared check + auto-decay for both +// egresses. If the streak is at threshold but captchaCooldown has +// elapsed since the last ERROR_LIMIT, clear the streak and report +// not-saturated so the spawn paths can probe again. +func saturatedWithCooldown(streak *atomic.Int64, satAt *atomic.Int64) bool { + if streak.Load() < satThreshold { + return false + } + last := satAt.Load() + if last == 0 { + return true + } + if time.Now().UnixNano()-last > captchaCooldown.Nanoseconds() { + streak.Store(0) + satAt.Store(0) + return false + } + return true +} + +func directSaturated() bool { + return saturatedWithCooldown(&captchaDirectFailStreak, &captchaDirectSatAt) +} +func tunnelSaturated() bool { + return saturatedWithCooldown(&captchaTunnelFailStreak, &captchaTunnelSatAt) +} + +//export TurnBridgeGetCaptchaDirectCount +func TurnBridgeGetCaptchaDirectCount() C.int { + return C.int(captchaDirectOK.Load()) +} + +//export TurnBridgeGetCaptchaTunnelCount +func TurnBridgeGetCaptchaTunnelCount() C.int { + return C.int(captchaTunnelOK.Load()) +} + +//export TurnBridgeGetCaptchaDirectAttempts +func TurnBridgeGetCaptchaDirectAttempts() C.int { + return C.int(captchaDirectAttempts.Load()) +} + +//export TurnBridgeGetCaptchaTunnelAttempts +func TurnBridgeGetCaptchaTunnelAttempts() C.int { + return C.int(captchaTunnelAttempts.Load()) +} + +//export TurnBridgeGetCaptchaDirectInFlight +func TurnBridgeGetCaptchaDirectInFlight() C.int { + return C.int(captchaDirectInFlight.Load()) +} + +//export TurnBridgeGetCaptchaTunnelInFlight +func TurnBridgeGetCaptchaTunnelInFlight() C.int { + return C.int(captchaTunnelInFlight.Load()) +} + +//export TurnBridgeIsCaptchaDirectSaturated +func TurnBridgeIsCaptchaDirectSaturated() C.int { + if directSaturated() { + return 1 + } + return 0 +} + +//export TurnBridgeIsCaptchaTunnelSaturated +func TurnBridgeIsCaptchaTunnelSaturated() C.int { + if tunnelSaturated() { + return 1 + } + return 0 +} + +//export TurnBridgeGetSessionsReady +func TurnBridgeGetSessionsReady() C.int { + return C.int(captchaSessionsReady.Load()) +} + +//export TurnBridgeGetSessionsTarget +func TurnBridgeGetSessionsTarget() C.int { + return C.int(captchaSessionsTarget.Load()) +} diff --git a/wireguard-apple/Sources/WireGuardKitGo/captcha_trap.go b/wireguard-apple/Sources/WireGuardKitGo/captcha_trap.go new file mode 100644 index 0000000..d97e4d6 --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/captcha_trap.go @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: MIT +// +// Captcha trap ("мухоловка"): buffers every captcha challenge in memory +// while the solver runs and flushes the buffer to disk ONLY if the solve +// ultimately fails. Successful solves leave nothing behind. Failed +// solves drop a self-contained folder (raw VK response JSON, the image +// bytes, a notes log) into the App Group container so we can inspect +// captcha variants we don't yet handle. +// +// Wiring: Swift creates the trap directory under the App Group container +// and pushes the absolute path here via TurnBridgeSetCaptchaTrapDir +// before StartProxy. If the path is empty, every trap call is a no-op +// (the feature simply doesn't engage). + +package main + +/* +#include +*/ +import "C" + +import ( + "crypto/rand" + "encoding/hex" + "fmt" + "log" + "os" + "path/filepath" + "strings" + "sync" + "sync/atomic" + "time" +) + +var ( + captchaTrapDir atomic.Value // string +) + +//export TurnBridgeSetCaptchaTrapDir +func TurnBridgeSetCaptchaTrapDir(cPath *C.char) { + if cPath == nil { + captchaTrapDir.Store("") + return + } + path := C.GoString(cPath) + captchaTrapDir.Store(path) + if path == "" { + log.Printf("captcha-trap: disabled (empty path)") + return + } + if err := os.MkdirAll(path, 0o755); err != nil { + log.Printf("captcha-trap: mkdir %q failed: %v — feature off", path, err) + captchaTrapDir.Store("") + return + } + // Write a probe file so the path is verifiable end-to-end without + // waiting for a captcha to actually fail. If the user opens + // "Captured Captchas" and sees nothing AND no probe, the issue is + // path/permission. If they see the probe but no failure folders, + // the slider solver simply hasn't tripped this session. + probePath := filepath.Join(path, "_probe.txt") + probeContent := fmt.Sprintf("trap dir: %s\nwritten: %s\npid: %d\n", + path, time.Now().Format(time.RFC3339), os.Getpid()) + if err := os.WriteFile(probePath, []byte(probeContent), 0o644); err != nil { + log.Printf("captcha-trap: probe write failed: %v", err) + } else { + log.Printf("captcha-trap: artifacts → %s (probe written)", path) + } +} + +func captchaTrapRoot() string { + v, _ := captchaTrapDir.Load().(string) + return v +} + +type captchaTrap struct { + label string + started time.Time + + mu sync.Mutex + files map[string][]byte + notes []string + flushed bool + hasImage bool // set true the moment a non-empty image.* artefact lands +} + +// newCaptchaTrap opens an in-memory artifact buffer. Safe to call even +// when the trap is disabled (returns a no-op handle). +func newCaptchaTrap(label string) *captchaTrap { + return &captchaTrap{ + label: label, + started: time.Now(), + files: map[string][]byte{}, + } +} + +// Save records an artifact (a file that will be written to disk if the +// trap commits). The data is copied so callers can reuse the slice. +func (t *captchaTrap) Save(name string, data []byte) { + if t == nil || captchaTrapRoot() == "" { + return + } + t.mu.Lock() + defer t.mu.Unlock() + if t.flushed { + return + } + cp := make([]byte, len(data)) + copy(cp, data) + clean := sanitizeArtifactName(name) + t.files[clean] = cp + if len(cp) > 0 && strings.HasPrefix(clean, "image.") { + t.hasImage = true + } +} + +// Note appends a human-readable line that lands in notes.log on commit. +func (t *captchaTrap) Note(format string, args ...any) { + if t == nil || captchaTrapRoot() == "" { + return + } + t.mu.Lock() + defer t.mu.Unlock() + if t.flushed { + return + } + t.notes = append(t.notes, fmt.Sprintf("[%s] %s", + time.Now().Format("15:04:05.000"), + fmt.Sprintf(format, args...))) +} + +// Commit flushes the buffer to disk under a fresh subdirectory. Safe to +// call multiple times; only the first call writes. Commits without +// an image artefact are skipped — text-only "VK said ERROR" entries +// have no diagnostic value and just clutter the trap directory. +func (t *captchaTrap) Commit(reason string) { + if t == nil { + return + } + root := captchaTrapRoot() + if root == "" { + return + } + t.mu.Lock() + defer t.mu.Unlock() + if t.flushed { + return + } + t.flushed = true + + if !t.hasImage { + log.Printf("captcha-trap: skip commit, no image to capture (reason=%s)", reason) + t.files = nil + t.notes = nil + return + } + + subdir := filepath.Join(root, fmt.Sprintf("%s_%s_%s", + t.started.Format("20060102_150405"), + t.label, + shortRandHex(3))) + if err := os.MkdirAll(subdir, 0o755); err != nil { + log.Printf("captcha-trap: commit mkdir %q failed: %v", subdir, err) + return + } + + for name, data := range t.files { + if err := os.WriteFile(filepath.Join(subdir, name), data, 0o644); err != nil { + log.Printf("captcha-trap: write %s/%s failed: %v", subdir, name, err) + } + } + + notesBlob := strings.Builder{} + fmt.Fprintf(¬esBlob, "label: %s\n", t.label) + fmt.Fprintf(¬esBlob, "reason: %s\n", reason) + fmt.Fprintf(¬esBlob, "started: %s\n", t.started.Format(time.RFC3339Nano)) + fmt.Fprintf(¬esBlob, "duration: %s\n", time.Since(t.started)) + notesBlob.WriteString("---\n") + for _, n := range t.notes { + notesBlob.WriteString(n) + notesBlob.WriteByte('\n') + } + _ = os.WriteFile(filepath.Join(subdir, "notes.log"), []byte(notesBlob.String()), 0o644) + + log.Printf("captcha-trap: saved %d artefacts to %s (reason=%s)", + len(t.files), subdir, reason) +} + +// Discard drops the in-memory buffer without touching disk. The deferred +// safety net for the happy path: if the solve returns a success token, +// Discard frees the buffer and nothing is persisted. +func (t *captchaTrap) Discard() { + if t == nil { + return + } + t.mu.Lock() + defer t.mu.Unlock() + if t.flushed { + return + } + t.flushed = true + t.files = nil + t.notes = nil +} + +func sanitizeArtifactName(name string) string { + // Keep filenames flat and predictable — anything iOS' file browsers + // can choke on (slashes, leading dots) gets normalised away. + cleaned := strings.Map(func(r rune) rune { + switch { + case r == '/' || r == '\\' || r == 0: + return '_' + default: + return r + } + }, name) + cleaned = strings.TrimLeft(cleaned, ".") + if cleaned == "" { + cleaned = "artifact" + } + return cleaned +} + +func shortRandHex(n int) string { + b := make([]byte, n) + if _, err := rand.Read(b); err != nil { + return "noid" + } + return hex.EncodeToString(b) +} diff --git a/wireguard-apple/Sources/WireGuardKitGo/dns_resolver.go b/wireguard-apple/Sources/WireGuardKitGo/dns_resolver.go new file mode 100644 index 0000000..1c92067 --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/dns_resolver.go @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: MIT +// +// Resilient DNS for VK captcha/identity HTTP. The system resolver on +// mobile carriers in censorship-heavy regions sometimes returns +// NXDOMAIN, hijacked IPs, or hangs on api.vk.com / id.vk.com lookups, +// even when the underlying network is otherwise fine. The captcha +// solver then errors out with "no such host" or a timeout before any +// of our retry logic can engage. +// +// customDial is a drop-in replacement for net.Dialer.DialContext that +// layers: +// 1. literal IP addresses — dial immediately, no resolution. +// 2. system resolver — 4 s budget. Works on WiFi where the +// carrier isn't censoring. +// 3. DNS-over-HTTPS (DoH) — Cloudflare's 1.1.1.1 JSON endpoint +// by IP, so the lookup itself needs +// no DNS. Cached for 10 minutes per +// hostname to avoid hammering DoH. +// 4. fallback IP map — last-resort hardcoded A records for +// VK domains, in case DoH is also +// blocked. Stale risk but better than +// a hard failure. +// +// The TLS handshake uses the original hostname (Go's http.Transport +// passes the request URL host as SNI/ServerName regardless of what +// DialContext returned), so dialing to a raw IP doesn't break cert +// verification. + +package main + +import ( + "context" + "crypto/tls" + "encoding/json" + "fmt" + "io" + "log" + "net" + "net/http" + "strings" + "sync" + "time" +) + +const ( + dohURL = "https://1.1.1.1/dns-query" + dohCacheTTL = 10 * time.Minute + systemDialBudget = 4 * time.Second + dohDialBudget = 6 * time.Second +) + +// dohClient is used ONLY for the DoH lookup itself. Plain net.Dialer +// so we don't recurse into customDial. +var dohClient = &http.Client{ + Timeout: 5 * time.Second, + Transport: &http.Transport{ + DialContext: (&net.Dialer{Timeout: 4 * time.Second}).DialContext, + TLSClientConfig: &tls.Config{}, + }, +} + +type dohEntry struct { + ips []string + expires time.Time +} + +var dohCache sync.Map // host -> dohEntry + +// Last-resort hardcoded A records. Used only if BOTH system resolver +// and DoH fail. VK's API endpoints have lived on these IPs for a long +// time; refresh manually if VK migrates infrastructure. +var fallbackIPs = map[string][]string{ + "login.vk.com": {"87.240.132.78", "87.240.137.158"}, + "api.vk.com": {"87.240.132.78", "87.240.137.158"}, + "id.vk.com": {"87.240.132.78", "87.240.137.158"}, + "vk.com": {"87.240.132.78", "87.240.137.158"}, + "m.vk.com": {"87.240.132.78"}, + // keep .ru hosts too in case some upstream code path still + // hits them (and they're reachable on the user's network). + "login.vk.ru": {"87.240.137.158", "87.240.190.78"}, + "api.vk.ru": {"87.240.137.158", "87.240.190.78"}, + "id.vk.ru": {"87.240.137.158", "87.240.190.78"}, + "vk.ru": {"87.240.137.158"}, +} + +// customDial is the net.Dialer.DialContext-shaped function plug into +// http.Transport on any HTTP client that needs censorship-tolerant DNS. +func customDial(ctx context.Context, network, address string) (net.Conn, error) { + host, port, err := net.SplitHostPort(address) + if err != nil { + return nil, err + } + + // Fast path: literal IP needs no resolution. + if net.ParseIP(host) != nil { + return (&net.Dialer{Timeout: 8 * time.Second}).DialContext(ctx, network, address) + } + + // Layer 1: system resolver. + d := &net.Dialer{Timeout: dohDialBudget} + sysCtx, cancel := context.WithTimeout(ctx, systemDialBudget) + conn, sysErr := d.DialContext(sysCtx, network, address) + cancel() + if sysErr == nil { + return conn, nil + } + log.Printf("dns: system resolve+dial failed for %s: %v — falling back to DoH", host, sysErr) + + // Layer 2: DoH. + if ips, err := resolveViaDoH(ctx, host); err == nil && len(ips) > 0 { + log.Printf("dns: DoH %s → %v", host, ips) + for _, ip := range ips { + c, derr := d.DialContext(ctx, network, net.JoinHostPort(ip, port)) + if derr == nil { + return c, nil + } + log.Printf("dns: dial %s (DoH) failed: %v", ip, derr) + } + } else if err != nil { + log.Printf("dns: DoH lookup failed for %s: %v", host, err) + } + + // Layer 3: hardcoded fallback. + if ips, ok := fallbackIPs[strings.ToLower(host)]; ok { + log.Printf("dns: trying hardcoded fallback IPs for %s: %v", host, ips) + for _, ip := range ips { + c, derr := d.DialContext(ctx, network, net.JoinHostPort(ip, port)) + if derr == nil { + return c, nil + } + log.Printf("dns: dial %s (fallback) failed: %v", ip, derr) + } + } + + return nil, fmt.Errorf("all DNS layers exhausted for %s (sys=%v)", host, sysErr) +} + +func resolveViaDoH(ctx context.Context, host string) ([]string, error) { + host = strings.ToLower(host) + if v, ok := dohCache.Load(host); ok { + if entry, ok := v.(dohEntry); ok && time.Now().Before(entry.expires) { + return entry.ips, nil + } + } + + url := dohURL + "?name=" + host + "&type=A" + req, err := http.NewRequestWithContext(ctx, "GET", url, nil) + if err != nil { + return nil, err + } + req.Header.Set("accept", "application/dns-json") + + resp, err := dohClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + + var doh struct { + Answer []struct { + Type int `json:"type"` + Data string `json:"data"` + } `json:"Answer"` + } + if err := json.Unmarshal(body, &doh); err != nil { + return nil, err + } + + var ips []string + for _, a := range doh.Answer { + if a.Type == 1 && net.ParseIP(a.Data) != nil { // A record + ips = append(ips, strings.TrimSpace(a.Data)) + } + } + if len(ips) == 0 { + return nil, fmt.Errorf("DoH returned no A records for %s", host) + } + dohCache.Store(host, dohEntry{ + ips: ips, + expires: time.Now().Add(dohCacheTTL), + }) + return ips, nil +} diff --git a/wireguard-apple/Sources/WireGuardKitGo/go.mod b/wireguard-apple/Sources/WireGuardKitGo/go.mod index 4c02db8..a14f122 100644 --- a/wireguard-apple/Sources/WireGuardKitGo/go.mod +++ b/wireguard-apple/Sources/WireGuardKitGo/go.mod @@ -3,7 +3,6 @@ module github.com/amnezia-vpn/amneziawg-apple go 1.26 require ( - github.com/cbeuw/connutil v1.0.1 github.com/google/uuid v1.6.0 github.com/pion/dtls/v3 v3.1.2 github.com/pion/logging v0.2.4 @@ -19,14 +18,21 @@ require ( ) require ( - github.com/andybalholm/brotli v1.1.0 // indirect + github.com/andybalholm/brotli v1.2.0 // indirect github.com/apernet/quic-go v0.57.2-0.20260111184307-eec823306178 // indirect + github.com/bdandy/go-errors v1.2.2 // indirect + github.com/bdandy/go-socks4 v1.2.3 // indirect + github.com/bogdanfinn/fhttp v0.6.8 // indirect + github.com/bogdanfinn/quic-go-utls v1.0.9-utls // indirect + github.com/bogdanfinn/tls-client v1.14.0 // indirect + github.com/bogdanfinn/utls v1.7.7-barnius // indirect + github.com/bogdanfinn/websocket v1.5.5-barnius // indirect github.com/cloudflare/circl v1.6.3 // indirect github.com/ghodss/yaml v1.0.1-0.20220118164431-d8423dcdf344 // indirect github.com/google/btree v1.1.3 // indirect github.com/gorilla/websocket v1.5.3 // indirect github.com/juju/ratelimit v1.0.2 // indirect - github.com/klauspost/compress v1.17.7 // indirect + github.com/klauspost/compress v1.18.2 // indirect github.com/klauspost/cpuid/v2 v2.2.7 // indirect github.com/miekg/dns v1.1.72 // indirect github.com/pelletier/go-toml v1.9.5 // indirect @@ -39,6 +45,7 @@ require ( github.com/rogpeppe/go-internal v1.14.1 // indirect github.com/sagernet/sing v0.5.1 // indirect github.com/sagernet/sing-shadowsocks v0.2.7 // indirect + github.com/tam7t/hpkp v0.0.0-20160821193359-2b70b4024ed5 // indirect github.com/vishvananda/netlink v1.3.1 // indirect github.com/vishvananda/netns v0.0.5 // indirect github.com/wlynxg/anet v0.0.5 // indirect diff --git a/wireguard-apple/Sources/WireGuardKitGo/go.sum b/wireguard-apple/Sources/WireGuardKitGo/go.sum index a021e77..9668572 100644 --- a/wireguard-apple/Sources/WireGuardKitGo/go.sum +++ b/wireguard-apple/Sources/WireGuardKitGo/go.sum @@ -6,10 +6,24 @@ github.com/amnezia-vpn/amneziawg-go v0.2.16 h1:XY6HOq/xtqH8ZXMncRWkjFs85EKdN10NL github.com/amnezia-vpn/amneziawg-go v0.2.16/go.mod h1:nRkPpIzjCxMW8pZKXTRkpqAQVlmFJdVOGkeQSC7wbms= github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= +github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ= +github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY= github.com/apernet/quic-go v0.57.2-0.20260111184307-eec823306178 h1:bSq8n+gX4oO/qnM3MKf4kroW75n+phO9Qp6nigJKZ1E= github.com/apernet/quic-go v0.57.2-0.20260111184307-eec823306178/go.mod h1:N1WIjPphkqs4efXWuyDNQ6OjjIK04vM3h+bEgwV+eVU= -github.com/cbeuw/connutil v1.0.1 h1:LWuNYjwm7JEDYG/ISAO1TfU4G+q2dA5NhR97eq2roCA= -github.com/cbeuw/connutil v1.0.1/go.mod h1:lKofNtrW7Atmosgp1eNnTt2j2NjA2IkifapgLVI1QtA= +github.com/bdandy/go-errors v1.2.2 h1:WdFv/oukjTJCLa79UfkGmwX7ZxONAihKu4V0mLIs11Q= +github.com/bdandy/go-errors v1.2.2/go.mod h1:NkYHl4Fey9oRRdbB1CoC6e84tuqQHiqrOcZpqFEkBxM= +github.com/bdandy/go-socks4 v1.2.3 h1:Q6Y2heY1GRjCtHbmlKfnwrKVU/k81LS8mRGLRlmDlic= +github.com/bdandy/go-socks4 v1.2.3/go.mod h1:98kiVFgpdogR8aIGLWLvjDVZ8XcKPsSI/ypGrO+bqHI= +github.com/bogdanfinn/fhttp v0.6.8 h1:LiQyHOY3i0QoxxNB7nq27/nGNNbtPj0fuBPozhR7Ws4= +github.com/bogdanfinn/fhttp v0.6.8/go.mod h1:A+EKDzMx2hb4IUbMx4TlkoHnaJEiLl8r/1Ss1Y+5e5M= +github.com/bogdanfinn/quic-go-utls v1.0.9-utls h1:tV6eDEiRbRCcepALSzxR94JUVD3N3ACIiRLgyc2Ep8s= +github.com/bogdanfinn/quic-go-utls v1.0.9-utls/go.mod h1:aHph9B9H9yPOt5xnhWKSOum27DJAqpiHzwX+gjvaXcg= +github.com/bogdanfinn/tls-client v1.14.0 h1:vyk7Cn4BIvLAGVuMfb0tP22OqogfO1lYamquQNEZU1A= +github.com/bogdanfinn/tls-client v1.14.0/go.mod h1:LsU6mXVn8MOFDwTkyRfI7V1BZM1p0wf2ZfZsICW/1fM= +github.com/bogdanfinn/utls v1.7.7-barnius h1:OuJ497cc7F3yKNVHRsYPQdGggmk5x6+V5ZlrCR7fOLU= +github.com/bogdanfinn/utls v1.7.7-barnius/go.mod h1:aAK1VZQlpKZClF1WEQeq6kyclbkPq4hz6xTbB5xSlmg= +github.com/bogdanfinn/websocket v1.5.5-barnius h1:bY+qnxpai1qe7Jmjx+Sds/cmOSpuuLoR8x61rWltjOI= +github.com/bogdanfinn/websocket v1.5.5-barnius/go.mod h1:gvvEw6pTKHb7yOiFvIfAFTStQWyrm25BMVCTj5wRSsI= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cloudflare/circl v1.6.3 h1:9GPOhQGF9MCYUeXyMYlqTR6a5gTrgR/fBLXvUgtVcg8= @@ -38,6 +52,8 @@ github.com/juju/ratelimit v1.0.2 h1:sRxmtRiajbvrcLQT7S+JbqU0ntsb9W2yhSdNN8tWfaI= github.com/juju/ratelimit v1.0.2/go.mod h1:qapgC/Gy+xNh9UxzV13HGGl/6UXNN+ct+vwSgWNm/qk= github.com/klauspost/compress v1.17.7 h1:ehO88t2UGzQK66LMdE8tibEd1ErmzZjNEqWkjLAKQQg= github.com/klauspost/compress v1.17.7/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/klauspost/compress v1.18.2 h1:iiPHWW0YrcFgpBYhsA6D1+fqHssJscY/Tm/y2Uqnapk= +github.com/klauspost/compress v1.18.2/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4= github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM= github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= @@ -76,6 +92,8 @@ github.com/sagernet/sing-shadowsocks v0.2.7 h1:zaopR1tbHEw5Nk6FAkM05wCslV6ahVegE github.com/sagernet/sing-shadowsocks v0.2.7/go.mod h1:0rIKJZBR65Qi0zwdKezt4s57y/Tl1ofkaq6NlkzVuyE= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tam7t/hpkp v0.0.0-20160821193359-2b70b4024ed5 h1:YqAladjX7xpA6BM04leXMWAEjS0mTZ5kUU9KRBriQJc= +github.com/tam7t/hpkp v0.0.0-20160821193359-2b70b4024ed5/go.mod h1:2JjD2zLQYH5HO74y5+aE3remJQvl6q4Sn6aWA2wD1Ng= github.com/vishvananda/netlink v1.3.1 h1:3AEMt62VKqz90r0tmNhog0r/PpWKmrEShJU0wJW6bV0= github.com/vishvananda/netlink v1.3.1/go.mod h1:ARtKouGSTGchR8aMwmkzC0qiNPrrWO5JS/XMVl45+b4= github.com/vishvananda/netns v0.0.5 h1:DfiHV+j8bA32MFM7bfEunvT8IAqQ/NzSJHtcmW5zdEY= @@ -108,19 +126,25 @@ golang.org/x/exp v0.0.0-20260212183809-81e46e3db34a h1:ovFr6Z0MNmU7nH8VaX5xqw+05 golang.org/x/exp v0.0.0-20260212183809-81e46e3db34a/go.mod h1:K79w1Vqn7PoiZn+TkNpx3BUWUQksGO3JcVX6qIjytmA= golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8= golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= +golang.org/x/net v0.0.0-20211104170005-ce137452f963/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60= golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM= golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 h1:B82qJJgjvYKsXS9jeunTOisW56dUokqW/FOteYJJ/yg= diff --git a/wireguard-apple/Sources/WireGuardKitGo/identity.go b/wireguard-apple/Sources/WireGuardKitGo/identity.go index 52e77c6..578b188 100644 --- a/wireguard-apple/Sources/WireGuardKitGo/identity.go +++ b/wireguard-apple/Sources/WireGuardKitGo/identity.go @@ -29,66 +29,34 @@ var lastNames = []string{ } var profiles = []Profile{ - // Windows Chrome + // iPhone Safari only. VK's anti-bot pipeline triggers the + // "Confirm you're not a robot" checkbox when it sees a mismatch + // between the connection (Russian cellular IP, iPhone-shaped TLS + // fingerprint from NSURLSession's underlying CFNetwork stack) + // and the User-Agent header. Real users clicking a VK call link + // from Safari on iPhone aren't asked for a captcha — and that's + // exactly the request we want to look like. + // + // Safari deliberately doesn't implement Client Hints; vk_captcha + // skips the sec-ch-ua headers entirely when SecChUa is empty, + // matching what mobile Safari actually sends on the wire. { - UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36", - SecChUa: `"Chromium";v="146", "Not-A.Brand";v="24", "Google Chrome";v="146"`, - SecChUaMobile: "?0", - SecChUaPlatform: `"Windows"`, + UserAgent: "Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Mobile/15E148 Safari/604.1", }, { - UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36", - SecChUa: `"Chromium";v="145", "Not-A.Brand";v="99", "Google Chrome";v="145"`, - SecChUaMobile: "?0", - SecChUaPlatform: `"Windows"`, + UserAgent: "Mozilla/5.0 (iPhone; CPU iPhone OS 18_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Mobile/15E148 Safari/604.1", }, { - UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36", - SecChUa: `"Chromium";v="144", "Not-A.Brand";v="8", "Google Chrome";v="144"`, - SecChUaMobile: "?0", - SecChUaPlatform: `"Windows"`, - }, - - // Windows Edge - { - UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0", - SecChUa: `"Chromium";v="146", "Not-A.Brand";v="24", "Microsoft Edge";v="146"`, - SecChUaMobile: "?0", - SecChUaPlatform: `"Windows"`, + UserAgent: "Mozilla/5.0 (iPhone; CPU iPhone OS 18_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Mobile/15E148 Safari/604.1", }, { - UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36 Edg/145.0.0.0", - SecChUa: `"Chromium";v="145", "Not-A.Brand";v="99", "Microsoft Edge";v="145"`, - SecChUaMobile: "?0", - SecChUaPlatform: `"Windows"`, + UserAgent: "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.6 Mobile/15E148 Safari/604.1", }, - - // macOS Chrome - { - UserAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36", - SecChUa: `"Chromium";v="146", "Not-A.Brand";v="24", "Google Chrome";v="146"`, - SecChUaMobile: "?0", - SecChUaPlatform: `"macOS"`, - }, - { - UserAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36", - SecChUa: `"Chromium";v="145", "Not-A.Brand";v="99", "Google Chrome";v="145"`, - SecChUaMobile: "?0", - SecChUaPlatform: `"macOS"`, - }, - - // Linux Chrome { - UserAgent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36", - SecChUa: `"Chromium";v="146", "Not-A.Brand";v="24", "Google Chrome";v="146"`, - SecChUaMobile: "?0", - SecChUaPlatform: `"Linux"`, + UserAgent: "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1", }, { - UserAgent: "Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36", - SecChUa: `"Chromium";v="144", "Not-A.Brand";v="8", "Google Chrome";v="144"`, - SecChUaMobile: "?0", - SecChUaPlatform: `"Linux"`, + UserAgent: "Mozilla/5.0 (iPhone; CPU iPhone OS 17_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Mobile/15E148 Safari/604.1", }, } diff --git a/wireguard-apple/Sources/WireGuardKitGo/memstats.go b/wireguard-apple/Sources/WireGuardKitGo/memstats.go new file mode 100644 index 0000000..ef48a1b --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/memstats.go @@ -0,0 +1,140 @@ +// memstats.go — periodic Go heap stats while StartProxy is running. +// +// Companion to the Swift-side memory logger in PacketTunnelProvider: +// Swift logs the OS-level numbers (resident set size + iOS's view of +// remaining memory budget for the extension), Go logs what its own +// runtime is holding so we can attribute spikes to captcha pipeline / +// DTLS state / channel buffers / etc. +// +// Lifetime is tied to the proxy context — stops the moment +// StartProxy's ctx fires Done, so Disconnect doesn't leave a stray +// goroutine logging into nothing. + +package main + +import ( + "context" + "log" + "runtime" + "runtime/debug" + "time" +) + +const memstatsInterval = 5 * time.Second + +// goSoftMemoryLimit caps Go's heap at ~75 MB. iOS extensions get +// ~100 MB total; the rest is C heap (cgo allocations from pion-dtls +// crypto, the WG core, the kernel-side socket buffers we tuned to +// 4 MB each). When Go's heap approaches this cap, the runtime fires +// GC much more aggressively — the cost is CPU time spent collecting +// but the alternative is iOS SIGKILL on the whole extension, which +// is strictly worse. See Go runtime/debug.SetMemoryLimit. +const goSoftMemoryLimit = 75 * 1024 * 1024 + +// goGCPercent halves the default 100 → the heap doubles between GC +// cycles by default; we cut it to 50 (triples between cycles is the +// default math at 100, so 50 means the heap grows only 1.5x before +// the next GC). Pairs with the memory-limit: under steady-state load +// SetMemoryLimit handles the cap, but during transient spikes +// (captcha solve storm, DTLS handshake burst) GCPercent is what +// keeps the steady-state from drifting upward over minutes. +const goGCPercent = 50 + +// freeOSMemoryInterval is how often we force returning idle pages +// to the OS. Go normally hands memory back to the OS lazily (it +// keeps reclaimed heap mapped to amortise re-allocation). On iOS +// what matters is RSS, not Go's view — releasing eagerly makes the +// OS see lower RSS, which keeps us further from the SIGKILL line. +const freeOSMemoryInterval = 15 * time.Second + +// tuneGoRuntime applies the static-config tunings once at proxy +// startup. SetMemoryLimit and SetGCPercent are global — calling them +// repeatedly is fine but redundant, so we gate behind a runtime.Once +// equivalent by just calling from StartProxy. +func tuneGoRuntime() { + debug.SetMemoryLimit(goSoftMemoryLimit) + debug.SetGCPercent(goGCPercent) + log.Printf("memstats: tuned runtime soft_limit=%s gc_percent=%d", + humanBytes(goSoftMemoryLimit), goGCPercent) +} + +func startMemstatsLogger(ctx context.Context) { + go func() { + ticker := time.NewTicker(memstatsInterval) + defer ticker.Stop() + freeTicker := time.NewTicker(freeOSMemoryInterval) + defer freeTicker.Stop() + logMemstats("startup") + for { + select { + case <-ctx.Done(): + logMemstats("shutdown") + return + case <-ticker.C: + logMemstats("tick") + case <-freeTicker.C: + // FreeOSMemory does a STW GC and returns idle pages + // to the OS. The STW pause is short (~ms at this + // heap size) and only fires every 15s — well below + // the threshold where it would be visible as a + // data-plane stall, but enough to keep RSS from + // ratcheting up between captcha storms. + debug.FreeOSMemory() + } + } + }() +} + +func logMemstats(label string) { + var m runtime.MemStats + runtime.ReadMemStats(&m) + log.Printf("memstats(%s): heap_alloc=%s heap_sys=%s sys=%s goroutines=%d gc=%d", + label, + humanBytes(m.HeapAlloc), + humanBytes(m.HeapSys), + humanBytes(m.Sys), + runtime.NumGoroutine(), + m.NumGC, + ) +} + +func humanBytes(b uint64) string { + const ( + KB = 1024 + MB = 1024 * KB + ) + switch { + case b >= MB: + return fmtDecimal(b, MB) + "MB" + case b >= KB: + return fmtDecimal(b, KB) + "KB" + default: + return fmtDecimal(b, 1) + "B" + } +} + +func fmtDecimal(value, unit uint64) string { + if unit == 1 { + return formatUint(value) + } + whole := value / unit + frac := (value * 10 / unit) % 10 + if frac == 0 { + return formatUint(whole) + } + return formatUint(whole) + "." + formatUint(frac) +} + +func formatUint(v uint64) string { + if v == 0 { + return "0" + } + var buf [20]byte + i := len(buf) + for v > 0 { + i-- + buf[i] = byte('0' + v%10) + v /= 10 + } + return string(buf[i:]) +} diff --git a/wireguard-apple/Sources/WireGuardKitGo/packet_pipe.go b/wireguard-apple/Sources/WireGuardKitGo/packet_pipe.go new file mode 100644 index 0000000..d38352e --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/packet_pipe.go @@ -0,0 +1,221 @@ +// packet_pipe.go — bounded in-memory net.PacketConn pair. +// +// Replaces github.com/pion/transport/v4/connutil.AsyncPacketPipe in +// turn_proxy.go. The pion version is backed by an unbounded +// bytes.Buffer whose capacity ratchets up to the high-water mark on +// any burst and never shrinks — that's how steady-state RSS grew +// past the iOS budget at N=40 under reconnect storms. The bounded +// pipe below caps in-flight queue depth at boundedPipeDepth packets +// per direction and drops on overflow (UDP semantics, no +// backpressure), so worst-case memory per session is exactly +// 2 × depth × ~MTU bytes. +// +// Also exports readBufPool, a sync.Pool of *[1600]byte used by the +// four read-loop scratches in oneDtlsConnection / +// oneTurnConnection. Each loop borrows on entry, returns on exit. +// Under reconnect churn (sessions dying and respawning at high +// rate) this trims the GC pressure of repeatedly allocating ~1.6 KB +// per new goroutine. + +package main + +import ( + "net" + "os" + "sync" + "sync/atomic" + "time" +) + +// boundedPipeDepth caps how many packets can sit in either direction +// of the pipe at one time. DTLS handshake bursts are ~5-10 packets; +// post-handshake traffic drains fast, so 16 leaves comfortable +// headroom without committing many MB per session. +const boundedPipeDepth = 16 + +type pipePacket struct { + data []byte + addr net.Addr +} + +// pipePair owns the two channels that connect a pair of pipeConns, +// plus the shared close state. Closing either pipeConn closes the +// pair — there's no way to half-close a UDP-like connection, and +// pion's connutil behaved the same way. +type pipePair struct { + a2b chan pipePacket + b2a chan pipePacket + closeOnce sync.Once + closed chan struct{} + dropped atomic.Uint64 +} + +type pipeConn struct { + pair *pipePair + rx chan pipePacket // packets coming IN (= peer's tx) + tx chan pipePacket // packets going OUT (= peer's rx) + + deadlineMu sync.Mutex + wakeup chan struct{} + deadlineTimer *time.Timer +} + +// boundedPacketPipe returns a connected pair of net.PacketConn. +// Each direction has its own channel of depth=boundedPipeDepth. +// WriteTo to one appears on the other's ReadFrom. Bounded — overflow +// drops the new packet with a counter increment rather than blocking. +func boundedPacketPipe() (net.PacketConn, net.PacketConn) { + pair := &pipePair{ + a2b: make(chan pipePacket, boundedPipeDepth), + b2a: make(chan pipePacket, boundedPipeDepth), + closed: make(chan struct{}), + } + a := &pipeConn{ + pair: pair, + rx: pair.b2a, + tx: pair.a2b, + wakeup: make(chan struct{}), + } + b := &pipeConn{ + pair: pair, + rx: pair.a2b, + tx: pair.b2a, + wakeup: make(chan struct{}), + } + return a, b +} + +func (p *pipeConn) ReadFrom(buf []byte) (int, net.Addr, error) { + select { + case pkt, ok := <-p.rx: + if !ok { + return 0, nil, net.ErrClosed + } + n := copy(buf, pkt.data) + return n, pkt.addr, nil + case <-p.pair.closed: + return 0, nil, net.ErrClosed + case <-p.wakeup: + return 0, nil, os.ErrDeadlineExceeded + } +} + +func (p *pipeConn) WriteTo(buf []byte, addr net.Addr) (int, error) { + select { + case <-p.pair.closed: + return 0, net.ErrClosed + default: + } + // Copy because the caller is allowed to reuse buf after WriteTo + // returns (pion does this with its own scratch buffer). + data := append([]byte(nil), buf...) + select { + case p.tx <- pipePacket{data: data, addr: addr}: + return len(buf), nil + case <-p.pair.closed: + return 0, net.ErrClosed + default: + // Drop on overflow. Mirrors udp_fanout.go's dispatcher + // behaviour and matches UDP's "no flow control" semantics. + // Lying about success (returning len(buf), nil) is the + // standard idiom — net.PacketConn callers don't have a + // way to react to "your packet was buffered, not sent" + // anyway. + p.pair.dropped.Add(1) + return len(buf), nil + } +} + +func (p *pipeConn) Close() error { + p.pair.closeOnce.Do(func() { close(p.pair.closed) }) + return nil +} + +// LocalAddr returns a sentinel because pion-dtls reads it just to log +// it; the value doesn't have to be meaningful for the pipe to work. +func (p *pipeConn) LocalAddr() net.Addr { + return &net.UDPAddr{IP: net.IPv4zero, Port: 0} +} + +func (p *pipeConn) SetDeadline(t time.Time) error { + if err := p.SetReadDeadline(t); err != nil { + return err + } + return p.SetWriteDeadline(t) +} + +// SetReadDeadline mirrors fanoutPacketConn: the pion idiom is +// "Set to a past time" = "interrupt the in-flight read with +// os.ErrDeadlineExceeded". We don't bother implementing a real +// future-deadline behaviour because oneTurnConnection / +// oneDtlsConnection only ever call this with time.Now() from +// context.AfterFunc when ctx is being cancelled. +func (p *pipeConn) SetReadDeadline(t time.Time) error { + p.deadlineMu.Lock() + defer p.deadlineMu.Unlock() + + if p.deadlineTimer != nil { + p.deadlineTimer.Stop() + p.deadlineTimer = nil + } + + if t.IsZero() { + return nil + } + d := time.Until(t) + if d <= 0 { + select { + case p.wakeup <- struct{}{}: + default: + } + return nil + } + p.deadlineTimer = time.AfterFunc(d, func() { + select { + case p.wakeup <- struct{}{}: + default: + } + }) + return nil +} + +// SetWriteDeadline is a no-op: WriteTo never blocks (drops on +// overflow via the select-default branch), so a deadline can't +// be missed. +func (p *pipeConn) SetWriteDeadline(t time.Time) error { + return nil +} + +// readBufPool amortises the 1600-byte read scratches used by every +// per-session read-loop goroutine in oneDtlsConnection and +// oneTurnConnection. Without it, each goroutine startup allocates +// a fresh slice — under a reconnect storm (N=40 sessions cycling +// every ~30 s on cred-rotation) that's ~160 allocations per cycle +// of 1.6 KB each = 256 KB of churn per minute just on scratches. +// With the pool, freshly-spawned goroutines reuse a recently-freed +// scratch instead. +var readBufPool = sync.Pool{ + New: func() any { + b := make([]byte, 1600) + return &b + }, +} + +// borrowReadBuf returns a 1600-byte slice from the pool. The caller +// must return it via returnReadBuf when done; the buf MUST NOT be +// retained or shared after that. +func borrowReadBuf() []byte { + return *readBufPool.Get().(*[]byte) +} + +func returnReadBuf(buf []byte) { + // Only return slices that haven't been re-sliced down; this keeps + // the pool entries at the expected 1600-byte capacity. The check + // also rejects nil and non-pool-sourced buffers that callers + // might accidentally pass in. + if cap(buf) != 1600 { + return + } + buf = buf[:cap(buf)] + readBufPool.Put(&buf) +} diff --git a/wireguard-apple/Sources/WireGuardKitGo/physical_dialer.go b/wireguard-apple/Sources/WireGuardKitGo/physical_dialer.go new file mode 100644 index 0000000..65d2ab0 --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/physical_dialer.go @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: MIT +// +// physical_dialer.go — escape hatch from the utun default route. +// +// Once WireGuard comes up inside the NE extension, utun becomes the +// default route in the extension's process. From that moment, every +// HTTP request the captcha solver issues exits through utun → WG +// server → VK. That's the "tunnel egress" — VK sees a single shared +// source IP from all our sessions, and VK's per-IP captcha rate-limit +// chokes it within a minute. Worse: there is no way to flip the +// default route back to cellular dynamically; iOS exposes no API for +// that inside an NE extension. +// +// Workaround: pin individual sockets to a specific physical interface +// using Darwin's IP_BOUND_IF / IPV6_BOUND_IF setsockopts. The kernel +// then routes those sockets through the named NIC (en0 / pdp_ip0) +// regardless of what the default route says. utun is bypassed. +// +// cellularDial is a drop-in replacement for customDial that does +// exactly this. It enumerates non-loopback non-tunnel interfaces, +// prefers Wi-Fi (en0), falls back to cellular (pdp_ip0), and falls +// back to customDial if no usable physical interface is found. + +package main + +import ( + "context" + "fmt" + "log" + "net" + "strings" + "sync" + "sync/atomic" + "syscall" + "time" +) + +// Darwin socket-option constants. The Go standard library doesn't +// re-export these, but they're stable in : +// IP_BOUND_IF = 25 (binds an IPv4 socket to an interface index) +// IPV6_BOUND_IF = 125 (same for IPv6) +const ( + darwinIPBoundIf = 25 + darwinIPv6BoundIf = 125 +) + +// Cached physical-interface index, refreshed on a TTL. Enumerating +// interfaces is cheap but not free, and cellularDial may be called +// many times per second under heavy retry. 30 s is generous since +// iOS interface indices don't change without a Network Path change +// event (which the watchdog already handles separately). +var ( + physIfaceMu sync.Mutex + physIfaceIndex atomic.Int32 // 0 means "no usable interface" + physIfaceCachedAt time.Time +) + +const physIfaceTTL = 30 * time.Second + +func physicalInterfaceIndex() int { + physIfaceMu.Lock() + defer physIfaceMu.Unlock() + + if !physIfaceCachedAt.IsZero() && time.Since(physIfaceCachedAt) < physIfaceTTL { + return int(physIfaceIndex.Load()) + } + + idx := lookupPhysicalInterface() + physIfaceIndex.Store(int32(idx)) + physIfaceCachedAt = time.Now() + return idx +} + +// lookupPhysicalInterface returns the index of a non-loopback, non- +// tunnel, non-bridge interface that has at least one routable IPv4 +// address. Wi-Fi (en0..) is preferred; cellular (pdp_ipN) is the +// fallback. Returns 0 if nothing usable is up. +func lookupPhysicalInterface() int { + ifaces, err := net.Interfaces() + if err != nil { + log.Printf("physical_dialer: net.Interfaces failed: %v", err) + return 0 + } + + var wifi, cellular int + for _, iface := range ifaces { + if iface.Flags&net.FlagUp == 0 { + continue + } + if iface.Flags&net.FlagLoopback != 0 { + continue + } + name := iface.Name + if strings.HasPrefix(name, "utun") || strings.HasPrefix(name, "ipsec") { + continue + } + addrs, err := iface.Addrs() + if err != nil || len(addrs) == 0 { + continue + } + hasUsableIP := false + for _, a := range addrs { + ipnet, ok := a.(*net.IPNet) + if !ok { + continue + } + ip4 := ipnet.IP.To4() + if ip4 == nil { + continue + } + if ip4.IsLinkLocalUnicast() || ip4.IsLoopback() || ip4.IsUnspecified() { + continue + } + hasUsableIP = true + break + } + if !hasUsableIP { + continue + } + switch { + case strings.HasPrefix(name, "en"): + if wifi == 0 { + wifi = iface.Index + } + case strings.HasPrefix(name, "pdp_ip"): + if cellular == 0 { + cellular = iface.Index + } + } + } + if wifi != 0 { + return wifi + } + return cellular +} + +// pinnedDialer returns a net.Dialer whose sockets are bound to the +// given interface index via IP_BOUND_IF / IPV6_BOUND_IF. Each Dial +// call sets the option in the socket Control hook before connect. +func pinnedDialer(ifIndex int, timeout time.Duration) *net.Dialer { + return &net.Dialer{ + Timeout: timeout, + Control: func(network, address string, c syscall.RawConn) error { + var bindErr error + ctrlErr := c.Control(func(fd uintptr) { + if strings.HasSuffix(network, "6") { + bindErr = syscall.SetsockoptInt(int(fd), syscall.IPPROTO_IPV6, darwinIPv6BoundIf, ifIndex) + } else { + bindErr = syscall.SetsockoptInt(int(fd), syscall.IPPROTO_IP, darwinIPBoundIf, ifIndex) + } + }) + if ctrlErr != nil { + return ctrlErr + } + return bindErr + }, + } +} + +// cellularDial is the interface-pinned counterpart of customDial. +// Same DNS resilience (system → DoH → hardcoded fallback IPs), but +// every connect() is issued from a socket bound to a physical +// interface, so the kernel routes through cellular / Wi-Fi instead +// of utun. If no physical interface is up we transparently fall back +// to customDial; the caller (solveVkCaptcha) has already gated on +// physicalInterfaceIndex() > 0 anyway, but the fallback is cheap +// safety. +func cellularDial(ctx context.Context, network, address string) (net.Conn, error) { + ifIndex := physicalInterfaceIndex() + if ifIndex == 0 { + log.Printf("cellularDial: no usable physical interface, falling back to default route") + return customDial(ctx, network, address) + } + + host, port, err := net.SplitHostPort(address) + if err != nil { + return nil, err + } + + d := pinnedDialer(ifIndex, dohDialBudget) + + // Fast path: literal IP needs no resolution. + if net.ParseIP(host) != nil { + return d.DialContext(ctx, network, address) + } + + // Layer 1: system resolver via the pinned dialer. + sysCtx, cancel := context.WithTimeout(ctx, systemDialBudget) + conn, sysErr := d.DialContext(sysCtx, network, address) + cancel() + if sysErr == nil { + return conn, nil + } + log.Printf("cellularDial: system resolve+dial failed for %s via iface=%d: %v — falling back to DoH", + host, ifIndex, sysErr) + + // Layer 2: DoH lookup, then dial the returned IPs via the pinned + // dialer. DoH itself uses dohClient (default route) — once the + // tunnel is up, DoH responses get cached for 10 minutes so the + // per-host RTT is amortised. + if ips, err := resolveViaDoH(ctx, host); err == nil && len(ips) > 0 { + log.Printf("cellularDial: DoH %s → %v (iface=%d)", host, ips, ifIndex) + for _, ip := range ips { + c, derr := d.DialContext(ctx, network, net.JoinHostPort(ip, port)) + if derr == nil { + return c, nil + } + log.Printf("cellularDial: dial %s via iface=%d (DoH) failed: %v", ip, ifIndex, derr) + } + } else if err != nil { + log.Printf("cellularDial: DoH lookup failed for %s: %v", host, err) + } + + // Layer 3: hardcoded VK fallback IPs. + if ips, ok := fallbackIPs[strings.ToLower(host)]; ok { + log.Printf("cellularDial: trying hardcoded fallback IPs for %s via iface=%d: %v", + host, ifIndex, ips) + for _, ip := range ips { + c, derr := d.DialContext(ctx, network, net.JoinHostPort(ip, port)) + if derr == nil { + return c, nil + } + log.Printf("cellularDial: dial %s via iface=%d (fallback) failed: %v", ip, ifIndex, derr) + } + } + + return nil, fmt.Errorf("cellularDial: all DNS layers exhausted for %s via iface=%d (sys=%v)", + host, ifIndex, sysErr) +} diff --git a/wireguard-apple/Sources/WireGuardKitGo/remote_creds.go b/wireguard-apple/Sources/WireGuardKitGo/remote_creds.go new file mode 100644 index 0000000..df38d6d --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/remote_creds.go @@ -0,0 +1,278 @@ +// remote_creds.go — optional offload of getCreds to an external +// captcha-service. Configured at runtime by Swift via +// ProxySetRemoteCaptchaService. When configured AND the local +// captcha pipeline has already produced `remoteHandoverThreshold` +// unique TURN identities (i.e. WG is comfortably up and we just +// need more sessions for stream-aggregation throughput), subsequent +// getCreds calls route to the server instead of consuming the local +// per-IP rate-limit budget. +// +// The server (see captcha-service/) does the captcha solving from +// ITS own IP — that's the whole point: it doesn't share VK's +// per-IP ERROR_LIMIT bucket with the user's mobile IP, and the +// 70 MB slider rendering happens on a real machine with proper +// memory. + +package main + +/* +#include +*/ +import "C" + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "log" + "net/http" + "strconv" + "strings" + "sync/atomic" + "time" + "unsafe" +) + +// remoteHandoverThreshold — number of LOCAL successful captcha +// solves before subsequent solves are offloaded to the remote +// service. ONE is the minimum: as soon as the user has paid for +// the bootstrap captcha and the WG tunnel has actual data flowing, +// every subsequent getCreds call routes to the server cluster. +// The user's explicit ask (1.3.29): never see more than 3 captcha +// sheets per StartProxy. The combination of threshold=1 here + +// manualCaptchaQuotaPerSession=3 in captcha_manual.go gives us +// that bound even in the worst case where the remote /cred path +// is briefly degraded — local fallback can still produce up to +// 3 prompts before the quota kicks in. +const remoteHandoverThreshold = 1 + +type remoteCaptchaConfig struct { + url atomic.Value // string + apiKey atomic.Value // string +} + +var remoteCaptcha remoteCaptchaConfig + +// errDeferToRemote is returned from the local captcha path +// (requestManualCaptcha specifically) when, by the time the goroutine +// is about to actually inconvenience the user with a sheet, the +// remote captcha-service is preferred (≥1 session ready, no +// cooldown). The call bubbles back up to getCredsRouted which then +// re-routes to the server cluster. The user only sees the sheet for +// the captchas that genuinely have to happen on the device. +var errDeferToRemote = errors.New("defer to remote captcha service") + +// shouldDeferToRemoteNow reports whether the remote captcha-service +// is configured, has at least one local solve under its belt (so the +// remote path's auth isn't going to fight ERROR_LIMIT alongside the +// mobile IP for the bootstrap window), and isn't currently in a +// 429-cooldown. Goroutines queued behind the serialise lock check +// this AFTER they acquire the lock — see captcha_manual.go. +func shouldDeferToRemoteNow() bool { + if !remoteCaptchaEnabled() { + return false + } + if captchaSessionsReady.Load() < int64(remoteHandoverThreshold) { + return false + } + return !remoteInCooldown() +} + +func remoteCaptchaURL() string { + v, _ := remoteCaptcha.url.Load().(string) + return v +} + +func remoteCaptchaAPIKey() string { + v, _ := remoteCaptcha.apiKey.Load().(string) + return v +} + +func remoteCaptchaEnabled() bool { + return remoteCaptchaURL() != "" && remoteCaptchaAPIKey() != "" +} + +//export ProxySetRemoteCaptchaService +func ProxySetRemoteCaptchaService(cURL *C.char, cAPIKey *C.char) { + url := strings.TrimSpace(C.GoString(cURL)) + apiKey := strings.TrimSpace(C.GoString(cAPIKey)) + remoteCaptcha.url.Store(url) + remoteCaptcha.apiKey.Store(apiKey) + if url == "" || apiKey == "" { + log.Printf("remote-captcha: disabled") + return + } + log.Printf("remote-captcha: configured (url=%s, handover-after=%d local solves)", url, remoteHandoverThreshold) +} + +// remoteCooldownDefault — how long the client treats the remote +// service as unavailable when the master returns 429 without a +// usable Retry-After header. Matches the server's own ERROR_LIMIT +// cooldown so the two ends recover in lockstep. +const remoteCooldownDefault = 60 * time.Second + +// remoteCooldownUntilNano is a UnixNano timestamp; calls to +// getCredsRemote skip the round trip entirely while now() < this +// value. Lets `getCredsRouted` fall through to local immediately +// during a saturation window instead of paying 90 s of HTTP timeout +// per session waiting for the server to refuse again. +var remoteCooldownUntilNano atomic.Int64 + +func remoteInCooldown() bool { + until := remoteCooldownUntilNano.Load() + if until == 0 { + return false + } + return time.Now().UnixNano() < until +} + +func setRemoteCooldown(d time.Duration) { + if d <= 0 { + return + } + until := time.Now().Add(d).UnixNano() + for { + cur := remoteCooldownUntilNano.Load() + // Only extend; never shorten. Two concurrent 429s with + // different Retry-After values shouldn't clobber the + // longer one. + if until <= cur { + return + } + if remoteCooldownUntilNano.CompareAndSwap(cur, until) { + return + } + } +} + +// remoteCredsClient is dedicated to /cred calls. Its DialContext is +// customDial so it benefits from DoH + fallback IPs when api.vk.com +// is censored, but the actual target host is the user's own server. +var remoteCredsClient = &http.Client{ + Timeout: 90 * time.Second, // server-side solve can take up to 80 s; add slack. + Transport: &http.Transport{ + DialContext: customDial, + MaxIdleConns: 20, + MaxIdleConnsPerHost: 20, + IdleConnTimeout: 120 * time.Second, + }, +} + +type remoteCredResponse struct { + User string `json:"user"` + Pass string `json:"pass"` + Addr string `json:"addr"` + ExpiresAt time.Time `json:"expires_at"` + Error string `json:"error,omitempty"` +} + +func getCredsRemote(ctx context.Context, link string) (string, string, string, error) { + url := remoteCaptchaURL() + apiKey := remoteCaptchaAPIKey() + if url == "" || apiKey == "" { + return "", "", "", errors.New("remote captcha not configured") + } + + captchaRemoteAttempts.Add(1) + captchaRemoteInFlight.Add(1) + defer captchaRemoteInFlight.Add(-1) + + body, _ := json.Marshal(map[string]string{"link": link}) + req, err := http.NewRequestWithContext(ctx, "POST", strings.TrimRight(url, "/")+"/cred", bytes.NewReader(body)) + if err != nil { + return "", "", "", fmt.Errorf("build request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+apiKey) + + httpResp, err := remoteCredsClient.Do(req) + if err != nil { + return "", "", "", fmt.Errorf("call server: %w", err) + } + defer httpResp.Body.Close() + + rawBody, _ := io.ReadAll(httpResp.Body) + var resp remoteCredResponse + if jsonErr := json.Unmarshal(rawBody, &resp); jsonErr != nil { + return "", "", "", fmt.Errorf("decode server response (status=%d): %w", httpResp.StatusCode, jsonErr) + } + if httpResp.StatusCode == http.StatusTooManyRequests { + // Master is reporting that every peer it knows about is + // saturated. Trip our local cooldown so we don't pile on + // during the recovery window — the next ~60 s of getCreds + // calls will skip the HTTP round trip and go straight to + // the local solver (which on a single-IP deployment is + // often also saturated, but at least skipping spares us + // the 90 s remote timeout per session). + cooldown := remoteCooldownDefault + if ra := httpResp.Header.Get("Retry-After"); ra != "" { + if secs, err := strconv.Atoi(strings.TrimSpace(ra)); err == nil && secs > 0 { + cooldown = time.Duration(secs) * time.Second + } + } + setRemoteCooldown(cooldown) + log.Printf("remote-captcha: master saturated, cooling down for %v", cooldown) + msg := resp.Error + if msg == "" { + msg = "all peers saturated" + } + return "", "", "", fmt.Errorf("server: %s", msg) + } + if httpResp.StatusCode != http.StatusOK { + msg := resp.Error + if msg == "" { + msg = fmt.Sprintf("HTTP %d", httpResp.StatusCode) + } + return "", "", "", fmt.Errorf("server: %s", msg) + } + if resp.User == "" || resp.Pass == "" || resp.Addr == "" { + return "", "", "", fmt.Errorf("server returned incomplete creds") + } + captchaRemoteOK.Add(1) + return resp.User, resp.Pass, resp.Addr, nil +} + +// getCredsRouted picks local vs remote at call time. The first +// `remoteHandoverThreshold` cred acquisitions stay local (regardless +// of how many recycle-fallbacks happen) so the WG tunnel can come up +// on the user's own mobile IP; after that, calls prefer the remote +// service. Remote failures cleanly fall through to local — same +// recycle-pool behaviour as before, no client-side regression. +// +// When the master is in 429-cooldown (see setRemoteCooldown), skip +// the HTTP attempt entirely. The cooldown is established by a real +// 429 response; once the window passes the next call will try remote +// again. This keeps the recovery-window log clean and stops us from +// burning 90 s timeouts on each session-spawn while the cluster +// recovers. +func getCredsRouted(ctx context.Context, link string) (string, string, string, error) { + useRemote := remoteCaptchaEnabled() && captchaSessionsReady.Load() >= int64(remoteHandoverThreshold) + if useRemote && !remoteInCooldown() { + u, p, a, err := getCredsRemote(ctx, link) + if err == nil { + log.Printf("remote-captcha: cred from server (sessions_ready=%d)", captchaSessionsReady.Load()) + return u, p, a, nil + } + log.Printf("remote-captcha: server call failed (%v) — falling back to local", err) + } + u, p, a, err := getCreds(ctx, link) + // Local path can defer to remote when the goroutine was queued + // behind manualCaptchaSerialise long enough for the first session + // to come up. Honour the deferral and try the server cluster. + if errors.Is(err, errDeferToRemote) && remoteCaptchaEnabled() && !remoteInCooldown() { + log.Printf("remote-captcha: local deferred to remote, retrying via server") + ru, rp, ra, rerr := getCredsRemote(ctx, link) + if rerr == nil { + return ru, rp, ra, nil + } + log.Printf("remote-captcha: deferred retry failed (%v) — returning original local error", rerr) + } + return u, p, a, err +} + +// Compile-time sanity: keep "unsafe" import referenced if cgo tooling +// ever decides it's "unused". +var _ unsafe.Pointer diff --git a/wireguard-apple/Sources/WireGuardKitGo/stream_aggregation.go b/wireguard-apple/Sources/WireGuardKitGo/stream_aggregation.go new file mode 100644 index 0000000..7e376b9 --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/stream_aggregation.go @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: MIT +// +// Stream-Aggregation handshake compatible with the kiper292/vk-turn-proxy +// server fork. When enabled, every DTLS session this client establishes +// writes a 17-byte preamble immediately after the DTLS handshake +// completes: +// +// bytes 0..15: Session ID (UUID v4 binary, shared across all N streams) +// byte 16: Stream ID (0..N-1) +// +// The receiver-side aggregator reads this preamble, groups every stream +// that carries the same Session ID under one logical session, and +// presents them to the upstream WireGuard server as a SINGLE endpoint. +// That stops the WG server from endpoint-thrashing when N parallel TURN +// allocations deliver packets from N distinct VK relay ports. +// +// Without a compatible server-side aggregator the preamble would be +// fed directly into WireGuard as the first bytes of "WG data", garbling +// the very first handshake. The flag therefore defaults to off and is +// only toggled on by Swift at StartProxy time when the active profile +// has streamAggregation=true. + +package main + +/* +#include +*/ +import "C" + +import ( + "crypto/rand" + "sync" + "sync/atomic" +) + +var ( + streamAggEnabled atomic.Bool // true ⇔ write the 17-byte preamble on each session + + streamAggSessionMu sync.Mutex + streamAggSessionID [16]byte // re-rolled once per StartProxy when the flag is on + streamAggHasID bool +) + +//export TurnBridgeSetStreamAggregation +func TurnBridgeSetStreamAggregation(enabled C.int) { + streamAggEnabled.Store(enabled != 0) + if enabled == 0 { + // Clear the cached session ID so the next "on" re-rolls a fresh one. + streamAggSessionMu.Lock() + streamAggHasID = false + streamAggSessionMu.Unlock() + } +} + +func streamAggIsEnabled() bool { + return streamAggEnabled.Load() +} + +// freshStreamAggSession re-rolls the shared Session ID. Called from +// StartProxy at the moment all N sessions are about to be spawned, so +// every set of TURN allocations from a single connect attempt shares +// one ID and the server-side aggregator can fuse them. +func freshStreamAggSession() [16]byte { + streamAggSessionMu.Lock() + defer streamAggSessionMu.Unlock() + if _, err := rand.Read(streamAggSessionID[:]); err != nil { + // crypto/rand failing on iOS is practically impossible, but if it + // does we'd rather have a fixed-zero ID than crash StartProxy; + // the aggregator will at least bucket all streams together. + for i := range streamAggSessionID { + streamAggSessionID[i] = 0 + } + } + // Set the UUID v4 marker bits (RFC 4122 §4.4) so the bytes look like + // a valid v4 UUID on the wire — matches what the reference Go + // implementation produces via uuid.New().MarshalBinary(). + streamAggSessionID[6] = (streamAggSessionID[6] & 0x0f) | 0x40 + streamAggSessionID[8] = (streamAggSessionID[8] & 0x3f) | 0x80 + streamAggHasID = true + return streamAggSessionID +} + +func currentStreamAggSession() ([16]byte, bool) { + streamAggSessionMu.Lock() + defer streamAggSessionMu.Unlock() + if !streamAggHasID { + return [16]byte{}, false + } + return streamAggSessionID, true +} diff --git a/wireguard-apple/Sources/WireGuardKitGo/turn_min.go b/wireguard-apple/Sources/WireGuardKitGo/turn_min.go new file mode 100644 index 0000000..99741ea --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/turn_min.go @@ -0,0 +1,670 @@ +// turn_min.go — hand-rolled minimal TURN client (RFC 5766). +// +// Replaces github.com/pion/turn/v5 in the data path. pion's client +// maintains rich permission/channel state, multi-peer dispatch, RFC +// 6062 TCP-allocation support, and a handful of background goroutines +// per Client. For our use-case — one peer per allocation (the WG +// client), UDP-or-STUN-over-TCP transport, one bound channel for that +// one peer — that's a lot of dead weight to carry per session and the +// per-session struct + maps were a meaningful slice of the ~2 MB/ +// session steady-state we measured in the memory audit. +// +// What's here: +// - Allocate with long-term auth (two-pass nonce challenge per +// RFC 5389 §10.2 + RFC 5766 §6.2) +// - ChannelBind for the single peer +// - Refresh on a half-lifetime ticker, cancel via context +// - ChannelData frame encode/decode in the hot path +// - net.PacketConn surface matching pion's allocation.Conn +// +// What's NOT here: +// - CreatePermission — superseded by ChannelBind which also installs +// the permission (RFC 5766 §11.2) +// - Send/Data indications — we always use the bound channel +// - Multi-peer allocations (we have exactly one peer) +// - Fingerprint / short-term auth +// - TURN-TCP (RFC 6062) + +package main + +import ( + "context" + "encoding/binary" + "errors" + "fmt" + "log" + "net" + "os" + "sync" + "sync/atomic" + "time" + + "github.com/pion/stun/v3" +) + +// TURN attribute types beyond what pion/stun exports as named +// constants. Values from RFC 5766 §14. +const ( + attrChannelNumber stun.AttrType = 0x000C + attrLifetime stun.AttrType = 0x000D + attrXORPeerAddress stun.AttrType = 0x0012 + attrXORRelayedAddress stun.AttrType = 0x0016 + attrRequestedAddressFamily stun.AttrType = 0x0017 + attrRequestedTransport stun.AttrType = 0x0019 +) + +// Channel numbers must fall in 0x4000-0x4FFE (RFC 5766 §11). We only +// bind one channel per allocation, so a fixed value is fine. +const fixedChannelNumber uint16 = 0x4000 + +// RFC 5766 §6.2 — lifetime SHOULD default to 600s. We ask for that +// and refresh at half-life. +const allocLifetimeSec uint32 = 600 + +// RFC 5766 §14.7 — REQUESTED-TRANSPORT for UDP relay. +const transportUDP byte = 17 + +// Address-family attribute values (RFC 6156 §4.1.1). +const ( + addrFamilyIPv4 byte = 0x01 + addrFamilyIPv6 byte = 0x02 +) + +// txRetryDelays is the RFC 5389 §7.2.1 retransmit schedule, slightly +// shortened: 500ms, 1s, 2s, 4s. We don't need the full 7-retry +// 39.5s ladder because failed allocations get retried at a higher +// layer (oneTurnConnectionLoop) and we'd rather fail fast and let +// the session recycle on a fresh socket. +var txRetryDelays = []time.Duration{ + 500 * time.Millisecond, + 1 * time.Second, + 2 * time.Second, + 4 * time.Second, +} + +// minimalTURNAlloc represents a single live TURN allocation. It +// satisfies net.PacketConn so it can drop into the same slot as +// pion's relayConn in oneTurnConnection. +type minimalTURNAlloc struct { + conn net.PacketConn // transport (connectedUDPConn or *turn.STUNConn equivalent) + server net.Addr // destination for WriteTo on UDP; ignored by STUNConn + + user, pass string + realm []byte + nonce atomic.Value // []byte; updated whenever a 438 lands + + peer *net.UDPAddr + relayedAddr *net.UDPAddr + currentLifetime time.Duration + + // Transactions are serialised: only Allocate / ChannelBind / + // Refresh run, and only one at a time (Refresh waits for the + // previous to finish before firing). One pending slot is enough. + pendingMu sync.Mutex + pendingTx [stun.TransactionIDSize]byte + pendingCh chan *stun.Message + + // Data path. The read loop demuxes inbound frames: ChannelData + // goes onto inboundData; everything else (STUN responses / + // indications) is steered to pendingCh by transaction-ID match. + inboundData chan []byte + + closed chan struct{} + closeOnce sync.Once + + // SetReadDeadline: past-time wakeup mirrors the idiom in + // fanoutPacketConn / pipeConn — set deadline = now to interrupt + // the in-flight ReadFrom with os.ErrDeadlineExceeded. + deadlineMu sync.Mutex + wakeup chan struct{} + timer *time.Timer +} + +// LocalAddr returns the relayed transport address — the address peers +// dial to reach us through the TURN server. oneTurnConnection only +// reads this for the "relayed-address=" log line. +func (a *minimalTURNAlloc) LocalAddr() net.Addr { + return a.relayedAddr +} + +// ReadFrom blocks until the next data frame arrives from the bound +// peer. The returned addr is always the bound peer — we never accept +// data from any other source because we never CreatePermission'd / +// ChannelBind'd any other peer. +func (a *minimalTURNAlloc) ReadFrom(buf []byte) (int, net.Addr, error) { + select { + case data, ok := <-a.inboundData: + if !ok { + return 0, nil, net.ErrClosed + } + n := copy(buf, data) + return n, a.peer, nil + case <-a.closed: + return 0, nil, net.ErrClosed + case <-a.wakeup: + return 0, nil, os.ErrDeadlineExceeded + } +} + +// WriteTo wraps buf in a ChannelData frame and sends it through the +// transport. The addr argument is ignored — we always send to the +// bound peer via the bound channel. This matches the contract that +// oneTurnConnection relies on (it calls WriteTo with the same peer +// every time). +func (a *minimalTURNAlloc) WriteTo(buf []byte, _ net.Addr) (int, error) { + select { + case <-a.closed: + return 0, net.ErrClosed + default: + } + + frame := encodeChannelData(fixedChannelNumber, buf) + _, err := a.conn.WriteTo(frame, a.server) + if err != nil { + return 0, err + } + return len(buf), nil +} + +func (a *minimalTURNAlloc) Close() error { + a.closeOnce.Do(func() { + // Best-effort delete on the server side: Refresh with + // lifetime=0 explicitly tears down the allocation per + // RFC 5766 §7. If it fails (network gone, server already + // expired us) it doesn't matter — the allocation would + // time out within ~10 min anyway. + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + _ = a.refresh(ctx, 0) + cancel() + close(a.closed) + }) + return nil +} + +func (a *minimalTURNAlloc) SetDeadline(t time.Time) error { + return a.SetReadDeadline(t) +} + +func (a *minimalTURNAlloc) SetReadDeadline(t time.Time) error { + a.deadlineMu.Lock() + defer a.deadlineMu.Unlock() + + if a.timer != nil { + a.timer.Stop() + a.timer = nil + } + if t.IsZero() { + return nil + } + d := time.Until(t) + if d <= 0 { + select { + case a.wakeup <- struct{}{}: + default: + } + return nil + } + a.timer = time.AfterFunc(d, func() { + select { + case a.wakeup <- struct{}{}: + default: + } + }) + return nil +} + +// SetWriteDeadline is a no-op. Our WriteTo doesn't block (channel-data +// goes straight to the underlying conn, which is either UDP — drops on +// overflow — or TCP via STUNConn, where the kernel buffers). +func (a *minimalTURNAlloc) SetWriteDeadline(t time.Time) error { + return nil +} + +// minimalTURNAllocate dials a fresh TURN allocation on conn, binds a +// channel for peer, and starts the read+refresh goroutines. The +// caller's ctx is used only for the allocate handshake; subsequent +// lifetime is bounded by Close. +func minimalTURNAllocate( + ctx context.Context, + conn net.PacketConn, + server net.Addr, + user, pass string, + peer *net.UDPAddr, +) (*minimalTURNAlloc, error) { + alloc := &minimalTURNAlloc{ + conn: conn, + server: server, + user: user, + pass: pass, + peer: peer, + inboundData: make(chan []byte, 64), + closed: make(chan struct{}), + wakeup: make(chan struct{}, 1), + } + alloc.nonce.Store([]byte(nil)) + + // Spawn the read demultiplexer before any handshake so allocate + // responses can reach pendingCh. + go alloc.readLoop() + + // On error, leave conn lifecycle to the caller — they hold the + // defer that closes it. Just signal our internal closed channel so + // the just-spawned readLoop unwinds. + if err := alloc.allocate(ctx); err != nil { + alloc.closeOnce.Do(func() { close(alloc.closed) }) + return nil, fmt.Errorf("allocate: %w", err) + } + if err := alloc.channelBind(ctx); err != nil { + alloc.closeOnce.Do(func() { close(alloc.closed) }) + return nil, fmt.Errorf("channelBind: %w", err) + } + go alloc.refreshLoop() + return alloc, nil +} + +// allocate runs the full RFC 5389 §10.2 long-term-auth two-pass +// handshake: first request is anonymous and expected to get back a +// 401 with REALM and NONCE; second pass adds USERNAME/REALM/NONCE/ +// MESSAGE-INTEGRITY and gets back XOR-RELAYED-ADDRESS. +func (a *minimalTURNAlloc) allocate(ctx context.Context) error { + family := addrFamilyIPv4 + if a.peer.IP.To4() == nil { + family = addrFamilyIPv6 + } + + build := func(withAuth bool) (*stun.Message, error) { + m := stun.New() + m.SetType(stun.NewType(stun.MethodAllocate, stun.ClassRequest)) + if err := m.NewTransactionID(); err != nil { + return nil, err + } + // WriteHeader stamps the STUN magic cookie 0x2112A442 into + // m.Raw[4:8]. Without this, MessageIntegrity.AddTo (called + // later by addAuth) computes the HMAC over m.Raw with + // cookie=0 — but the wire bytes go out with the real cookie + // (Encode writes it), so the server's recomputed HMAC over + // the received bytes doesn't match → 401. SetType wrote + // [0:2] and NewTransactionID wrote [8:20], but nothing else + // touches [4:8] until Encode, which runs too late. This was + // the 1.3.9 ship-blocker. + m.WriteHeader() + m.Add(attrRequestedTransport, []byte{transportUDP, 0, 0, 0}) + m.Add(attrRequestedAddressFamily, []byte{family, 0, 0, 0}) + if withAuth { + if err := a.addAuth(m); err != nil { + return nil, err + } + } + m.Encode() + return m, nil + } + + // First attempt — anonymous. RFC 5389 spells out that the server + // MUST 401 this with REALM/NONCE for long-term-credential mode. + first, err := build(false) + if err != nil { + return err + } + resp, err := a.do(ctx, first) + if err != nil { + return err + } + if resp.Type.Class == stun.ClassSuccessResponse { + // Server happens to accept unauthenticated allocate (rare). + return a.parseAllocSuccess(resp) + } + if err := a.learnAuth(resp); err != nil { + return fmt.Errorf("learn auth: %w", err) + } + + // Second attempt — authenticated. + second, err := build(true) + if err != nil { + return err + } + resp, err = a.do(ctx, second) + if err != nil { + return err + } + if resp.Type.Class != stun.ClassSuccessResponse { + // On 438 Stale Nonce, server may have rotated the nonce + // between the 401 and our second request. Pick up the new + // nonce and retry once. + if isStaleNonce(resp) { + if err := a.learnAuth(resp); err != nil { + return err + } + retry, err := build(true) + if err != nil { + return err + } + resp, err = a.do(ctx, retry) + if err != nil { + return err + } + if resp.Type.Class != stun.ClassSuccessResponse { + return errorFrom(resp) + } + } else { + return errorFrom(resp) + } + } + return a.parseAllocSuccess(resp) +} + +func (a *minimalTURNAlloc) parseAllocSuccess(m *stun.Message) error { + var relayed stun.XORMappedAddress + if err := relayed.GetFromAs(m, attrXORRelayedAddress); err != nil { + return fmt.Errorf("XOR-RELAYED-ADDRESS: %w", err) + } + a.relayedAddr = &net.UDPAddr{IP: relayed.IP, Port: relayed.Port} + + lifetime := allocLifetimeSec + if raw, err := m.Get(attrLifetime); err == nil && len(raw) == 4 { + lifetime = binary.BigEndian.Uint32(raw) + } + a.currentLifetime = time.Duration(lifetime) * time.Second + return nil +} + +// channelBind binds fixedChannelNumber to a.peer. Per RFC 5766 §11.2, +// this also installs a permission for the peer's IP, so we don't need +// a separate CreatePermission. +func (a *minimalTURNAlloc) channelBind(ctx context.Context) error { + build := func() (*stun.Message, error) { + m := stun.New() + m.SetType(stun.NewType(stun.MethodChannelBind, stun.ClassRequest)) + if err := m.NewTransactionID(); err != nil { + return nil, err + } + m.WriteHeader() // stamp magic cookie — see allocate() + var chBuf [4]byte + binary.BigEndian.PutUint16(chBuf[0:2], fixedChannelNumber) + m.Add(attrChannelNumber, chBuf[:]) + xor := stun.XORMappedAddress{IP: a.peer.IP, Port: a.peer.Port} + if err := xor.AddToAs(m, attrXORPeerAddress); err != nil { + return nil, err + } + if err := a.addAuth(m); err != nil { + return nil, err + } + m.Encode() + return m, nil + } + for attempt := 0; attempt < 2; attempt++ { + req, err := build() + if err != nil { + return err + } + resp, err := a.do(ctx, req) + if err != nil { + return err + } + if resp.Type.Class == stun.ClassSuccessResponse { + return nil + } + if isStaleNonce(resp) && attempt == 0 { + if err := a.learnAuth(resp); err != nil { + return err + } + continue + } + return errorFrom(resp) + } + return errors.New("channel bind: out of retries") +} + +// refresh sends a Refresh request with the given lifetime (or 0 to +// destroy the allocation). On 438 Stale Nonce it re-learns and +// retries once. +func (a *minimalTURNAlloc) refresh(ctx context.Context, lifetimeSec uint32) error { + build := func() (*stun.Message, error) { + m := stun.New() + m.SetType(stun.NewType(stun.MethodRefresh, stun.ClassRequest)) + if err := m.NewTransactionID(); err != nil { + return nil, err + } + m.WriteHeader() // stamp magic cookie — see allocate() + var lifeBuf [4]byte + binary.BigEndian.PutUint32(lifeBuf[:], lifetimeSec) + m.Add(attrLifetime, lifeBuf[:]) + if err := a.addAuth(m); err != nil { + return nil, err + } + m.Encode() + return m, nil + } + for attempt := 0; attempt < 2; attempt++ { + req, err := build() + if err != nil { + return err + } + resp, err := a.do(ctx, req) + if err != nil { + return err + } + if resp.Type.Class == stun.ClassSuccessResponse { + return nil + } + if isStaleNonce(resp) && attempt == 0 { + if err := a.learnAuth(resp); err != nil { + return err + } + continue + } + return errorFrom(resp) + } + return errors.New("refresh: out of retries") +} + +func (a *minimalTURNAlloc) refreshLoop() { + half := a.currentLifetime / 2 + if half < 30*time.Second { + half = 30 * time.Second + } + t := time.NewTicker(half) + defer t.Stop() + for { + select { + case <-a.closed: + return + case <-t.C: + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + err := a.refresh(ctx, allocLifetimeSec) + cancel() + if err != nil { + log.Printf("turn-min: refresh failed: %s", err) + return + } + } + } +} + +// addAuth stamps USERNAME, REALM, NONCE and MESSAGE-INTEGRITY onto m. +// All three text attrs MUST be present for the integrity check to +// validate per RFC 5389 §15.4. +func (a *minimalTURNAlloc) addAuth(m *stun.Message) error { + nonce, _ := a.nonce.Load().([]byte) + if nonce == nil || a.realm == nil { + return errors.New("auth not yet learned") + } + if err := stun.NewUsername(a.user).AddTo(m); err != nil { + return err + } + if err := stun.Realm(a.realm).AddTo(m); err != nil { + return err + } + if err := stun.Nonce(nonce).AddTo(m); err != nil { + return err + } + return stun.NewLongTermIntegrity(a.user, string(a.realm), a.pass).AddTo(m) +} + +// learnAuth pulls REALM and NONCE from a 401 / 438 error response. +func (a *minimalTURNAlloc) learnAuth(m *stun.Message) error { + var realm stun.Realm + if err := realm.GetFrom(m); err != nil { + return fmt.Errorf("REALM: %w", err) + } + a.realm = []byte(realm) + var nonce stun.Nonce + if err := nonce.GetFrom(m); err != nil { + return fmt.Errorf("NONCE: %w", err) + } + a.nonce.Store([]byte(nonce)) + return nil +} + +// do sends m and returns the matching response. It serialises on +// pendingMu so we never have more than one in-flight transaction — +// the existing call sites are sequential (allocate → channelBind → +// periodic refresh) so this is the natural shape. +func (a *minimalTURNAlloc) do(ctx context.Context, m *stun.Message) (*stun.Message, error) { + a.pendingMu.Lock() + a.pendingTx = m.TransactionID + ch := make(chan *stun.Message, 1) + a.pendingCh = ch + a.pendingMu.Unlock() + + defer func() { + a.pendingMu.Lock() + a.pendingCh = nil + a.pendingMu.Unlock() + }() + + // Retransmit ladder. UDP transport may lose the request or its + // response; TCP transport doesn't need the retries but they're + // harmless because the server demuxes by transaction ID. + for _, delay := range txRetryDelays { + if _, err := a.conn.WriteTo(m.Raw, a.server); err != nil { + return nil, err + } + select { + case resp := <-ch: + return resp, nil + case <-time.After(delay): + continue + case <-ctx.Done(): + return nil, ctx.Err() + case <-a.closed: + return nil, net.ErrClosed + } + } + return nil, errors.New("transaction timeout") +} + +// readLoop is the one and only goroutine that pulls from the +// underlying transport. It demuxes between STUN frames (which feed +// pendingCh) and ChannelData (which feeds inboundData). +func (a *minimalTURNAlloc) readLoop() { + buf := borrowReadBuf() + defer returnReadBuf(buf) + for { + n, _, err := a.conn.ReadFrom(buf) + if err != nil { + // Surface the close downstream so a blocked ReadFrom + // returns net.ErrClosed instead of hanging. + a.closeOnce.Do(func() { close(a.closed) }) + return + } + if n < 4 { + continue + } + if isChannelData(buf[:n]) { + payload, ok := decodeChannelData(buf[:n]) + if !ok { + continue + } + // Copy because buf gets reused on the next loop iteration. + cp := make([]byte, len(payload)) + copy(cp, payload) + select { + case a.inboundData <- cp: + case <-a.closed: + return + default: + // Drop on consumer back-pressure. Matches udp_fanout + // and packet_pipe behavior under UDP semantics — + // better than blocking the read loop, which would + // freeze the allocation entirely. + } + continue + } + // STUN frame — could be a response (matches pending tx ID) + // or an indication (Data indication if peer-sent us data via + // a path that didn't use channel binding). Indications are + // ignored: the peer ChannelBound, so all real traffic comes + // in via ChannelData. + msg := &stun.Message{Raw: append([]byte(nil), buf[:n]...)} + if err := msg.Decode(); err != nil { + continue + } + a.pendingMu.Lock() + ch := a.pendingCh + expected := a.pendingTx + a.pendingMu.Unlock() + if ch != nil && msg.TransactionID == expected { + select { + case ch <- msg: + default: + // Slot already filled — duplicate retransmit reply, + // safe to drop. + } + } + } +} + +// encodeChannelData wraps payload in a ChannelData frame per +// RFC 5766 §11.5. The 4-byte header is followed by payload and (for +// TCP/TLS transports) zero-padded to a 4-byte boundary. UDP doesn't +// require padding but accepts it, so we always pad — the few bytes +// of waste aren't worth a branch. +func encodeChannelData(channel uint16, payload []byte) []byte { + padLen := (4 - (len(payload) & 3)) & 3 + frame := make([]byte, 4+len(payload)+padLen) + binary.BigEndian.PutUint16(frame[0:2], channel) + binary.BigEndian.PutUint16(frame[2:4], uint16(len(payload))) + copy(frame[4:], payload) + return frame +} + +// decodeChannelData returns the unframed payload (no padding) and +// true if the frame is well-formed. We assume the caller has already +// run isChannelData to disambiguate from STUN. +func decodeChannelData(frame []byte) ([]byte, bool) { + if len(frame) < 4 { + return nil, false + } + dataLen := int(binary.BigEndian.Uint16(frame[2:4])) + if 4+dataLen > len(frame) { + return nil, false + } + return frame[4 : 4+dataLen], true +} + +// isChannelData distinguishes ChannelData from STUN. STUN's first two +// bits are zero (so first byte < 0x40); valid channel numbers start +// at 0x4000. +func isChannelData(b []byte) bool { + if len(b) < 4 { + return false + } + ch := binary.BigEndian.Uint16(b[0:2]) + return ch >= 0x4000 && ch <= 0x4FFE +} + +func isStaleNonce(m *stun.Message) bool { + var ec stun.ErrorCodeAttribute + if err := ec.GetFrom(m); err != nil { + return false + } + return ec.Code == 438 +} + +func errorFrom(m *stun.Message) error { + var ec stun.ErrorCodeAttribute + if err := ec.GetFrom(m); err != nil { + return fmt.Errorf("turn server returned non-success without ERROR-CODE") + } + return fmt.Errorf("turn server error %d: %s", ec.Code, ec.Reason) +} diff --git a/wireguard-apple/Sources/WireGuardKitGo/turn_proxy.go b/wireguard-apple/Sources/WireGuardKitGo/turn_proxy.go index 0b9bc88..8f6be07 100755 --- a/wireguard-apple/Sources/WireGuardKitGo/turn_proxy.go +++ b/wireguard-apple/Sources/WireGuardKitGo/turn_proxy.go @@ -21,16 +21,17 @@ import ( "fmt" "io" "log" + "math/rand" "net" "net/http" neturl "net/url" + "strconv" "sync" "sync/atomic" "time" "unsafe" "strings" - "github.com/cbeuw/connutil" "github.com/google/uuid" "github.com/pion/dtls/v3" "github.com/pion/dtls/v3/pkg/crypto/selfsign" @@ -42,6 +43,69 @@ var proxyLoggerFunc C.proxy_logger_fn_t var proxyLoggerCtx unsafe.Pointer var proxyCancel context.CancelFunc +// Session registry — every live DTLS/TURN session registers its +// cancel func so ProxyForceReconnect() can tear them all down at once +// (e.g. when iOS wakes the device after sleep and we want fresh +// allocations before WireGuard resumes pumping packets). +var ( + sessionMu sync.Mutex + sessionCancels = map[uint64]context.CancelFunc{} + sessionIDSource uint64 +) + +func registerSession(cancel context.CancelFunc) func() { + id := atomic.AddUint64(&sessionIDSource, 1) + sessionMu.Lock() + sessionCancels[id] = cancel + sessionMu.Unlock() + return func() { + sessionMu.Lock() + delete(sessionCancels, id) + sessionMu.Unlock() + } +} + +//export ProxyForceReconnect +func ProxyForceReconnect() { + sessionMu.Lock() + cancels := make([]context.CancelFunc, 0, len(sessionCancels)) + for _, c := range sessionCancels { + cancels = append(cancels, c) + } + sessionMu.Unlock() + for _, c := range cancels { + c() + } + // Network-path changes (which is what triggers a force-reconnect + // 90% of the time) are exactly when half-dead persistConns + // accumulate in the HTTP idle pool — IdleConnTimeout=90s won't + // catch them because the socket isn't naturally idle, it's + // silently broken. Drop them all so the next captcha solve + // dials fresh sockets instead of reusing zombies. + flushHTTPIdleConns() + log.Printf("ProxyForceReconnect: cancelled %d live session(s)", len(cancels)) +} + +// sleepCtx blocks for d, returning early (with ctx.Err()) if ctx +// fires first. Unlike `select { case <-ctx.Done(): case <-time.After(d): }`, +// this releases the underlying Timer immediately when ctx wins — +// so it doesn't leak a Timer object + runtime goroutine on every +// abandoned wait. In a DTLS reconnect storm this accumulates fast. +func sleepCtx(ctx context.Context, d time.Duration) error { + if d <= 0 { + return ctx.Err() + } + timer := time.NewTimer(d) + defer timer.Stop() + select { + case <-ctx.Done(): + return ctx.Err() + case <-timer.C: + return nil + } +} + + //export ProxySetLogger func ProxySetLogger(context unsafe.Pointer, loggerFn C.proxy_logger_fn_t) { proxyLoggerCtx = context @@ -81,9 +145,34 @@ func init() { log.SetOutput(ProxyLogger(0)) } -type getCredsFunc func(string) (string, string, string, error) +type getCredsFunc func(context.Context, string) (string, string, string, error) + +// sharedAuthClient is the package-level HTTP client used by getCreds +// for the 8-RT VK auth + identity-registration pipeline. Sharing one +// client across every getCreds invocation amortises TLS handshakes +// (~300-500 ms each) over the connection pool — previously each +// getCreds built a fresh http.Client whose defer CloseIdleConnections +// destroyed the idle pool the moment the function returned, so every +// one of 4×N=200 round trips paid full handshake cost. The captcha +// solver uses its own client (newCaptchaClient) because it needs a +// per-attempt cookie jar. +var sharedAuthClient = &http.Client{ + Timeout: 20 * time.Second, + Transport: &http.Transport{ + // customDial layers system DNS → DoH (1.1.1.1) → hardcoded + // VK fallback IPs. Russian mobile carriers regularly + // NXDOMAIN login.vk.com / api.vk.com, so without this + // fallback the very first get_anonym_token POST dies on + // lookup before any captcha logic engages. See + // dns_resolver.go. + DialContext: customDial, + MaxIdleConns: 100, + MaxIdleConnsPerHost: 100, + IdleConnTimeout: 90 * time.Second, + }, +} -func getCreds(link string) (resUser string, resPass string, resTurn string, resErr error) { +func getCreds(ctx context.Context, link string) (resUser string, resPass string, resTurn string, resErr error) { profile := getRandomProfile() name := generateName() escapedName := neturl.QueryEscape(name) @@ -91,17 +180,7 @@ func getCreds(link string) (resUser string, resPass string, resTurn string, resE log.Printf("Connecting - Name: %s | UA: %s", name, profile.UserAgent) doRequest := func(data string, url string) (resp map[string]interface{}, err error) { - - client := &http.Client{ - Timeout: 20 * time.Second, - Transport: &http.Transport{ - MaxIdleConns: 100, - MaxIdleConnsPerHost: 100, - IdleConnTimeout: 90 * time.Second, - }, - } - defer client.CloseIdleConnections() - req, err := http.NewRequest("POST", url, bytes.NewBuffer([]byte(data))) + req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer([]byte(data))) if err != nil { return nil, err } @@ -109,7 +188,7 @@ func getCreds(link string) (resUser string, resPass string, resTurn string, resE req.Header.Add("User-Agent", profile.UserAgent) req.Header.Add("Content-Type", "application/x-www-form-urlencoded") - httpResp, err := client.Do(req) + httpResp, err := sharedAuthClient.Do(req) if err != nil { return nil, err } @@ -141,7 +220,7 @@ func getCreds(link string) (resUser string, resPass string, resTurn string, resE }() data := "client_id=6287487&token_type=messages&client_secret=QbYic1K3lEV5kTGiqlq2&version=1&app_id=6287487" - url := "https://login.vk.ru/?act=get_anonym_token" + url := "https://login.vk.com/?act=get_anonym_token" resp, err := doRequest(data, url) if err != nil { @@ -151,7 +230,7 @@ func getCreds(link string) (resUser string, resPass string, resTurn string, resE token1 := resp["data"].(map[string]interface{})["access_token"].(string) data = fmt.Sprintf("vk_join_link=https://vk.com/call/join/%s&name=%s&access_token=%s", link, escapedName, token1) - reqURL := "https://api.vk.ru/method/calls.getAnonymousToken?v=5.274&client_id=6287487" + reqURL := "https://api.vk.com/method/calls.getAnonymousToken?v=5.274&client_id=6287487" var token2 string const maxCaptchaAttempts = 3 @@ -172,15 +251,70 @@ func getCreds(link string) (resUser string, resPass string, resTurn string, resE if captchaErr.IsCaptchaError() { log.Printf("[Captcha] Attempt %d/%d: solving...", attempt+1, maxCaptchaAttempts) - successToken, solveErr := solveVkCaptcha(context.Background(), captchaErr) + if captchaErr.CaptchaAttempt == "0" || captchaErr.CaptchaAttempt == "" { + captchaErr.CaptchaAttempt = "1" + } + + // Build the retry body template up front so the + // WebView can replay it inside the same browser + // session that solved the captcha — VK then sees + // one coherent actor (cookies / fingerprint / IP) + // instead of "token minted here, redeemed + // somewhere else". Literal "__TOKEN__" gets swapped + // for the actual success_token inside the WebView's + // injected JS. WebView passes the JSON response + // back via TurnBridgeSubmitManualCaptchaResponse; + // see solveVkCaptcha's return shape. + retryURL := reqURL + retryBody := fmt.Sprintf("vk_join_link=https://vk.com/call/join/%s&name=%s"+ + "&captcha_key=&captcha_sid=%s&is_sound_captcha=0&success_token=__TOKEN__"+ + "&captcha_ts=%s&captcha_attempt=%s&access_token=%s", + link, escapedName, captchaErr.CaptchaSid, + captchaErr.CaptchaTs, captchaErr.CaptchaAttempt, token1) + + successToken, manualResp, solveErr := solveVkCaptcha(ctx, captchaErr, retryURL, retryBody) if solveErr != nil { - return "", "", "", fmt.Errorf("captcha solve error: %v", solveErr) + // %w not %v: lets getCredsRouted unwrap and + // check for errDeferToRemote sentinel. + return "", "", "", fmt.Errorf("captcha solve error: %w", solveErr) } - if captchaErr.CaptchaAttempt == "0" || captchaErr.CaptchaAttempt == "" { - captchaErr.CaptchaAttempt = "1" + if manualResp != "" { + // WebView did the retry itself and gave us back + // the final JSON response. Splice it into the + // outer for-loop's resp variable so the next + // iteration of the loop (which checks resp for + // error vs success) sees what we'd have gotten + // from our own doRequest. Then `continue` — + // but with a special marker: we set data to + // empty so the next doRequest call would be a + // no-op; instead we short-circuit by parsing + // here. + var parsed map[string]interface{} + if jerr := json.Unmarshal([]byte(manualResp), &parsed); jerr != nil { + return "", "", "", fmt.Errorf("manual captcha response not JSON: %v (raw=%s)", jerr, manualResp) + } + resp = parsed + if errObj2, hasErr := resp["error"].(map[string]interface{}); hasErr { + return "", "", "", fmt.Errorf("VK API error in manual-retry response: %v", errObj2) + } + rspObj, ok := resp["response"].(map[string]interface{}) + if !ok { + return "", "", "", fmt.Errorf("manual-retry response missing 'response' field: %s", manualResp) + } + tok, ok := rspObj["token"].(string) + if !ok || tok == "" { + return "", "", "", fmt.Errorf("manual-retry response missing token: %s", manualResp) + } + token2 = tok + log.Printf("[Captcha] Used in-WebView retry response, token2 acquired") + break // exit the for-attempt loop with token2 set } + // Legacy path: WebView gave us just the token, do + // the retry ourselves from Go's HTTP client. VK + // may reject because of session mismatch — that's + // the failure mode the response_path above fixes. data = fmt.Sprintf("vk_join_link=https://vk.com/call/join/%s&name=%s"+ "&captcha_key=&captcha_sid=%s&is_sound_captcha=0&success_token=%s"+ "&captcha_ts=%s&captcha_attempt=%s&access_token=%s", @@ -249,13 +383,38 @@ func dtlsFunc(ctx context.Context, conn net.PacketConn, peer *net.UDPAddr) (net. return dtlsConn, nil } -func oneDtlsConnection(ctx context.Context, peer *net.UDPAddr, listenConn net.PacketConn, connchan chan<- net.PacketConn, okchan chan<- struct{}, c1 chan<- error) { +func oneDtlsConnection(ctx context.Context, peer *net.UDPAddr, listenConn net.PacketConn, connchan chan<- net.PacketConn, okchan chan<- struct{}, c1 chan<- error, streamID int) { var err error = nil defer func() { c1 <- err }() + sessionStart := time.Now() + + // Data-plane byte counters for this DTLS session. The two directions: + // wgToDtls: bytes read from listenConn (WG ciphertext at :9000) + // and written into dtlsConn (towards the TURN relay). + // dtlsToWg: bytes read from dtlsConn (decrypted DTLS payload + // coming back from the relay) and written into + // listenConn (towards the WG client). + // A periodic logger below prints both totals and 10s deltas so we + // can tell whether user traffic is actually flowing through the + // tunnel or whether it's just WG control-plane keepalives. + var wgToDtls, dtlsToWg atomic.Uint64 + + defer func() { + log.Printf("DTLS session lifetime=%s wg→dtls=%dB dtls→wg=%dB exit=%v", + time.Since(sessionStart).Round(time.Millisecond), + wgToDtls.Load(), dtlsToWg.Load(), err) + }() dtlsctx, dtlscancel := context.WithCancel(ctx) defer dtlscancel() + unregister := registerSession(dtlscancel) + defer unregister() var conn1, conn2 net.PacketConn - conn1, conn2 = connutil.AsyncPacketPipe() + conn1, conn2 = boundedPacketPipe() + // Bounded pipe caps in-flight queue per direction at + // boundedPipeDepth packets; overflow drops with UDP semantics. + // Closing conn1 tears down both ends — pipePair shares one + // closed-channel that both pipeConns select on. + defer conn1.Close() go func() { for { select { @@ -278,7 +437,42 @@ func oneDtlsConnection(ctx context.Context, peer *net.UDPAddr, listenConn net.Pa log.Printf("Closed DTLS connection\n") }() log.Printf("Established DTLS connection!\n") - select { case proxyReady <- struct{}{}: default: } + + // Stream-Aggregation preamble: if enabled, write the 17-byte + // [sessionID, streamID] header BEFORE WireGuard packets start + // flowing through dtlsConn. The receiver-side aggregator + // (kiper292/vk-turn-proxy fork on the WG server's box) reads + // this once per stream and fuses every stream sharing the same + // session ID into a single endpoint for WG, stopping the WG + // server from endpoint-thrashing when N parallel TURN + // allocations deliver packets from N distinct VK relay ports. + // Without the flag set (default), nothing is written and the + // stream looks exactly like our pre-aggregation transport. + if streamAggIsEnabled() { + sid, ok := currentStreamAggSession() + if ok { + preamble := make([]byte, 17) + copy(preamble[:16], sid[:]) + preamble[16] = byte(streamID) + if _, werr := dtlsConn.Write(preamble); werr != nil { + log.Printf("stream-agg: preamble write failed on stream %d: %s", streamID, werr) + err = fmt.Errorf("stream-agg preamble: %s", werr) + return + } + log.Printf("stream-agg: stream %d preamble sent (sessionID=%x)", streamID, sid[:4]) + } + } + + // NOTE: do NOT signal proxyReady here. Signalling it the moment + // the FIRST DTLS session establishes causes Swift to call + // adapter.start() and iOS to bring up utun with the WG config's + // AllowedIPs=0.0.0.0/0 routing. If the user has nValue>1, the + // remaining N-1 sessions still need to fetch fresh VK creds — + // and that means the manual-captcha WebView in the app tries to + // load id.vk.ru AFTER utun is up, so the captcha sheet ends up + // routed through the half-built tunnel and never loads. The + // proxyReady signal is now sent from StartProxy once all N + // sessions have established their DTLS+TURN allocations. go func() { for { select { @@ -289,17 +483,103 @@ func oneDtlsConnection(ctx context.Context, peer *net.UDPAddr, listenConn net.Pa } }() + // Application-level keepalive over DTLS. + // + // WireGuard's PersistentKeepalive=25 only fires when WG itself is + // running. When iOS throttles or briefly suspends the Network + // Extension, WG's goroutine can miss its tick and the DTLS path + // goes silent — the VK TURN relay then drops the channel binding + // as 'idle' and the next real packet finds a dead path. + // + // We send a tiny sentinel packet over the DTLS conn every 5s so + // the TURN ChannelData is refreshed regardless of WG state. + // + // Sentinel: 0xFF 0xFF 0xFF 0xFF — invalid first byte for any + // WireGuard message type (valid: 0x01-0x04) and below WG's 32-byte + // minimum, so server-side vk-turn-proxy can drop it cheaply before + // forwarding to wg-quick@wg0. See companion patch in + // truvvor/vk-turn-proxy server/. + go func() { + keepalive := []byte{0xFF, 0xFF, 0xFF, 0xFF} + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + for { + select { + case <-dtlsctx.Done(): + return + case <-ticker.C: + if _, werr := dtlsConn.Write(keepalive); werr != nil { + log.Printf("keepalive write failed: %s", werr) + return + } + } + } + }() + wg := sync.WaitGroup{} wg.Add(2) context.AfterFunc(dtlsctx, func() { listenConn.SetDeadline(time.Now()) dtlsConn.SetDeadline(time.Now()) }) + + // Watchdog: catch sessions that were healthy then went silent. The + // narrow case it must catch: TURN allocation gets quietly killed + // (relay timeout, server restart, NAT rebinding) while DTLS stays + // up — wg→dtls writes keep "succeeding" into the void. + // + // The case it must NOT trigger on: DTLS handshake succeeded, WG + // came up, but the fanout dispatcher never round-robined a packet + // into this session yet. At N=60 only ~1/N sessions sees the WG + // keepalive every 25 s, so most sessions sit idle for minutes + // before being useful. Killing them on a 60 s timer just because + // they're idle-but-healthy created a reconnect storm: every cull + // burns a captcha solve, VK rate-limits, the replacement also + // gets culled, repeat. Memory was fine (1.3.12), throughput + // wasn't. + // + // Distinguishing the two: lastRxNanos starts at 0 (not now()), + // bumped to time.Now() on the first dtlsConn.Read in the read + // loop below. Watchdog only fires after lastRxNanos has actually + // been bumped — i.e., we've proven this session can carry + // traffic. Sessions that never get data sit and let minimal + // TURN's half-lifetime Refresh keep the allocation alive. + var lastRxNanos atomic.Int64 + go func() { + ticker := time.NewTicker(15 * time.Second) + defer ticker.Stop() + for { + select { + case <-dtlsctx.Done(): + return + case now := <-ticker.C: + lastNanos := lastRxNanos.Load() + if lastNanos == 0 { + continue // session has never received data yet, give it room + } + last := time.Unix(0, lastNanos) + if now.Sub(last) > 60*time.Second { + log.Printf("Watchdog: no inbound DTLS traffic for %s — forcing restart", now.Sub(last).Round(time.Second)) + dtlscancel() + return + } + } + } + }() + var addr atomic.Value + + // Note: byte counters keep accumulating into wgToDtls / dtlsToWg + // and surface in the per-session lifetime log on exit. The + // periodic 10s dump was useful while we were proving that user + // traffic actually flows through the tunnel, but now it's just + // line noise. + go func() { defer wg.Done() defer dtlscancel() - buf := make([]byte, 1600) + buf := borrowReadBuf() + defer returnReadBuf(buf) for { select { case <-dtlsctx.Done(): @@ -319,13 +599,15 @@ func oneDtlsConnection(ctx context.Context, peer *net.UDPAddr, listenConn net.Pa log.Printf("Failed: %s", err1) return } + wgToDtls.Add(uint64(n)) } }() go func() { defer wg.Done() defer dtlscancel() - buf := make([]byte, 1600) + buf := borrowReadBuf() + defer returnReadBuf(buf) for { select { case <-dtlsctx.Done(): @@ -337,6 +619,7 @@ func oneDtlsConnection(ctx context.Context, peer *net.UDPAddr, listenConn net.Pa log.Printf("Failed: %s", err1) return } + lastRxNanos.Store(time.Now().UnixNano()) addr1, ok := addr.Load().(net.Addr) if !ok { log.Printf("Failed: no listener ip") @@ -348,6 +631,7 @@ func oneDtlsConnection(ctx context.Context, peer *net.UDPAddr, listenConn net.Pa log.Printf("Failed: %s", err1) return } + dtlsToWg.Add(uint64(n)) } }() @@ -365,17 +649,55 @@ func (c *connectedUDPConn) WriteTo(p []byte, _ net.Addr) (int, error) { } type turnParams struct { - host string - port string - link string + host string + port string + // links is a non-empty list of VK call-join links. The first + // call to nextLink returns links[0], then [1], rolling over + // after the last entry. With N>1 we hypothesise VK keys its + // per-IP captcha rate-limit on (source-IP, link) so spreading + // solves across multiple call IDs multiplies the effective + // budget proportionally. Also gives a chance of landing on + // different turn_server.urls[0] relays, each with its own + // voice-grade shaper. linkCursor advances atomically so + // concurrent oneTurnConnections don't all hit the same link + // simultaneously. + links []string + linkCursor atomic.Uint64 + udp bool getCreds getCredsFunc } +func (p *turnParams) nextLink() string { + if len(p.links) == 0 { + return "" + } + idx := p.linkCursor.Add(1) - 1 + return p.links[int(idx)%len(p.links)] +} + func oneTurnConnection(ctx context.Context, turnParams *turnParams, peer *net.UDPAddr, conn2 net.PacketConn, c chan<- error) { var err error = nil defer func() { c <- err }() - user, pass, url, err1 := turnParams.getCreds(turnParams.link) + sessionStart := time.Now() + + // Data-plane byte counters on the TURN side. The two directions: + // conn2ToRelay: bytes read from conn2 (decrypted DTLS output that + // represents the WG packet) and written into + // relayConn (towards the WG server via the TURN + // server's relay). + // relayToConn2: bytes coming back from the relay and pushed into + // conn2 (which DTLS will re-encrypt for the client). + // Periodic logger below mirrors the DTLS-side counters so a missing + // data path can be pinpointed to either the DTLS or TURN layer. + var conn2ToRelay, relayToConn2 atomic.Uint64 + + defer func() { + log.Printf("TURN session lifetime=%s conn2→relay=%dB relay→conn2=%dB exit=%v", + time.Since(sessionStart).Round(time.Millisecond), + conn2ToRelay.Load(), relayToConn2.Load(), err) + }() + user, pass, url, err1 := turnParams.getCreds(ctx, turnParams.nextLink()) if err1 != nil { err = fmt.Errorf("failed to get TURN credentials: %s", err1) return @@ -418,6 +740,11 @@ func oneTurnConnection(ctx context.Context, turnParams *turnParams, peer *net.UD return } }() + // Same buffer concern as listenConn, but on the wire side: a + // page-load burst arrives at the device from the relay over a + // 30–100 ms RTT path, and any backlog the kernel can't queue + // gets dropped silently — TCP then retransmits and stalls. + tuneUDPBuffers("turnConn", conn) turnConn = &connectedUDPConn{conn} } else { conn, err2 := d.DialContext(ctx1, "tcp", turnServerAddr) // nolint: noctx @@ -433,51 +760,64 @@ func oneTurnConnection(ctx context.Context, turnParams *turnParams, peer *net.UD }() turnConn = turn.NewSTUNConn(conn) } - var addrFamily turn.RequestedAddressFamily - if peer.IP.To4() != nil { - addrFamily = turn.RequestedAddressFamilyIPv4 + // useMinimalTURN swaps pion/turn for the in-tree minimal client + // (turn_min.go). Re-enabled in 1.3.11 after fixing the 1.3.9 bug: + // MessageIntegrity.AddTo was computing HMAC over m.Raw with the + // STUN magic cookie still zero (Encode wrote it later), so the + // server's recomputed HMAC over the wire bytes never matched → + // 401 on every authenticated allocate. Fixed by calling + // m.WriteHeader() right after NewTransactionID() in all three + // builders (allocate, channelBind, refresh). + const useMinimalTURN = true + + var relayConn net.PacketConn + if useMinimalTURN { + allocCtx, allocCancel := context.WithTimeout(ctx, 15*time.Second) + alloc, err1 := minimalTURNAllocate(allocCtx, turnConn, turnServerUdpAddr, user, pass, peer) + allocCancel() + if err1 != nil { + err = fmt.Errorf("failed to allocate (minimal): %s", err1) + return + } + relayConn = alloc + defer alloc.Close() } else { - addrFamily = turn.RequestedAddressFamilyIPv6 - } - // Start a new TURN Client and wrap our net.Conn in a STUNConn - // This allows us to simulate datagram based communication over a net.Conn - cfg = &turn.ClientConfig{ - STUNServerAddr: turnServerAddr, - TURNServerAddr: turnServerAddr, - Conn: turnConn, - Username: user, - Password: pass, - RequestedAddressFamily: addrFamily, - LoggerFactory: logging.NewDefaultLoggerFactory(), - } - - client, err1 := turn.NewClient(cfg) - if err1 != nil { - err = fmt.Errorf("failed to create TURN client: %s", err1) - return - } - defer client.Close() - - // Start listening on the conn provided. - err1 = client.Listen() - if err1 != nil { - err = fmt.Errorf("failed to listen: %s", err1) - return - } - - // Allocate a relay socket on the TURN server. On success, it - // will return a net.PacketConn which represents the remote - // socket. - relayConn, err1 := client.Allocate() - if err1 != nil { - err = fmt.Errorf("failed to allocate: %s", err1) - return - } - defer func() { - if err1 := relayConn.Close(); err1 != nil { - err = fmt.Errorf("failed to close TURN allocated connection: %s", err1) + var addrFamily turn.RequestedAddressFamily + if peer.IP.To4() != nil { + addrFamily = turn.RequestedAddressFamilyIPv4 + } else { + addrFamily = turn.RequestedAddressFamilyIPv6 } - }() + cfg = &turn.ClientConfig{ + STUNServerAddr: turnServerAddr, + TURNServerAddr: turnServerAddr, + Conn: turnConn, + Username: user, + Password: pass, + RequestedAddressFamily: addrFamily, + LoggerFactory: logging.NewDefaultLoggerFactory(), + } + client, err1 := turn.NewClient(cfg) + if err1 != nil { + err = fmt.Errorf("failed to create TURN client: %s", err1) + return + } + defer client.Close() + if err1 = client.Listen(); err1 != nil { + err = fmt.Errorf("failed to listen: %s", err1) + return + } + relayConn, err1 = client.Allocate() + if err1 != nil { + err = fmt.Errorf("failed to allocate: %s", err1) + return + } + defer func() { + if err1 := relayConn.Close(); err1 != nil { + err = fmt.Errorf("failed to close TURN allocated connection: %s", err1) + } + }() + } // The relayConn's local address is actually the transport // address assigned on the TURN server. @@ -485,7 +825,19 @@ func oneTurnConnection(ctx context.Context, turnParams *turnParams, peer *net.UD wg := sync.WaitGroup{} wg.Add(2) - turnctx, turncancel := context.WithCancel(context.Background()) + // Derive turnctx from the parent ctx (StartProxy's, via + // oneTurnConnectionLoop). Previously this was rooted at + // context.Background(), so StopProxy / Disconnect only cancelled + // proxy-ctx — the two read-loop goroutines below kept blocking + // on conn2.ReadFrom forever (conn2 is an AsyncPacketPipe with no + // natural close trigger). ProxyForceReconnect happened to work + // because it iterates the session registry and calls turncancel + // directly, but StopProxy doesn't. Result: every session + // abandoned via StopProxy left one wedged goroutine behind, plus + // its conn2 pipe in memory. + turnctx, turncancel := context.WithCancel(ctx) + unregister := registerSession(turncancel) + defer unregister() context.AfterFunc(turnctx, func() { if err := relayConn.SetDeadline(time.Now()); err != nil { log.Printf("Failed to set relay deadline: %s", err) @@ -494,12 +846,39 @@ func oneTurnConnection(ctx context.Context, turnParams *turnParams, peer *net.UD log.Printf("Failed to set upstream deadline: %s", err) } }) + // SRTP/Opus mimicry layer (see wrap.go). Built per session + // because each wrapConn has its own random SSRC/sessionID/counter + // init — if all sessions shared a wrapConn, the SSRC + monotonic + // seq would tag every TURN allocation as "the same call leg", + // which is a coarser fingerprint than what real VK call traffic + // looks like. nil wrap = bypass and use the legacy direct + // conn2↔relayConn path; the key takes effect on the NEXT + // allocation after the user changes it, not on already-live + // ones (mid-stream AEAD key rotation isn't worth the complexity). + var wrap *wrapConn + if key := currentWrapKey(); key != nil { + if w, werr := newWrapConn(key, false); werr != nil { + log.Printf("wrap: session disabled — newWrapConn: %v", werr) + } else { + wrap = w + } + } + var addr atomic.Value // Start read-loop on conn2 (output of DTLS) go func() { defer wg.Done() defer turncancel() - buf := make([]byte, 1600) + buf := borrowReadBuf() + defer returnReadBuf(buf) + // wireBuf carries the wrapped (SRTP-shaped) bytes when wrap + // is on. +wrapOverhead headroom over the DTLS payload max. + // Allocated once per goroutine — borrowing from readBufPool + // won't help because the pool's slices are exactly 1600. + var wireBuf []byte + if wrap != nil { + wireBuf = make([]byte, len(buf)+wrapOverhead) + } for { select { case <-turnctx.Done(): @@ -514,11 +893,22 @@ func oneTurnConnection(ctx context.Context, turnParams *turnParams, peer *net.UD addr.Store(addr1) // store peer - _, err1 = relayConn.WriteTo(buf[:n], peer) + out := buf[:n] + if wrap != nil { + wn, werr := wrap.wrapInto(wireBuf, buf[:n]) + if werr != nil { + log.Printf("wrap: wrapInto failed: %v", werr) + return + } + out = wireBuf[:wn] + } + + _, err1 = relayConn.WriteTo(out, peer) if err1 != nil { log.Printf("Failed: %s", err1) return } + conn2ToRelay.Add(uint64(n)) } }() @@ -526,7 +916,14 @@ func oneTurnConnection(ctx context.Context, turnParams *turnParams, peer *net.UD go func() { defer wg.Done() defer turncancel() - buf := make([]byte, 1600) + buf := borrowReadBuf() + defer returnReadBuf(buf) + // plainBuf carries the unwrapped DTLS bytes when wrap is on. + // Unwrapped is always smaller than wrapped, so 1600 is plenty. + var plainBuf []byte + if wrap != nil { + plainBuf = make([]byte, len(buf)) + } for { select { case <-turnctx.Done(): @@ -544,14 +941,35 @@ func oneTurnConnection(ctx context.Context, turnParams *turnParams, peer *net.UD return } - _, err1 = conn2.WriteTo(buf[:n], addr1) + out := buf[:n] + if wrap != nil { + un, werr := wrap.unwrapPacket(buf[:n], plainBuf) + if werr != nil { + // AEAD failure here means the peer end didn't + // wrap (server-side wrap not configured / wrong + // key) — drop the packet rather than tear down + // the whole session, since a stray non-wrapped + // packet on a wrap-enabled session is a real + // possibility right at session bring-up. + log.Printf("wrap: unwrap dropped: %v", werr) + continue + } + out = plainBuf[:un] + } + + _, err1 = conn2.WriteTo(out, addr1) if err1 != nil { log.Printf("Failed: %s", err1) return } + relayToConn2.Add(uint64(n)) } }() + // Byte counters are folded into the per-session lifetime log on + // exit; the periodic 10s dump that proved data was flowing + // during the throughput investigation is no longer interesting. + wg.Wait() if err := relayConn.SetDeadline(time.Time{}); err != nil { log.Printf("Failed to clear relay deadline: %s", err) @@ -561,22 +979,57 @@ func oneTurnConnection(ctx context.Context, turnParams *turnParams, peer *net.UD } } -func oneDtlsConnectionLoop(ctx context.Context, peer *net.UDPAddr, listenConnChan <-chan net.PacketConn, connchan chan<- net.PacketConn, okchan chan<- struct{}) { +// reconnectBackoff produces a capped exponential backoff with jitter. +// Caller uses it like: +// wait := reconnectBackoff(prev, success) +// time.Sleep(wait) +// On success it returns 0 (caller resets state and continues immediately). +func reconnectBackoff(prev time.Duration, success bool) time.Duration { + if success { + return 0 + } + if prev <= 0 { + prev = 500 * time.Millisecond + } else { + prev *= 2 + } + const maxBackoff = 30 * time.Second + if prev > maxBackoff { + prev = maxBackoff + } + // Add jitter +/- 25% so reconnects don't synchronise across N parallel streams. + jitter := time.Duration(rand.Int63n(int64(prev / 2))) - prev/4 + return prev + jitter +} + +func oneDtlsConnectionLoop(ctx context.Context, peer *net.UDPAddr, listenConnChan <-chan net.PacketConn, connchan chan<- net.PacketConn, okchan chan<- struct{}, streamID int) { + var backoff time.Duration for { select { case <-ctx.Done(): return case listenConn := <-listenConnChan: c := make(chan error) - go oneDtlsConnection(ctx, peer, listenConn, connchan, okchan, c) - if err := <-c; err != nil { + go oneDtlsConnection(ctx, peer, listenConn, connchan, okchan, c, streamID) + err := <-c + if err != nil { log.Printf("%s", err) + backoff = reconnectBackoff(backoff, false) + if backoff > 0 { + log.Printf("DTLS reconnect in %s", backoff.Round(time.Millisecond)) + if err := sleepCtx(ctx, backoff); err != nil { + return + } + } + } else { + backoff = reconnectBackoff(backoff, true) } } } } func oneTurnConnectionLoop(ctx context.Context, turnParams *turnParams, peer *net.UDPAddr, connchan <-chan net.PacketConn, t <-chan time.Time) { + var backoff time.Duration for { select { case <-ctx.Done(): @@ -586,8 +1039,18 @@ func oneTurnConnectionLoop(ctx context.Context, turnParams *turnParams, peer *ne case <-t: c := make(chan error) go oneTurnConnection(ctx, turnParams, peer, conn2, c) - if err := <-c; err != nil { + err := <-c + if err != nil { log.Printf("%s", err) + backoff = reconnectBackoff(backoff, false) + if backoff > 0 { + log.Printf("TURN reconnect in %s", backoff.Round(time.Millisecond)) + if err := sleepCtx(ctx, backoff); err != nil { + return + } + } + } else { + backoff = reconnectBackoff(backoff, true) } default: } @@ -597,92 +1060,311 @@ func oneTurnConnectionLoop(ctx context.Context, turnParams *turnParams, peer *ne type turnCred struct { user, pass, addr string + acquiredAt time.Time } +// credMaxAge is how long a TURN cred stays usable in the pool. VK +// rotates TURN allocations roughly every minute, after which Allocate +// returns 437 (allocation mismatch). Recycling a 90 s-old cred during +// a reconnect storm just kicks off a brand-new dead TURN session — +// pion fails fast, the loop reconnects, getCreds returns the same +// stale cred, and round we go. Capping at 45 s gives a comfortable +// margin under VK's actual rotation window while still letting the +// burst-recycle path (fresh creds added in the last ~5 s) work. +const credMaxAge = 45 * time.Second + +// Max concurrent captcha solves against VK. Fully-parallel solves at +// N=30 trigger VK's anti-bot rate-limit (`ERROR_LIMIT` on +// captcha.isNotRobot, `status: ERROR` on slider getContent) and the +// per-IP TURN allocation cap (error 486). +// +// Lowered 5 → 3 in 1.3.10: each solve transiently holds an HTTP/TLS +// client + JSON state + image decode buffer + a handful of stdlib +// http.Transport goroutines (~1.5-2 MB worth). Under a reconnect +// storm where sessions die past T+30s, 5 simultaneous solves was +// adding ~10 MB transient + ~50 net/http goroutines on top of the +// already-loaded steady-state. 3 keeps almost all the throughput +// (the binding constraint is VK's per-IP rate-limit, not our +// concurrency) for ~6 MB lower peak. +const maxConcurrentCaptchaSolves = 3 + func poolCreds(f getCredsFunc, poolSize int) getCredsFunc { var mu sync.Mutex var pool []turnCred var cTime time.Time var idx int - return func(link string) (string, string, string, error) { + // Bounded-concurrency gate for captcha solves. Buffered channel + // acts as a semaphore: at most cap(solveSlot) goroutines hold a + // slot at a time, the rest block on send until a slot is released. + solveSlot := make(chan struct{}, maxConcurrentCaptchaSolves) + + return func(ctx context.Context, link string) (string, string, string, error) { mu.Lock() - defer mu.Unlock() if !cTime.IsZero() && time.Since(cTime) > 10*time.Minute { pool = nil cTime = time.Time{} } - if len(pool) < poolSize { - u, p, a, err := f(link) - if err == nil { - pool = append(pool, turnCred{u, p, a}) - cTime = time.Now() - log.Printf("Successfully registered User Identity %d/%d", len(pool), poolSize) - - // Space out requests by 1000ms to avoid API limits - if len(pool) < poolSize { - time.Sleep(1000 * time.Millisecond) + // Prune creds older than credMaxAge. Without this both the + // cache-hit fast path and the saturation short-circuit would + // keep handing out dead identities to oneTurnConnection, + // which then 437s on Allocate, dies, reconnects, and burns + // another solve attempt. The 45 s budget lines up with VK's + // TURN rotation window so any cred in the pool was either + // just acquired or is in its useful lifetime. + if len(pool) > 0 { + fresh := pool[:0] + for _, c := range pool { + if time.Since(c.acquiredAt) <= credMaxAge { + fresh = append(fresh, c) } - - c := pool[len(pool)-1] - idx++ - return c.user, c.pass, c.addr, nil } + pool = fresh + } + + // Cache-hit fast path: pool already at capacity, hand out a + // rotating cached cred and bail. This path never touches the + // solve semaphore — only cold solves are throttled. + if len(pool) >= poolSize { + c := pool[idx%len(pool)] + idx++ + cTime = time.Now() + mu.Unlock() + return c.user, c.pass, c.addr, nil + } - log.Printf("Failed to get unique TURN identity: %v", err) - if len(pool) > 0 { - log.Printf("Falling back to reusing a previous identity...") + // Saturation short-circuit. Reconnect loops (oneDtls/oneTurn + // ConnectionLoop) call getCreds on every retry, and while the + // pool is below poolSize each call would otherwise spin up + // another solveVkCaptcha → ERROR_LIMIT → recycle cycle. With + // N=50 sessions all hitting VK's rate-limit simultaneously + // this snowballs into 100+ doomed captcha attempts per minute + // and a fresh TURN allocation per attempt — each of which VK + // closes within ~50 s. Detect the burning state and short- + // circuit straight to a recycled cred. The cooldown in + // directSaturated/tunnelSaturated will auto-clear the streak + // after captchaCooldown so this is not permanent — once VK's + // rate-limit window expires, real solves resume. + if len(pool) > 0 { + egressIsTunnel := captchaTunnelEgress.Load() + // "currentSat" is the egress this attempt would use by + // default. "otherSat" is the egress solveVkCaptcha can + // escape to via the force-direct path (cellularDial). + // That escape only exists for tunnel → direct, not the + // other way around, so when we're already on direct the + // short-circuit just looks at directSaturated. + currentSat := directSaturated() + otherSat := tunnelSaturated() + if egressIsTunnel { + currentSat, otherSat = tunnelSaturated(), directSaturated() + } + if currentSat && (otherSat || !egressIsTunnel) { c := pool[idx%len(pool)] idx++ + cTime = time.Now() + mu.Unlock() return c.user, c.pass, c.addr, nil } + } + + // Cache-miss slow path: release the mutex, jitter, take a + // solve slot, then call f(ctx, link). The mutex is dropped + // first so we don't serialise on it while waiting for a slot. + // The jitter runs BEFORE slot acquisition so it overlaps the + // queue wait instead of holding a slot — previously a 5-slot + // pipeline burned 0.75-3 s per slot on jitter alone, halving + // effective throughput. Now the slot only covers the actual + // PoW + HTTP work. ctx-aware at every step so a Disconnect + // during the wait bails fast. + mu.Unlock() + + // 1.5-2.5 s pre-slot wait: combined anti-bot pacing (used to + // live inside solveVkCaptcha as a fixed 1.5-2.5 s sleep while + // the slot was held) and entry desync (used to be a 0-750 ms + // post-slot jitter). Both purposes preserved, the slot is + // freed earlier. + if err := sleepCtx(ctx, time.Duration(1500+rand.Intn(1000))*time.Millisecond); err != nil { return "", "", "", err } - c := pool[idx%len(pool)] - idx++ - return c.user, c.pass, c.addr, nil + select { + case solveSlot <- struct{}{}: + case <-ctx.Done(): + return "", "", "", ctx.Err() + } + u, p, a, err := f(ctx, link) + <-solveSlot + + mu.Lock() + defer mu.Unlock() + + if err == nil { + pool = append(pool, turnCred{u, p, a, time.Now()}) + cTime = time.Now() + log.Printf("Successfully registered User Identity %d/%d", len(pool), poolSize) + idx++ + return u, p, a, nil + } + + log.Printf("Failed to get unique TURN identity: %v", err) + if len(pool) > 0 { + log.Printf("Falling back to reusing a previous identity...") + c := pool[idx%len(pool)] + idx++ + cTime = time.Now() + return c.user, c.pass, c.addr, nil + } + return "", "", "", err } } +// parseLiteralUDPAddr parses "ip:port" without touching the system +// resolver. The address comes from the iOS profile and is always a +// literal numeric IP — going through net.ResolveUDPAddr (which dives +// through getaddrinfo via cgo) trips a transient sandbox-init race +// where Go's resolver reports "unknown port" for a perfectly valid +// numeric port. Manual parsing sidesteps the whole resolver path. +// +// Also strips Unicode whitespace before parsing: field log showed +// the iOS side passing "56010 " (port followed by U+2009 THIN +// SPACE), which strconv.Atoi rejects. Thin spaces tend to sneak in +// via copy-paste from web UIs where the address is formatted with +// a narrow non-breaking space for readability — strings.TrimSpace +// drops every Unicode whitespace including U+2009. +func parseLiteralUDPAddr(s string) (*net.UDPAddr, error) { + s = strings.TrimSpace(s) + host, portStr, err := net.SplitHostPort(s) + if err != nil { + return nil, fmt.Errorf("split host:port %q: %w", s, err) + } + host = strings.TrimSpace(host) + portStr = strings.TrimSpace(portStr) + ip := net.ParseIP(host) + if ip == nil { + return nil, fmt.Errorf("host %q is not a literal IP", host) + } + port, err := strconv.Atoi(portStr) + if err != nil || port <= 0 || port > 65535 { + return nil, fmt.Errorf("port %q invalid", portStr) + } + return &net.UDPAddr{IP: ip, Port: port}, nil +} + //export StartProxy -func StartProxy(cLink *C.char, cPeerAddr *C.char, cLocalAddr *C.char, cN C.int) { +func StartProxy(cLink *C.char, cPeerAddr *C.char, cLocalAddr *C.char, cN C.int, cUDP C.int) { select { case <-proxyReady: default: } - link := C.GoString(cLink) + rawLink := C.GoString(cLink) peerAddrStr := C.GoString(cPeerAddr) localAddrStr := C.GoString(cLocalAddr) + + // Parse the link parameter as a list — accept comma OR newline + // OR semicolon as separators so the Swift side can stuff + // multiple URLs into the existing single-string profile field + // without an API break. Empty entries are dropped. Hypothesis + // we're testing: VK's per-IP captcha rate-limit might be keyed + // on (source-IP, vk_join_link) rather than just source-IP, in + // which case M distinct links multiply our effective budget by + // roughly M. + var links []string + for _, sep := range []string{"\n", ",", ";"} { + rawLink = strings.ReplaceAll(rawLink, sep, "\n") + } + for _, l := range strings.Split(rawLink, "\n") { + if l = strings.TrimSpace(l); l != "" { + links = append(links, l) + } + } + if len(links) == 0 { + log.Printf("StartProxy: no usable link in %q, aborting", C.GoString(cLink)) + return + } + log.Printf("StartProxy: %d link(s) configured for round-robin: %v", len(links), links) + // host/port: empty by default so we use what VK API returned in + // turn_server.urls[0]. Override only if you know the TURN endpoint + // shouldn't track what VK responds with (e.g. pinning a stable IP). host := "" - port := "19302" + port := "" n := int(cN) - udp := true + // Hard cap on N. 1.3.11 re-enables the minimal TURN client (the + // auth bug from 1.3.9 is fixed), adds debug.SetMemoryLimit(75MB) + // + SetGCPercent(50) + periodic FreeOSMemory, tightens the + // captcha solver's HTTP idle pool, and lowers solve concurrency + // 5 → 3. The combined memory wins should comfortably support + // N=100, but we cap at 60 conservatively until field-tested — + // raising it once the new floor is empirically known. + const maxN = 60 + if n > maxN { + log.Printf("StartProxy: N=%d capped to %d (iOS memory budget)", n, maxN) + n = maxN + } + if n < 1 { + n = 1 + } + // udp transport to TURN. true=plain UDP (faster, fragile under loss), + // false=TCP STUNConn (survives short cellular blips at the cost of HoL). + udp := cUDP != 0 + log.Printf("StartProxy: peer=%s n=%d udp=%v", peerAddrStr, n, udp) ctx, cancel := context.WithCancel(context.Background()) proxyCancel = cancel defer cancel() - peer, err := net.ResolveUDPAddr("udp", peerAddrStr) + // Apply Go runtime memory tunings BEFORE any per-session work + // spawns goroutines — SetMemoryLimit applies retroactively but + // GCPercent is sampled at the next GC cycle, so earlier is + // better. See memstats.go for what these do. + tuneGoRuntime() + + // Fresh session = fresh manual-captcha quota. See + // manualCaptchaQuotaPerSession. + resetManualCaptchaQuota() + + // Periodic Go runtime memstats + periodic FreeOSMemory. Pair + // with the Swift-side os_proc_available_memory logger to + // understand when the extension is approaching iOS's kill + // threshold. + startMemstatsLogger(ctx) + + // The address is a literal "ip:port" from the iOS profile — + // never a hostname. Going through net.ResolveUDPAddr means + // routing through getaddrinfo via cgo, which on iOS NE + // extensions can transiently fail in the first hundred-or-so + // milliseconds of startup because the sandbox networking + // subsystem isn't fully wired yet. The failure looks like + // "lookup udp/: unknown port" — Go's resolver got a weird + // answer from getservbyname() for the port string, even though + // the port is numeric and shouldn't need a service lookup at all. + // Parsing host+port ourselves bypasses the resolver entirely. + peer, err := parseLiteralUDPAddr(peerAddrStr) if err != nil { log.Printf("Resolve UDP error: %v", err) return } - parts := strings.Split(link, "join/") - link = parts[len(parts)-1] - - if idx := strings.IndexAny(link, "/?#"); idx != -1 { - link = link[:idx] + // Normalise each link to the bare "joinID" used in the VK API + // body: strip the "vk.com/call/join/" prefix and any trailing + // path/query/fragment. Applied per-link so a mixed paste of + // full URLs and bare IDs both work. + for i, l := range links { + if parts := strings.Split(l, "join/"); len(parts) > 1 { + l = parts[len(parts)-1] + } + if idx := strings.IndexAny(l, "/?#"); idx != -1 { + l = l[:idx] + } + links[i] = l } params := &turnParams{ host: host, port: port, - link: link, + links: links, udp: udp, - getCreds: poolCreds(getCreds, n), + getCreds: poolCreds(getCredsRouted, n), } listenConnChan := make(chan net.PacketConn) @@ -691,52 +1373,282 @@ func StartProxy(cLink *C.char, cPeerAddr *C.char, cLocalAddr *C.char, cN C.int) log.Printf("Failed to listen: %s", err) return } - + // Bump the WG↔proxy UDP socket buffers. Default iOS UDP recv buffer + // is ~196 KB; a single page load can burst 50–100 1.2 KB packets at + // once, overflowing the kernel queue before our read goroutine + // drains it. The kernel may cap the request below 4 MB depending on + // kern.ipc.maxsockbuf — log what we actually got. + tuneUDPBuffers("listenConn", listenConn) + context.AfterFunc(ctx, func() { if closeErr := listenConn.Close(); closeErr != nil { log.Printf("Failed to close local connection: %s", closeErr) } }) - go func() { - for { - select { - case <-ctx.Done(): - return - case listenConnChan <- listenConn: + // Per-session fan-out of the shared listenConn. Without this, all + // N oneDtlsConnection goroutines call ReadFrom on the same UDP + // socket, the kernel wakes only one of them, and the other N-1 + // sessions sit idle — silently defeating nValue>1. The dispatcher + // reads once and round-robins each WG packet to one of N + // fanoutPacketConn channels; each session reads from its own. + // Writes still go straight back to the real listenConn so replies + // from any session reach the WG client. + fanouts := make([]*fanoutPacketConn, n) + for i := range fanouts { + fanouts[i] = newFanoutPacketConn(i, listenConn) + } + startFanoutDispatcher(ctx, listenConn, fanouts) + log.Printf("fanout: dispatcher up with %d virtual conn(s)", n) + + // Each oneDtlsConnectionLoop wants a chan that endlessly redelivers + // its private listen-side conn. Spawn one such chan per fanout. + makeFanoutChan := func(f net.PacketConn) chan net.PacketConn { + ch := make(chan net.PacketConn) + go func() { + for { + select { + case <-ctx.Done(): + return + case ch <- f: + } } - } - }() - - wg1 := sync.WaitGroup{} - t := time.Tick(200 * time.Millisecond) - - okchan := make(chan struct{}) - connchan := make(chan net.PacketConn) + }() + return ch + } - wg1.Go(func() { - oneDtlsConnectionLoop(ctx, peer, listenConnChan, connchan, okchan) - }) - wg1.Go(func() { - oneTurnConnectionLoop(ctx, params, peer, connchan, t) - }) + // listenConnChan kept for the type signature only — the original + // goroutine that fed the shared listenConn is replaced by the + // per-fanout chans below. + _ = listenConnChan - select { - case <-okchan: - case <-ctx.Done(): + wg1 := sync.WaitGroup{} + // time.Tick (no Stop hook) leaks one ticker goroutine + heap + // object per StartProxy invocation — across iOS suspend/wake + // cycles and Disconnect/Reconnect this accumulates fast. Use + // NewTicker + Stop bound to ctx cleanup. + tDispatcher := time.NewTicker(200 * time.Millisecond) + defer tDispatcher.Stop() + t := tDispatcher.C + + // Re-roll the Stream-Aggregation session ID once per StartProxy. + // Each of the N DTLS sessions below will then prepend the same + // session ID + its own stream index after handshake, letting the + // receiver-side aggregator fuse them. No-op when the feature is + // off (default). + if streamAggIsEnabled() { + sid := freshStreamAggSession() + log.Printf("stream-agg: enabled, sessionID=%x (N=%d)", sid[:4], n) } - for i := 0; i < n-1; i++ { + // Phased bring-up driven by adaptive captcha-egress budget. + // + // VK rate-limits captcha.isNotRobot per source IP. We have two + // budgets available: + // + // "direct" — the user's mobile IP. Used until ERROR_LIMIT lands + // on a captcha solve. + // "tunnel" — the WG server's egress IP. Once WG handshake + // completes, this extension's outbound HTTP routes + // through utun automatically (includeAllNetworks=true). + // + // Sequence: + // + // Phase A (direct): + // Spawn sessions one at a time with a small stagger, keeping + // them all on the user's mobile IP. Stop as soon as either + // (a) we've spawned N, or + // (b) a captcha solve returns ERROR_LIMIT (captchaDirectSat + // trips). + // This drains the direct egress's rate-limit budget — exactly + // what the user asked for ("столько тоннелей сколько можно + // поднять со своего родного айпи"). + // + // Bridge: + // Wait for any one of the spawned sessions to reach DTLS + // ready, fire proxyReady so Swift starts the WG adapter, + // then wait ~2 s for WG handshake to complete through that + // session. Flip captchaTunnelEgress so subsequent solves + // are attributed to the tunnel pool. + // + // Phase B (tunnel) — only if Phase A stopped early on direct + // saturation AND we still have sessions to spawn: + // Continue spawning sessions, also one at a time. Their + // captcha HTTP now goes through utun → WG server → api.vk.ru, + // so VK sees the WG server's egress IP — a fresh per-IP + // rate-limit budget. Stop when N reached or + // captchaTunnelSat trips. + // + // Manual-captcha mode keeps the single-phase "all N before WG" + // barrier: each WebView is presented one-at-a-time anyway, and + // the UI flow assumes the captcha sheet can still reach id.vk.ru + // outside the tunnel (includeAllNetworks=false in that mode). + resetCaptchaStats() + captchaSessionsTarget.Store(int64(n)) + + sessionReady := make(chan int, n) + spawnSession := func(i int) { + fanoutChan := makeFanoutChan(fanouts[i]) cChan := make(chan net.PacketConn) + sessionOk := make(chan struct{}) + wg1.Go(func() { - oneDtlsConnectionLoop(ctx, peer, listenConnChan, cChan, nil) + oneDtlsConnectionLoop(ctx, peer, fanoutChan, cChan, sessionOk, i) }) wg1.Go(func() { oneTurnConnectionLoop(ctx, params, peer, cChan, t) }) + go func() { + select { + case <-sessionOk: + // Make this lane visible to the fanout dispatcher. + // Until now the dispatcher was skipping it because + // nothing was draining its incoming channel. + fanouts[i].active.Store(true) + captchaSessionsReady.Add(1) + sessionReady <- i + case <-ctx.Done(): + } + }() } - log.Printf("Proxy started on %s", localAddrStr) + if manualCaptchaForcedMode() { + // Manual mode: spawn all N upfront and wait for every one + // before bringing up WG (legacy behaviour the WebView UI + // flow depends on). + for i := 0; i < n; i++ { + spawnSession(i) + } + for k := 0; k < n; k++ { + select { + case idx := <-sessionReady: + log.Printf("StartProxy: session %d ready (%d/%d, manual)", idx+1, k+1, n) + case <-ctx.Done(): + wg1.Wait() + return + } + } + select { + case proxyReady <- struct{}{}: + default: + } + log.Printf("Proxy started on %s with %d parallel TURN session(s) (manual mode)", localAddrStr, n) + wg1.Wait() + return + } + + // Start the sessionReady drain + proxyReady-signaller BEFORE the + // Phase A spawn loop. iOS' startTunnel completion handler has to + // fire within ~15-20 s or the OS gives up and tears the tunnel + // down. Phase A's 400 ms stagger × N=50 = 20 s of spawning, so if + // we wait for "phase A done" before consuming the first + // sessionReady, iOS pulls the plug before WG ever starts. With + // this goroutine reading concurrently, the very first DTLS-ready + // session (≈5 s in) triggers proxyReady immediately and Swift's + // adapter.start fires without waiting on the rest of the fleet. + go func() { + firstSignalled := false + for { + select { + case idx := <-sessionReady: + log.Printf("StartProxy: session %d ready", idx+1) + if !firstSignalled { + firstSignalled = true + log.Printf("StartProxy: first session ready, signaling proxyReady") + select { + case proxyReady <- struct{}{}: + default: + } + // Flip tunnel egress 2 s after the first DTLS + // session is up — WG handshake completes in that + // window and from then on the extension's HTTP + // auto-routes through utun. + go func() { + select { + case <-time.After(2 * time.Second): + captchaTunnelEgress.Store(true) + log.Printf("StartProxy: tunnel egress engaged") + case <-ctx.Done(): + } + }() + } + case <-ctx.Done(): + return + } + } + }() + + // Phase A: spawn direct sessions until N reached or direct egress + // hits ERROR_LIMIT. The 1.5-2.5 s pre-slot jitter inside poolCreds + // (see F3) now does the anti-bot pacing that the stagger used to + // do; the stagger only exists to give the saturation check inside + // this loop enough granularity to fire BEFORE the whole fleet has + // kicked off solveVkCaptcha. 100 ms × 50 = 5 s for all N to enter + // the slot queue, vs the old 20 s — saves ~15 s of bring-up time + // when direct doesn't saturate, while still letting Phase A→B + // transition fire within 100 ms of an ERROR_LIMIT landing. + phaseAStagger := 100 * time.Millisecond + phaseACount := 0 + for phaseACount < n { + if directSaturated() { + log.Printf("StartProxy: direct egress saturated after %d sessions, transitioning to tunnel egress", + phaseACount) + break + } + spawnSession(phaseACount) + phaseACount++ + select { + case <-time.After(phaseAStagger): + case <-ctx.Done(): + wg1.Wait() + return + } + } + log.Printf("StartProxy: phase A done, spawned=%d/%d direct, saturated=%v", + phaseACount, n, directSaturated()) + + if phaseACount >= n { + // Phase A spawned all N — no Phase B needed. proxyReady was + // already fired by the drain goroutine above; nothing else + // to do here. + log.Printf("Proxy started on %s with %d parallel TURN session(s) (all direct)", localAddrStr, n) + wg1.Wait() + return + } + + // Phase B: still need sessions, direct saturated. Spawn the rest + // through the tunnel egress. captchaTunnelEgress has either + // already flipped (if first session was ready before saturation) + // or will flip via the drain goroutine after the next ready. + wg1.Go(func() { + log.Printf("StartProxy: spawning phase B (target=%d, already=%d)", n, phaseACount) + + // Per-session stagger 200 ms — twice Phase A because the WG + // server's egress is the only IP for everyone else's traffic + // too, so saturating it has wider blast radius. The 1.5-2.5 s + // pre-slot jitter (F3) handles anti-bot pacing; the stagger + // only governs how quickly the loop notices tunnel + // saturation. 200 ms × 40 ≈ 8 s phase B warm-up vs old 32 s. + phaseBStagger := 200 * time.Millisecond + for i := phaseACount; i < n; i++ { + if ctx.Err() != nil { + return + } + if tunnelSaturated() { + log.Printf("StartProxy: tunnel egress also rate-limited; stopping at %d/%d sessions", + i, n) + return + } + spawnSession(i) + select { + case <-time.After(phaseBStagger): + case <-ctx.Done(): + return + } + } + }) + + log.Printf("Proxy started on %s with %d parallel TURN session(s) requested (phased)", localAddrStr, n) wg1.Wait() } @@ -747,4 +1659,21 @@ func StopProxy() { proxyCancel = nil log.Println("Proxy gracefully stopped") } + // Drop accumulated idle HTTP conns. sharedAuthClient, + // remoteCredsClient and dohClient are package-level so their + // pools survive StartProxy/StopProxy cycles — without an + // explicit flush, every Disconnect carries forward a + // potentially-stale persistConn (each with a readLoop + + // writeLoop goroutine pair) to the next Connect. + flushHTTPIdleConns() +} + +// flushHTTPIdleConns closes idle conns on every package-level +// http.Client in the bridge. Called from StopProxy and +// ProxyForceReconnect — both cases where outbound HTTP path may +// have changed under us. +func flushHTTPIdleConns() { + sharedAuthClient.CloseIdleConnections() + remoteCredsClient.CloseIdleConnections() + dohClient.CloseIdleConnections() } diff --git a/wireguard-apple/Sources/WireGuardKitGo/turn_restart.go b/wireguard-apple/Sources/WireGuardKitGo/turn_restart.go new file mode 100644 index 0000000..8b6e863 --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/turn_restart.go @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: MIT +// +// `RestartProxy` keeps existing iOS callers (PacketTunnelProvider's +// debounced wake/path-change path) wired up after the in-tunnel session +// registry was renamed. It delegates to ProxyForceReconnect, which owns +// the per-session cancel map. The log line is preserved verbatim so the +// extension's TransportHealthMonitor pattern-match still flips the +// "transport unhealthy" flag in App Group UserDefaults. + +package main + +import "C" +import "log" + +//export RestartProxy +func RestartProxy() { + sessionMu.Lock() + n := len(sessionCancels) + sessionMu.Unlock() + if n == 0 { + log.Printf("RestartProxy: nothing to restart") + return + } + ProxyForceReconnect() + log.Printf("RestartProxy: cancelled %d in-flight DTLS connection(s)", n) +} diff --git a/wireguard-apple/Sources/WireGuardKitGo/udp_buffers.go b/wireguard-apple/Sources/WireGuardKitGo/udp_buffers.go new file mode 100644 index 0000000..0420ee1 --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/udp_buffers.go @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: MIT +// +// UDP socket-buffer tuning helper for the two real wire sockets in +// turn_proxy.go (listenConn on 127.0.0.1:9000 and turnConn to VK's +// relay). Default iOS UDP RCVBUF/SNDBUF is in the ~196 KB ballpark, +// which is fine for the audio/video calling that VK's TURN servers +// were originally built for but too small for the bursty packet +// pattern of a web page load tunneled over WG: 50–100 1.2 KB packets +// arrive within a few ms and the kernel drops anything that can't fit +// the queue before the read goroutine drains it. +// +// We try to raise both buffers to 4 MB. The kernel may cap the actual +// size below the request (iOS uses `kern.ipc.maxsockbuf`, typically +// 8 MB), so we log what we actually got via SO_RCVBUF / SO_SNDBUF +// readback so a future "still losing packets" report can be diagnosed. + +package main + +import ( + "log" + "net" + "syscall" +) + +const udpBufferTarget = 4 * 1024 * 1024 // 4 MB + +// udpBufferTuner is the smallest interface that both +// `net.PacketConn` (listenConn) and `*net.UDPConn` (turnConn) satisfy +// for setting socket buffer sizes. +type udpBufferTuner interface { + SetReadBuffer(bytes int) error + SetWriteBuffer(bytes int) error +} + +// tuneUDPBuffers requests larger socket buffers and logs the result. +// On Darwin the actual buffer size is 2x the value you ask for (the +// kernel accounts for control overhead), so the SO_RCVBUF/SO_SNDBUF +// readback can look bigger than `udpBufferTarget` — that's fine. +func tuneUDPBuffers(label string, conn interface{}) { + t, ok := conn.(udpBufferTuner) + if !ok { + log.Printf("%s: cannot tune UDP buffers (unsupported type %T)", label, conn) + return + } + if err := t.SetReadBuffer(udpBufferTarget); err != nil { + log.Printf("%s: SetReadBuffer(%d) failed: %v", label, udpBufferTarget, err) + } + if err := t.SetWriteBuffer(udpBufferTarget); err != nil { + log.Printf("%s: SetWriteBuffer(%d) failed: %v", label, udpBufferTarget, err) + } + + // Read back the kernel-accepted values via SyscallConn so we know + // whether the request was honoured or silently capped. + rcv, snd := readbackBuffers(conn) + log.Printf("%s: UDP buffers tuned: SO_RCVBUF=%d SO_SNDBUF=%d (target=%d)", + label, rcv, snd, udpBufferTarget) +} + +func readbackBuffers(conn interface{}) (rcv, snd int) { + type syscallable interface { + SyscallConn() (syscall.RawConn, error) + } + sc, ok := conn.(syscallable) + if !ok { + return 0, 0 + } + raw, err := sc.SyscallConn() + if err != nil { + return 0, 0 + } + _ = raw.Control(func(fd uintptr) { + if v, err := syscall.GetsockoptInt(int(fd), syscall.SOL_SOCKET, syscall.SO_RCVBUF); err == nil { + rcv = v + } + if v, err := syscall.GetsockoptInt(int(fd), syscall.SOL_SOCKET, syscall.SO_SNDBUF); err == nil { + snd = v + } + }) + return rcv, snd +} + +// Type-assertion guard: net.PacketConn returned by net.ListenPacket +// for the "udp" network is concretely *net.UDPConn, which satisfies +// both udpBufferTuner and the SyscallConn interface. Compile-time +// sanity check so we don't drift. +var ( + _ udpBufferTuner = (*net.UDPConn)(nil) +) diff --git a/wireguard-apple/Sources/WireGuardKitGo/udp_fanout.go b/wireguard-apple/Sources/WireGuardKitGo/udp_fanout.go new file mode 100644 index 0000000..79e21b8 --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/udp_fanout.go @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: MIT +// +// UDP fan-out so N parallel TURN allocations actually share the WG +// upstream traffic instead of all sleeping on the same listenConn. +// +// Background: StartProxy creates ONE net.PacketConn for 127.0.0.1:9000 +// (the WG client's UDP endpoint) and previously handed the same +// PacketConn to every oneDtlsConnection goroutine. When WG sends a +// packet, the kernel wakes ONE waiting goroutine — usually the same +// one consistently due to scheduling — so the other N-1 sessions sit +// idle. Setting nValue=3 in the profile then doesn't actually +// triple throughput, which silently defeats the whole reason to +// run multiple TURN allocations. +// +// Fix: a dispatcher goroutine reads from the real listenConn and +// round-robins each packet into one of N fanoutPacketConn channels. +// Each fanoutPacketConn satisfies net.PacketConn, so it drops in +// where listenConn used to be passed without changing the +// oneDtlsConnection signature. WriteTo delegates straight back to +// the real socket — replies from all N sessions go out the same +// shared port to the same WG client address. +// +// WG itself is robust to per-packet reordering up to a 32-packet +// replay window (RFC 7539 + WireGuard whitepaper §5.3), so a 3-way +// round-robin is safe. Round-robin is per-packet rather than per-flow +// because there's only ever one flow on this socket (one WG client +// instance). + +package main + +import ( + "context" + "errors" + "log" + "net" + "os" + "sync" + "sync/atomic" + "time" +) + +// fanoutQueueDepth is the per-virtual-conn buffer size. Each slot +// holds a fanoutPacket (slice header + ~1500 B payload), so at +// N=40 and 256-deep queues worst-case the dispatcher could be +// holding 40 × 256 × 1.5 KB ≈ 15 MB of in-flight WG packets — a +// lot of headroom for a system that's already inside a ~100 MB +// extension budget. 64 still absorbs a fast page-load burst +// (~50-100 packets in a few ms) without dropping, and caps worst- +// case at ~4 MB. If the consumer is slower than that the drop +// counter trips earlier, which is the right signal anyway — +// hiding it behind a deeper queue just delays the inevitable. +const fanoutQueueDepth = 64 + +type fanoutPacket struct { + data []byte + addr net.Addr +} + +// fanoutPacketConn is the per-DTLS-session view of the shared +// listenConn. Reads come from a private channel filled by the +// dispatcher; writes go straight to the underlying socket. +type fanoutPacketConn struct { + id int + real net.PacketConn + incoming chan fanoutPacket + + closeOnce sync.Once + closed chan struct{} + + // deadline state: SetReadDeadline(past time) is the standard + // "interrupt the in-flight read" idiom used by oneDtlsConnection's + // context.AfterFunc cleanup. We mirror that with a wakeup channel + // that ReadFrom selects on. + deadlineMu sync.Mutex + wakeup chan struct{} + deadlineTimer *time.Timer + + dropped atomic.Uint64 // packets the dispatcher tried to enqueue but the channel was full + + // active=false ⇒ dispatcher skips this lane. Used during phased + // bring-up: wave-2 fanouts exist before their DTLS sessions are + // up; without this gate the dispatcher round-robins WG packets + // into channels nobody is draining, the buffer fills, and packets + // get dropped instead of being delivered to the wave-1 lanes that + // are actually live. Flipped to true the instant the matching + // oneDtlsConnection signals sessionOk. + active atomic.Bool +} + +func newFanoutPacketConn(id int, real net.PacketConn) *fanoutPacketConn { + return &fanoutPacketConn{ + id: id, + real: real, + incoming: make(chan fanoutPacket, fanoutQueueDepth), + closed: make(chan struct{}), + wakeup: make(chan struct{}), + } +} + +func (f *fanoutPacketConn) ReadFrom(p []byte) (int, net.Addr, error) { + select { + case pkt, ok := <-f.incoming: + if !ok { + return 0, nil, net.ErrClosed + } + n := copy(p, pkt.data) + return n, pkt.addr, nil + case <-f.closed: + return 0, nil, net.ErrClosed + case <-f.wakeup: + return 0, nil, os.ErrDeadlineExceeded + } +} + +func (f *fanoutPacketConn) WriteTo(p []byte, addr net.Addr) (int, error) { + return f.real.WriteTo(p, addr) +} + +func (f *fanoutPacketConn) Close() error { + f.closeOnce.Do(func() { close(f.closed) }) + return nil +} + +func (f *fanoutPacketConn) LocalAddr() net.Addr { return f.real.LocalAddr() } + +func (f *fanoutPacketConn) SetDeadline(t time.Time) error { + if err := f.SetReadDeadline(t); err != nil { + return err + } + return f.SetWriteDeadline(t) +} + +func (f *fanoutPacketConn) SetReadDeadline(t time.Time) error { + f.deadlineMu.Lock() + defer f.deadlineMu.Unlock() + + if f.deadlineTimer != nil { + f.deadlineTimer.Stop() + f.deadlineTimer = nil + } + + // Empty time → clear deadline. Replace wakeup so future ReadFrom + // calls don't immediately fail. + if t.IsZero() { + select { + case <-f.wakeup: + // Was closed; create a fresh one so subsequent reads don't fail. + f.wakeup = make(chan struct{}) + default: + } + return nil + } + + wait := time.Until(t) + if wait <= 0 { + // Already past — interrupt any current ReadFrom immediately. + select { + case <-f.wakeup: + // Already closed, nothing to do. + default: + close(f.wakeup) + } + return nil + } + + // Future deadline — arm a timer to close wakeup at the right moment. + f.deadlineTimer = time.AfterFunc(wait, func() { + f.deadlineMu.Lock() + defer f.deadlineMu.Unlock() + select { + case <-f.wakeup: + default: + close(f.wakeup) + } + }) + return nil +} + +func (f *fanoutPacketConn) SetWriteDeadline(t time.Time) error { + // The real listenConn's deadline is shared across all fanouts, so + // honoring it here would break the other sessions. We don't use + // write deadlines anywhere in oneDtlsConnection's actual data + // path, so this is safe to ignore. + return nil +} + +// startFanoutDispatcher spawns one goroutine that drains the shared +// listenConn and distributes packets round-robin into the N fanouts. +// On listenConn close it tears down all fanouts. +func startFanoutDispatcher(ctx context.Context, listenConn net.PacketConn, fanouts []*fanoutPacketConn) { + go func() { + defer func() { + for _, f := range fanouts { + f.Close() + } + }() + + buf := make([]byte, 1600) + var rrIdx uint64 + var dropped uint64 + + // Drops are silent under healthy conditions and only surface + // when a consumer is actually backing up. Periodic dispatcher + // health log fires every 10s but only emits a line if at + // least one packet was dropped since the last tick. + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + go func() { + var prevDrop uint64 + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + curDrop := atomic.LoadUint64(&dropped) + if curDrop == prevDrop { + continue + } + var perFanoutDrop []uint64 + for _, f := range fanouts { + perFanoutDrop = append(perFanoutDrop, f.dropped.Load()) + } + log.Printf("fanout: dropped Δ+%d (total=%d) per-session=%v", + curDrop-prevDrop, curDrop, perFanoutDrop) + prevDrop = curDrop + } + } + }() + + for { + n, addr, err := listenConn.ReadFrom(buf) + if err != nil { + if errors.Is(err, net.ErrClosed) || errors.Is(err, os.ErrDeadlineExceeded) { + log.Printf("fanout: dispatcher exiting: %s", err) + return + } + log.Printf("fanout: dispatcher read error: %s", err) + return + } + + // Copy because buf is reused next iteration. + data := make([]byte, n) + copy(data, buf[:n]) + + // Pick an ACTIVE fanout. We use SHORTEST-QUEUE-FIRST with a + // round-robin tiebreak. Pure round-robin distributes WG + // packets evenly across N sessions, but the throughput of a + // VK TURN allocation is voice-grade (~250 kbps - 2 Mbps) and + // some allocations are noticeably slower than others — a + // recycled cred about to expire, a relay on a hot path, a + // session whose DTLS handshake just retried. Pure RR keeps + // shoving packets into those slow lanes until their 256-deep + // buffer fills, then drops; meanwhile fast lanes idle. + // Shortest-queue-first naturally puts most packets on the + // fastest lanes (their queues are short because they drain + // quickly) and starves the slow ones. WG's own + // retransmit/ack machinery handles any out-of-order arrival. + // + // O(N) per packet is fine: N ≤ 100, ~1k pkts/sec on a + // 10 Mbps tunnel → 100k cheap ops/sec. + total := uint64(len(fanouts)) + start := (atomic.AddUint64(&rrIdx, 1) - 1) % total + var f *fanoutPacketConn + minLen := fanoutQueueDepth + 1 // anything in-range beats this + for k := uint64(0); k < total; k++ { + i := (start + k) % total + if !fanouts[i].active.Load() { + continue + } + l := len(fanouts[i].incoming) + if l < minLen { + minLen = l + f = fanouts[i] + if l == 0 { + break // an empty queue is unbeatable + } + } + } + if f == nil { + // No active lanes — the bootstrap fleet hasn't come + // up yet, or all sessions died. Drop and account. + atomic.AddUint64(&dropped, 1) + continue + } + + select { + case f.incoming <- fanoutPacket{data: data, addr: addr}: + case <-ctx.Done(): + return + default: + // Consumer is too slow — drop this packet and account + // for it. Better than blocking the dispatcher (which + // would also stall the other N-1 fanouts) or growing + // the channel unbounded. + f.dropped.Add(1) + atomic.AddUint64(&dropped, 1) + } + } + }() +} diff --git a/wireguard-apple/Sources/WireGuardKitGo/vk_captcha.go b/wireguard-apple/Sources/WireGuardKitGo/vk_captcha.go index 7192762..8ca056e 100644 --- a/wireguard-apple/Sources/WireGuardKitGo/vk_captcha.go +++ b/wireguard-apple/Sources/WireGuardKitGo/vk_captcha.go @@ -4,22 +4,22 @@ import ( "context" "crypto/rand" "crypto/sha256" - "crypto/tls" "encoding/base64" "encoding/hex" "encoding/json" + "errors" "fmt" "io" "log" mathrand "math/rand" - "net" - "net/http" - "net/http/cookiejar" "net/url" "regexp" "strconv" "strings" "time" + + fhttp "github.com/bogdanfinn/fhttp" + tlsclient "github.com/bogdanfinn/tls-client" ) type VkCaptchaError struct { @@ -44,21 +44,18 @@ func randomHex(n int) string { return hex.EncodeToString(bytes) } -func newCaptchaClient() *http.Client { - jar, _ := cookiejar.New(nil) - return &http.Client{ - Timeout: 20 * time.Second, - Jar: jar, - Transport: &http.Transport{ - DialContext: (&net.Dialer{ - Timeout: 30 * time.Second, - KeepAlive: 30 * time.Second, - }).DialContext, - TLSClientConfig: &tls.Config{ - InsecureSkipVerify: false, - }, - }, +// newCaptchaClient is kept for backwards compat with call sites and +// just defers to the TLS-fingerprinted client. See captcha_client.go +// for the full rationale and the forceDirect caveat. +func newCaptchaClient(forceDirect bool) tlsclient.HttpClient { + c, err := newTLSCaptchaClient(forceDirect) + if err != nil { + // tls-client.NewHttpClient can only fail on misconfigured + // options. Our options are static and tested, so a panic + // here means we built bogus options at compile time. + panic(fmt.Sprintf("newTLSCaptchaClient: %v", err)) } + return c } func ParseVkCaptchaError(errData map[string]interface{}) *VkCaptchaError { @@ -110,22 +107,88 @@ func (e *VkCaptchaError) IsCaptchaError() bool { return e.ErrorCode == 14 && e.RedirectUri != "" && e.SessionToken != "" } -func solveVkCaptcha(ctx context.Context, captchaErr *VkCaptchaError) (string, error) { - time.Sleep(time.Duration(1500+mathrand.Intn(1000)) * time.Millisecond) +// solveVkCaptcha returns either a success_token (legacy path: caller +// retries the failing VK API call themselves) OR a full JSON response +// (new path: the WebView did the retry inside its own browser +// session, so the caller skips its own retry and uses the response +// directly). +// +// retryURL + retryBody describe the request the WebView should make +// after extracting success_token. retryBody contains the literal +// "__TOKEN__" placeholder. Pass empty strings to fall back to the +// token-only flow — that's still wired and works for backwards +// compat with older Swift bridges that don't know about the new +// response path. +func solveVkCaptcha(ctx context.Context, captchaErr *VkCaptchaError, retryURL, retryBody string) (string, string, error) { + if manualCaptchaForcedMode() { + log.Printf("[Captcha] Manual mode enabled — handing the challenge to the UI") + return requestManualCaptcha(captchaErr.RedirectUri, retryURL, retryBody, 180*time.Second) + } + + // Bootstrap-manual-first. Under hard network blocking there is no + // tunnel and no reachable captcha-service until the very first + // session is up, so the first few identities MUST be earned by + // hand. Running the tls-client auto-solver first in that window is + // actively harmful: it reliably draws status:BOT, and that BOT + // verdict poisons the captcha session / source IP that the user is + // about to solve in a real WebKit engine moments later. So while no + // session is ready (and the user opted into prompts at all), skip + // the auto chain and go straight to the manual sheet. Gated on + // mode != off so pure-auto users never see a surprise prompt, and + // bounded by the per-session prompt quota inside requestManualCaptcha. + if manualCaptchaBootstrapActive() { + log.Printf("[Captcha] bootstrap (sessions_ready=0) — manual-first, skipping auto solve to avoid poisoning the session") + tok, resp, err := requestManualCaptcha(captchaErr.RedirectUri, retryURL, retryBody, 180*time.Second) + if err == nil { + return tok, resp, nil + } + // errDeferToRemote (quota exhausted / a session came up while we + // queued) means "let the normal routing take over" — fall + // through to the auto chain rather than failing the solve. + if !errors.Is(err, errDeferToRemote) { + return "", "", err + } + log.Printf("[Captcha] bootstrap manual deferred (%v) — falling through to auto chain", err) + } + + // Egress decision. The default is whatever captchaTunnelEgress + // dictates (direct pre-handshake, tunnel post-handshake). When + // tunnel is saturated AND direct still has budget, we override + // and pin a physical interface (cellular / WiFi) for this attempt + // so the request bypasses utun — that's the only way to retry + // the direct egress after WG comes up. cellularDial falls back + // to the system route if no usable physical interface is found. + forceDirect := captchaTunnelEgress.Load() && tunnelSaturated() && !directSaturated() + if forceDirect { + log.Printf("[Captcha] tunnel egress saturated — forcing physical-interface egress") + } + + // Bump the in-flight gauge for this egress so the UI sees an + // increase the moment a solve starts. Released on every return + // path via defer. + isTunnel := markCaptchaAttemptStart(forceDirect) + defer markCaptchaAttemptDone(isTunnel) + + // Anti-bot pacing used to live here as a 1.5-2.5 s pre-solve + // sleep, but it was held INSIDE poolCreds' solveSlot semaphore + // which throttles 5 in-flight solves. The slot now covers only + // the real PoW + HTTP work; pacing has been moved to poolCreds' + // pre-slot wait so the same wall-clock delay overlaps the slot + // queue instead of serialising inside it. log.Printf("[Captcha] Solving Not Robot Captcha...") sessionToken := captchaErr.SessionToken if sessionToken == "" { - return "", fmt.Errorf("no session_token in redirect_uri") + return "", "", fmt.Errorf("no session_token in redirect_uri") } profile := getRandomProfile() - client := newCaptchaClient() + client := newCaptchaClient(forceDirect) powInput, difficulty, htmlSettings, err := fetchPowInput(ctx, client, profile, captchaErr.RedirectUri) if err != nil { - return "", fmt.Errorf("failed to fetch PoW input: %w", err) + return "", "", fmt.Errorf("failed to fetch PoW input: %w", err) } log.Printf("[Captcha] PoW input: %s, difficulty: %d, htmlSettings=%v", powInput, difficulty, htmlSettings != nil) @@ -133,32 +196,49 @@ func solveVkCaptcha(ctx context.Context, captchaErr *VkCaptchaError) (string, er hash := solvePoW(powInput, difficulty) log.Printf("[Captcha] PoW solved: hash=%s", hash) - successToken, err := callCaptchaNotRobot(ctx, client, profile, sessionToken, hash, htmlSettings) + successToken, err := callCaptchaNotRobot(ctx, client, profile, sessionToken, hash, htmlSettings, isTunnel) if err != nil { - return "", fmt.Errorf("captchaNotRobot API failed: %w", err) + // Manual-fallback mode: hand the redirect_uri to the iOS UI + // and let the user solve in SFSafariViewController instead of + // returning failure to the caller (which would recycle a + // stale identity). Only consulted when the auto chain has + // actually run AND failed, so user only sees prompts for the + // 15-20% of identities the solver couldn't earn on its own. + if manualCaptchaFallbackAvailable() { + log.Printf("[Captcha] auto failed (%v) — escalating to manual prompt", err) + tok, resp, mErr := requestManualCaptcha(captchaErr.RedirectUri, retryURL, retryBody, 180*time.Second) + if mErr == nil { + log.Printf("[Captcha] Success via manual fallback (response_path=%v)", resp != "") + markCaptchaSuccess(isTunnel) + return tok, resp, nil + } + return "", "", fmt.Errorf("captchaNotRobot API failed: %w (manual fallback also failed: %v)", err, mErr) + } + return "", "", fmt.Errorf("captchaNotRobot API failed: %w", err) } log.Printf("[Captcha] Success! Got success_token") - return successToken, nil + return successToken, "", nil } -func fetchPowInput(ctx context.Context, client *http.Client, profile Profile, redirectUri string) (string, int, map[string]interface{}, error) { - req, err := http.NewRequestWithContext(ctx, "GET", redirectUri, nil) +func fetchPowInput(ctx context.Context, client tlsclient.HttpClient, profile Profile, redirectUri string) (string, int, map[string]interface{}, error) { + req, err := fhttp.NewRequest("GET", redirectUri, nil) if err != nil { return "", 0, nil, err } + req = withCaptchaCtx(ctx, req) req.Header.Set("User-Agent", profile.UserAgent) req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8") req.Header.Set("Accept-Language", "en-US,en;q=0.9") - req.Header.Set("sec-ch-ua", profile.SecChUa) - req.Header.Set("sec-ch-ua-mobile", profile.SecChUaMobile) - req.Header.Set("sec-ch-ua-platform", profile.SecChUaPlatform) + // Safari iOS deliberately doesn't implement Client Hints — sending + // sec-ch-ua* from a Safari UA was itself a bot tell on the old + // net/http path. With Safari_IOS_18_0 fingerprint we mirror real + // Safari at every layer, so we drop these unconditionally. req.Header.Set("Sec-Fetch-Site", "none") req.Header.Set("Sec-Fetch-Mode", "navigate") req.Header.Set("Sec-Fetch-Dest", "document") - req.Header.Set("Sec-GPC", "1") - req.Header.Set("DNT", "1") + applySafariHeaderOrder(req) resp, err := client.Do(req) if err != nil { @@ -204,6 +284,20 @@ func fetchPowInput(ctx context.Context, client *http.Client, profile Profile, re } } + // Locate not_robot_captcha.js so callCaptchaNotRobot can fetch + // the live debug_info hash from it (see captcha_debug_info.go). + // Empty string is OK — the caller handles the absent-script path. + scriptURL := extractScriptURL(html) + if scriptURL != "" { + // Stash on htmlSettings so we don't need to grow the function + // signature. The map is opaque downstream apart from + // captchaNotRobot.check. + if htmlSettings == nil { + htmlSettings = map[string]interface{}{} + } + htmlSettings["_scriptURL"] = scriptURL + } + return powInput, difficulty, htmlSettings, nil } @@ -222,30 +316,29 @@ func solvePoW(powInput string, difficulty int) string { return "" } -func callCaptchaNotRobot(ctx context.Context, client *http.Client, profile Profile, sessionToken, hash string, htmlSettings map[string]interface{}) (string, error) { +func callCaptchaNotRobot(ctx context.Context, client tlsclient.HttpClient, profile Profile, sessionToken, hash string, htmlSettings map[string]interface{}, isTunnel bool) (string, error) { vkReq := func(method string, postData string) (map[string]interface{}, error) { - requestURL := "https://api.vk.ru/method/" + method + "?v=5.131" + requestURL := "https://api.vk.com/method/" + method + "?v=5.131" - req, err := http.NewRequestWithContext(ctx, "POST", requestURL, strings.NewReader(postData)) + req, err := fhttp.NewRequest("POST", requestURL, strings.NewReader(postData)) if err != nil { return nil, err } + req = withCaptchaCtx(ctx, req) req.Header.Set("User-Agent", profile.UserAgent) req.Header.Set("Content-Type", "application/x-www-form-urlencoded") req.Header.Set("Accept", "*/*") req.Header.Set("Accept-Language", "en-US,en;q=0.9") - req.Header.Set("Origin", "https://id.vk.ru") - req.Header.Set("Referer", "https://id.vk.ru/") - req.Header.Set("sec-ch-ua", profile.SecChUa) - req.Header.Set("sec-ch-ua-mobile", profile.SecChUaMobile) - req.Header.Set("sec-ch-ua-platform", profile.SecChUaPlatform) + req.Header.Set("Origin", "https://id.vk.com") + req.Header.Set("Referer", "https://id.vk.com/") + // No sec-ch-ua* — Safari doesn't send them; sending from a + // Safari fingerprint is itself a classifier tell. req.Header.Set("Sec-Fetch-Site", "same-site") req.Header.Set("Sec-Fetch-Mode", "cors") req.Header.Set("Sec-Fetch-Dest", "empty") - req.Header.Set("Sec-GPC", "1") - req.Header.Set("DNT", "1") req.Header.Set("Priority", "u=1, i") + applySafariHeaderOrder(req) httpResp, err := client.Do(req) if err != nil { @@ -281,34 +374,34 @@ func callCaptchaNotRobot(ctx context.Context, client *http.Client, profile Profi // Step 2: componentDone log.Printf("[Captcha] Step 2/4: componentDone") - browserFp := fmt.Sprintf("%016x%016x", mathrand.Int63(), mathrand.Int63()) - - resolutions := [][]int{{1920, 1080}, {1366, 768}, {1440, 900}, {1536, 864}, {2560, 1440}} - res := resolutions[mathrand.Intn(len(resolutions))] - screenW, screenH := res[0], res[1] - - cores := []int{4, 8, 12, 16}[mathrand.Intn(4)] - ram := []int{4, 8, 16, 32}[mathrand.Intn(4)] - - baseDownlink := 8.0 + mathrand.Float64()*4.0 - downlinkStr := fmt.Sprintf("%.1f", baseDownlink) - + // crypto/rand-backed 32-hex-char browser fingerprint. The pre-v2 + // version used math/rand which is seeded weakly and predictably. + browserFp := randomHex(16) + + // v2 device shape: 11 fixed fields matching what a real desktop + // browser reports through navigator.* probes. The pre-v2 version + // randomised resolutions and CPU counts per call, which created + // a per-solve fingerprint churn that VK's classifier could + // correlate against the stable TLS fingerprint and flag as bot + // behaviour. v2 ships the same desktop Chrome 8-core/1080p shape + // every time; combined with random browser_fp + cursor jitter + // it's noisy enough on the variable signals that matter. + const ( + screenW = 1920 + screenH = 1080 + ) deviceMap := map[string]interface{}{ "screenWidth": screenW, "screenHeight": screenH, "screenAvailWidth": screenW, - "screenAvailHeight": screenH - 40, - "innerWidth": screenW - mathrand.Intn(100), - "innerHeight": screenH - 100 - mathrand.Intn(50), - "devicePixelRatio": []float64{1, 1.25, 1.5, 2}[mathrand.Intn(4)], + "screenAvailHeight": screenH, + "innerWidth": screenW, + "innerHeight": 951, + "devicePixelRatio": 1, "language": "en-US", "languages": []string{"en-US", "en"}, "webdriver": false, - "hardwareConcurrency": cores, - "deviceMemory": ram, - "connectionEffectiveType": "4g", - "connectionRtt": []int{50, 100, 150}[mathrand.Intn(3)], - "connectionDownlink": baseDownlink, + "hardwareConcurrency": 8, "notificationsPermission": "denied", } deviceBytes, _ := json.Marshal(deviceMap) @@ -346,11 +439,24 @@ func callCaptchaNotRobot(ctx context.Context, client *http.Client, profile Profi } cursorBytes, _ := json.Marshal(cursor) - connectionDownlink := "[" + downlinkStr + "," + downlinkStr + "," + downlinkStr + "," + downlinkStr + "," + downlinkStr + "," + downlinkStr + "," + downlinkStr + "]" - answer := base64.StdEncoding.EncodeToString([]byte("{}")) - debugInfo := "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + // Fetch debug_info from not_robot_captcha.js (cached). If we can't + // (no scriptURL in HTML, or fetch failed), fall back to the legacy + // constant — better than skipping check entirely; on a healthy + // build the constant happens to match and we degrade gracefully. + scriptURL, _ := htmlSettings["_scriptURL"].(string) + debugInfo, debugErr := fetchDebugInfo(ctx, client, profile, scriptURL) + if debugErr != nil { + log.Printf("[Captcha] fetchDebugInfo: %v — using legacy constant", debugErr) + debugInfo = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + } + + // All motion arrays empty per v2 wire shape: VK's classifier looks + // for "client emits zero device events" as a sign of an honest + // desktop browser (not a touch-event-streaming mobile). The pre-v2 + // populated connectionDownlink array was a low-signal noise + // generator that didn't fool anything. checkData := baseParams + fmt.Sprintf( "&accelerometer=%s&gyroscope=%s&motion=%s&cursor=%s&taps=%s&connectionRtt=%s&connectionDownlink=%s"+ "&browser_fp=%s&hash=%s&answer=%s&debug_info=%s", @@ -360,7 +466,7 @@ func callCaptchaNotRobot(ctx context.Context, client *http.Client, profile Profi url.QueryEscape(string(cursorBytes)), url.QueryEscape("[]"), url.QueryEscape("[]"), - url.QueryEscape(connectionDownlink), + url.QueryEscape("[]"), browserFp, hash, answer, @@ -378,19 +484,40 @@ func callCaptchaNotRobot(ctx context.Context, client *http.Client, profile Profi } status, _ := respObj["status"].(string) - log.Printf("[Captcha] checkbox status: %s", status) + showType, _ := respObj["show_captcha_type"].(string) + log.Printf("[Captcha] checkbox status: %s show_type=%q", status, showType) if status == "OK" { successToken, ok := respObj["success_token"].(string) if ok && successToken != "" { log.Printf("[Captcha] Step 4/4: endSession") _, _ = vkReq("captchaNotRobot.endSession", baseParams) + markCaptchaSuccess(isTunnel) return successToken, nil } } - // Checkbox failed — try slider captcha - log.Printf("[Captcha] Checkbox failed, trying slider captcha...") + if status == "ERROR_LIMIT" { + // VK rate-limited the source IP. The slider path uses the + // same egress and the same rate-limit bucket, so trying slider + // would just burn another doomed request and (worse) saturate + // the next iteration earlier. Surface the error and let the + // outer retry storm controller decide. + markCaptchaSaturated(isTunnel) + return "", fmt.Errorf("captchaNotRobot.check ERROR_LIMIT (no slider fallback under rate-limit)") + } + + // v2 routing: only attempt slider when VK explicitly says BOT + // AND we have slider settings to feed it. Other non-OK statuses + // (server errors, unknown) shouldn't auto-fall-through to slider + // because slider is a separate, heavier request that VK can also + // 4xx independently. + sliderEligible := status == "BOT" && (showType == "" || showType == "slider") + if !sliderEligible { + return "", fmt.Errorf("captchaNotRobot.check non-OK status=%q show_type=%q", status, showType) + } + + log.Printf("[Captcha] Checkbox status=BOT show_type=%q, switching to slider", showType) // Use htmlSettings from the HTML page if available, otherwise use API settings mergedSettings := settingsResp @@ -398,13 +525,17 @@ func callCaptchaNotRobot(ctx context.Context, client *http.Client, profile Profi mergedSettings = htmlSettings } - sliderToken, sliderErr := solveSliderCaptcha(vkReq, baseParams, browserFp, hash, mergedSettings) + sliderToken, sliderErr := solveSliderCaptcha(vkReq, baseParams, browserFp, hash, debugInfo, mergedSettings, isTunnel) if sliderErr != nil { + // saturation accounting now happens inside solveSliderCaptcha + // at the exact branch (ERROR_LIMIT or unparseable_response), + // so this caller just propagates the error. return "", fmt.Errorf("slider captcha also failed: %w", sliderErr) } log.Printf("[Captcha] Slider solved! endSession...") _, _ = vkReq("captchaNotRobot.endSession", baseParams) + markCaptchaSuccess(isTunnel) return sliderToken, nil } diff --git a/wireguard-apple/Sources/WireGuardKitGo/wireguard.h b/wireguard-apple/Sources/WireGuardKitGo/wireguard.h index 222deeb..e021972 100644 --- a/wireguard-apple/Sources/WireGuardKitGo/wireguard.h +++ b/wireguard-apple/Sources/WireGuardKitGo/wireguard.h @@ -32,9 +32,56 @@ extern char *LibXrayStopXray(); extern char *LibXrayXrayVersion(); extern char* LibXraySetSockCallback(libxray_sockcallback cb, void* ctx); -extern void StartProxy(const char *link, const char *peerAddrStr, const char *localAddrStr, int n); +extern void StartProxy(const char *link, const char *peerAddrStr, const char *localAddrStr, int n, int udp); extern void StopProxy(void); +extern void RestartProxy(void); +extern void ProxyForceReconnect(void); extern void ProxySetLogger(void *context, logger_fn_t logger_fn); extern int ProxyWaitReady(int timeoutMs); +extern void ProxySetRemoteCaptchaService(const char *url, const char *apiKey); + +typedef void (*manual_captcha_cb_t)(const char *request_id, const char *redirect_uri); +extern void TurnBridgeSetManualCaptchaCallback(manual_captcha_cb_t cb); +extern void TurnBridgeSubmitManualCaptchaToken(const char *request_id, const char *token); +extern void TurnBridgeCancelManualCaptcha(const char *request_id, const char *reason); +extern void TurnBridgeSetManualCaptchaMode(int enabled); + +/* Returns a JSON {"url":..., "body":...} describing the request the + * WebView should make after extracting success_token, inside its own + * browser session. Caller must free() the returned string. NULL when + * no retry is configured for this request_id (legacy token-only flow). + * Used by the network-extension's CaptchaBridge to populate the + * PendingRequest with retry params for the app. */ +extern char *TurnBridgeGetManualCaptchaRetryRequest(const char *request_id); + +/* Delivers the full JSON response from the WebView's in-session API + * replay. getCreds then skips its own redemption call. Pass a non- + * empty token via TurnBridgeSubmitManualCaptchaToken when the WebView + * couldn't do the replay (fetch failed) so the legacy path still + * runs. */ +extern void TurnBridgeSubmitManualCaptchaResponse(const char *request_id, const char *response_json); +extern void TurnBridgeSetStreamAggregation(int enabled); +extern void TurnBridgeSetCaptchaTrapDir(const char *path); +extern int TurnBridgeGetCaptchaDirectCount(void); +extern int TurnBridgeGetCaptchaTunnelCount(void); +extern int TurnBridgeGetCaptchaDirectAttempts(void); +extern int TurnBridgeGetCaptchaTunnelAttempts(void); +extern int TurnBridgeGetCaptchaDirectInFlight(void); +extern int TurnBridgeGetCaptchaTunnelInFlight(void); +extern int TurnBridgeGetCaptchaRemoteCount(void); +extern int TurnBridgeGetCaptchaRemoteAttempts(void); +extern int TurnBridgeGetCaptchaRemoteInFlight(void); +extern int TurnBridgeIsCaptchaDirectSaturated(void); +extern int TurnBridgeIsCaptchaTunnelSaturated(void); +extern int TurnBridgeGetSessionsReady(void); +extern int TurnBridgeGetSessionsTarget(void); + +/* SRTP/Opus mimicry layer (see wrap.go). Empty / NULL key disables the + * wrap and falls back to the legacy direct DTLS-over-TURN path. Set + * BEFORE StartProxy — already-live sessions don't pick up key changes. + * The matching server must be running vk-turn-proxy with the same key + * configured (-wrap -wrap-key=); without that, AEAD will fail on + * every packet and no traffic flows. */ +extern void TurnBridgeSetWrapKey(const char *hexKey); #endif diff --git a/wireguard-apple/Sources/WireGuardKitGo/wrap.go b/wireguard-apple/Sources/WireGuardKitGo/wrap.go new file mode 100644 index 0000000..b155807 --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/wrap.go @@ -0,0 +1,198 @@ +// wrap.go — SRTP/Opus mimicry layer between our DTLS-encrypted WG +// payload and the TURN ChannelData frame on the wire. +// +// What VK's DPI sees on real call traffic between two clients via TURN: +// - DTLS handshake records (type 0x16) at the start +// - then SRTP frames carrying Opus voice — RTP header version=2, +// payload type 111 (Opus), monotonic seq+timestamp+SSRC, followed +// by an AEAD ciphertext. +// +// What VK sees on OUR traffic without wrap: +// - DTLS handshake records (fine) +// - then DTLS application-data records (type 0x17) forever +// +// The two diverge sharply after the handshake completes. VK appears to +// run a fast-path classifier on TURN ChannelData payloads: SRTP-shaped +// gets forwarded freely, anything else (incl. plain DTLS application- +// data) gets the rate-limit treatment we've been observing. wrap.go +// re-frames our DTLS records so the wire bytes match the shape of a +// real Opus voice stream — VK can't DPI past the AEAD ciphertext so +// they can't tell our "Opus" is actually WireGuard inside DTLS. +// +// Wire format (per packet): +// +// [12B RTP header | 12B explicit nonce | AEAD ciphertext | 16B tag] +// +// RTP header (RFC 3550): +// byte 0: 0x80 V=2, P=0, X=0, CC=0 +// byte 1: 0x6F M=0, PT=111 (Opus) +// byte 2-3: seq16 BE monotonic, init random +// byte 4-7: ts32 BE monotonic, init random, +960 per packet +// (20ms at 48kHz, the standard Opus framing) +// byte 8-11: SSRC random per conn, MSB encodes direction +// +// 12B explicit nonce = 4B sessionID || 8B counter (BE). sessionID MSB +// matches SSRC MSB (direction bit so client and server pick disjoint +// nonce subspaces despite sharing the same key). counter starts at a +// random uint64. +// +// AAD = first 24 bytes (RTP header || nonce). Authenticating these +// means the seq/timestamp/SSRC are spoof-proof — VK can't reorder +// or replay one packet's bytes into another's slot without AEAD +// failure. +// +// AEAD is ChaCha20-Poly1305 (RFC 7539). The shared 32-byte key is +// configured out of band; both ends must have the same key. Real SRTP +// uses AES-GCM (RFC 7714); we use ChaCha20-Poly1305 because the wire +// ciphertext/tag length is the same and ChaCha20 is faster on mobile +// CPUs without AES-NI. VK's DPI can't distinguish — it's looking at +// the RTP framing, not the cipher choice. +// +// Verbatim port from Moroka8/vk-turn-proxy/pkg/clientcore/wrap.go. +// Server-side counterpart lives in that same project and must be +// configured with the matching key. + +package main + +import ( + "crypto/cipher" + "crypto/rand" + "encoding/binary" + "encoding/hex" + "errors" + "fmt" + "sync/atomic" + + "golang.org/x/crypto/chacha20poly1305" +) + +const ( + wrapKeyLen = 32 + wrapRTPHdrLen = 12 + wrapNonceLen = 12 + wrapTagLen = 16 + wrapHeaderLen = wrapRTPHdrLen + wrapNonceLen + wrapOverhead = wrapHeaderLen + wrapTagLen + wrapRTPVersion = 0x80 + wrapRTPPT = 0x6F + wrapTSStep = 960 +) + +type wrapConn struct { + aead cipher.AEAD + sessionID [4]byte + ssrc [4]byte + counter atomic.Uint64 + seq atomic.Uint32 + timestamp atomic.Uint32 +} + +func newWrapConn(key []byte, isServer bool) (*wrapConn, error) { + if len(key) != wrapKeyLen { + return nil, fmt.Errorf("wrap: key must be %d bytes (got %d)", wrapKeyLen, len(key)) + } + aead, err := chacha20poly1305.New(key) + if err != nil { + return nil, fmt.Errorf("wrap: aead init: %w", err) + } + w := &wrapConn{aead: aead} + + var rnd [16]byte + if _, err := rand.Read(rnd[:]); err != nil { + return nil, fmt.Errorf("wrap: rand init: %w", err) + } + copy(w.sessionID[:], rnd[0:4]) + copy(w.ssrc[:], rnd[4:8]) + if isServer { + w.sessionID[0] |= 0x80 + w.ssrc[0] |= 0x80 + } else { + w.sessionID[0] &^= 0x80 + w.ssrc[0] &^= 0x80 + } + w.seq.Store(uint32(binary.BigEndian.Uint16(rnd[8:10]))) + w.timestamp.Store(binary.BigEndian.Uint32(rnd[10:14])) + + var cb [8]byte + if _, err := rand.Read(cb[:]); err != nil { + return nil, fmt.Errorf("wrap: counter rand: %w", err) + } + w.counter.Store(binary.BigEndian.Uint64(cb[:])) + return w, nil +} + +func wrapMaxWire(payloadLen int) int { + return wrapOverhead + payloadLen +} + +func (w *wrapConn) wrapInto(dst, payload []byte) (int, error) { + wireLen := wrapOverhead + len(payload) + if len(dst) < wireLen { + return 0, errors.New("wrap: dst buffer too small") + } + + dst[0] = wrapRTPVersion + dst[1] = wrapRTPPT + seq := uint16(w.seq.Add(1) - 1) + binary.BigEndian.PutUint16(dst[2:4], seq) + ts := w.timestamp.Add(wrapTSStep) - wrapTSStep + binary.BigEndian.PutUint32(dst[4:8], ts) + copy(dst[8:12], w.ssrc[:]) + + noncePos := wrapRTPHdrLen + copy(dst[noncePos:noncePos+4], w.sessionID[:]) + ctr := w.counter.Add(1) - 1 + binary.BigEndian.PutUint64(dst[noncePos+4:noncePos+wrapNonceLen], ctr) + + nonce := dst[noncePos : noncePos+wrapNonceLen] + aad := dst[:wrapHeaderLen] + ctPos := wrapHeaderLen + copy(dst[ctPos:], payload) + w.aead.Seal(dst[ctPos:ctPos], nonce, dst[ctPos:ctPos+len(payload)], aad) + + return wireLen, nil +} + +func (w *wrapConn) unwrapPacket(wire, dst []byte) (int, error) { + if len(wire) < wrapOverhead { + return 0, errors.New("wrap: packet too short") + } + nonce := wire[wrapRTPHdrLen : wrapRTPHdrLen+wrapNonceLen] + aad := wire[:wrapHeaderLen] + ct := wire[wrapHeaderLen:] + + plain, err := w.aead.Open(ct[:0], nonce, ct, aad) + if err != nil { + return 0, fmt.Errorf("wrap: AEAD open: %w", err) + } + if len(plain) > len(dst) { + return 0, errors.New("wrap: dst buffer too small") + } + copy(dst[:len(plain)], plain) + return len(plain), nil +} + +func genWrapKeyHex() (string, error) { + key := make([]byte, wrapKeyLen) + if _, err := rand.Read(key); err != nil { + return "", fmt.Errorf("wrap: key gen: %w", err) + } + return hex.EncodeToString(key), nil +} + +func decodeWrapKey(enabled bool, raw string) ([]byte, error) { + if !enabled { + return nil, nil + } + if raw == "" { + return nil, errors.New("wrap enabled but key is empty") + } + key, err := hex.DecodeString(raw) + if err != nil { + return nil, fmt.Errorf("wrap-key invalid hex: %w", err) + } + if len(key) != wrapKeyLen { + return nil, fmt.Errorf("wrap-key must decode to %d bytes (got %d)", wrapKeyLen, len(key)) + } + return key, nil +} diff --git a/wireguard-apple/Sources/WireGuardKitGo/wrap_config.go b/wireguard-apple/Sources/WireGuardKitGo/wrap_config.go new file mode 100644 index 0000000..ad4c096 --- /dev/null +++ b/wireguard-apple/Sources/WireGuardKitGo/wrap_config.go @@ -0,0 +1,78 @@ +// wrap_config.go — Swift→Go bridge for the wrap-key configuration. +// +// Swift sets the wrap key (64 hex chars = 32 bytes) before calling +// StartProxy. Empty key disables wrap entirely; oneTurnConnection +// then takes the legacy code path. Non-empty key must decode +// successfully or StartProxy logs and aborts before spawning sessions +// — there's no point opening 60 TURN allocations that VK will then +// drop on the wire-format mismatch. + +package main + +/* +#include +*/ +import "C" + +import ( + "log" + "sync/atomic" +) + +// wrapKeyBytes holds the decoded 32-byte key. nil = wrap disabled. +// Updated atomically by TurnBridgeSetWrapKey before StartProxy reads +// it; oneTurnConnection re-reads on every session start so the value +// in effect always matches what Swift last published. +var wrapKeyBytes atomic.Pointer[[]byte] + +//export TurnBridgeSetWrapKey +func TurnBridgeSetWrapKey(cKey *C.char) { + if cKey == nil { + wrapKeyBytes.Store(nil) + log.Printf("wrap: disabled (nil key)") + return + } + raw := C.GoString(cKey) + if raw == "" { + wrapKeyBytes.Store(nil) + log.Printf("wrap: disabled (empty key)") + return + } + key, err := decodeWrapKey(true, raw) + if err != nil { + // Don't half-enable: leave the previous value intact so a + // bad input doesn't suddenly turn a working session off. + log.Printf("wrap: TurnBridgeSetWrapKey rejected: %v", err) + return + } + wrapKeyBytes.Store(&key) + log.Printf("wrap: enabled (key set, %d bytes)", len(key)) +} + +// currentWrapKey returns the active wrap key or nil if disabled. +// Each oneTurnConnection calls this once on session start so a key +// change mid-flight takes effect on the next reconnect, not on +// already-live sessions (changing it mid-stream would just AEAD-fail +// every subsequent packet on the existing allocation). +func currentWrapKey() []byte { + p := wrapKeyBytes.Load() + if p == nil { + return nil + } + return *p +} + +//export TurnBridgeGenerateWrapKey +func TurnBridgeGenerateWrapKey() *C.char { + // Convenience for the iOS Settings UI: generates a fresh 32-byte + // random key and returns it as 64 hex chars. Swift takes the + // returned string, displays it for the user to copy to the + // server config, and persists it locally. Caller MUST free the + // returned string via free() — see SwiftBridge.swift. + hex, err := genWrapKeyHex() + if err != nil { + log.Printf("wrap: genWrapKeyHex failed: %v", err) + return nil + } + return C.CString(hex) +}