diff --git a/.vscode/cspell.misc.yaml b/.vscode/cspell.misc.yaml index 66a501e4eb8..9b6242d0f58 100644 --- a/.vscode/cspell.misc.yaml +++ b/.vscode/cspell.misc.yaml @@ -20,6 +20,22 @@ overrides: words: - MSRC - msrc + - filename: ./docs/specs/metrics-audit/** + words: + - vsrpc + - Buildpacks + - devdeviceid + - appinit + - oneauth + - dashboarding + - Pseudonymized + - pseudonymized + - unhashed + - countif + - Angelos + - Entra + - CODEOWNERS + - weikanglim - filename: ./README.md words: - VSIX diff --git a/cli/azd/cmd/auth_login.go b/cli/azd/cmd/auth_login.go index b0687f80be4..66459ea7012 100644 --- a/cli/azd/cmd/auth_login.go +++ b/cli/azd/cmd/auth_login.go @@ -20,6 +20,8 @@ import ( "github.com/azure/azure-dev/cli/azd/cmd/actions" "github.com/azure/azure-dev/cli/azd/internal" "github.com/azure/azure-dev/cli/azd/internal/runcontext" + "github.com/azure/azure-dev/cli/azd/internal/tracing" + "github.com/azure/azure-dev/cli/azd/internal/tracing/fields" "github.com/azure/azure-dev/cli/azd/pkg/account" "github.com/azure/azure-dev/cli/azd/pkg/auth" "github.com/azure/azure-dev/cli/azd/pkg/contracts" @@ -307,6 +309,7 @@ func (la *loginAction) Run(ctx context.Context) (*actions.ActionResult, error) { } if la.flags.onlyCheckStatus { + tracing.SetUsageAttributes(fields.AuthMethodKey.String("check-status")) // In check status mode, we always print the final status to stdout. // We print any non-setup related errors to stderr. // We always return a zero exit code. @@ -452,6 +455,11 @@ func runningOnCodespacesBrowser(ctx context.Context, commandRunner exec.CommandR } func (la *loginAction) login(ctx context.Context) error { + // Track tenant ID if provided (before resolving from env vars) + if la.flags.tenantID != "" { + tracing.SetUsageAttributes(fields.TenantIdKey.String(la.flags.tenantID)) + } + if la.flags.federatedTokenProvider == azurePipelinesProvider { if la.flags.clientID == "" { log.Printf("setting client id from environment variable %s", azurePipelinesClientIDEnvVarName) @@ -465,6 +473,7 @@ func (la *loginAction) login(ctx context.Context) error { } if la.flags.managedIdentity { + tracing.SetUsageAttributes(fields.AuthMethodKey.String("managed-identity")) if _, err := la.authManager.LoginWithManagedIdentity( ctx, la.flags.clientID, ); err != nil { @@ -494,6 +503,7 @@ func (la *loginAction) login(ctx context.Context) error { switch { case la.flags.clientSecret.ptr != nil: + tracing.SetUsageAttributes(fields.AuthMethodKey.String("service-principal-secret")) if *la.flags.clientSecret.ptr == "" { v, err := la.console.Prompt(ctx, input.ConsoleOptions{ Message: "Enter your client secret", @@ -510,6 +520,7 @@ func (la *loginAction) login(ctx context.Context) error { return fmt.Errorf("logging in: %w", err) } case la.flags.clientCertificate != "": + tracing.SetUsageAttributes(fields.AuthMethodKey.String("service-principal-certificate")) certFile, err := os.Open(la.flags.clientCertificate) if err != nil { return fmt.Errorf("reading certificate: %w", err) @@ -527,12 +538,14 @@ func (la *loginAction) login(ctx context.Context) error { return fmt.Errorf("logging in: %w", err) } case la.flags.federatedTokenProvider == "github": + tracing.SetUsageAttributes(fields.AuthMethodKey.String("federated-github")) if _, err := la.authManager.LoginWithGitHubFederatedTokenProvider( ctx, la.flags.tenantID, la.flags.clientID, ); err != nil { return fmt.Errorf("logging in: %w", err) } case la.flags.federatedTokenProvider == azurePipelinesProvider: + tracing.SetUsageAttributes(fields.AuthMethodKey.String("federated-azure-pipelines")) serviceConnectionID := os.Getenv(azurePipelinesServiceConnectionIDEnvVarName) if serviceConnectionID == "" { @@ -546,6 +559,7 @@ func (la *loginAction) login(ctx context.Context) error { return fmt.Errorf("logging in: %w", err) } case la.flags.federatedTokenProvider == "oidc": // generic oidc provider + tracing.SetUsageAttributes(fields.AuthMethodKey.String("federated-oidc")) if _, err := la.authManager.LoginWithOidcFederatedTokenProvider( ctx, la.flags.tenantID, la.flags.clientID, ); err != nil { @@ -557,6 +571,7 @@ func (la *loginAction) login(ctx context.Context) error { } if la.authManager.UseExternalAuth() { + tracing.SetUsageAttributes(fields.AuthMethodKey.String("external")) // Request a token and assume the external auth system will prompt the user to log in. // // TODO(ellismg): We may want instead to call some explicit `/login` endpoint on the external auth system instead @@ -581,6 +596,7 @@ func (la *loginAction) login(ctx context.Context) error { } if useDevCode { + tracing.SetUsageAttributes(fields.AuthMethodKey.String("device-code")) _, err = la.authManager.LoginWithDeviceCode(ctx, la.flags.tenantID, la.flags.scopes, claims, func(url string) error { if !la.flags.global.NoPrompt { @@ -598,8 +614,10 @@ func (la *loginAction) login(ctx context.Context) error { } if oneauth.Supported && !la.flags.browser { + tracing.SetUsageAttributes(fields.AuthMethodKey.String("oneauth")) err = la.authManager.LoginWithOneAuth(ctx, la.flags.tenantID, la.flags.scopes) } else { + tracing.SetUsageAttributes(fields.AuthMethodKey.String("browser")) _, err = la.authManager.LoginInteractive(ctx, la.flags.scopes, claims, &auth.LoginInteractiveOptions{ TenantID: la.flags.tenantID, diff --git a/cli/azd/cmd/env.go b/cli/azd/cmd/env.go index fb237a0293f..5f299e8bb91 100644 --- a/cli/azd/cmd/env.go +++ b/cli/azd/cmd/env.go @@ -17,6 +17,8 @@ import ( "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" "github.com/azure/azure-dev/cli/azd/cmd/actions" "github.com/azure/azure-dev/cli/azd/internal" + "github.com/azure/azure-dev/cli/azd/internal/tracing" + "github.com/azure/azure-dev/cli/azd/internal/tracing/fields" "github.com/azure/azure-dev/cli/azd/pkg/account" "github.com/azure/azure-dev/cli/azd/pkg/alpha" "github.com/azure/azure-dev/cli/azd/pkg/azapi" @@ -874,6 +876,8 @@ func (e *envListAction) Run(ctx context.Context) (*actions.ActionResult, error) return nil, fmt.Errorf("listing environments: %w", err) } + tracing.SetUsageAttributes(fields.EnvCountKey.Int(len(envs))) + if e.formatter.Kind() == output.TableFormat { columns := []output.Column{ { diff --git a/cli/azd/cmd/hooks.go b/cli/azd/cmd/hooks.go index e8f9731ca47..1cc582a05f1 100644 --- a/cli/azd/cmd/hooks.go +++ b/cli/azd/cmd/hooks.go @@ -9,6 +9,8 @@ import ( "github.com/azure/azure-dev/cli/azd/cmd/actions" "github.com/azure/azure-dev/cli/azd/internal" + "github.com/azure/azure-dev/cli/azd/internal/tracing" + "github.com/azure/azure-dev/cli/azd/internal/tracing/fields" "github.com/azure/azure-dev/cli/azd/pkg/environment" "github.com/azure/azure-dev/cli/azd/pkg/exec" "github.com/azure/azure-dev/cli/azd/pkg/ext" @@ -115,9 +117,46 @@ const ( hookContextService hookContextType = "service" ) +// knownHookNames is the set of built-in azd hook names. +// Extension-defined hooks are not included here; they are hashed in telemetry. +// See https://github.com/Azure/azure-dev/issues/7348 for tracking. +var knownHookNames = map[string]bool{ + "prebuild": true, + "postbuild": true, + "predeploy": true, + "postdeploy": true, + "predown": true, + "postdown": true, + "prepackage": true, + "postpackage": true, + "preprovision": true, + "postprovision": true, + "prepublish": true, + "postpublish": true, + "prerestore": true, + "postrestore": true, + "preup": true, + "postup": true, +} + func (hra *hooksRunAction) Run(ctx context.Context) (*actions.ActionResult, error) { hookName := hra.args[0] + hookType := "project" + if hra.flags.service != "" { + hookType = "service" + } + + // Log known hook names raw; hash unknown names to avoid logging arbitrary user input. + hookNameAttr := fields.StringHashed(fields.HooksNameKey, hookName) + if knownHookNames[hookName] { + hookNameAttr = fields.HooksNameKey.String(hookName) + } + tracing.SetUsageAttributes( + hookNameAttr, + fields.HooksTypeKey.String(hookType), + ) + // Command title hra.console.MessageUxItem(ctx, &ux.MessageTitle{ Title: "Running hooks (azd hooks run)", diff --git a/cli/azd/cmd/infra_generate.go b/cli/azd/cmd/infra_generate.go index 697e85915c5..9a8bdb7778f 100644 --- a/cli/azd/cmd/infra_generate.go +++ b/cli/azd/cmd/infra_generate.go @@ -13,6 +13,8 @@ import ( "github.com/azure/azure-dev/cli/azd/cmd/actions" "github.com/azure/azure-dev/cli/azd/internal" + "github.com/azure/azure-dev/cli/azd/internal/tracing" + "github.com/azure/azure-dev/cli/azd/internal/tracing/fields" "github.com/azure/azure-dev/cli/azd/pkg/alpha" "github.com/azure/azure-dev/cli/azd/pkg/environment/azdcontext" "github.com/azure/azure-dev/cli/azd/pkg/input" @@ -85,6 +87,16 @@ func newInfraGenerateAction( } func (a *infraGenerateAction) Run(ctx context.Context) (*actions.ActionResult, error) { + // Track infra provider from project configuration + // Emit "auto" when provider is empty, so we know auto-detection was used. + if a.projectConfig != nil { + provider := string(a.projectConfig.Infra.Provider) + if provider == "" { + provider = "auto" + } + tracing.SetUsageAttributes(fields.InfraProviderKey.String(provider)) + } + if a.calledAs == "synth" { fmt.Fprintln( a.console.Handles().Stderr, diff --git a/cli/azd/cmd/pipeline.go b/cli/azd/cmd/pipeline.go index 1bd591f2c13..5c2560ef30f 100644 --- a/cli/azd/cmd/pipeline.go +++ b/cli/azd/cmd/pipeline.go @@ -10,6 +10,8 @@ import ( "github.com/MakeNowJust/heredoc/v2" "github.com/azure/azure-dev/cli/azd/cmd/actions" "github.com/azure/azure-dev/cli/azd/internal" + "github.com/azure/azure-dev/cli/azd/internal/tracing" + "github.com/azure/azure-dev/cli/azd/internal/tracing/fields" "github.com/azure/azure-dev/cli/azd/pkg/alpha" "github.com/azure/azure-dev/cli/azd/pkg/environment" "github.com/azure/azure-dev/cli/azd/pkg/infra/provisioning" @@ -170,6 +172,13 @@ func (p *pipelineConfigAction) Run(ctx context.Context) (*actions.ActionResult, // Command title pipelineProviderName := p.manager.CiProviderName() + + // Track the resolved pipeline provider (after CiProviderName resolves auto-detection). + // cmd.flags already indicates whether --provider was explicitly set by the user. + tracing.SetUsageAttributes(fields.PipelineProviderKey.String(pipelineProviderName)) + if p.flags.PipelineAuthTypeName != "" { + tracing.SetUsageAttributes(fields.PipelineAuthKey.String(p.flags.PipelineAuthTypeName)) + } p.console.MessageUxItem(ctx, &ux.MessageTitle{ Title: fmt.Sprintf("Configure your %s pipeline", pipelineProviderName), }) diff --git a/cli/azd/cmd/telemetry_coverage_test.go b/cli/azd/cmd/telemetry_coverage_test.go new file mode 100644 index 00000000000..3aaef855bcf --- /dev/null +++ b/cli/azd/cmd/telemetry_coverage_test.go @@ -0,0 +1,197 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package cmd + +import ( + "testing" + + "github.com/azure/azure-dev/cli/azd/internal/tracing/fields" + "github.com/stretchr/testify/require" +) + +// TestTelemetryFieldConstants verifies that all telemetry field constants added for +// command-specific instrumentation are properly defined and produce valid attribute +// key-value pairs. This is a contract test: if a field constant is removed or renamed, +// this test will fail, catching regressions in the telemetry schema. +// +// NOTE: This test validates field definitions, not command-level instrumentation. +// Command-level coverage is enforced via the documented allowlist in +// TestCommandTelemetryCoverageAllowlist (below) and the feature-telemetry-matrix.md. +// Full AST-based scanning of SetUsageAttributes calls is a future enhancement. +func TestTelemetryFieldConstants(t *testing.T) { + // Auth command telemetry fields + t.Run("AuthFields", func(t *testing.T) { + kv := fields.AuthMethodKey.String("browser") + require.Equal(t, "auth.method", string(kv.Key)) + require.Equal(t, "browser", kv.Value.AsString()) + + // Verify all auth method values are valid strings + authMethods := []string{ + "browser", "device-code", "service-principal-secret", + "service-principal-certificate", "federated-github", + "federated-azure-pipelines", "federated-oidc", + "managed-identity", "external", "oneauth", + } + for _, method := range authMethods { + kv := fields.AuthMethodKey.String(method) + require.NotEmpty(t, kv.Value.AsString()) + } + }) + + // Env command telemetry fields + t.Run("EnvFields", func(t *testing.T) { + // Env count is a measurement + kvCount := fields.EnvCountKey.Int(3) + require.Equal(t, "env.count", string(kvCount.Key)) + require.Equal(t, int64(3), kvCount.Value.AsInt64()) + }) + + // Hooks command telemetry fields + t.Run("HooksFields", func(t *testing.T) { + kv := fields.HooksNameKey.String("predeploy") + require.Equal(t, "hooks.name", string(kv.Key)) + + kvType := fields.HooksTypeKey.String("project") + require.Equal(t, "hooks.type", string(kvType.Key)) + }) + + // Pipeline command telemetry fields + t.Run("PipelineFields", func(t *testing.T) { + kv := fields.PipelineProviderKey.String("github") + require.Equal(t, "pipeline.provider", string(kv.Key)) + + kvAuth := fields.PipelineAuthKey.String("federated") + require.Equal(t, "pipeline.auth", string(kvAuth.Key)) + }) + + // Infra command telemetry fields + t.Run("InfraFields", func(t *testing.T) { + providers := []string{"bicep", "terraform"} + for _, provider := range providers { + kv := fields.InfraProviderKey.String(provider) + require.Equal(t, "infra.provider", string(kv.Key)) + require.Equal(t, provider, kv.Value.AsString()) + } + }) +} + +// TestCommandTelemetryCoverage ensures every user-facing command is explicitly categorized +// for telemetry coverage. When a new command is added to the CLI, it must be added to one +// of the lists below. This forces developers to consciously decide whether the command needs +// command-specific telemetry attributes or whether global middleware telemetry is sufficient. +// +// NOTE: Building the full command tree via NewRootCmd requires the DI container, which makes +// it impractical for a unit test. Instead, we maintain an explicit manifest of all known +// user-facing commands and their telemetry classification. This test fails if: +// - A command appears in both lists (contradictory classification) +// - A command appears in neither list (unclassified — forces developer action) +// - The lists are not sorted (maintainability) +func TestCommandTelemetryCoverage(t *testing.T) { + // Commands that have command-specific telemetry attributes emitted via + // tracing.SetUsageAttributes (beyond the global middleware that tracks + // command name, flags, duration, and errors for all commands). + // + // When adding a command here, ensure the command's action sets at least one + // command-specific attribute (e.g., auth.method, config.operation, env.operation). + commandsWithSpecificTelemetry := []string{ + "auth login", // auth.method + "build", // (via hooks middleware) + "deploy", // infra.provider, service attributes (via hooks middleware) + "down", // infra.provider (via hooks middleware) + "env list", // env.count + "hooks run", // hooks.name, hooks.type + "infra generate", // infra.provider + "init", // init.method, appinit.* fields + "package", // (via hooks middleware) + "pipeline config", // pipeline.provider, pipeline.auth + "provision", // infra.provider (via hooks middleware) + "restore", // (via hooks middleware) + "up", // infra.provider (via hooks middleware, composes provision+deploy) + "update", // update.* fields + } + + // Commands that rely ONLY on global middleware telemetry (command name, flags, + // duration, errors) and do NOT emit command-specific attributes. Each entry + // includes a justification for why command-specific telemetry is not needed. + commandsWithOnlyGlobalTelemetry := []string{ + "auth logout", // No command-specific telemetry — logout is a simple operation + "auth status", // Global telemetry sufficient — auth check is simple pass/fail + "completion", // Shell completion script generation — no meaningful usage signal + "config get", // Global telemetry sufficient — low cardinality + "config list", // Global telemetry sufficient — low cardinality + "config list-alpha", // Simple list of alpha features — no operational variance + "config reset", // Global telemetry sufficient — low cardinality + "config set", // Global telemetry sufficient — low cardinality + "config show", // Global telemetry sufficient — low cardinality + "config unset", // Global telemetry sufficient — low cardinality + "copilot", // Copilot session telemetry handled by copilot.* fields at session level + "env config get", // Thin wrapper — low cardinality, global telemetry sufficient + "env config set", // Thin wrapper — low cardinality, global telemetry sufficient + "env config unset", // Thin wrapper — low cardinality, global telemetry sufficient + "env get-value", // Global telemetry sufficient — command name captures operation + "env get-values", // Global telemetry sufficient — command name captures operation + "env new", // Global telemetry sufficient — command name captures operation + "env refresh", // Global telemetry sufficient — command name captures operation + "env remove", // Destructive but simple — global telemetry captures usage + "env select", // Global telemetry sufficient — command name captures operation + "env set", // Global telemetry sufficient — command name captures operation + "env set-secret", // Global telemetry sufficient — command name captures operation + "mcp", // MCP tool telemetry handled by mcp.* fields at invocation level + "monitor", // Global telemetry sufficient — command name captures usage + "show", // Global telemetry sufficient — output format not analytically useful + "telemetry", // Meta-command for telemetry itself — avoid recursion + "template list", // Global telemetry sufficient — command name captures operation + "template show", // Global telemetry sufficient — command name captures operation + "template source add", // Global telemetry sufficient — command name captures operation + "template source list", // Global telemetry sufficient — command name captures operation + "template source remove", // Global telemetry sufficient — command name captures operation + "version", // Telemetry explicitly disabled (DisableTelemetry: true) + "vs-server", // JSON-RPC server — telemetry handled by rpc.* fields per call + } + + // Build lookup maps + specificMap := make(map[string]bool, len(commandsWithSpecificTelemetry)) + for _, cmd := range commandsWithSpecificTelemetry { + specificMap[cmd] = true + } + + globalOnlyMap := make(map[string]bool, len(commandsWithOnlyGlobalTelemetry)) + for _, cmd := range commandsWithOnlyGlobalTelemetry { + globalOnlyMap[cmd] = true + } + + // Verify no command appears in both lists + for _, cmd := range commandsWithSpecificTelemetry { + require.False(t, globalOnlyMap[cmd], + "command %q appears in BOTH specific and global-only telemetry lists — pick one", cmd) + } + + // Verify lists are sorted (for maintainability and merge conflict avoidance) + for i := 1; i < len(commandsWithSpecificTelemetry); i++ { + require.Less(t, commandsWithSpecificTelemetry[i-1], commandsWithSpecificTelemetry[i], + "commandsWithSpecificTelemetry is not sorted: %q should come before %q", + commandsWithSpecificTelemetry[i-1], commandsWithSpecificTelemetry[i]) + } + for i := 1; i < len(commandsWithOnlyGlobalTelemetry); i++ { + require.Less(t, commandsWithOnlyGlobalTelemetry[i-1], commandsWithOnlyGlobalTelemetry[i], + "commandsWithOnlyGlobalTelemetry is not sorted: %q should come before %q", + commandsWithOnlyGlobalTelemetry[i-1], commandsWithOnlyGlobalTelemetry[i]) + } + + // Verify combined coverage is non-empty and reasonable + totalClassified := len(commandsWithSpecificTelemetry) + len(commandsWithOnlyGlobalTelemetry) + require.Greater(t, totalClassified, 0, "no commands classified — lists are empty") + + // Verify no duplicates within each list + seen := make(map[string]bool) + for _, cmd := range commandsWithSpecificTelemetry { + require.False(t, seen[cmd], "duplicate command in commandsWithSpecificTelemetry: %q", cmd) + seen[cmd] = true + } + seen = make(map[string]bool) + for _, cmd := range commandsWithOnlyGlobalTelemetry { + require.False(t, seen[cmd], "duplicate command in commandsWithOnlyGlobalTelemetry: %q", cmd) + seen[cmd] = true + } +} diff --git a/cli/azd/internal/tracing/fields/fields.go b/cli/azd/internal/tracing/fields/fields.go index f1638a7a6e2..5a73962ccf8 100644 --- a/cli/azd/internal/tracing/fields/fields.go +++ b/cli/azd/internal/tracing/fields/fields.go @@ -314,6 +314,75 @@ const ( AccountTypeServicePrincipal = "Service Principal" ) +// Auth command related fields +var ( + // The authentication method used for login. + // + // Example: "browser", "device-code", "service-principal-secret", "managed-identity" + AuthMethodKey = AttributeKey{ + Key: attribute.Key("auth.method"), + Classification: SystemMetadata, + Purpose: FeatureInsight, + } +) + +// Environment command related fields +var ( + // The number of environments that exist for the current project. + EnvCountKey = AttributeKey{ + Key: attribute.Key("env.count"), + Classification: SystemMetadata, + Purpose: FeatureInsight, + IsMeasurement: true, + } +) + +// Hooks command related fields +var ( + // The name of the hook being run. + HooksNameKey = AttributeKey{ + Key: attribute.Key("hooks.name"), + Classification: SystemMetadata, + Purpose: FeatureInsight, + } + // The type of the hook (project or service). + HooksTypeKey = AttributeKey{ + Key: attribute.Key("hooks.type"), + Classification: SystemMetadata, + Purpose: FeatureInsight, + } +) + +// Pipeline command related fields +var ( + // The pipeline provider being configured. + // + // Example: "github", "azdo" + PipelineProviderKey = AttributeKey{ + Key: attribute.Key("pipeline.provider"), + Classification: SystemMetadata, + Purpose: FeatureInsight, + } + // The authentication type used for pipeline configuration. + PipelineAuthKey = AttributeKey{ + Key: attribute.Key("pipeline.auth"), + Classification: SystemMetadata, + Purpose: FeatureInsight, + } +) + +// Infrastructure command related fields +var ( + // The IaC provider used for infrastructure generation. + // + // Example: "bicep", "terraform" + InfraProviderKey = AttributeKey{ + Key: attribute.Key("infra.provider"), + Classification: SystemMetadata, + Purpose: FeatureInsight, + } +) + // The value used for ServiceNameKey const ServiceNameAzd = "azd" diff --git a/cli/azd/pkg/state/state_cache_test.go b/cli/azd/pkg/state/state_cache_test.go index 6a4e969e9e5..0a6fc4f631b 100644 --- a/cli/azd/pkg/state/state_cache_test.go +++ b/cli/azd/pkg/state/state_cache_test.go @@ -5,6 +5,7 @@ package state import ( "context" + "encoding/json" "os" "path/filepath" "testing" @@ -81,7 +82,7 @@ func TestStateCacheManager_Invalidate(t *testing.T) { func TestStateCacheManager_TTL(t *testing.T) { tempDir := t.TempDir() manager := NewStateCacheManager(tempDir) - manager.SetTTL(500 * time.Millisecond) // Short TTL for testing (not too short to be flaky) + manager.SetTTL(1 * time.Hour) // Use a large TTL — we test expiration by backdating, not sleeping ctx := context.Background() cache := &StateCache{ @@ -93,15 +94,20 @@ func TestStateCacheManager_TTL(t *testing.T) { err := manager.Save(ctx, "test-env", cache) require.NoError(t, err) - // Load immediately should work + // Load immediately should work (cache just created, TTL is 1 hour) loaded, err := manager.Load(ctx, "test-env") require.NoError(t, err) require.NotNil(t, loaded) - // Wait for TTL to expire - time.Sleep(600 * time.Millisecond) + // Backdate the cache's UpdatedAt to simulate TTL expiration deterministically + // (avoids flaky time.Sleep-based expiration that depends on wall clock behavior) + loaded.UpdatedAt = time.Now().Add(-2 * time.Hour) + data, err := json.MarshalIndent(loaded, "", " ") + require.NoError(t, err) + err = os.WriteFile(manager.GetCachePath("test-env"), data, 0600) + require.NoError(t, err) - // Load after TTL should return nil + // Load after backdating should return nil (TTL expired) loaded, err = manager.Load(ctx, "test-env") require.NoError(t, err) require.Nil(t, loaded) diff --git a/docs/specs/metrics-audit/audit-process.md b/docs/specs/metrics-audit/audit-process.md new file mode 100644 index 00000000000..1e604ef2ac4 --- /dev/null +++ b/docs/specs/metrics-audit/audit-process.md @@ -0,0 +1,288 @@ +# Telemetry Audit Process + +This document defines the recurring audit process for `azd` telemetry, including cadence, +ownership, checklists, downstream validation, and automation. + +## Quarterly Review Cadence + +Telemetry audits run on a quarterly cycle aligned with fiscal quarters. + +| Quarter | Audit Window | Report Due | +|---------|-------------|------------| +| Q1 | Weeks 1–2 of quarter | End of Week 3 | +| Q2 | Weeks 1–2 of quarter | End of Week 3 | +| Q3 | Weeks 1–2 of quarter | End of Week 3 | +| Q4 | Weeks 1–2 of quarter | End of Week 3 | + +### Audit Phases + +1. **Discovery** (Week 1) — Automated scan identifies new commands, changed telemetry fields, + and coverage gaps. +2. **Review** (Week 2) — Manual review of scan results, privacy classification check, and + downstream validation. +3. **Report** (Week 3) — Publish audit report, file issues for gaps, update documentation. + +## Ownership + +| Role | Responsibility | +|------|---------------| +| **Telemetry Lead** | Owns the audit process, runs scans, publishes reports | +| **Feature Developers** | Respond to gap issues, implement telemetry for new commands | +| **Privacy Contact** | Reviews new classifications, approves changes to hashing | +| **Data Engineering** | Validates downstream Kusto functions and cooked tables | +| **PM / Analytics** | Reviews audit report, prioritizes gap closures | + +## Audit Checklist + +### 1. Command Coverage Scan + +- [ ] Run the command inventory scan against the current `main` branch +- [ ] Compare results with the [Feature-Telemetry Matrix](feature-telemetry-matrix.md) +- [ ] Identify new commands added since last audit +- [ ] Identify commands that had telemetry added since last audit +- [ ] Flag commands still missing command-specific telemetry + +### 2. Field Inventory + +- [ ] Diff `fields/fields.go` against the previous audit snapshot +- [ ] Identify new fields added without documentation +- [ ] Verify all fields have correct classification and purpose +- [ ] Verify hashing is applied to all user-provided values +- [ ] Cross-reference with the [Telemetry Schema](telemetry-schema.md) + +### 3. Event Inventory + +- [ ] Diff `events/events.go` against the previous audit snapshot +- [ ] Identify new events added without documentation +- [ ] Verify event naming follows conventions (`prefix.noun.verb`) + +### 4. Privacy Review + +- [ ] Review all new fields against the [Privacy Review Checklist](privacy-review-checklist.md) +- [ ] Confirm no `CustomerContent` is emitted +- [ ] Confirm no unhashed user-provided values +- [ ] Spot-check 5 random existing fields for classification accuracy + +### 5. Disabled Telemetry Check + +- [ ] Verify `version` still has `DisableTelemetry: true` +- [ ] Verify `telemetry upload` still has `DisableTelemetry: true` +- [ ] Check for any new commands with `DisableTelemetry: true` — confirm intent + +### 6. Opt-Out Rate Estimation + +When `AZURE_DEV_COLLECT_TELEMETRY=no`, the entire telemetry pipeline is disabled — no +spans are created and no data is sent. This means **opted-out users are invisible** in +telemetry data and we cannot directly measure the opt-out rate. + +**Estimation approach** (indirect): + +- [ ] Compare total install/download counts (from package manager stats, GitHub releases, + winget/brew/apt download logs) against distinct active telemetry users in the same period +- [ ] Estimate: `opt-out rate ≈ 1 − (active telemetry users / total installs)` +- [ ] Track this ratio over time to detect trends + +> **⚠️ Open question for @AngelosP / Privacy team**: Should azd send a single anonymous +> opt-out counter signal (containing zero identifying information — no machine ID, no IP, +> just an increment) when the user has `AZURE_DEV_COLLECT_TELEMETRY=no`? This is a gray +> area: GDPR Article 7(3) requires stopping processing on consent withdrawal, but a +> zero-identifier counter may not constitute "personal data." The .NET SDK installer does +> send a telemetry entry on successful installation even before the user sets the opt-out +> variable. A decision from the privacy team would clarify whether this approach is +> acceptable for azd. + +### 7. Data Pipeline Health + +- [ ] Verify telemetry upload process is functioning (check error rates) +- [ ] Confirm data arrives in Azure Monitor within expected latency +- [ ] Validate sample spans contain expected attributes + +## Downstream Validation + +### LENS Jobs + +LENS jobs consume raw telemetry and produce aggregated metrics. Each audit must verify: + +- [ ] All active LENS jobs are running without errors +- [ ] New fields referenced by LENS jobs exist in the telemetry stream +- [ ] Deprecated fields referenced by LENS jobs have been migrated or removed +- [ ] LENS job output matches expected schema + +### Kusto Functions + +Kusto functions parse and transform raw telemetry into queryable tables. + +- [ ] All Kusto functions compile without errors +- [ ] New fields are extracted correctly (spot-check with sample data) +- [ ] Renamed or removed fields have been updated in function definitions +- [ ] Function output types match downstream dashboard expectations + +### Cooked Tables + +Cooked tables are pre-aggregated views used by dashboards and reports. + +- [ ] Cooked table materialization is running on schedule +- [ ] New columns from new fields are populated correctly +- [ ] Historical data migration is complete (if field was renamed) +- [ ] Dashboard queries return expected results + +## Automation Suggestions + +### CI Scan: Telemetry Coverage Gate + +Add a CI check that fails the build when a new command is added without telemetry instrumentation. + +**Implementation approach:** + +1. Write a Go analysis pass (or script) that: + - Walks all `ActionDescriptor` registrations in `internal/cmd/` + - Checks each for either `DisableTelemetry: true` or a `SetUsageAttributes` call + - Reports commands that have neither + +2. Add the check to the existing CI pipeline: + ```yaml + - name: Telemetry Coverage Check + run: go run ./eng/scripts/telemetry-coverage-check/main.go + ``` + +3. Allow exemptions via a `// telemetry:exempt ` comment on the `ActionDescriptor`. + +### GitHub Action: Quarterly Audit Issue + +Automate the creation of a quarterly audit issue with the full checklist. + +**Implementation approach:** + +```yaml +name: Quarterly Telemetry Audit +on: + schedule: + # First Monday of each quarter (Jan, Apr, Jul, Oct) + - cron: '0 9 1-7 1,4,7,10 1' + +jobs: + create-audit-issue: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Create Audit Issue + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const checklist = fs.readFileSync( + 'docs/specs/metrics-audit/audit-process.md', 'utf8' + ); + + // Extract the checklist sections + const quarter = Math.ceil((new Date().getMonth() + 1) / 3); + const year = new Date().getFullYear(); + + await github.rest.issues.create({ + owner: context.repo.owner, + repo: context.repo.repo, + title: `Telemetry Audit Q${quarter} ${year}`, + body: `## Quarterly Telemetry Audit — Q${quarter} ${year}\n\n` + + `Audit window: Weeks 1–2\n` + + `Report due: End of Week 3\n\n` + + `### Checklist\n\n` + + `See [audit-process.md](docs/specs/metrics-audit/audit-process.md) for full details.\n\n` + + `- [ ] Command coverage scan\n` + + `- [ ] Field inventory\n` + + `- [ ] Event inventory\n` + + `- [ ] Privacy review\n` + + `- [ ] Disabled telemetry check\n` + + `- [ ] Data pipeline health\n` + + `- [ ] LENS job validation\n` + + `- [ ] Kusto function validation\n` + + `- [ ] Cooked table validation\n` + + `- [ ] Audit report published\n`, + labels: ['telemetry', 'audit'] + }); +``` + +### PR Label Automation + +Automatically label PRs that modify telemetry files for review. + +**Trigger files:** +- `cli/azd/internal/tracing/fields/fields.go` +- `cli/azd/internal/tracing/events/events.go` +- `cli/azd/internal/tracing/fields/key.go` +- `cli/azd/internal/tracing/resource/resource.go` +- Any file containing `SetUsageAttributes` + +**Implementation:** Use a CODEOWNERS entry to require telemetry team review: + +``` +# .github/CODEOWNERS (telemetry-related files) +cli/azd/internal/tracing/ @AzureDevCLI/telemetry-reviewers +``` + +This is preferred over a separate GitHub Actions workflow because it integrates directly +with the existing PR review flow and requires no additional CI configuration. + +## Telemetry Validation Pipeline + +### 1. Local Validation + +Use `--trace-log-file ` to dump all telemetry spans to a JSON file, then inspect for +expected attributes. + +```bash +azd pipeline config --trace-log-file telemetry-dump.json +# Then inspect telemetry-dump.json for pipeline.provider, pipeline.auth.type fields +``` + +```bash +azd infra synth --trace-log-file telemetry-dump.json +# Inspect for infra.provider field +``` + +This flag is available on all azd commands and writes the full span tree (with all attributes) +to the specified file. Use `jq` or similar tools to filter for specific keys. + +### 2. Functional Tests + +The repo has existing functional telemetry tests at +`cli/azd/test/functional/telemetry_test.go` that run real commands and validate trace +attributes. New telemetry fields should be covered here. + +When adding a new field, add a test case that: +1. Runs the command that emits the field. +2. Reads the trace output. +3. Asserts the expected attribute key and value are present. + +### 3. PR Builds + +Azure Pipelines publishes PR-specific builds via `eng/pipelines/release-cli.yml`. Install a +PR build with: + +```bash +azd version install pr/ +``` + +Then manually test commands and inspect `--trace-log-file` output to verify the new telemetry +attributes are present with expected values. + +### 4. Pre-Production Checklist + +Before merging telemetry changes: + +- [ ] Unit tests pass (`go test ./cmd/... ./internal/tracing/...`) +- [ ] Functional telemetry tests pass +- [ ] Local `--trace-log-file` validation for each new field +- [ ] PR build smoke test with real Azure subscription +- [ ] Dev telemetry endpoint receives expected attributes (non-prod builds auto-target dev App Insights) + +### Telemetry Diff Report + +Generate a diff report on every PR that modifies telemetry, showing: +- New fields added (with classification) +- Fields removed +- Classification changes +- New events + +This can be implemented as a Go script that parses `fields.go` and `events.go` ASTs and +compares against the base branch. diff --git a/docs/specs/metrics-audit/feature-telemetry-matrix.md b/docs/specs/metrics-audit/feature-telemetry-matrix.md new file mode 100644 index 00000000000..c6e1eb1e927 --- /dev/null +++ b/docs/specs/metrics-audit/feature-telemetry-matrix.md @@ -0,0 +1,124 @@ +# Feature-Telemetry Inventory Matrix + +This document provides a comprehensive inventory of every `azd` command and its telemetry coverage. +It identifies gaps where commands rely solely on the global middleware span and recommends +specific telemetry additions. + +## Telemetry Coverage Legend + +| Symbol | Meaning | +|--------|---------| +| ✅ | Covered — command-specific attributes or events are emitted | +| ⚠️ | Global span only — no command-specific telemetry | +| ❌ | Gap identified — needs instrumentation | +| 🚫 | Telemetry intentionally disabled | + +## Commands with Telemetry Disabled + +These commands have `DisableTelemetry: true` set on their `ActionDescriptor`. + +| Command | Reason | +|---------|--------| +| `version` | Trivial local-only command; no value in tracking | +| `telemetry upload` | Disabled to prevent recursive telemetry-about-telemetry | + +## Commands with Command-Specific Telemetry + +These commands emit attributes or events beyond the global middleware span. + +| Command | Attributes / Events | Notes | +|---------|---------------------|-------| +| `init` | `init.method` (template / app / project / environment / copilot), `appinit.detected.databases`, `appinit.detected.services`, `appinit.confirmed.databases`, `appinit.confirmed.services`, `appinit.modify_add.count`, `appinit.modify_remove.count`, `appinit.lastStep` | Comprehensive coverage via `SetUsageAttributes` and `repository/app_init.go` | +| `update` | `update.installMethod`, `update.channel`, `update.fromVersion`, `update.toVersion`, `update.result` | Result codes cover success, failure, and skip reasons | +| Extensions (dynamic) | `extension.id`, `extension.version` + trace-context propagation to child process | Covers `ext.run` and `ext.install` events | +| `mcp start` | Per-tool spans via `tracing.Start` with `mcp.client.name`, `mcp.client.version` | MCP event prefix `mcp.*` | + +## Full Inventory Matrix + +| Command | Subcommands | Global Span | Command-Specific Attrs | Feature Events | Notes | +|---------|-------------|:-----------:|:----------------------:|:--------------:|-------| +| **Auth** | | | | | | +| `auth login` | — | ✅ | ✅ | ❌ | `auth.method` (browser, device-code, service-principal-secret, etc.) | +| `auth logout` | — | ✅ | ✅ | ❌ | `auth.method` (logout) | +| `auth status` | — | ✅ | ❌ | ❌ | Global telemetry sufficient — simple pass/fail check | +| `auth token` | — | ✅ | ❌ | ❌ | Global telemetry sufficient | +| **Config** | | | | | | +| `config` | `show`, `list`, `get`, `set`, `unset`, `reset`, `list-alpha`, `options` | ✅ | ❌ | ❌ | Redundant — command name in global span captures operation | +| **Environment** | | | | | | +| `env` | `set`, `set-secret`, `select`, `new`, `remove`, `refresh`, `get-values`, `get-value` | ✅ | ❌ | ❌ | Redundant — command name in global span captures operation | +| `env list` | — | ✅ | ✅ | ❌ | `env.count` (measurement — number of environments) | +| `env config` | `get`, `set`, `unset` | ✅ | ❌ | ❌ | Thin wrappers — global telemetry sufficient | +| **Hooks** | | | | | | +| `hooks run` | — | ✅ | ✅ | ❌ | `hooks.name`, `hooks.type` (project/service) | +| **Templates** | | | | | | +| `template` | `list`, `show` | ✅ | ❌ | ❌ | Redundant — command name in global span captures operation | +| `template source` | `list`, `add`, `remove` | ✅ | ❌ | ❌ | Redundant — command name in global span captures operation | +| **Pipeline** | | | | | | +| `pipeline config` | — | ✅ | ✅ | ❌ | `pipeline.provider` (github/azdo), `pipeline.auth` (federated/client-credentials) | +| **Monitor** | | | | | | +| `monitor` | — | ✅ | ❌ | ❌ | Redundant — command name in global span is sufficient | +| **Show** | | | | | | +| `show` | — | ✅ | ❌ | ❌ | Redundant — output format not analytically useful | +| **Infrastructure** | | | | | | +| `infra generate` | — | ✅ | ✅ | ❌ | `infra.provider` (bicep/terraform) | +| `infra synth` | — | ✅ | ✅ | ❌ | `infra.provider` (bicep/terraform) | +| `infra create` | — (hidden, deprecated) | ✅ | ❌ | ❌ | Wraps `provision`; inherits its telemetry | +| `infra delete` | — (hidden, deprecated) | ✅ | ❌ | ❌ | Wraps `down`; inherits its telemetry | +| **Core Lifecycle** | | | | | | +| `restore` | — | ✅ | ❌ | ❌ | Via hooks middleware | +| `build` | — | ✅ | ❌ | ❌ | Via hooks middleware | +| `provision` | — | ✅ | ❌ | ❌ | `infra.provider` set via hooks middleware | +| `package` | — | ✅ | ❌ | ❌ | Via hooks middleware | +| `deploy` | — | ✅ | ❌ | ❌ | `infra.provider`, service attributes via hooks middleware | +| `publish` | — | ✅ | ❌ | ❌ | Same as `deploy` (alias behavior) | +| `up` | — | ✅ | ❌ | ❌ | `infra.provider` via hooks middleware (composes provision+deploy) | +| `down` | — | ✅ | ❌ | ❌ | `infra.provider` via hooks middleware | +| **Add** | | | | | | +| `add` | — | ✅ | ❌ | ❌ | Low priority | +| **Completion** | | | | | | +| `completion` | `bash`, `zsh`, `fish`, `powershell`, `fig` | ✅ | ❌ | ❌ | Low priority — minimal analytical value | +| **VS Server** | | | | | | +| `vs-server` | — | ✅ | ❌ | ❌ | Long-running RPC; covered by `vsrpc.*` events | +| **Copilot Consent** | | | | | | +| `copilot consent` | `list`, `revoke`, `grant` | ✅ | ❌ | ❌ | Low priority | +| **Extension Management** | | | | | | +| `extension` | `list`, `show`, `install`, `uninstall`, `upgrade` | ✅ | ❌ | ❌ | Covered by `extension.*` fields | +| `extension source` | `list`, `add`, `remove`, `validate` | ✅ | ❌ | ❌ | Low priority | +| **Init** | | | | | | +| `init` | — | ✅ | ✅ | ✅ | Comprehensive coverage via `appinit.*` fields | +| **Update** | | | | | | +| `update` | — | ✅ | ✅ | ✅ | Covered by `update.*` fields | +| **MCP** | | | | | | +| `mcp start` | — | ✅ | ✅ | ✅ | Per-tool spans via `mcp.*` | +| **Disabled** | | | | | | +| `version` | — | 🚫 | — | — | Intentionally disabled | +| `telemetry upload` | — | 🚫 | — | — | Intentionally disabled | + +## Retained Fields Summary + +After the redundancy audit (per PR review feedback from @weikanglim), the following +command-specific telemetry fields provide analytical value beyond the command name: + +| Field | OTel Key | Commands | Justification | +|-------|----------|----------|---------------| +| Auth method | `auth.method` | `auth login`, `auth logout` | Distinguishes authentication flow type (browser, device-code, SP, federated, etc.) | +| Env count | `env.count` | `env list` | Measurement — number of environments is a quantitative metric | +| Hooks name | `hooks.name` | `hooks run` | Identifies which hook script ran | +| Hooks type | `hooks.type` | `hooks run` | Distinguishes project vs service hooks | +| Pipeline provider | `pipeline.provider` | `pipeline config` | Distinguishes GitHub vs Azure DevOps | +| Pipeline auth | `pipeline.auth` | `pipeline config` | Distinguishes federated vs client-credentials | +| Infra provider | `infra.provider` | `infra generate`, `infra synth` | Distinguishes Bicep vs Terraform | + +### Removed Fields (Redundant with Command Name) + +The following fields were removed because the command name in the global span already +captures the operation type, making the attribute redundant: + +| Removed Field | Reason | +|---------------|--------| +| `auth.result` | Success/failure already captured by span status | +| `config.operation` | Each config subcommand has its own command name | +| `env.operation` | Each env subcommand has its own command name | +| `template.operation` | Each template subcommand has its own command name | +| `monitor.type` | Single command — no distinguishing value | +| `show.output.format` | Output format not analytically useful | diff --git a/docs/specs/metrics-audit/privacy-review-checklist.md b/docs/specs/metrics-audit/privacy-review-checklist.md new file mode 100644 index 00000000000..469f4bf908b --- /dev/null +++ b/docs/specs/metrics-audit/privacy-review-checklist.md @@ -0,0 +1,199 @@ +# Privacy Review Checklist + +This document defines when a privacy review is required for telemetry changes in `azd`, +the data classification framework, hashing requirements, and a PR checklist template. + +## When to Trigger a Privacy Review + +A privacy review **must** be triggered when any of the following conditions are met: + +1. **New telemetry field** — Any new attribute key added to `cli/azd/internal/tracing/fields/fields.go` or emitted + via `SetUsageAttributes` / `tracing.SetSpanAttributes`. + +2. **New event** — Any new event constant added to `cli/azd/internal/tracing/events/events.go` or new span name + introduced via `tracing.Start`. + +3. **Classification change** — Any change to an existing field's `Classification` or `Purpose`. + +4. **New data source** — Telemetry that captures data from a source not previously instrumented + (e.g., a new Azure service response, user input, file system content). + +5. **Hashing removal or weakening** — Any change that removes `StringHashed` / `StringSliceHashed` + from a field that was previously hashed. + +6. **Cross-boundary data flow** — Telemetry that propagates trace context to external processes + (e.g., extension child processes) or receives context from external sources. + +7. **Measurement → String conversion** — Changing a field from a numeric measurement to a + string value (strings have higher re-identification risk). + +A privacy review is **not** required for: + +- Bug fixes to existing telemetry (e.g., fixing a typo in an attribute name). +- Removing telemetry fields entirely. +- Adding new values to an existing enum field (e.g., a new `auth.method` value) — unless + the new value captures data from a new source. + +## Raw Telemetry Data Shape Changes + +> "Any time any of the incoming raw data changes, your team needs to review and understand +> what needs to change to keep calculating things correctly" — AngelosP + +When the shape of raw telemetry data changes, ALL downstream consumers must be reviewed. +This is a **standalone mandatory checklist item** that applies whenever any of the following +occur: + +- [ ] **Field renamed** — A telemetry attribute key is renamed (e.g., `auth.type` → `auth.method`). + Review all Kusto functions, cooked table queries, LENS jobs, and dashboards that reference + the old key name. +- [ ] **Field type changed** — A field changes from string to int, or from single-value to array, + etc. Review all downstream parsers, `extend`/`project` statements in KQL, and any schema + validations. +- [ ] **Allowed values changed** — An enum field gains, removes, or renames values (e.g., + `auth.method` adding `"logout"`). Review all `case`/`iff`/`countif` expressions in Kusto + that filter or bucket by the old value set. +- [ ] **Field removed or deprecated** — A field is no longer emitted. Review all queries that + reference it and add null-handling or migration logic. +- [ ] **Measurement semantics changed** — Units change (seconds → milliseconds), counting + methodology changes, or aggregation expectations change. Review all KPI calculations, + percentile computations, and alerting thresholds. +- [ ] **Hashing algorithm changed** — Hash function or salt changes break join-ability with + historical data. Review all queries that correlate hashed fields across time ranges. + +**Action required**: Before merging any PR that changes raw telemetry data shape, the author +must verify that all downstream Kusto functions and KPI calculations still compute correctly +with the new shape. This includes cooked tables, LENS jobs, dashboards, and alerts. + +## Data Classifications + +All telemetry fields must be assigned exactly one classification from the table below. +Classifications are defined in `cli/azd/internal/tracing/fields/fields.go`. + +| Classification | Description | Examples | Retention | +|----------------|-------------|----------|-----------| +| **PublicPersonalData** | Data the user has intentionally made public | GitHub username | Standard | +| **SystemMetadata** | Non-personal metadata about the system or environment | OS type, Go version, feature flags | Standard | +| **CallstackOrException** | Stack traces, panic details, error frames | `error.frame` | Reduced | +| **CustomerContent** | Content created by the user | File contents, messages | Highest restriction — avoid in telemetry | +| **EndUserPseudonymizedInformation** | User identifiers that have been pseudonymized | Hashed MAC address (`machine.id`), SQM User ID | Standard | +| **OrganizationalIdentifiableInformation** | Identifiers scoped to an organization | Azure subscription ID, tenant ID | Standard | + +### Classification Decision Tree + +``` +Is the data created by the user (file content, messages)? + └─ Yes → CustomerContent (do NOT emit in telemetry) + └─ No → + Can the data identify a specific person? + └─ Yes → + Is it already public? + └─ Yes → PublicPersonalData + └─ No → + Can it be hashed? + └─ Yes → Hash it → EndUserPseudonymizedInformation + └─ No → Do NOT emit — escalate to privacy team + └─ No → + Can it identify an organization? + └─ Yes → OrganizationalIdentifiableInformation + └─ No → + Is it a stack trace or exception detail? + └─ Yes → CallstackOrException + └─ No → SystemMetadata +``` + +## Hashing Requirements + +Any field that could identify a user, project, or environment **must** be hashed before emission. + +### Hash Functions + +All hashing functions are in `cli/azd/internal/tracing/fields/key.go`. + +| Function | Signature | Behavior | +|----------|-----------|----------| +| `CaseInsensitiveHash` | `func CaseInsensitiveHash(value string) string` | Lowercases the input, then computes SHA-256. Returns hex-encoded digest. | +| `StringHashed` | `func StringHashed(key, value string) attribute.KeyValue` | Creates an OTel `attribute.KeyValue` with the value replaced by its case-insensitive SHA-256 hash. | +| `StringSliceHashed` | `func StringSliceHashed(key string, values []string) attribute.KeyValue` | Creates an OTel `attribute.KeyValue` where each element in the slice is independently hashed. | + +### Fields That Must Be Hashed + +| Field | Hash Function | Reason | +|-------|---------------|--------| +| `project.template.id` | `StringHashed` | Template IDs may contain repo URLs or user-chosen names | +| `project.template.version` | `StringHashed` | Version strings may be user-defined | +| `project.name` | `StringHashed` | Project names are user-chosen | +| `env.name` | `StringHashed` | Environment names may contain identifying information | + +### When to Hash New Fields + +A new field **must** be hashed if any of the following are true: + +- The value is user-provided (typed by the user or read from a user-authored file). +- The value could contain a project name, repository URL, or path. +- The value could be used to correlate across users or organizations. + +A new field should **not** be hashed if: + +- The value is from a fixed enum (e.g., `auth.method` = `"browser"`). +- The value is a count or duration (measurements). +- The value is system-generated metadata (e.g., OS type). + +## Data Catalog Classification Process + +When adding a new telemetry field: + +1. **Define the field** in `internal/telemetry/fields/fields.go` using the `NewKey` pattern. +2. **Assign classification** — use the decision tree above to determine the correct classification. +3. **Assign purpose** — select one or more from: `FeatureInsight`, `BusinessInsight`, `PerformanceAndHealth`. +4. **Determine hashing** — apply hashing rules above. +5. **Register in Data Catalog** — update the [Telemetry Schema](telemetry-schema.md) with: + - OTel key name + - Classification + - Purpose + - Whether it is hashed + - Whether it is a measurement + - Allowed values (if enum) +6. **Update LENS/Kusto** — if the field will be queried downstream, coordinate with the + data engineering team to update Kusto functions and cooked tables. + +## PR Checklist Template for Telemetry Changes + +Copy this checklist into your PR description when making telemetry changes. + +```markdown +## Telemetry Change Checklist + +### New Fields +- [ ] Field defined in `fields/fields.go` with correct classification and purpose +- [ ] Field documented in `docs/specs/metrics-audit/telemetry-schema.md` +- [ ] Hashing applied where required (user-provided values, names, paths) +- [ ] Measurement fields use correct OTel type (Int64, Float64) +- [ ] Enum values documented with allowed value set + +### New Events +- [ ] Event constant defined in `events/events.go` +- [ ] Event documented in `docs/specs/metrics-audit/telemetry-schema.md` +- [ ] Event follows naming convention (`prefix.noun.verb`) + +### Privacy +- [ ] Classification assigned using decision tree +- [ ] No `CustomerContent` emitted in telemetry +- [ ] No unhashed user-provided values +- [ ] No PII in string attributes (names, emails, paths) +- [ ] Privacy review triggered (if required per triggers above) + +### Testing +- [ ] Unit test verifies attributes are set on the span +- [ ] Integration test confirms end-to-end emission (if applicable) +- [ ] Verified field appears correctly in local telemetry output + +### Downstream +- [ ] LENS job updated (if field is queried in dashboards) +- [ ] Kusto function updated (if field is used in cooked tables) +- [ ] Dashboard updated (if field powers a new metric) + +### Documentation +- [ ] Feature-telemetry matrix updated (if gap is being closed) +- [ ] Telemetry schema updated with new field/event +- [ ] This checklist is complete +``` diff --git a/docs/specs/metrics-audit/telemetry-schema.md b/docs/specs/metrics-audit/telemetry-schema.md new file mode 100644 index 00000000000..2d0e22e6a63 --- /dev/null +++ b/docs/specs/metrics-audit/telemetry-schema.md @@ -0,0 +1,298 @@ +# Telemetry Schema Reference + +This document is the authoritative reference for all telemetry events, fields, classifications, +and data pipeline details in the Azure Developer CLI (`azd`). + +## Events + +Events are defined in `cli/azd/internal/tracing/events/events.go`. Each event is emitted as an +OpenTelemetry span name or event name. + +| Constant | Value | Description | +|----------|-------|-------------| +| `CommandEventPrefix` | `cmd.` | Prefix for all command events (via `GetCommandEventName`) | +| `VsRpcEventPrefix` | `vsrpc.` | Prefix for VS Code JSON-RPC events | +| `McpEventPrefix` | `mcp.` | Prefix for MCP tool invocation events | +| `PackBuildEvent` | `tools.pack.build` | Cloud Native Buildpacks build event | +| `AgentTroubleshootEvent` | `agent.troubleshoot` | Agent troubleshooting event | +| `ExtensionRunEvent` | `ext.run` | Extension execution event | +| `ExtensionInstallEvent` | `ext.install` | Extension install/upgrade event | +| `CopilotInitializeEvent` | `copilot.initialize` | Copilot initialization event | +| `CopilotSessionEvent` | `copilot.session` | Copilot session lifecycle event | + +## Fields + +Fields are defined in `cli/azd/internal/tracing/fields/fields.go`. Each field has a classification +and purpose that governs how it may be stored, queried, and retained. + +### Application-Level (Resource Attributes) + +These are set once at process startup via `resource.New()` and attached to every span. + +| Field | OTel Key | Classification | Purpose | Notes | +|-------|----------|----------------|---------|-------| +| Service name | `service.name` | — | — | Always `"azd"` | +| Service version | `service.version` | — | — | Build version string | +| OS type | `os.type` | — | — | e.g. `linux`, `windows`, `darwin` | +| OS version | `os.version` | SystemMetadata | PerformanceAndHealth | Kernel / build version | +| Host architecture | `host.arch` | SystemMetadata | PerformanceAndHealth | e.g. `amd64`, `arm64` | +| Runtime version | `process.runtime.version` | SystemMetadata | PerformanceAndHealth | Go version | +| Machine ID | `machine.id` | EndUserPseudonymizedInformation | BusinessInsight | MAC address hash | +| Dev Device ID | `machine.devdeviceid` | EndUserPseudonymizedInformation | BusinessInsight | SQM User ID | +| Execution environment | `execution.environment` | SystemMetadata | BusinessInsight | CI system detection | +| Installer | `service.installer` | SystemMetadata | FeatureInsight | How azd was installed | + +### Experimentation + +| Field | OTel Key | Classification | Purpose | +|-------|----------|----------------|---------| +| Assignment context | `exp.assignmentContext` | SystemMetadata | FeatureInsight | + +### Identity and Account Context + +| Field | OTel Key | Classification | Purpose | Notes | +|-------|----------|----------------|---------|-------| +| Object ID | `user_AuthenticatedId` | — | — | From Application Insights contracts | +| Tenant ID | `ad.tenant.id` | SystemMetadata | BusinessInsight | Entra ID tenant | +| Account type | `ad.account.type` | SystemMetadata | BusinessInsight | `"User"` or `"Service Principal"` | +| Subscription ID | `ad.subscription.id` | OrganizationalIdentifiableInformation | PerformanceAndHealth | Azure subscription | + +### Project Context (azure.yaml) + +| Field | OTel Key | Classification | Purpose | Notes | +|-------|----------|----------------|---------|-------| +| Template ID | `project.template.id` | SystemMetadata | FeatureInsight | **Hashed** | +| Template version | `project.template.version` | SystemMetadata | FeatureInsight | **Hashed** | +| Project name | `project.name` | SystemMetadata | FeatureInsight | **Hashed** | +| Service hosts | `project.service.hosts` | SystemMetadata | FeatureInsight | List of host types | +| Service targets | `project.service.targets` | SystemMetadata | FeatureInsight | List of deploy targets | +| Service languages | `project.service.languages` | SystemMetadata | FeatureInsight | List of languages | +| Service language | `project.service.language` | SystemMetadata | PerformanceAndHealth | Single service language | +| Platform type | `platform.type` | SystemMetadata | FeatureInsight | e.g. `aca`, `aks` | + +### Config and Environment + +| Field | OTel Key | Classification | Purpose | Notes | +|-------|----------|----------------|---------|-------| +| Feature flags | `config.features` | SystemMetadata | FeatureInsight | Active feature flags | +| Environment name | `env.name` | SystemMetadata | FeatureInsight | **Hashed** | + +### Command Entry-Point + +| Field | OTel Key | Classification | Purpose | Notes | +|-------|----------|----------------|---------|-------| +| Flags | `cmd.flags` | SystemMetadata | FeatureInsight | Which flags were passed | +| Argument count | `cmd.args.count` | SystemMetadata | FeatureInsight | **Measurement** | +| Entry point | `cmd.entry` | SystemMetadata | FeatureInsight | How the command was invoked | + +### Error Attributes + +| Field | OTel Key | Classification | Purpose | +|-------|----------|----------------|---------| +| Error category | `error.category` | SystemMetadata | PerformanceAndHealth | +| Error code | `error.code` | SystemMetadata | PerformanceAndHealth | +| Error type | `error.type` | SystemMetadata | PerformanceAndHealth | +| Inner error | `error.inner` | SystemMetadata | PerformanceAndHealth | +| Error frame | `error.frame` | SystemMetadata | PerformanceAndHealth | + +Error classification is handled by `MapError` in `internal/cmd/errors.go`, which categorizes +errors into: update errors, auth errors, service (Azure) errors, deployment errors, extension +errors, tool errors, sentinel errors, and network errors. Each receives an `error.code`, +`error.category`, and contextual attributes. + +### Service Attributes + +| Field | OTel Key | Classification | Purpose | Notes | +|-------|----------|----------------|---------|-------| +| Service host | `service.host` | SystemMetadata | PerformanceAndHealth | | +| Service name | `service.name` | SystemMetadata | PerformanceAndHealth | | +| Status code | `service.statusCode` | SystemMetadata | PerformanceAndHealth | **Measurement** | +| Method | `service.method` | SystemMetadata | PerformanceAndHealth | | +| Error code | `service.errorCode` | SystemMetadata | PerformanceAndHealth | **Measurement** | +| Correlation ID | `service.correlationId` | SystemMetadata | PerformanceAndHealth | | + +### Tool Attributes + +| Field | OTel Key | Classification | Purpose | +|-------|----------|----------------|---------| +| Tool name | `tool.name` | SystemMetadata | FeatureInsight | +| Tool exit code | `tool.exitCode` | SystemMetadata | PerformanceAndHealth | + +### Performance + +| Field | OTel Key | Classification | Purpose | Notes | +|-------|----------|----------------|---------|-------| +| Interaction time | `perf.interact_time` | SystemMetadata | PerformanceAndHealth | **Measurement** — time to first user prompt | + +### Pack (Buildpacks) + +| Field | OTel Key | Classification | Purpose | +|-------|----------|----------------|---------| +| Builder image | `pack.builder.image` | SystemMetadata | FeatureInsight | +| Builder tag | `pack.builder.tag` | SystemMetadata | FeatureInsight | + +### MCP + +| Field | OTel Key | Classification | Purpose | +|-------|----------|----------------|---------| +| Client name | `mcp.client.name` | SystemMetadata | FeatureInsight | +| Client version | `mcp.client.version` | SystemMetadata | FeatureInsight | + +### Init + +| Field | OTel Key | Classification | Purpose | Notes | +|-------|----------|----------------|---------|-------| +| Init method | `init.method` | SystemMetadata | FeatureInsight | template/app/project/environment/copilot | +| Detected databases | `appinit.detected.databases` | SystemMetadata | FeatureInsight | | +| Detected services | `appinit.detected.services` | SystemMetadata | FeatureInsight | | +| Confirmed databases | `appinit.confirmed.databases` | SystemMetadata | FeatureInsight | | +| Confirmed services | `appinit.confirmed.services` | SystemMetadata | FeatureInsight | | +| Modify add count | `appinit.modify_add.count` | SystemMetadata | FeatureInsight | **Measurement** | +| Modify remove count | `appinit.modify_remove.count` | SystemMetadata | FeatureInsight | **Measurement** | +| Last step | `appinit.lastStep` | SystemMetadata | FeatureInsight | | + +### Remote Build + +| Field | OTel Key | Classification | Purpose | Notes | +|-------|----------|----------------|---------|-------| +| Remote build count | `container.remoteBuild.count` | SystemMetadata | FeatureInsight | **Measurement** | + +### JSON-RPC + +| Field | OTel Key | Classification | Purpose | Notes | +|-------|----------|----------------|---------|-------| +| RPC method | `rpc.method` | SystemMetadata | FeatureInsight | | +| Request ID | `rpc.jsonrpc.request_id` | SystemMetadata | PerformanceAndHealth | | +| Error code | `rpc.jsonrpc.error_code` | SystemMetadata | PerformanceAndHealth | **Measurement** | + +### Agent + +| Field | OTel Key | Classification | Purpose | +|-------|----------|----------------|---------| +| Fix attempts | `agent.fix.attempts` | SystemMetadata | PerformanceAndHealth | + +### Extensions + +| Field | OTel Key | Classification | Purpose | +|-------|----------|----------------|---------| +| Extension ID | `extension.id` | SystemMetadata | FeatureInsight | +| Extension version | `extension.version` | SystemMetadata | FeatureInsight | +| Extension installed | `extension.installed` | SystemMetadata | FeatureInsight | + +### Update + +| Field | OTel Key | Classification | Purpose | +|-------|----------|----------------|---------| +| Update channel | `update.channel` | SystemMetadata | FeatureInsight | +| Install method | `update.installMethod` | SystemMetadata | FeatureInsight | +| From version | `update.fromVersion` | SystemMetadata | FeatureInsight | +| To version | `update.toVersion` | SystemMetadata | FeatureInsight | +| Update result | `update.result` | SystemMetadata | FeatureInsight | + +### Copilot Session + +| Field | OTel Key | Classification | Purpose | Notes | +|-------|----------|----------------|---------|-------| +| Session ID | `copilot.session.id` | SystemMetadata | FeatureInsight | | +| Is new session | `copilot.session.isNew` | SystemMetadata | FeatureInsight | | +| Message count | `copilot.session.messageCount` | SystemMetadata | FeatureInsight | **Measurement** | + +### Copilot Init + +| Field | OTel Key | Classification | Purpose | +|-------|----------|----------------|---------| +| Is first run | `copilot.init.isFirstRun` | SystemMetadata | FeatureInsight | +| Reasoning effort | `copilot.init.reasoningEffort` | SystemMetadata | FeatureInsight | +| Model | `copilot.init.model` | SystemMetadata | FeatureInsight | +| Consent scope | `copilot.init.consentScope` | SystemMetadata | FeatureInsight | + +### Copilot Message + +| Field | OTel Key | Classification | Purpose | Notes | +|-------|----------|----------------|---------|-------| +| Mode | `copilot.mode` | SystemMetadata | FeatureInsight | | +| Model | `copilot.message.model` | SystemMetadata | FeatureInsight | | +| Input tokens | `copilot.message.inputTokens` | SystemMetadata | PerformanceAndHealth | **Measurement** | +| Output tokens | `copilot.message.outputTokens` | SystemMetadata | PerformanceAndHealth | **Measurement** | +| Billing rate | `copilot.message.billingRate` | SystemMetadata | BusinessInsight | **Measurement** | +| Premium requests | `copilot.message.premiumRequests` | SystemMetadata | BusinessInsight | **Measurement** | +| Duration (ms) | `copilot.message.durationMs` | SystemMetadata | PerformanceAndHealth | **Measurement** | + +### Copilot Consent + +| Field | OTel Key | Classification | Purpose | Notes | +|-------|----------|----------------|---------|-------| +| Approved count | `copilot.consent.approvedCount` | SystemMetadata | FeatureInsight | **Measurement** | +| Denied count | `copilot.consent.deniedCount` | SystemMetadata | FeatureInsight | **Measurement** | + +## New Fields (Added by This Audit) + +The following fields are being introduced to close telemetry gaps identified in the +[Feature-Telemetry Matrix](feature-telemetry-matrix.md). + +| Field | OTel Key | Classification | Purpose | Values | +|-------|----------|----------------|---------|--------| +| Auth method | `auth.method` | SystemMetadata | FeatureInsight | `browser`, `device-code`, `service-principal-secret`, `service-principal-certificate`, `federated-github`, `federated-azure-pipelines`, `federated-oidc`, `managed-identity`, `external`, `oneauth`, `check-status` | +| Env count | `env.count` | SystemMetadata | FeatureInsight | **Measurement** — number of environments | +| Hooks name | `hooks.name` | SystemMetadata | FeatureInsight | Built-in hook name (raw) or SHA-256 hash for extension/custom hooks. Known values: `prebuild`, `postbuild`, `predeploy`, `postdeploy`, `predown`, `postdown`, `prepackage`, `postpackage`, `preprovision`, `postprovision`, `prepublish`, `postpublish`, `prerestore`, `postrestore`, `preup`, `postup` | +| Hooks type | `hooks.type` | SystemMetadata | FeatureInsight | `project`, `service` | +| Pipeline provider | `pipeline.provider` | SystemMetadata | FeatureInsight | `github`, `azdo`, `auto` (auto-detected) | +| Pipeline auth | `pipeline.auth` | SystemMetadata | FeatureInsight | `federated`, `client-credentials`, `auto` (auto-detected) | +| Infra provider | `infra.provider` | SystemMetadata | FeatureInsight | `bicep`, `terraform`, `auto` (auto-detected from files) | + +## Data Classifications + +Classifications are defined in `internal/telemetry/fields/fields.go` and control how data +is stored, retained, and who may access it. + +| Classification | Description | +|----------------|-------------| +| `PublicPersonalData` | Data the user has made public (e.g. GitHub username) | +| `SystemMetadata` | Non-personal system/environment metadata | +| `CallstackOrException` | Stack traces and exception details | +| `CustomerContent` | User-created content (files, messages) — highest sensitivity | +| `EndUserPseudonymizedInformation` | Pseudonymized user identifiers (hashed MACs, device IDs) | +| `OrganizationalIdentifiableInformation` | Organization-level identifiers (subscription IDs, tenant IDs) | + +## Purposes + +Each field is tagged with one or more purposes that govern its permitted use. + +| Purpose | Description | +|---------|-------------| +| `FeatureInsight` | Understanding feature adoption and usage patterns | +| `BusinessInsight` | Business metrics (active users, organizations, growth) | +| `PerformanceAndHealth` | Performance monitoring, error rates, reliability | + +## Hashing + +Sensitive values are hashed before emission using functions in `cli/azd/internal/tracing/fields/key.go`. + +| Function | Behavior | +|----------|----------| +| `CaseInsensitiveHash(value)` | Lowercases, then SHA-256 hashes | +| `StringHashed(key, value)` | Creates an OTel attribute with a case-insensitive SHA-256 hash | +| `StringSliceHashed(key, values)` | Hashes each element in a string slice independently | + +Fields that are hashed: `project.template.id`, `project.template.version`, `project.name`, `env.name`. + +## Data Pipeline + +``` +┌──────────────┐ ┌─────────────────────┐ ┌──────────────┐ ┌──────────────────┐ +│ OTel Spans │───▶│ App Insights │───▶│ Disk Queue │───▶│ Azure Monitor / │ +│ (in-process)│ │ Exporter (custom) │ │ (~/.azd/) │ │ Kusto │ +└──────────────┘ └─────────────────────┘ └──────────────┘ └──────────────────┘ + │ + ▼ + ┌──────────────┐ + │ telemetry │ + │ upload cmd │ + └──────────────┘ +``` + +1. **Instrumentation**: Commands create OTel spans with attributes via `tracing.Start` and `SetUsageAttributes`. +2. **Export**: A custom Application Insights exporter converts spans to App Insights envelopes. +3. **Queue**: Envelopes are written to disk under `~/.azd/telemetry/`. +4. **Upload**: The `azd telemetry upload` command (run as a background process) reads the queue and sends data to Azure Monitor. +5. **Analysis**: Data flows into Kusto tables for dashboarding and analysis via LENS jobs and cooked tables.