feat(php+node): Enable flag evaluation metrics tests; fix reason=static

leoromanovsky · leoromanovsky · commit a33486a03d8d · 2026-03-11T15:13:19.000-04:00
- Enable tests/ffe/test_flag_eval_metrics.py for PHP (&gt;=1.16.0) and Node.js (express4 v6.0.0-pre)
- Fix reason assertion: UFC engine returns AssignmentReason::Static for a 100%
  catch-all allocation (rules:[], splits:[{shards:[]}]), not TargetingMatch
- Add type annotations to test helpers (mypy compliance)
diff --git a/manifests/nodejs.yml b/manifests/nodejs.yml
@@ -1579,7 +1579,10 @@ manifest:
         "*": incomplete_test_app
         express4: *ref_5_77_0
   tests/ffe/test_exposures.py::Test_FFE_EXP_5_Missing_Targeting_Key: bug (FFL-1730)
-  tests/ffe/test_flag_eval_metrics.py: missing_feature
+  tests/ffe/test_flag_eval_metrics.py:
+    - weblog_declaration:
+        "*": incomplete_test_app
+        express4: v6.0.0-pre
   tests/integration_frameworks/llm/anthropic/test_anthropic_apm.py::TestAnthropicApmMessages: *ref_5_71_0
   tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py::TestAnthropicLlmObsMessages: *ref_5_71_0
   tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py::TestAnthropicLlmObsMessages::test_create_error: bug (MLOB-1234)
@@ -2260,7 +2263,6 @@ manifest:
     - weblog_declaration:
         nextjs: missing_feature
   tests/test_rum_injection.py: irrelevant (RUM injection only supported for Java)
-  tests/test_sampling_rate_capping.py::Test_SamplingRateCappedIncrease: missing_feature
   tests/test_sampling_rates.py::Test_SampleRateFunction: *ref_5_54_0
   tests/test_sampling_rates.py::Test_SamplingDecisionAdded: *ref_5_17_0
   tests/test_sampling_rates.py::Test_SamplingDecisions: *ref_5_54_0
diff --git a/manifests/php.yml b/manifests/php.yml
@@ -551,7 +551,7 @@ manifest:
   tests/ffe/test_dynamic_evaluation.py: missing_feature
   tests/ffe/test_exposures.py: missing_feature
   tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py::TestAnthropicLlmObsMessages::test_create_error: bug (MLOB-1234)
-  tests/ffe/test_flag_eval_metrics.py: missing_feature
+  tests/ffe/test_flag_eval_metrics.py: '>=1.16.0'
   tests/integrations/crossed_integrations/test_kafka.py::Test_Kafka: missing_feature
   tests/integrations/crossed_integrations/test_kinesis.py::Test_Kinesis_PROPAGATION_VIA_MESSAGE_ATTRIBUTES: missing_feature
   tests/integrations/crossed_integrations/test_rabbitmq.py::Test_RabbitMQ_Trace_Context_Propagation: missing_feature
diff --git a/tests/ffe/test_flag_eval_metrics.py b/tests/ffe/test_flag_eval_metrics.py
@@ -13,7 +13,7 @@
 RC_PATH = f"datadog/2/{RC_PRODUCT}"
 
 
-def make_ufc_fixture(flag_key, variant_key="on", variation_type="STRING", enabled=True):
+def make_ufc_fixture(flag_key: str, variant_key: str = "on", variation_type: str = "STRING", *, enabled: bool = True):
     """Create a UFC fixture with the given flag configuration."""
     values: dict[str, dict[str, str | bool]] = {
         "STRING": {"on": "on-value", "off": "off-value"},
@@ -47,7 +47,7 @@ def make_ufc_fixture(flag_key, variant_key="on", variation_type="STRING", enable
     }
 
 
-def find_eval_metrics(flag_key=None):
+def find_eval_metrics(flag_key: str | None = None):
     """Find feature_flag.evaluations metrics in agent data.
 
     Returns a list of metric points matching the metric name, optionally filtered by flag key tag.
@@ -67,7 +67,7 @@ def find_eval_metrics(flag_key=None):
     return results
 
 
-def get_tag_value(tags, key):
+def get_tag_value(tags: list[str], key: str):
     """Extract a tag value from a list of 'key:value' strings."""
     prefix = f"{key}:"
     for tag in tags:
@@ -99,8 +99,6 @@ def setup_ffe_eval_metric_basic(self):
             },
         )
 
-
-
     def test_ffe_eval_metric_basic(self):
         """Test that flag evaluation produces a metric with correct tags."""
         assert self.r.status_code == 200, f"Flag evaluation failed: {self.r.text}"
@@ -121,8 +119,8 @@ def test_ffe_eval_metric_basic(self):
         assert get_tag_value(tags, "feature_flag.result.variant") == "on", (
             f"Expected tag feature_flag.result.variant:on, got tags: {tags}"
         )
-        assert get_tag_value(tags, "feature_flag.result.reason") == "targeting_match", (
-            f"Expected tag feature_flag.result.reason:targeting_match, got tags: {tags}"
+        assert get_tag_value(tags, "feature_flag.result.reason") == "static", (
+            f"Expected tag feature_flag.result.reason:static, got tags: {tags}"
         )
         assert get_tag_value(tags, "feature_flag.result.allocation_key") == "default-allocation", (
             f"Expected tag feature_flag.result.allocation_key:default-allocation, got tags: {tags}"
@@ -156,17 +154,14 @@ def setup_ffe_eval_metric_count(self):
             )
             self.responses.append(r)
 
-
-
     def test_ffe_eval_metric_count(self):
         """Test that N evaluations produce metric count = N."""
         for i, r in enumerate(self.responses):
             assert r.status_code == 200, f"Request {i + 1} failed: {r.text}"
 
         metrics = find_eval_metrics(self.flag_key)
         assert len(metrics) > 0, (
-            f"Expected at least one feature_flag.evaluations metric for flag '{self.flag_key}', "
-            f"but found none."
+            f"Expected at least one feature_flag.evaluations metric for flag '{self.flag_key}', but found none."
         )
 
         # Sum all data points for this flag (agent may split across multiple series entries)
@@ -180,9 +175,7 @@ def test_ffe_eval_metric_count(self):
                 elif isinstance(p, list) and len(p) >= 2:
                     total_count += p[1]
 
-        assert total_count >= self.eval_count, (
-            f"Expected metric count >= {self.eval_count}, got {total_count}"
-        )
+        assert total_count >= self.eval_count, f"Expected metric count >= {self.eval_count}, got {total_count}"
 
 
 @scenarios.feature_flagging_and_experimentation
@@ -262,8 +255,6 @@ def setup_ffe_eval_metric_different_flags(self):
             },
         )
 
-
-
     def test_ffe_eval_metric_different_flags(self):
         """Test that each flag key gets its own metric series."""
         assert self.r_a.status_code == 200, f"Flag A evaluation failed: {self.r_a.text}"
@@ -272,12 +263,8 @@ def test_ffe_eval_metric_different_flags(self):
         metrics_a = find_eval_metrics(self.flag_a)
         metrics_b = find_eval_metrics(self.flag_b)
 
-        assert len(metrics_a) > 0, (
-            f"Expected metric for flag '{self.flag_a}', found none. All: {find_eval_metrics()}"
-        )
-        assert len(metrics_b) > 0, (
-            f"Expected metric for flag '{self.flag_b}', found none. All: {find_eval_metrics()}"
-        )
+        assert len(metrics_a) > 0, f"Expected metric for flag '{self.flag_a}', found none. All: {find_eval_metrics()}"
+        assert len(metrics_b) > 0, f"Expected metric for flag '{self.flag_b}', found none. All: {find_eval_metrics()}"
 
 
 @scenarios.feature_flagging_and_experimentation
@@ -290,9 +277,7 @@ def setup_ffe_eval_metric_error(self):
 
         # Set up config with a different flag than what we'll request
         config_id = "ffe-eval-metric-error"
-        rc.tracer_rc_state.set_config(
-            f"{RC_PATH}/{config_id}/config", make_ufc_fixture("some-other-flag")
-        ).apply()
+        rc.tracer_rc_state.set_config(f"{RC_PATH}/{config_id}/config", make_ufc_fixture("some-other-flag")).apply()
 
         self.flag_key = "non-existent-eval-metric-flag"
         self.r = weblog.post(
@@ -306,8 +291,6 @@ def setup_ffe_eval_metric_error(self):
             },
         )
 
-
-
     def test_ffe_eval_metric_error(self):
         """Test that error evaluations produce metric with error.type tag."""
         assert self.r.status_code == 200, f"Flag evaluation request failed: {self.r.text}"
@@ -362,16 +345,12 @@ def setup_ffe_eval_metric_type_mismatch(self):
             },
         )
 
-
-
     def test_ffe_eval_metric_type_mismatch(self):
         """Test that type conversion errors produce metric with error.type:type_mismatch."""
         assert self.r.status_code == 200, f"Flag evaluation request failed: {self.r.text}"
 
         metrics = find_eval_metrics(self.flag_key)
-        assert len(metrics) > 0, (
-            f"Expected metric for flag '{self.flag_key}', found none. All: {find_eval_metrics()}"
-        )
+        assert len(metrics) > 0, f"Expected metric for flag '{self.flag_key}', found none. All: {find_eval_metrics()}"
 
         point = metrics[0]
         tags = point.get("tags", [])