googleapis · danieljbruce · Feb 2, 2026 · Jan 30, 2026 · Jan 30, 2026
@@ -1,4 +1,4 @@
-// Copyright 2025 Google LLC
+// Copyright 2026 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -108,6 +108,33 @@ message Content {
 // A `Part` must have a fixed IANA MIME type identifying the type and subtype
 // of the media if `inline_data` or `file_data` field is filled with raw bytes.
 message Part {
+  // per part media resolution.
+  // Media resolution for the input media.
+  message MediaResolution {
+    // The media resolution level.
+    enum Level {
+      // Media resolution has not been set.
+      MEDIA_RESOLUTION_UNSPECIFIED = 0;
+
+      // Media resolution set to low.
+      MEDIA_RESOLUTION_LOW = 1;
+
+      // Media resolution set to medium.
+      MEDIA_RESOLUTION_MEDIUM = 2;
+
+      // Media resolution set to high.
+      MEDIA_RESOLUTION_HIGH = 3;
+
+      // Media resolution set to ultra high. This is for image only.
+      MEDIA_RESOLUTION_ULTRA_HIGH = 4;
+    }
+
+    oneof value {
+      // The tokenization quality used for given media.
+      Level level = 1;
+    }
+  }
+
   oneof data {
     // Optional. Text part (can be code).
     string text = 1 [(google.api.field_behavior) = OPTIONAL];
@@ -150,6 +177,10 @@ message Part {
     // video data is presented in inline_data or file_data.
     VideoMetadata video_metadata = 4 [(google.api.field_behavior) = OPTIONAL];
   }
+
+  // per part media resolution.
+  // Media resolution for the input media.
+  MediaResolution media_resolution = 12;
 }
 
 // Content blob.
@@ -182,6 +213,10 @@ message VideoMetadata {
   // Optional. The end offset of the video.
   google.protobuf.Duration end_offset = 2
       [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. The frame rate of the video sent to the model. If not specified,
+  // the default value is 1.0. The valid range is (0.0, 24.0].
+  double fps = 3 [(google.api.field_behavior) = OPTIONAL];
 }
 
 // Configuration for a prebuilt voice.
@@ -202,7 +237,6 @@ message ReplicatedVoiceConfig {
   bytes voice_sample_audio = 2 [(google.api.field_behavior) = OPTIONAL];
 }
 
-
 // Configuration for a voice.
 message VoiceConfig {
   // The configuration for the speaker to use.
@@ -250,6 +284,37 @@ message SpeechConfig {
 
 // Config for image generation features.
 message ImageConfig {
+  // The image output format for generated images.
+  message ImageOutputOptions {
+    // Optional. The image format that the output should be saved as.
+    optional string mime_type = 1 [(google.api.field_behavior) = OPTIONAL];
+
+    // Optional. The compression quality of the output image.
+    optional int32 compression_quality = 2
+        [(google.api.field_behavior) = OPTIONAL];
+  }
+
+  // Enum for controlling the generation of people in images.
+  enum PersonGeneration {
+    // The default behavior is unspecified. The model will decide whether to
+    // generate images of people.
+    PERSON_GENERATION_UNSPECIFIED = 0;
+
+    // Allows the model to generate images of people, including adults and
+    // children.
+    ALLOW_ALL = 1;
+
+    // Allows the model to generate images of adults, but not children.
+    ALLOW_ADULT = 2;
+
+    // Prevents the model from generating images of people.
+    ALLOW_NONE = 3;
+  }
+
+  // Optional. The image output format for generated images.
+  optional ImageOutputOptions image_output_options = 1
+      [(google.api.field_behavior) = OPTIONAL];
+
   // Optional. The desired aspect ratio for the generated images. The following
   // aspect ratios are supported:
   //
@@ -260,6 +325,14 @@ message ImageConfig {
   // "9:16", "16:9"
   // "21:9"
   optional string aspect_ratio = 2 [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. Controls whether the model can generate people.
+  optional PersonGeneration person_generation = 3
+      [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. Specifies the size of generated images. Supported values are
+  // `1K`, `2K`, `4K`. If not specified, the model will use default value `1K`.
+  optional string image_size = 4 [(google.api.field_behavior) = OPTIONAL];
 }
 
 // Generation config.
@@ -308,13 +381,65 @@ message GenerationConfig {
 
   // Config for thinking features.
   message ThinkingConfig {
+    // The thinking level for the model.
+    enum ThinkingLevel {
+      // Unspecified thinking level.
+      THINKING_LEVEL_UNSPECIFIED = 0;
+
+      // Low thinking level.
+      LOW = 1;
+
+      // Medium thinking level.
+      MEDIUM = 2;
+
+      // High thinking level.
+      HIGH = 3;
+
+      // MINIMAL thinking level.
+      MINIMAL = 4;
+    }
+
     // Indicates whether to include thoughts in the response.
     // If true, thoughts are returned only when available.
     optional bool include_thoughts = 1 [(google.api.field_behavior) = OPTIONAL];
 
     // Optional. Indicates the thinking budget in tokens.
     // This is only applied when enable_thinking is true.
     optional int32 thinking_budget = 3 [(google.api.field_behavior) = OPTIONAL];
+
+    // Optional. The number of thoughts tokens that the model should generate.
+    optional ThinkingLevel thinking_level = 4
+        [(google.api.field_behavior) = OPTIONAL];
+  }
+
+  // The modalities of the response.
+  enum Modality {
+    // Unspecified modality. Will be processed as text.
+    MODALITY_UNSPECIFIED = 0;
+
+    // Text modality.
+    TEXT = 1;
+
+    // Image modality.
+    IMAGE = 2;
+
+    // Audio modality.
+    AUDIO = 3;
+  }
+
+  // Media resolution for the input media.
+  enum MediaResolution {
+    // Media resolution has not been set.
+    MEDIA_RESOLUTION_UNSPECIFIED = 0;
+
+    // Media resolution set to low (64 tokens).
+    MEDIA_RESOLUTION_LOW = 1;
+
+    // Media resolution set to medium (256 tokens).
+    MEDIA_RESOLUTION_MEDIUM = 2;
+
+    // Media resolution set to high (zoomed reframing with 256 tokens).
+    MEDIA_RESOLUTION_HIGH = 3;
   }
 
   // Optional. Controls the randomness of predictions.
@@ -411,6 +536,27 @@ message GenerationConfig {
   optional RoutingConfig routing_config = 17
       [(google.api.field_behavior) = OPTIONAL];
 
+  // Optional. If enabled, audio timestamps will be included in the request to
+  // the model. This can be useful for synchronizing audio with other modalities
+  // in the response.
+  optional bool audio_timestamp = 20 [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. The modalities of the response. The model will generate a
+  // response that includes all the specified modalities. For example, if this
+  // is set to `[TEXT, IMAGE]`, the response will include both text and an
+  // image.
+  repeated Modality response_modalities = 21
+      [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. The token resolution at which input media content is sampled.
+  // This is used to control the trade-off between the quality of the response
+  // and the number of tokens used to represent the media. A higher resolution
+  // allows the model to perceive more detail, which can lead to a more nuanced
+  // response, but it will also use more tokens. This does not affect the
+  // image dimensions sent to the model.
+  optional MediaResolution media_resolution = 22
+      [(google.api.field_behavior) = OPTIONAL];
+
   // Optional. The speech generation config.
   optional SpeechConfig speech_config = 23
       [(google.api.field_behavior) = OPTIONAL];