Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2025 Google LLC
// Copyright 2026 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -108,6 +108,33 @@ message Content {
// A `Part` must have a fixed IANA MIME type identifying the type and subtype
// of the media if `inline_data` or `file_data` field is filled with raw bytes.
message Part {
// per part media resolution.
// Media resolution for the input media.
message MediaResolution {
// The media resolution level.
enum Level {
// Media resolution has not been set.
MEDIA_RESOLUTION_UNSPECIFIED = 0;

// Media resolution set to low.
MEDIA_RESOLUTION_LOW = 1;

// Media resolution set to medium.
MEDIA_RESOLUTION_MEDIUM = 2;

// Media resolution set to high.
MEDIA_RESOLUTION_HIGH = 3;

// Media resolution set to ultra high. This is for image only.
MEDIA_RESOLUTION_ULTRA_HIGH = 4;
}

oneof value {
// The tokenization quality used for given media.
Level level = 1;
}
}

oneof data {
// Optional. Text part (can be code).
string text = 1 [(google.api.field_behavior) = OPTIONAL];
Expand Down Expand Up @@ -150,6 +177,10 @@ message Part {
// video data is presented in inline_data or file_data.
VideoMetadata video_metadata = 4 [(google.api.field_behavior) = OPTIONAL];
}

// per part media resolution.
// Media resolution for the input media.
MediaResolution media_resolution = 12;
}

// Content blob.
Expand Down Expand Up @@ -182,6 +213,10 @@ message VideoMetadata {
// Optional. The end offset of the video.
google.protobuf.Duration end_offset = 2
[(google.api.field_behavior) = OPTIONAL];

// Optional. The frame rate of the video sent to the model. If not specified,
// the default value is 1.0. The valid range is (0.0, 24.0].
double fps = 3 [(google.api.field_behavior) = OPTIONAL];
}

// Configuration for a prebuilt voice.
Expand All @@ -202,7 +237,6 @@ message ReplicatedVoiceConfig {
bytes voice_sample_audio = 2 [(google.api.field_behavior) = OPTIONAL];
}


// Configuration for a voice.
message VoiceConfig {
// The configuration for the speaker to use.
Expand Down Expand Up @@ -250,6 +284,37 @@ message SpeechConfig {

// Config for image generation features.
message ImageConfig {
// The image output format for generated images.
message ImageOutputOptions {
// Optional. The image format that the output should be saved as.
optional string mime_type = 1 [(google.api.field_behavior) = OPTIONAL];

// Optional. The compression quality of the output image.
optional int32 compression_quality = 2
[(google.api.field_behavior) = OPTIONAL];
}

// Enum for controlling the generation of people in images.
enum PersonGeneration {
// The default behavior is unspecified. The model will decide whether to
// generate images of people.
PERSON_GENERATION_UNSPECIFIED = 0;

// Allows the model to generate images of people, including adults and
// children.
ALLOW_ALL = 1;

// Allows the model to generate images of adults, but not children.
ALLOW_ADULT = 2;

// Prevents the model from generating images of people.
ALLOW_NONE = 3;
}

// Optional. The image output format for generated images.
optional ImageOutputOptions image_output_options = 1
[(google.api.field_behavior) = OPTIONAL];

// Optional. The desired aspect ratio for the generated images. The following
// aspect ratios are supported:
//
Expand All @@ -260,6 +325,14 @@ message ImageConfig {
// "9:16", "16:9"
// "21:9"
optional string aspect_ratio = 2 [(google.api.field_behavior) = OPTIONAL];

// Optional. Controls whether the model can generate people.
optional PersonGeneration person_generation = 3
[(google.api.field_behavior) = OPTIONAL];

// Optional. Specifies the size of generated images. Supported values are
// `1K`, `2K`, `4K`. If not specified, the model will use default value `1K`.
optional string image_size = 4 [(google.api.field_behavior) = OPTIONAL];
}

// Generation config.
Expand Down Expand Up @@ -308,13 +381,65 @@ message GenerationConfig {

// Config for thinking features.
message ThinkingConfig {
// The thinking level for the model.
enum ThinkingLevel {
// Unspecified thinking level.
THINKING_LEVEL_UNSPECIFIED = 0;

// Low thinking level.
LOW = 1;

// Medium thinking level.
MEDIUM = 2;

// High thinking level.
HIGH = 3;

// MINIMAL thinking level.
MINIMAL = 4;
}

// Indicates whether to include thoughts in the response.
// If true, thoughts are returned only when available.
optional bool include_thoughts = 1 [(google.api.field_behavior) = OPTIONAL];

// Optional. Indicates the thinking budget in tokens.
// This is only applied when enable_thinking is true.
optional int32 thinking_budget = 3 [(google.api.field_behavior) = OPTIONAL];

// Optional. The number of thoughts tokens that the model should generate.
optional ThinkingLevel thinking_level = 4
[(google.api.field_behavior) = OPTIONAL];
}

// The modalities of the response.
enum Modality {
// Unspecified modality. Will be processed as text.
MODALITY_UNSPECIFIED = 0;

// Text modality.
TEXT = 1;

// Image modality.
IMAGE = 2;

// Audio modality.
AUDIO = 3;
}

// Media resolution for the input media.
enum MediaResolution {
// Media resolution has not been set.
MEDIA_RESOLUTION_UNSPECIFIED = 0;

// Media resolution set to low (64 tokens).
MEDIA_RESOLUTION_LOW = 1;

// Media resolution set to medium (256 tokens).
MEDIA_RESOLUTION_MEDIUM = 2;

// Media resolution set to high (zoomed reframing with 256 tokens).
MEDIA_RESOLUTION_HIGH = 3;
}

// Optional. Controls the randomness of predictions.
Expand Down Expand Up @@ -411,6 +536,27 @@ message GenerationConfig {
optional RoutingConfig routing_config = 17
[(google.api.field_behavior) = OPTIONAL];

// Optional. If enabled, audio timestamps will be included in the request to
// the model. This can be useful for synchronizing audio with other modalities
// in the response.
optional bool audio_timestamp = 20 [(google.api.field_behavior) = OPTIONAL];

// Optional. The modalities of the response. The model will generate a
// response that includes all the specified modalities. For example, if this
// is set to `[TEXT, IMAGE]`, the response will include both text and an
// image.
repeated Modality response_modalities = 21
[(google.api.field_behavior) = OPTIONAL];

// Optional. The token resolution at which input media content is sampled.
// This is used to control the trade-off between the quality of the response
// and the number of tokens used to represent the media. A higher resolution
// allows the model to perceive more detail, which can lead to a more nuanced
// response, but it will also use more tokens. This does not affect the
// image dimensions sent to the model.
optional MediaResolution media_resolution = 22
[(google.api.field_behavior) = OPTIONAL];

// Optional. The speech generation config.
optional SpeechConfig speech_config = 23
[(google.api.field_behavior) = OPTIONAL];
Expand Down
Loading