python-agents-examples/realtime/gemini_live_vision.py at d2cd375b5f2f5860e064d69058bf8cf11af47406 · livekit-examples/python-agents-examples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""
---
title: Gemini Realtime Agent with Live Vision
category: realtime
tags: [gemini_realtime, live_vision]
difficulty: beginner
description: Minimal Gemini Realtime model agent setup with live vision capabilities
demonstrates:
  - Gemini Realtime model basic usage
  - Live vision capabilities
  - Session-based generation
  - VAD with Silero
---
"""

from dotenv import load_dotenv
from pathlib import Path
from livekit import agents
from livekit.agents import RoomInputOptions
from livekit.agents.voice import AgentSession, Agent
from livekit.plugins import (
    silero,
    google
)

load_dotenv(dotenv_path=Path(__file__).parent.parent / '.env')

class Assistant(Agent):
    def __init__(self) -> None:
        super().__init__(instructions="You are a helpful voice AI assistant that can see the world around you.")

async def entrypoint(ctx: agents.JobContext):
    session = AgentSession(
        llm=google.realtime.RealtimeModel(
            model="gemini-2.5-flash-native-audio-preview-09-2025",
            voice="Puck",
            temperature=0.8,
        ),
        vad=silero.VAD.load()
    )

    await session.start(
        room=ctx.room,
        agent=Assistant(),
        room_input_options=RoomInputOptions(
            video_enabled=True
        ),
    )

    await session.generate_reply(instructions="Start by offering assistance")

if __name__ == "__main__":
    agents.cli.run_app(agents.WorkerOptions(entrypoint_fnc=entrypoint))