Skip to main content
Your agent can reason about text. Now give it the ability to perceive - screen, microphone, camera, and video files.

Get Your API Key

  1. Go to VideoDB Console
  2. Copy your API key (free tier: 50 uploads, no credit card)
  3. Set it in your environment:
export VIDEODB_API_KEY="your-api-key"

Install the SDK

pip install videodb

Real-time Perception (Desktop Capture)

Stream what your agent sees and hears. Get structured context back in real-time.
import asyncio
import signal
import videodb
from dotenv import load_dotenv

load_dotenv()

AUDIO_URL = "rtsp://matrix.videodb.io:8554/audio"
SCREEN_URL = "rtsp://matrix.videodb.io:8554/screen"


async def main():
    conn = videodb.connect()
    coll = conn.get_collection()
    print(f"connected to collection: {coll.id}")

    ws = conn.connect_websocket()
    ws = await ws.connect()

    # Connect streams
    audio = coll.connect_rtstream(url=AUDIO_URL, name="Audio", media_types=["audio"])
    screen = coll.connect_rtstream(url=SCREEN_URL, name="Screen", media_types=["video"])
    print(f"audio stream:  {audio.id} ({audio.status})")
    print(f"screen stream: {screen.id} ({screen.status})")

    # Start pipelines
    audio.start_transcript(ws_connection_id=ws.connection_id)
    print("transcript started")

    audio.index_audio(
        prompt="Summarize what is being said or heard.",
        batch_config={"type": "time", "value": 30},
        ws_connection_id=ws.connection_id,
    )
    print("audio indexing started (30s window)")

    screen.index_visuals(
        prompt="In one sentence, describe the active application and what the agent is doing on screen. Note the current time if a clock is visible.",
        batch_config={"type": "time", "value": 30, "frame_count": 5},
        ws_connection_id=ws.connection_id,
    )
    print("visual indexing started (30s window, 5 frames)")

    # Listen for events — Ctrl+C to stop
    print("\nlistening for events...\n")
    stop = asyncio.Event()
    for sig in (signal.SIGINT, signal.SIGTERM):
        asyncio.get_event_loop().add_signal_handler(sig, stop.set)

    async def listen():
        async for msg in ws.receive():
            ch = msg.get("channel", "?")
            if ch == "capture_session":
                continue
            data = msg.get("data", msg)
            if ch == "transcript" and not data.get("is_final", False):
                continue
            text = data.get("text", "") if isinstance(data, dict) else ""
            print(f"  [{ch}] {text}")

    task = asyncio.create_task(listen())
    await asyncio.wait([task, asyncio.create_task(stop.wait())], return_when=asyncio.FIRST_COMPLETED)
    task.cancel()

    # Cleanup
    print("\nstopping streams...")
    audio.stop()
    screen.stop()
    await ws.close()
    print("done.")


if __name__ == "__main__":
    asyncio.run(main())
Try the interactive quickstart: Real-time Perception Quickstart on GitHub

Full Capture Guide

Deep dive: channels, permissions, client code, and event handling

Working with Video Files

Upload, index, and search existing recordings.

Upload a video

import videodb

conn = videodb.connect()
coll = conn.get_collection()
video = coll.upload(url="https://www.youtube.com/watch?v=WDv4AWk0J3U")

# Get an embeddable stream URL
stream_url = video.generate_stream()
print(stream_url)  # HLS link you can embed anywhere
Upload from YouTube, S3, any public URL, or local files.

Index spoken words

Create a searchable transcript:
video.index_audio(prompt="Extract key topics, decisions, and action items")

Search with natural language

results = video.search("What are the key benefits?")

for shot in results.shots:
    print(f"{shot.start}s - {shot.end}s: {shot.text}")

# Play the matching moments
results.play()
Search returns timestamps and playable links - verifiable evidence your agent can use.

Index Visual Scenes

For video where visuals matter (security footage, tutorials, presentations):
# Index with a prompt describing what to look for
video.index_visuals(prompt="Identify key moments and activities")

# Search visual content
results = video.search("person entering the room", index_type="scene")
results.play()

Search Across Collections

Scale to thousands of videos:
# Get your collection
coll = conn.get_collection()

# Upload multiple videos
coll.upload(url="https://youtube.com/watch?v=video1")
coll.upload(url="https://youtube.com/watch?v=video2")
coll.upload(url="https://youtube.com/watch?v=video3")

# Index all
for video in coll.get_videos():
    video.indexAudio()

# Search across everything
results = coll.search("quarterly revenue discussion")
results.play()  # Plays matching moments from any video

What’s Next