Dudoxx Kokoro TTS

Developer Documentation

Server Configuration

Base URL: https://kokoro.dudoxx.com
Authentication: Bearer Token (Header)
Content-Type: application/json

Supported Languages

Code	Language	Engine	Sample Rate
`a`	American English	Kokoro-82M	24kHz
`b`	British English	Kokoro-82M	24kHz
`d`	German	Piper TTS	22kHz
`e`	Spanish	Kokoro-82M	24kHz
`f`	French	Kokoro-82M	24kHz
`h`	Hindi	Kokoro-82M	24kHz
`i`	Italian	Kokoro-82M	24kHz
`p`	Portuguese	Kokoro-82M	24kHz
`j`	Japanese	Kokoro-82M	24kHz
`z`	Mandarin Chinese	Kokoro-82M	24kHz

Available Voices

Voice ID	Description	Gender
`af_heart`	Heart - Warm, expressive	Female
`af_bella`	Bella - Professional	Female
`af_nicole`	Nicole - Friendly	Female
`af_sarah`	Sarah - Clear, articulate	Female
`am_adam`	Adam - Deep, authoritative	Male
`am_michael`	Michael - Conversational	Male
`bf_emma`	Emma - British accent	Female
`bf_isabella`	Isabella - British accent	Female
`bm_george`	George - British accent	Male

Quick cURL Example

# Generate speech and save to file
curl -X POST "https://kokoro.dudoxx.com/tts/stream" \
  -H "Authorization: Bearer YOUR_API_TOKEN" \
  -H "Content-Type: application/json" \
  -d '{
    "text": "Hello, welcome to Dudoxx Kokoro TTS!",
    "language": "a",
    "voice": "af_heart",
    "format": "wav"
  }' \
  -o output.wav

Next.js 15+ Integration

Complete guide for integrating Dudoxx Kokoro TTS into your Next.js application.

1. Environment Setup

# .env.local
DUDOXX_TTS_URL=https://kokoro.dudoxx.com
DUDOXX_TTS_TOKEN=your-api-token-here

2. TTS Service (lib/tts.ts)

// lib/tts.ts
interface TTSOptions {
  text: string;
  language?: 'a' | 'b' | 'd' | 'e' | 'f' | 'h' | 'i' | 'p' | 'j' | 'z';
  voice?: string;
  speed?: number;
  format?: 'wav' | 'mp3' | 'ogg';
}

interface TTSResponse {
  success: boolean;
  audioUrl?: string;
  error?: string;
  duration?: number;
}

export async function generateSpeech(options: TTSOptions): Promise<TTSResponse> {
  const {
    text,
    language = 'a',
    voice = 'af_heart',
    speed = 1.0,
    format = 'wav'
  } = options;

  try {
    const response = await fetch(`${process.env.DUDOXX_TTS_URL}/tts/stream`, {
      method: 'POST',
      headers: {
        'Authorization': `Bearer ${process.env.DUDOXX_TTS_TOKEN}`,
        'Content-Type': 'application/json',
      },
      body: JSON.stringify({ text, language, voice, speed, format }),
    });

    if (!response.ok) {
      const error = await response.json();
      throw new Error(error.detail || 'TTS generation failed');
    }

    const audioBlob = await response.blob();
    const audioUrl = URL.createObjectURL(audioBlob);

    return { success: true, audioUrl };
  } catch (error) {
    return {
      success: false,
      error: error instanceof Error ? error.message : 'Unknown error'
    };
  }
}

// Server-side TTS (returns buffer)
export async function generateSpeechBuffer(options: TTSOptions): Promise<Buffer> {
  const response = await fetch(`${process.env.DUDOXX_TTS_URL}/tts/stream`, {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${process.env.DUDOXX_TTS_TOKEN}`,
      'Content-Type': 'application/json',
    },
    body: JSON.stringify(options),
  });

  if (!response.ok) {
    throw new Error('TTS generation failed');
  }

  const arrayBuffer = await response.arrayBuffer();
  return Buffer.from(arrayBuffer);
}

3. API Route (app/api/tts/route.ts)

// app/api/tts/route.ts
import { NextRequest, NextResponse } from 'next/server';

export async function POST(request: NextRequest) {
  try {
    const body = await request.json();
    const { text, language = 'a', voice = 'af_heart', format = 'wav' } = body;

    if (!text || text.length === 0) {
      return NextResponse.json(
        { error: 'Text is required' },
        { status: 400 }
      );
    }

    const ttsResponse = await fetch(
      `${process.env.DUDOXX_TTS_URL}/tts/stream`,
      {
        method: 'POST',
        headers: {
          'Authorization': `Bearer ${process.env.DUDOXX_TTS_TOKEN}`,
          'Content-Type': 'application/json',
        },
        body: JSON.stringify({ text, language, voice, format }),
      }
    );

    if (!ttsResponse.ok) {
      const error = await ttsResponse.json();
      return NextResponse.json(
        { error: error.detail || 'TTS failed' },
        { status: ttsResponse.status }
      );
    }

    const audioBuffer = await ttsResponse.arrayBuffer();

    return new NextResponse(audioBuffer, {
      headers: {
        'Content-Type': `audio/${format}`,
        'Content-Disposition': `attachment; filename="speech.${format}"`,
      },
    });
  } catch (error) {
    return NextResponse.json(
      { error: 'Internal server error' },
      { status: 500 }
    );
  }
}

4. React Hook (hooks/useTTS.ts)

// hooks/useTTS.ts
'use client';

import { useState, useCallback, useRef } from 'react';

interface UseTTSOptions {
  language?: string;
  voice?: string;
  autoPlay?: boolean;
}

export function useTTS(options: UseTTSOptions = {}) {
  const { language = 'a', voice = 'af_heart', autoPlay = true } = options;

  const [isLoading, setIsLoading] = useState(false);
  const [error, setError] = useState<string | null>(null);
  const [audioUrl, setAudioUrl] = useState<string | null>(null);
  const audioRef = useRef<HTMLAudioElement | null>(null);

  const speak = useCallback(async (text: string) => {
    setIsLoading(true);
    setError(null);

    try {
      const response = await fetch('/api/tts', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ text, language, voice }),
      });

      if (!response.ok) {
        throw new Error('TTS generation failed');
      }

      const blob = await response.blob();
      const url = URL.createObjectURL(blob);
      setAudioUrl(url);

      if (autoPlay) {
        const audio = new Audio(url);
        audioRef.current = audio;
        await audio.play();
      }
    } catch (err) {
      setError(err instanceof Error ? err.message : 'Unknown error');
    } finally {
      setIsLoading(false);
    }
  }, [language, voice, autoPlay]);

  const stop = useCallback(() => {
    if (audioRef.current) {
      audioRef.current.pause();
      audioRef.current.currentTime = 0;
    }
  }, []);

  return { speak, stop, isLoading, error, audioUrl };
}

5. Component Example

// components/TextToSpeech.tsx
'use client';

import { useState } from 'react';
import { useTTS } from '@/hooks/useTTS';

export function TextToSpeech() {
  const [text, setText] = useState('');
  const { speak, stop, isLoading, error } = useTTS({
    language: 'a',
    voice: 'af_heart',
    autoPlay: true,
  });

  return (
    <div className="flex flex-col gap-4">
      <textarea
        value={text}
        onChange={(e) => setText(e.target.value)}
        placeholder="Enter text to speak..."
        className="p-3 border rounded-lg"
        rows={4}
      />

      <div className="flex gap-2">
        <button
          onClick={() => speak(text)}
          disabled={isLoading || !text}
          className="px-4 py-2 bg-primary text-white rounded-lg disabled:opacity-50"
        >
          {isLoading ? 'Generating...' : 'Speak'}
        </button>

        <button
          onClick={stop}
          className="px-4 py-2 bg-gray-500 text-white rounded-lg"
        >
          Stop
        </button>
      </div>

      {error && <p className="text-red-500">{error}</p>}
    </div>
  );
}

6. Server Component with Streaming

// app/speak/[text]/route.ts
import { NextRequest } from 'next/server';

export async function GET(
  request: NextRequest,
  { params }: { params: Promise<{ text: string }> }
) {
  const { text } = await params;
  const decodedText = decodeURIComponent(text);

  const response = await fetch(
    `${process.env.DUDOXX_TTS_URL}/tts/stream`,
    {
      method: 'POST',
      headers: {
        'Authorization': `Bearer ${process.env.DUDOXX_TTS_TOKEN}`,
        'Content-Type': 'application/json',
      },
      body: JSON.stringify({
        text: decodedText,
        language: 'a',
        voice: 'af_heart',
        format: 'mp3',
      }),
    }
  );

  // Stream the response
  return new Response(response.body, {
    headers: {
      'Content-Type': 'audio/mpeg',
      'Transfer-Encoding': 'chunked',
    },
  });
}

LiveKit Voice Agent Integration

Configure Dudoxx Kokoro TTS as a custom TTS provider for LiveKit Agents.

LiveKit Agent Configuration

# Python LiveKit Agent with Dudoxx Kokoro TTS
# pip install livekit-agents livekit-plugins-openai httpx

from livekit.agents import tts
from livekit.agents.tts import SynthesizedAudio
import httpx
import io
import wave
import numpy as np
from dataclasses import dataclass
from typing import AsyncGenerator

@dataclass
class DudoxxTTSOptions:
    """Dudoxx Kokoro TTS configuration"""
    base_url: str = "https://kokoro.dudoxx.com"
    api_token: str = ""
    language: str = "a"  # a=American, b=British, d=German, etc.
    voice: str = "af_heart"
    speed: float = 1.0
    sample_rate: int = 24000  # 24kHz for Kokoro, 22kHz for German/Piper


class DudoxxTTS(tts.TTS):
    """
    Dudoxx Kokoro TTS Plugin for LiveKit Agents

    Server: https://kokoro.dudoxx.com

    Supported Languages:
    - 'a': American English (Kokoro-82M, 24kHz)
    - 'b': British English (Kokoro-82M, 24kHz)
    - 'd': German (Piper TTS, 22kHz)
    - 'e': Spanish (Kokoro-82M, 24kHz)
    - 'f': French (Kokoro-82M, 24kHz)
    - 'h': Hindi (Kokoro-82M, 24kHz)
    - 'i': Italian (Kokoro-82M, 24kHz)
    - 'p': Portuguese (Kokoro-82M, 24kHz)
    - 'j': Japanese (Kokoro-82M, 24kHz)
    - 'z': Mandarin Chinese (Kokoro-82M, 24kHz)

    Voices:
    - af_heart, af_bella, af_nicole, af_sarah (American Female)
    - am_adam, am_michael (American Male)
    - bf_emma, bf_isabella (British Female)
    - bm_george (British Male)
    """

    def __init__(self, options: DudoxxTTSOptions):
        super().__init__(
            capabilities=tts.TTSCapabilities(streaming=False),
            sample_rate=options.sample_rate,
            num_channels=1,
        )
        self.options = options
        self._client = httpx.AsyncClient(timeout=60.0)

    async def synthesize(self, text: str) -> SynthesizedAudio:
        """Synthesize speech from text"""
        response = await self._client.post(
            f"{self.options.base_url}/tts/stream",
            headers={
                "Authorization": f"Bearer {self.options.api_token}",
                "Content-Type": "application/json",
            },
            json={
                "text": text,
                "language": self.options.language,
                "voice": self.options.voice,
                "speed": self.options.speed,
                "format": "wav",
            },
        )
        response.raise_for_status()

        # Parse WAV data
        wav_data = io.BytesIO(response.content)
        with wave.open(wav_data, 'rb') as wav_file:
            frames = wav_file.readframes(wav_file.getnframes())
            audio_data = np.frombuffer(frames, dtype=np.int16)

        return SynthesizedAudio(
            text=text,
            data=audio_data.tobytes(),
            sample_rate=self.options.sample_rate,
            num_channels=1,
        )

    async def close(self):
        await self._client.aclose()


# Streaming version using WebSocket
class DudoxxStreamingTTS(tts.TTS):
    """
    Streaming TTS using WebSocket for lower latency
    """

    def __init__(self, options: DudoxxTTSOptions):
        super().__init__(
            capabilities=tts.TTSCapabilities(streaming=True),
            sample_rate=options.sample_rate,
            num_channels=1,
        )
        self.options = options

    async def stream(self, text: str) -> AsyncGenerator[SynthesizedAudio, None]:
        """Stream synthesized audio chunks"""
        import websockets

        ws_url = (
            f"wss://kokoro.dudoxx.com/tts/ws"
            f"?token={self.options.api_token}"
        )

        async with websockets.connect(ws_url) as ws:
            # Send TTS request
            await ws.send(json.dumps({
                "text": text,
                "language": self.options.language,
                "voice": self.options.voice,
                "speed": self.options.speed,
            }))

            # Receive audio chunks
            while True:
                message = await ws.recv()

                if isinstance(message, bytes):
                    # PCM audio chunk
                    yield SynthesizedAudio(
                        text=text,
                        data=message,
                        sample_rate=self.options.sample_rate,
                        num_channels=1,
                    )
                else:
                    # JSON message
                    data = json.loads(message)
                    if data.get("done"):
                        break

LiveKit Agent Entry Point

# agent.py - LiveKit Agent with Dudoxx TTS
import asyncio
from livekit.agents import (
    AutoSubscribe,
    JobContext,
    WorkerOptions,
    cli,
    llm,
)
from livekit.agents.voice_assistant import VoiceAssistant
from livekit.plugins import openai, silero

# Import our Dudoxx TTS
from dudoxx_tts import DudoxxTTS, DudoxxTTSOptions

async def entrypoint(ctx: JobContext):
    # Connect to the room
    await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)

    # Initialize Dudoxx TTS
    dudoxx_tts = DudoxxTTS(
        DudoxxTTSOptions(
            base_url="https://kokoro.dudoxx.com",
            api_token="YOUR_DUDOXX_API_TOKEN",
            language="a",  # American English
            voice="af_heart",  # Female voice
            speed=1.0,
            sample_rate=24000,
        )
    )

    # Create voice assistant
    assistant = VoiceAssistant(
        vad=silero.VAD.load(),
        stt=openai.STT(),
        llm=openai.LLM(),
        tts=dudoxx_tts,  # Use Dudoxx Kokoro TTS
    )

    # Start the assistant
    assistant.start(ctx.room)

    # Initial greeting
    await assistant.say(
        "Hello! I'm your AI assistant powered by Dudoxx Kokoro TTS. "
        "How can I help you today?"
    )

    # Keep running
    await asyncio.sleep(float('inf'))


if __name__ == "__main__":
    cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))

Environment Variables

# .env for LiveKit Agent
LIVEKIT_URL=wss://your-livekit-server.com
LIVEKIT_API_KEY=your-api-key
LIVEKIT_API_SECRET=your-api-secret
OPENAI_API_KEY=your-openai-key

# Dudoxx Kokoro TTS
DUDOXX_TTS_URL=https://kokoro.dudoxx.com
DUDOXX_TTS_TOKEN=your-dudoxx-token

Multi-Language Voice Agent

# Multi-language support with automatic language detection
LANGUAGE_CONFIGS = {
    "en": DudoxxTTSOptions(
        base_url="https://kokoro.dudoxx.com",
        api_token="YOUR_TOKEN",
        language="a",
        voice="af_heart",
        sample_rate=24000,
    ),
    "de": DudoxxTTSOptions(
        base_url="https://kokoro.dudoxx.com",
        api_token="YOUR_TOKEN",
        language="d",
        voice="thorsten",  # German uses Piper
        sample_rate=22050,  # Different sample rate for Piper
    ),
    "es": DudoxxTTSOptions(
        base_url="https://kokoro.dudoxx.com",
        api_token="YOUR_TOKEN",
        language="e",
        voice="af_heart",
        sample_rate=24000,
    ),
    "fr": DudoxxTTSOptions(
        base_url="https://kokoro.dudoxx.com",
        api_token="YOUR_TOKEN",
        language="f",
        voice="af_bella",
        sample_rate=24000,
    ),
}

def get_tts_for_language(lang_code: str) -> DudoxxTTS:
    config = LANGUAGE_CONFIGS.get(lang_code, LANGUAGE_CONFIGS["en"])
    return DudoxxTTS(config)

Performance Considerations

Latency:

HTTP streaming: ~500-1500ms for first audio
WebSocket streaming: ~150-300ms for first chunk
German (Piper): ~100-200ms (fastest)

Recommended for LiveKit:

Use WebSocket streaming for conversational AI
Pre-warm connection for faster response
Consider German (Piper) for ultra-low latency

API Reference

Authentication

Authorization: Bearer YOUR_API_TOKEN

POST /tts/stream

Generate speech and return audio file.

Request:
{
  "text": "Text to synthesize",      // Required, max 10000 chars
  "language": "a",                    // Optional, default: "a"
  "voice": "af_heart",                // Optional, default: "af_heart"
  "speed": 1.0,                       // Optional, 0.5-2.0, default: 1.0
  "format": "wav"                     // Optional: wav, mp3, ogg
}

Response: Audio binary (Content-Type: audio/wav)

POST /tts/generate

Generate speech and return metadata only.

Request: Same as /tts/stream

Response:
{
  "success": true,
  "message": "Speech generated successfully",
  "metadata": {
    "text_length": 50,
    "language": "American English",
    "voice": "af_heart",
    "speed": 1.0,
    "format": "wav",
    "engine": "Kokoro"
  },
  "audio_length": 3.25,
  "sample_rate": 24000
}

POST /tts/batch

Process multiple texts in a single request.

Request:
{
  "texts": ["Text 1", "Text 2", "Text 3"],  // Max 10 texts
  "language": "a",
  "voice": "af_heart"
}

Response:
{
  "results": [
    {"success": true, "audio_length": 2.1},
    {"success": true, "audio_length": 1.8},
    {"success": true, "audio_length": 2.5}
  ],
  "total_duration": 6.4
}

WebSocket /tts/ws

Real-time streaming via WebSocket.

Connect: wss://kokoro.dudoxx.com/tts/ws?token=YOUR_TOKEN

Send (JSON):
{
  "text": "Text to synthesize",
  "language": "a",
  "voice": "af_heart",
  "speed": 1.0
}

Receive:
1. JSON: {"status": "generating", "language": "...", "voice": "..."}
2. Binary: PCM audio chunks (16-bit, mono)
3. JSON: {"done": true, "duration": 3.25, "sample_rate": 24000}

GET /health

Server health check (no auth required).

Response:
{
  "status": "healthy",
  "version": "1.0.0",
  "languages": ["a", "b", "d", "e", "f", "h", "i", "p", "j", "z"],
  "voices": ["af_heart", "af_bella", ...],
  "device": "cuda:0"
}

Error Responses

401 Unauthorized:
{"detail": "Invalid or missing authentication token"}

400 Bad Request:
{"detail": "Text is required"}

429 Too Many Requests:
{"detail": "Rate limit exceeded"}

500 Internal Server Error:
{"detail": "TTS generation failed: ..."}

Rate Limits

Standard: 100 requests/minute per token
Batch: 20 requests/minute per token
WebSocket: Unlimited concurrent connections per token
Max text length: 10,000 characters per request

Server Information