High-Quality Neural Text-to-Speech Engine
Base URL: https://kokoro.dudoxx.com
Authentication: Bearer Token (Header)
Content-Type: application/json
| Code | Language | Engine | Sample Rate |
|---|---|---|---|
a | American English | Kokoro-82M | 24kHz |
b | British English | Kokoro-82M | 24kHz |
d | German | Piper TTS | 22kHz |
e | Spanish | Kokoro-82M | 24kHz |
f | French | Kokoro-82M | 24kHz |
h | Hindi | Kokoro-82M | 24kHz |
i | Italian | Kokoro-82M | 24kHz |
p | Portuguese | Kokoro-82M | 24kHz |
j | Japanese | Kokoro-82M | 24kHz |
z | Mandarin Chinese | Kokoro-82M | 24kHz |
| Voice ID | Description | Gender |
|---|---|---|
af_heart | Heart - Warm, expressive | Female |
af_bella | Bella - Professional | Female |
af_nicole | Nicole - Friendly | Female |
af_sarah | Sarah - Clear, articulate | Female |
am_adam | Adam - Deep, authoritative | Male |
am_michael | Michael - Conversational | Male |
bf_emma | Emma - British accent | Female |
bf_isabella | Isabella - British accent | Female |
bm_george | George - British accent | Male |
# Generate speech and save to file
curl -X POST "https://kokoro.dudoxx.com/tts/stream" \
-H "Authorization: Bearer YOUR_API_TOKEN" \
-H "Content-Type: application/json" \
-d '{
"text": "Hello, welcome to Dudoxx Kokoro TTS!",
"language": "a",
"voice": "af_heart",
"format": "wav"
}' \
-o output.wav
Complete guide for integrating Dudoxx Kokoro TTS into your Next.js application.
# .env.local
DUDOXX_TTS_URL=https://kokoro.dudoxx.com
DUDOXX_TTS_TOKEN=your-api-token-here
// lib/tts.ts
interface TTSOptions {
text: string;
language?: 'a' | 'b' | 'd' | 'e' | 'f' | 'h' | 'i' | 'p' | 'j' | 'z';
voice?: string;
speed?: number;
format?: 'wav' | 'mp3' | 'ogg';
}
interface TTSResponse {
success: boolean;
audioUrl?: string;
error?: string;
duration?: number;
}
export async function generateSpeech(options: TTSOptions): Promise<TTSResponse> {
const {
text,
language = 'a',
voice = 'af_heart',
speed = 1.0,
format = 'wav'
} = options;
try {
const response = await fetch(`${process.env.DUDOXX_TTS_URL}/tts/stream`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${process.env.DUDOXX_TTS_TOKEN}`,
'Content-Type': 'application/json',
},
body: JSON.stringify({ text, language, voice, speed, format }),
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || 'TTS generation failed');
}
const audioBlob = await response.blob();
const audioUrl = URL.createObjectURL(audioBlob);
return { success: true, audioUrl };
} catch (error) {
return {
success: false,
error: error instanceof Error ? error.message : 'Unknown error'
};
}
}
// Server-side TTS (returns buffer)
export async function generateSpeechBuffer(options: TTSOptions): Promise<Buffer> {
const response = await fetch(`${process.env.DUDOXX_TTS_URL}/tts/stream`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${process.env.DUDOXX_TTS_TOKEN}`,
'Content-Type': 'application/json',
},
body: JSON.stringify(options),
});
if (!response.ok) {
throw new Error('TTS generation failed');
}
const arrayBuffer = await response.arrayBuffer();
return Buffer.from(arrayBuffer);
}
// app/api/tts/route.ts
import { NextRequest, NextResponse } from 'next/server';
export async function POST(request: NextRequest) {
try {
const body = await request.json();
const { text, language = 'a', voice = 'af_heart', format = 'wav' } = body;
if (!text || text.length === 0) {
return NextResponse.json(
{ error: 'Text is required' },
{ status: 400 }
);
}
const ttsResponse = await fetch(
`${process.env.DUDOXX_TTS_URL}/tts/stream`,
{
method: 'POST',
headers: {
'Authorization': `Bearer ${process.env.DUDOXX_TTS_TOKEN}`,
'Content-Type': 'application/json',
},
body: JSON.stringify({ text, language, voice, format }),
}
);
if (!ttsResponse.ok) {
const error = await ttsResponse.json();
return NextResponse.json(
{ error: error.detail || 'TTS failed' },
{ status: ttsResponse.status }
);
}
const audioBuffer = await ttsResponse.arrayBuffer();
return new NextResponse(audioBuffer, {
headers: {
'Content-Type': `audio/${format}`,
'Content-Disposition': `attachment; filename="speech.${format}"`,
},
});
} catch (error) {
return NextResponse.json(
{ error: 'Internal server error' },
{ status: 500 }
);
}
}
// hooks/useTTS.ts
'use client';
import { useState, useCallback, useRef } from 'react';
interface UseTTSOptions {
language?: string;
voice?: string;
autoPlay?: boolean;
}
export function useTTS(options: UseTTSOptions = {}) {
const { language = 'a', voice = 'af_heart', autoPlay = true } = options;
const [isLoading, setIsLoading] = useState(false);
const [error, setError] = useState<string | null>(null);
const [audioUrl, setAudioUrl] = useState<string | null>(null);
const audioRef = useRef<HTMLAudioElement | null>(null);
const speak = useCallback(async (text: string) => {
setIsLoading(true);
setError(null);
try {
const response = await fetch('/api/tts', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text, language, voice }),
});
if (!response.ok) {
throw new Error('TTS generation failed');
}
const blob = await response.blob();
const url = URL.createObjectURL(blob);
setAudioUrl(url);
if (autoPlay) {
const audio = new Audio(url);
audioRef.current = audio;
await audio.play();
}
} catch (err) {
setError(err instanceof Error ? err.message : 'Unknown error');
} finally {
setIsLoading(false);
}
}, [language, voice, autoPlay]);
const stop = useCallback(() => {
if (audioRef.current) {
audioRef.current.pause();
audioRef.current.currentTime = 0;
}
}, []);
return { speak, stop, isLoading, error, audioUrl };
}
// components/TextToSpeech.tsx
'use client';
import { useState } from 'react';
import { useTTS } from '@/hooks/useTTS';
export function TextToSpeech() {
const [text, setText] = useState('');
const { speak, stop, isLoading, error } = useTTS({
language: 'a',
voice: 'af_heart',
autoPlay: true,
});
return (
<div className="flex flex-col gap-4">
<textarea
value={text}
onChange={(e) => setText(e.target.value)}
placeholder="Enter text to speak..."
className="p-3 border rounded-lg"
rows={4}
/>
<div className="flex gap-2">
<button
onClick={() => speak(text)}
disabled={isLoading || !text}
className="px-4 py-2 bg-primary text-white rounded-lg disabled:opacity-50"
>
{isLoading ? 'Generating...' : 'Speak'}
</button>
<button
onClick={stop}
className="px-4 py-2 bg-gray-500 text-white rounded-lg"
>
Stop
</button>
</div>
{error && <p className="text-red-500">{error}</p>}
</div>
);
}
// app/speak/[text]/route.ts
import { NextRequest } from 'next/server';
export async function GET(
request: NextRequest,
{ params }: { params: Promise<{ text: string }> }
) {
const { text } = await params;
const decodedText = decodeURIComponent(text);
const response = await fetch(
`${process.env.DUDOXX_TTS_URL}/tts/stream`,
{
method: 'POST',
headers: {
'Authorization': `Bearer ${process.env.DUDOXX_TTS_TOKEN}`,
'Content-Type': 'application/json',
},
body: JSON.stringify({
text: decodedText,
language: 'a',
voice: 'af_heart',
format: 'mp3',
}),
}
);
// Stream the response
return new Response(response.body, {
headers: {
'Content-Type': 'audio/mpeg',
'Transfer-Encoding': 'chunked',
},
});
}
Configure Dudoxx Kokoro TTS as a custom TTS provider for LiveKit Agents.
# Python LiveKit Agent with Dudoxx Kokoro TTS
# pip install livekit-agents livekit-plugins-openai httpx
from livekit.agents import tts
from livekit.agents.tts import SynthesizedAudio
import httpx
import io
import wave
import numpy as np
from dataclasses import dataclass
from typing import AsyncGenerator
@dataclass
class DudoxxTTSOptions:
"""Dudoxx Kokoro TTS configuration"""
base_url: str = "https://kokoro.dudoxx.com"
api_token: str = ""
language: str = "a" # a=American, b=British, d=German, etc.
voice: str = "af_heart"
speed: float = 1.0
sample_rate: int = 24000 # 24kHz for Kokoro, 22kHz for German/Piper
class DudoxxTTS(tts.TTS):
"""
Dudoxx Kokoro TTS Plugin for LiveKit Agents
Server: https://kokoro.dudoxx.com
Supported Languages:
- 'a': American English (Kokoro-82M, 24kHz)
- 'b': British English (Kokoro-82M, 24kHz)
- 'd': German (Piper TTS, 22kHz)
- 'e': Spanish (Kokoro-82M, 24kHz)
- 'f': French (Kokoro-82M, 24kHz)
- 'h': Hindi (Kokoro-82M, 24kHz)
- 'i': Italian (Kokoro-82M, 24kHz)
- 'p': Portuguese (Kokoro-82M, 24kHz)
- 'j': Japanese (Kokoro-82M, 24kHz)
- 'z': Mandarin Chinese (Kokoro-82M, 24kHz)
Voices:
- af_heart, af_bella, af_nicole, af_sarah (American Female)
- am_adam, am_michael (American Male)
- bf_emma, bf_isabella (British Female)
- bm_george (British Male)
"""
def __init__(self, options: DudoxxTTSOptions):
super().__init__(
capabilities=tts.TTSCapabilities(streaming=False),
sample_rate=options.sample_rate,
num_channels=1,
)
self.options = options
self._client = httpx.AsyncClient(timeout=60.0)
async def synthesize(self, text: str) -> SynthesizedAudio:
"""Synthesize speech from text"""
response = await self._client.post(
f"{self.options.base_url}/tts/stream",
headers={
"Authorization": f"Bearer {self.options.api_token}",
"Content-Type": "application/json",
},
json={
"text": text,
"language": self.options.language,
"voice": self.options.voice,
"speed": self.options.speed,
"format": "wav",
},
)
response.raise_for_status()
# Parse WAV data
wav_data = io.BytesIO(response.content)
with wave.open(wav_data, 'rb') as wav_file:
frames = wav_file.readframes(wav_file.getnframes())
audio_data = np.frombuffer(frames, dtype=np.int16)
return SynthesizedAudio(
text=text,
data=audio_data.tobytes(),
sample_rate=self.options.sample_rate,
num_channels=1,
)
async def close(self):
await self._client.aclose()
# Streaming version using WebSocket
class DudoxxStreamingTTS(tts.TTS):
"""
Streaming TTS using WebSocket for lower latency
"""
def __init__(self, options: DudoxxTTSOptions):
super().__init__(
capabilities=tts.TTSCapabilities(streaming=True),
sample_rate=options.sample_rate,
num_channels=1,
)
self.options = options
async def stream(self, text: str) -> AsyncGenerator[SynthesizedAudio, None]:
"""Stream synthesized audio chunks"""
import websockets
ws_url = (
f"wss://kokoro.dudoxx.com/tts/ws"
f"?token={self.options.api_token}"
)
async with websockets.connect(ws_url) as ws:
# Send TTS request
await ws.send(json.dumps({
"text": text,
"language": self.options.language,
"voice": self.options.voice,
"speed": self.options.speed,
}))
# Receive audio chunks
while True:
message = await ws.recv()
if isinstance(message, bytes):
# PCM audio chunk
yield SynthesizedAudio(
text=text,
data=message,
sample_rate=self.options.sample_rate,
num_channels=1,
)
else:
# JSON message
data = json.loads(message)
if data.get("done"):
break
# agent.py - LiveKit Agent with Dudoxx TTS
import asyncio
from livekit.agents import (
AutoSubscribe,
JobContext,
WorkerOptions,
cli,
llm,
)
from livekit.agents.voice_assistant import VoiceAssistant
from livekit.plugins import openai, silero
# Import our Dudoxx TTS
from dudoxx_tts import DudoxxTTS, DudoxxTTSOptions
async def entrypoint(ctx: JobContext):
# Connect to the room
await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
# Initialize Dudoxx TTS
dudoxx_tts = DudoxxTTS(
DudoxxTTSOptions(
base_url="https://kokoro.dudoxx.com",
api_token="YOUR_DUDOXX_API_TOKEN",
language="a", # American English
voice="af_heart", # Female voice
speed=1.0,
sample_rate=24000,
)
)
# Create voice assistant
assistant = VoiceAssistant(
vad=silero.VAD.load(),
stt=openai.STT(),
llm=openai.LLM(),
tts=dudoxx_tts, # Use Dudoxx Kokoro TTS
)
# Start the assistant
assistant.start(ctx.room)
# Initial greeting
await assistant.say(
"Hello! I'm your AI assistant powered by Dudoxx Kokoro TTS. "
"How can I help you today?"
)
# Keep running
await asyncio.sleep(float('inf'))
if __name__ == "__main__":
cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))
# .env for LiveKit Agent
LIVEKIT_URL=wss://your-livekit-server.com
LIVEKIT_API_KEY=your-api-key
LIVEKIT_API_SECRET=your-api-secret
OPENAI_API_KEY=your-openai-key
# Dudoxx Kokoro TTS
DUDOXX_TTS_URL=https://kokoro.dudoxx.com
DUDOXX_TTS_TOKEN=your-dudoxx-token
# Multi-language support with automatic language detection
LANGUAGE_CONFIGS = {
"en": DudoxxTTSOptions(
base_url="https://kokoro.dudoxx.com",
api_token="YOUR_TOKEN",
language="a",
voice="af_heart",
sample_rate=24000,
),
"de": DudoxxTTSOptions(
base_url="https://kokoro.dudoxx.com",
api_token="YOUR_TOKEN",
language="d",
voice="thorsten", # German uses Piper
sample_rate=22050, # Different sample rate for Piper
),
"es": DudoxxTTSOptions(
base_url="https://kokoro.dudoxx.com",
api_token="YOUR_TOKEN",
language="e",
voice="af_heart",
sample_rate=24000,
),
"fr": DudoxxTTSOptions(
base_url="https://kokoro.dudoxx.com",
api_token="YOUR_TOKEN",
language="f",
voice="af_bella",
sample_rate=24000,
),
}
def get_tts_for_language(lang_code: str) -> DudoxxTTS:
config = LANGUAGE_CONFIGS.get(lang_code, LANGUAGE_CONFIGS["en"])
return DudoxxTTS(config)
Authorization: Bearer YOUR_API_TOKEN
Generate speech and return audio file.
Request:
{
"text": "Text to synthesize", // Required, max 10000 chars
"language": "a", // Optional, default: "a"
"voice": "af_heart", // Optional, default: "af_heart"
"speed": 1.0, // Optional, 0.5-2.0, default: 1.0
"format": "wav" // Optional: wav, mp3, ogg
}
Response: Audio binary (Content-Type: audio/wav)
Generate speech and return metadata only.
Request: Same as /tts/stream
Response:
{
"success": true,
"message": "Speech generated successfully",
"metadata": {
"text_length": 50,
"language": "American English",
"voice": "af_heart",
"speed": 1.0,
"format": "wav",
"engine": "Kokoro"
},
"audio_length": 3.25,
"sample_rate": 24000
}
Process multiple texts in a single request.
Request:
{
"texts": ["Text 1", "Text 2", "Text 3"], // Max 10 texts
"language": "a",
"voice": "af_heart"
}
Response:
{
"results": [
{"success": true, "audio_length": 2.1},
{"success": true, "audio_length": 1.8},
{"success": true, "audio_length": 2.5}
],
"total_duration": 6.4
}
Real-time streaming via WebSocket.
Connect: wss://kokoro.dudoxx.com/tts/ws?token=YOUR_TOKEN
Send (JSON):
{
"text": "Text to synthesize",
"language": "a",
"voice": "af_heart",
"speed": 1.0
}
Receive:
1. JSON: {"status": "generating", "language": "...", "voice": "..."}
2. Binary: PCM audio chunks (16-bit, mono)
3. JSON: {"done": true, "duration": 3.25, "sample_rate": 24000}
Server health check (no auth required).
Response:
{
"status": "healthy",
"version": "1.0.0",
"languages": ["a", "b", "d", "e", "f", "h", "i", "p", "j", "z"],
"voices": ["af_heart", "af_bella", ...],
"device": "cuda:0"
}
401 Unauthorized:
{"detail": "Invalid or missing authentication token"}
400 Bad Request:
{"detail": "Text is required"}
429 Too Many Requests:
{"detail": "Rate limit exceeded"}
500 Internal Server Error:
{"detail": "TTS generation failed: ..."}