Source code for dpo_reader.tts.openai

"""OpenAI TTS backend - high quality cloud voices."""

from __future__ import annotations

import os

import numpy as np

from .base import TTSBackend

# Available OpenAI TTS voices
OPENAI_VOICES = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]



[docs]
class OpenAIBackend(TTSBackend):
    """OpenAI TTS backend using their API.

    Requires OPENAI_API_KEY environment variable.
    """

    name = "openai"
    sample_rate = 24000
    narrator_voice = "onyx"  # Deep, neutral voice for narration


[docs]
    def __init__(self, model: str = "tts-1"):
        """Initialize OpenAI TTS backend.

        Args:
            model: Model to use - "tts-1" (faster) or "tts-1-hd" (higher quality)
        """
        self.model = model
        self.api_key = os.environ.get("OPENAI_API_KEY")
        if not self.api_key:
            raise ValueError(
                "OPENAI_API_KEY environment variable required for OpenAI TTS. "
                "Get one at https://platform.openai.com/api-keys"
            )

        # Lazy import
        try:
            import httpx

            self._client = httpx.Client(
                base_url="https://api.openai.com/v1",
                headers={"Authorization": f"Bearer {self.api_key}"},
                timeout=60.0,
            )
        except ImportError as e:
            raise ImportError("httpx required for OpenAI backend") from e



[docs]
    def get_voices(self) -> list[str]:
        """Return list of available voice IDs."""
        return OPENAI_VOICES



[docs]
    def synthesize(self, text: str, voice: str) -> np.ndarray:
        """Synthesize text to audio using OpenAI TTS API.

        Args:
            text: Text to synthesize
            voice: Voice ID to use (alloy, echo, fable, onyx, nova, shimmer)

        Returns:
            Audio as float32 numpy array
        """
        import time

        import httpx

        if voice not in OPENAI_VOICES:
            voice = OPENAI_VOICES[hash(voice) % len(OPENAI_VOICES)]

        # Retry with exponential backoff for rate limits
        max_retries = 8
        base_delay = 2.0

        for attempt in range(max_retries):
            try:
                response = self._client.post(
                    "/audio/speech",
                    json={
                        "model": self.model,
                        "input": text,
                        "voice": voice,
                        "response_format": "pcm",  # Raw 24kHz 16-bit mono PCM
                    },
                )
                response.raise_for_status()

                # Convert PCM bytes to float32 numpy array
                audio_bytes = response.content
                audio_int16 = np.frombuffer(audio_bytes, dtype=np.int16)
                audio_float32 = audio_int16.astype(np.float32) / 32768.0

                return audio_float32

            except httpx.HTTPStatusError as e:
                if e.response.status_code == 429 and attempt < max_retries - 1:
                    # Rate limited - wait and retry with exponential backoff
                    delay = base_delay * (2**attempt)
                    print(f"Rate limited, waiting {delay:.0f}s (attempt {attempt + 1}/{max_retries})...")
                    time.sleep(delay)
                    continue
                raise
            except Exception as e:
                if "429" in str(e) and attempt < max_retries - 1:
                    delay = base_delay * (2**attempt)
                    print(f"Rate limited, waiting {delay:.0f}s (attempt {attempt + 1}/{max_retries})...")
                    time.sleep(delay)
                    continue
                raise

        raise RuntimeError(f"Failed to synthesize after {max_retries} retries")


    def __del__(self):
        """Cleanup HTTP client."""
        if hasattr(self, "_client"):
            self._client.close()