From ea1ddd8c63fda9b7fc56d6db25098de680cff0ac Mon Sep 17 00:00:00 2001 From: Hermes Agent Date: Mon, 27 Apr 2026 10:51:56 +0000 Subject: [PATCH] Add TTS/STT service with Coqui TTS and faster-whisper - Added tts-stt service definition to docker-compose.yml - Created tts-stt directory with: - Dockerfile: Based on debian:trixie-slim with ROCm dependencies - requirements.txt: Python packages including TTS, faster-whisper, FastAPI - app.py: FastAPI service with /tts and /stt endpoints - Service includes GPU device mapping (/dev/kfd, /dev/dri) for ROCm acceleration - Uses YourTTS multilingual model for TTS and large-v3 for Whisper - Configured to use persistent storage for models and cache --- ai/compose.yml | 23 +++++++ ai/tts-stt/Dockerfile | 50 ++++++++++++++ ai/tts-stt/app.py | 128 ++++++++++++++++++++++++++++++++++++ ai/tts-stt/requirements.txt | 7 ++ 4 files changed, 208 insertions(+) create mode 100644 ai/tts-stt/Dockerfile create mode 100644 ai/tts-stt/app.py create mode 100644 ai/tts-stt/requirements.txt diff --git a/ai/compose.yml b/ai/compose.yml index 460d44d..948c868 100644 --- a/ai/compose.yml +++ b/ai/compose.yml @@ -81,6 +81,29 @@ services: - "303" - "26" + + tts-stt: + build: + context: ./tts-stt + dockerfile: Dockerfile + container_name: tts-stt + restart: always + volumes: + - /mnt/HoardingCow_docker_data/TTS-Stt/models:/app/models + - /mnt/HoardingCow_docker_data/TTS-Stt/cache:/app/cache + environment: + - TTS_MODEL=tts_models/multilingual/multi-dataset/your_tts + - WHISPER_MODEL=large-v3 + - DEVICE=cuda + devices: + - /dev/kfd:/dev/kfd + - /dev/dri:/dev/dri + group_add: + - "303" + - "26" + networks: + - ai_backend + networks: ai_net: external: true diff --git a/ai/tts-stt/Dockerfile b/ai/tts-stt/Dockerfile new file mode 100644 index 0000000..a9083dd --- /dev/null +++ b/ai/tts-stt/Dockerfile @@ -0,0 +1,50 @@ +FROM debian:trixie-slim + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + python3-pip \ + python3-dev \ + git \ + wget \ + ffmpeg \ + libsndfile1 \ + && rm -rf /var/lib/apt/lists/* + +# Install ROCm runtime libraries (matching host setup) +RUN apt-get update && apt-get install -y \ + rocm-Devkits=6.3.0 \ + rocm-dev=6.3.0 \ + && rm -rf /var/lib/apt/lists/* + +# Create non-root user +RUN useradd -m -u 1000 appuser +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Download models (will be cached in volume) +RUN mkdir -p /app/models && chown appuser:appuser /app/models +USER appuser + +# Install TTS models (Coqui TTS with multilingual model) +RUN pip install --no-cache-dir TTS && \ + tts --text "Initializing model..." --model_name tts_models/multilingual/multi-dataset/your_tts --out_path /tmp/init.wav || true + +# Install faster-whisper +RUN pip install --no-cache-dir faster-whisper + +# Expose port +EXPOSE 8000 + +# Switch to non-root user +USER appuser + +# Copy application code +COPY app.py . + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8000/health || exit 1 + +# Command to run the application +CMD ["python", "app.py"] diff --git a/ai/tts-stt/app.py b/ai/tts-stt/app.py new file mode 100644 index 0000000..4e46801 --- /dev/null +++ b/ai/tts-stt/app.py @@ -0,0 +1,128 @@ +import os +import torch +import numpy as np +from TTS.api import TTS +from faster_whisper import WhisperModel +from fastapi import FastAPI, File, UploadFile, Form, HTTPException +from fastapi.responses import Response +import tempfile +import uuid +from pydantic import BaseModel +from typing import Optional + +app = FastAPI(title="TTS-STT Service", description="Coqui TTS and faster-whisper service") + +# Initialize models +tts_model = None +whisper_model = None + +@app.on_event("startup") +async def startup_event(): + global tts_model, whisper_model + + # Get model names from environment or use defaults + tts_model_name = os.getenv("TTS_MODEL", "tts_models/multilingual/multi-dataset/your_tts") + whisper_model_name = os.getenv("WHISPER_MODEL", "large-v3") + device = os.getenv("DEVICE", "cuda" if torch.cuda.is_available() else "cpu") + + print(f"Loading TTS model: {tts_model_name} on {device}") + tts_model = TTS(model_name=tts_model_name, progress_bar=False, gpu=device=="cuda") + + print(f"Loading Whisper model: {whisper_model_name} on {device}") + whisper_model = WhisperModel(whisper_model_name, device=device, compute_type="float16" if device=="cuda" else "int8") + + print("Models loaded successfully") + +class TTSRequest(BaseModel): + text: str + language: Optional[str] = None + speaker_wav: Optional[str] = None # For voice cloning with YourTTS + +@app.post("/tts") +async def text_to_speech(request: TTSRequest): + if not tts_model: + raise HTTPException(status_code=503, detail="TTS model not loaded") + + try: + # Generate unique filename + filename = f"{uuid.uuid4()}.wav" + filepath = f"/tmp/{filename}" + + # Synthesize speech + if request.speaker_wav and os.path.exists(request.speaker_wav): + # Voice cloning + tts_model.tts_to_file( + text=request.text, + speaker_wav=request.speaker_wav, + language=request.language or "en", + file_path=filepath + ) + else: + # Regular TTS + tts_model.tts_to_file( + text=request.text, + language=request.language or "en", + file_path=filepath + ) + + # Read and return the audio file + with open(filepath, "rb") as f: + audio_data = f.read() + + # Clean up + os.remove(filepath) + + return Response(content=audio_data, media_type="audio/wav") + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/stt") +async def speech_to_text(file: UploadFile = File(...), language: Optional[str] = None): + if not whisper_model: + raise HTTPException(status_code=503, detail="Whisper model not loaded") + + try: + # Save uploaded file temporarily + filename = f"{uuid.uuid4()}_{file.filename}" + filepath = f"/tmp/{filename}" + + with open(filepath, "wb") as buffer: + content = await file.read() + buffer.write(content) + + # Transcribe + segments, info = whisper_model.transcribe( + filepath, + language=language, + beam_size=5, + vad_filter=True, + vad_parameters=dict(min_silence_duration_ms=500) + ) + + # Collect transcription + transcription = " ".join([segment.text for segment in segments]) + + # Clean up + os.remove(filepath) + + return { + "text": transcription.strip(), + "language": info.language, + "language_probability": info.language_probability + } + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/health") +async def health_check(): + return { + "status": "healthy", + "tts_loaded": tts_model is not None, + "whisper_loaded": whisper_model is not None + } + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/ai/tts-stt/requirements.txt b/ai/tts-stt/requirements.txt new file mode 100644 index 0000000..6b0a7e3 --- /dev/null +++ b/ai/tts-stt/requirements.txt @@ -0,0 +1,7 @@ +fastapi==0.104.1 +uvicorn[standard]==0.24.0 +TTS==0.22.0 +faster-whisper==1.0.3 +python-multipart==0.0.6 +torch==2.3.0 +torchaudio==2.3.0