Add TTS/STT service with Coqui TTS and faster-whisper

- Added tts-stt service definition to docker-compose.yml - Created tts-stt directory with: - Dockerfile: Based on debian:trixie-slim with ROCm dependencies - requirements.txt: Python packages including TTS, faster-whisper, FastAPI - app.py: FastAPI service with /tts and /stt endpoints - Service includes GPU device mapping (/dev/kfd, /dev/dri) for ROCm acceleration - Uses YourTTS multilingual model for TTS and large-v3 for Whisper - Configured to use persistent storage for models and cache
2026-04-27 10:51:56 +00:00
parent fb0f2cbe84
commit ea1ddd8c63
4 changed files with 208 additions and 0 deletions
--- a/ai/compose.yml
+++ b/ai/compose.yml
@@ -81,6 +81,29 @@ services:
      - "303"
      - "26"
  tts-stt:
    build:
      context: ./tts-stt
      dockerfile: Dockerfile
    container_name: tts-stt
    restart: always
    volumes:
      - /mnt/HoardingCow_docker_data/TTS-Stt/models:/app/models
      - /mnt/HoardingCow_docker_data/TTS-Stt/cache:/app/cache
    environment:
      - TTS_MODEL=tts_models/multilingual/multi-dataset/your_tts
      - WHISPER_MODEL=large-v3
      - DEVICE=cuda
    devices:
      - /dev/kfd:/dev/kfd
      - /dev/dri:/dev/dri
    group_add:
      - "303"
      - "26"
    networks:
      - ai_backend
 networks:
  ai_net:
    external: true
--- a/ai/tts-stt/Dockerfile
+++ b/ai/tts-stt/Dockerfile
@@ -0,0 +1,50 @@
 FROM debian:trixie-slim
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
    python3-pip \
    python3-dev \
    git \
    wget \
    ffmpeg \
    libsndfile1 \
    && rm -rf /var/lib/apt/lists/*
 # Install ROCm runtime libraries (matching host setup)
 RUN apt-get update && apt-get install -y \
    rocm-Devkits=6.3.0 \
    rocm-dev=6.3.0 \
    && rm -rf /var/lib/apt/lists/*
 # Create non-root user
 RUN useradd -m -u 1000 appuser
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Download models (will be cached in volume)
 RUN mkdir -p /app/models && chown appuser:appuser /app/models
 USER appuser
 # Install TTS models (Coqui TTS with multilingual model)
 RUN pip install --no-cache-dir TTS && \
    tts --text "Initializing model..." --model_name tts_models/multilingual/multi-dataset/your_tts --out_path /tmp/init.wav || true
 # Install faster-whisper
 RUN pip install --no-cache-dir faster-whisper
 # Expose port
 EXPOSE 8000
 # Switch to non-root user
 USER appuser
 # Copy application code
 COPY app.py .
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
    CMD curl -f http://localhost:8000/health || exit 1
 # Command to run the application
 CMD ["python", "app.py"]
--- a/ai/tts-stt/app.py
+++ b/ai/tts-stt/app.py
@@ -0,0 +1,128 @@
 import os
 import torch
 import numpy as np
 from TTS.api import TTS
 from faster_whisper import WhisperModel
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
 from fastapi.responses import Response
 import tempfile
 import uuid
 from pydantic import BaseModel
 from typing import Optional
 app = FastAPI(title="TTS-STT Service", description="Coqui TTS and faster-whisper service")
 # Initialize models
 tts_model = None
 whisper_model = None
@app.on_event("startup")
 async def startup_event():
    global tts_model, whisper_model
    # Get model names from environment or use defaults
    tts_model_name = os.getenv("TTS_MODEL", "tts_models/multilingual/multi-dataset/your_tts")
    whisper_model_name = os.getenv("WHISPER_MODEL", "large-v3")
    device = os.getenv("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
    print(f"Loading TTS model: {tts_model_name} on {device}")
    tts_model = TTS(model_name=tts_model_name, progress_bar=False, gpu=device=="cuda")
    print(f"Loading Whisper model: {whisper_model_name} on {device}")
    whisper_model = WhisperModel(whisper_model_name, device=device, compute_type="float16" if device=="cuda" else "int8")
    print("Models loaded successfully")
 class TTSRequest(BaseModel):
    text: str
    language: Optional[str] = None
    speaker_wav: Optional[str] = None  # For voice cloning with YourTTS
@app.post("/tts")
 async def text_to_speech(request: TTSRequest):
    if not tts_model:
        raise HTTPException(status_code=503, detail="TTS model not loaded")
    try:
        # Generate unique filename
        filename = f"{uuid.uuid4()}.wav"
        filepath = f"/tmp/{filename}"
        # Synthesize speech
        if request.speaker_wav and os.path.exists(request.speaker_wav):
            # Voice cloning
            tts_model.tts_to_file(
                text=request.text,
                speaker_wav=request.speaker_wav,
                language=request.language or "en",
                file_path=filepath
            )
        else:
            # Regular TTS
            tts_model.tts_to_file(
                text=request.text,
                language=request.language or "en",
                file_path=filepath
            )
        # Read and return the audio file
        with open(filepath, "rb") as f:
            audio_data = f.read()
        # Clean up
        os.remove(filepath)
        return Response(content=audio_data, media_type="audio/wav")
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
@app.post("/stt")
 async def speech_to_text(file: UploadFile = File(...), language: Optional[str] = None):
    if not whisper_model:
        raise HTTPException(status_code=503, detail="Whisper model not loaded")
    try:
        # Save uploaded file temporarily
        filename = f"{uuid.uuid4()}_{file.filename}"
        filepath = f"/tmp/{filename}"
        with open(filepath, "wb") as buffer:
            content = await file.read()
            buffer.write(content)
        # Transcribe
        segments, info = whisper_model.transcribe(
            filepath,
            language=language,
            beam_size=5,
            vad_filter=True,
            vad_parameters=dict(min_silence_duration_ms=500)
        )
        # Collect transcription
        transcription = " ".join([segment.text for segment in segments])
        # Clean up
        os.remove(filepath)
        return {
            "text": transcription.strip(),
            "language": info.language,
            "language_probability": info.language_probability
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
 async def health_check():
    return {
        "status": "healthy",
        "tts_loaded": tts_model is not None,
        "whisper_loaded": whisper_model is not None
    }
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/ai/tts-stt/requirements.txt
+++ b/ai/tts-stt/requirements.txt
@@ -0,0 +1,7 @@
 fastapi==0.104.1
 uvicorn[standard]==0.24.0
 TTS==0.22.0
 faster-whisper==1.0.3
 python-multipart==0.0.6
 torch==2.3.0
 torchaudio==2.3.0