From ea1ddd8c63fda9b7fc56d6db25098de680cff0ac Mon Sep 17 00:00:00 2001
From: Hermes Agent <hermes@agent.local>
Date: Mon, 27 Apr 2026 10:51:56 +0000
Subject: [PATCH] Add TTS/STT service with Coqui TTS and faster-whisper

- Added tts-stt service definition to docker-compose.yml
- Created tts-stt directory with:
  - Dockerfile: Based on debian:trixie-slim with ROCm dependencies
  - requirements.txt: Python packages including TTS, faster-whisper, FastAPI
  - app.py: FastAPI service with /tts and /stt endpoints
- Service includes GPU device mapping (/dev/kfd, /dev/dri) for ROCm acceleration
- Uses YourTTS multilingual model for TTS and large-v3 for Whisper
- Configured to use persistent storage for models and cache
---
 ai/compose.yml              |  23 +++++++
 ai/tts-stt/Dockerfile       |  50 ++++++++++++++
 ai/tts-stt/app.py           | 128 ++++++++++++++++++++++++++++++++++++
 ai/tts-stt/requirements.txt |   7 ++
 4 files changed, 208 insertions(+)
 create mode 100644 ai/tts-stt/Dockerfile
 create mode 100644 ai/tts-stt/app.py
 create mode 100644 ai/tts-stt/requirements.txt

diff --git a/ai/compose.yml b/ai/compose.yml
index 460d44d..948c868 100644
--- a/ai/compose.yml
+++ b/ai/compose.yml
@@ -81,6 +81,29 @@ services:
       - "303"
       - "26"
 
+
+  tts-stt:
+    build:
+      context: ./tts-stt
+      dockerfile: Dockerfile
+    container_name: tts-stt
+    restart: always
+    volumes:
+      - /mnt/HoardingCow_docker_data/TTS-Stt/models:/app/models
+      - /mnt/HoardingCow_docker_data/TTS-Stt/cache:/app/cache
+    environment:
+      - TTS_MODEL=tts_models/multilingual/multi-dataset/your_tts
+      - WHISPER_MODEL=large-v3
+      - DEVICE=cuda
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri:/dev/dri
+    group_add:
+      - "303"
+      - "26"
+    networks:
+      - ai_backend
+
 networks:
   ai_net:
     external: true
diff --git a/ai/tts-stt/Dockerfile b/ai/tts-stt/Dockerfile
new file mode 100644
index 0000000..a9083dd
--- /dev/null
+++ b/ai/tts-stt/Dockerfile
@@ -0,0 +1,50 @@
+FROM debian:trixie-slim
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3-pip \
+    python3-dev \
+    git \
+    wget \
+    ffmpeg \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install ROCm runtime libraries (matching host setup)
+RUN apt-get update && apt-get install -y \
+    rocm-Devkits=6.3.0 \
+    rocm-dev=6.3.0 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create non-root user
+RUN useradd -m -u 1000 appuser
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Download models (will be cached in volume)
+RUN mkdir -p /app/models && chown appuser:appuser /app/models
+USER appuser
+
+# Install TTS models (Coqui TTS with multilingual model)
+RUN pip install --no-cache-dir TTS && \
+    tts --text "Initializing model..." --model_name tts_models/multilingual/multi-dataset/your_tts --out_path /tmp/init.wav || true
+
+# Install faster-whisper
+RUN pip install --no-cache-dir faster-whisper
+
+# Expose port
+EXPOSE 8000
+
+# Switch to non-root user
+USER appuser
+
+# Copy application code
+COPY app.py .
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+
+# Command to run the application
+CMD ["python", "app.py"]
diff --git a/ai/tts-stt/app.py b/ai/tts-stt/app.py
new file mode 100644
index 0000000..4e46801
--- /dev/null
+++ b/ai/tts-stt/app.py
@@ -0,0 +1,128 @@
+import os
+import torch
+import numpy as np
+from TTS.api import TTS
+from faster_whisper import WhisperModel
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from fastapi.responses import Response
+import tempfile
+import uuid
+from pydantic import BaseModel
+from typing import Optional
+
+app = FastAPI(title="TTS-STT Service", description="Coqui TTS and faster-whisper service")
+
+# Initialize models
+tts_model = None
+whisper_model = None
+
+@app.on_event("startup")
+async def startup_event():
+    global tts_model, whisper_model
+    
+    # Get model names from environment or use defaults
+    tts_model_name = os.getenv("TTS_MODEL", "tts_models/multilingual/multi-dataset/your_tts")
+    whisper_model_name = os.getenv("WHISPER_MODEL", "large-v3")
+    device = os.getenv("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
+    
+    print(f"Loading TTS model: {tts_model_name} on {device}")
+    tts_model = TTS(model_name=tts_model_name, progress_bar=False, gpu=device=="cuda")
+    
+    print(f"Loading Whisper model: {whisper_model_name} on {device}")
+    whisper_model = WhisperModel(whisper_model_name, device=device, compute_type="float16" if device=="cuda" else "int8")
+    
+    print("Models loaded successfully")
+
+class TTSRequest(BaseModel):
+    text: str
+    language: Optional[str] = None
+    speaker_wav: Optional[str] = None  # For voice cloning with YourTTS
+
+@app.post("/tts")
+async def text_to_speech(request: TTSRequest):
+    if not tts_model:
+        raise HTTPException(status_code=503, detail="TTS model not loaded")
+    
+    try:
+        # Generate unique filename
+        filename = f"{uuid.uuid4()}.wav"
+        filepath = f"/tmp/{filename}"
+        
+        # Synthesize speech
+        if request.speaker_wav and os.path.exists(request.speaker_wav):
+            # Voice cloning
+            tts_model.tts_to_file(
+                text=request.text,
+                speaker_wav=request.speaker_wav,
+                language=request.language or "en",
+                file_path=filepath
+            )
+        else:
+            # Regular TTS
+            tts_model.tts_to_file(
+                text=request.text,
+                language=request.language or "en",
+                file_path=filepath
+            )
+        
+        # Read and return the audio file
+        with open(filepath, "rb") as f:
+            audio_data = f.read()
+        
+        # Clean up
+        os.remove(filepath)
+        
+        return Response(content=audio_data, media_type="audio/wav")
+    
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.post("/stt")
+async def speech_to_text(file: UploadFile = File(...), language: Optional[str] = None):
+    if not whisper_model:
+        raise HTTPException(status_code=503, detail="Whisper model not loaded")
+    
+    try:
+        # Save uploaded file temporarily
+        filename = f"{uuid.uuid4()}_{file.filename}"
+        filepath = f"/tmp/{filename}"
+        
+        with open(filepath, "wb") as buffer:
+            content = await file.read()
+            buffer.write(content)
+        
+        # Transcribe
+        segments, info = whisper_model.transcribe(
+            filepath,
+            language=language,
+            beam_size=5,
+            vad_filter=True,
+            vad_parameters=dict(min_silence_duration_ms=500)
+        )
+        
+        # Collect transcription
+        transcription = " ".join([segment.text for segment in segments])
+        
+        # Clean up
+        os.remove(filepath)
+        
+        return {
+            "text": transcription.strip(),
+            "language": info.language,
+            "language_probability": info.language_probability
+        }
+    
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.get("/health")
+async def health_check():
+    return {
+        "status": "healthy",
+        "tts_loaded": tts_model is not None,
+        "whisper_loaded": whisper_model is not None
+    }
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/ai/tts-stt/requirements.txt b/ai/tts-stt/requirements.txt
new file mode 100644
index 0000000..6b0a7e3
--- /dev/null
+++ b/ai/tts-stt/requirements.txt
@@ -0,0 +1,7 @@
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+TTS==0.22.0
+faster-whisper==1.0.3
+python-multipart==0.0.6
+torch==2.3.0
+torchaudio==2.3.0