From 96e77c5ef2def7eac6525a92ff97a2262f2828d1 Mon Sep 17 00:00:00 2001 From: Hermes Date: Sat, 9 May 2026 20:19:26 +0000 Subject: [PATCH] Revert "feat: add ai-optimizer benchmark plan and state tracking for ollama GPU benchmarking" This reverts commit ff7303cf6ae8ccba95635342254fd3982fe8f03d. --- assets/ai-optimizer/README.md | 194 -------------------------------- assets/ai-optimizer/results.csv | 1 - assets/ai-optimizer/state.json | 21 ---- 3 files changed, 216 deletions(-) delete mode 100644 assets/ai-optimizer/README.md delete mode 100644 assets/ai-optimizer/results.csv delete mode 100644 assets/ai-optimizer/state.json diff --git a/assets/ai-optimizer/README.md b/assets/ai-optimizer/README.md deleted file mode 100644 index cde9392..0000000 --- a/assets/ai-optimizer/README.md +++ /dev/null @@ -1,194 +0,0 @@ -# AI Model Optimizer - Ollama GPU Benchmark Plan - -**Purpose:** Find optimal ollama configurations for maximum context size and GPU utilization on AMD MI50 GPUs. - -**Hardware:** -- 2x AMD MI50 GPUs (32GB VRAM each, 64GB total) -- 128GB system RAM -- ROCm: `HSA_OVERRIDE_GFX_VERSION=9.0.6`, `HIP_VISIBLE_DEVICES=0,1` - ---- - -## File Locations - -``` -STATE: /opt/data/infra/assets/ai-optimizer/state.json -RESULTS: /opt/data/infra/assets/ai-optimizer/results.csv -REPO: /opt/data/infra (persistent clone) -``` - ---- - -## Model Queues - -### GPU Track (Coding - prioritize speed + context on GPU) -1. `deepseek-coder-v2:16b` - Best coding model, fits on GPU -2. `qwen2.5-coder:32b` - Alternative coding model -3. `codellama:34b-instruct` - Legacy option - -### RAM Track (Knowledge - prioritize max context) -1. `qwen2.5:72b` - Large knowledge model -2. `nemotron-3-nano:30b` - Efficient large model -3. `mixtral:8x7b-instruct` - MoE architecture - ---- - -## Context Steps (in order) - -``` -[32768, 65536, 98304, 131072, 163840, 200704, 262144, 327680] -``` - ---- - -## Optimization Strategy - -### GPU Track (Coding) -- Start: `num_ctx=32768`, `num_gpu=99`, `flash_attn=true` -- Increase context until OOM or tokens/sec < 5 -- Record best config before hitting wall -- Target: >10 tokens/sec with max context - -### RAM Track (Knowledge) -- Start: `num_ctx=65536`, `num_gpu=50`, `flash_attn=true` -- Allow heavy RAM offload (up to 100GB system RAM) -- Increase context until OOM -- Speed secondary to context size - ---- - -## Prerequisites - -This PR adds the `ai-worker` user with docker group access. After merge: - -```bash -# SSH from Hermes container to run benchmarks on the host -ssh -i /path/to/key ai-worker@host docker exec ollama ollama list - -# Or if running directly on host -docker exec ollama ollama list -``` - ---- - -## Manual Testing Workflow - -### 1. Quick Model Test - -```bash -docker exec ollama ollama run : "Your prompt here" -``` - -### 2. Check Current State - -```bash -cd /opt/data/infra -cat assets/ai-optimizer/state.json -``` - -### 3. Pull Model (if needed) - -```bash -docker exec ollama ollama pull : -``` - -### 4. Create Test Modelfile - -```bash -docker exec ollama bash -c "cat < /root/.ollama/test_${model}.modelfile -FROM ${model} -PARAMETER num_ctx ${num_ctx} -PARAMETER num_gpu ${num_gpu} -PARAMETER flash_attn true -PARAMETER num_predict 4096 -PARAMETER num_keep 1024 -PARAMETER repeat_penalty 1.1 -EOF" - -docker exec ollama ollama create test-model -f /root/.ollama/test_${model}.modelfile -``` - -### 5. Run Benchmark - -```bash -# Warm up -docker exec ollama ollama run test-model "Hello" > /dev/null - -# Coding prompt -docker exec ollama ollama run test-model "Write a Python async context manager that retries a function with exponential backoff, max 5 retries, and logs each attempt using structlog. Include type hints." - -# Knowledge prompt -docker exec ollama ollama run test-model "Explain the complete memory hierarchy in modern GPUs, from registers through L1/L2 caches to VRAM, and how data moves between them during matrix multiplication." -``` - -### 6. Measure VRAM - -```bash -# Try host first -rocm-smi --showmeminfo vram 2>/dev/null || \ -# Try via docker -docker exec --privileged ollama rocm-smi --showmeminfo vram 2>/dev/null || \ -echo "VRAM unavailable" -``` - -### 7. Record Results - -Update `state.json` and append to `results.csv`: -- tokens/sec from ollama output -- VRAM/RAM usage -- Whether this config is the new best - -### 8. Commit Changes - -```bash -cd /opt/data/infra -git add assets/ai-optimizer/ -git commit -m "ai-optimizer: tested ${model} at ${num_ctx} ctx - ${status}" -git push -``` - ---- - -## State File Structure - -```json -{ - "track": "gpu", - "current_model": "deepseek-coder-v2:16b", - "model_index": 0, - "phase": "context_scaling", - "backend": "ollama", - "current_config": { - "num_ctx": 32768, - "num_gpu": 99, - "flash_attn": true - }, - "best_configs": { - "gpu": {}, - "ram": {} - }, - "completed_models": [], - "gpu_queue": ["deepseek-coder-v2:16b", "qwen2.5-coder:32b", "codellama:34b-instruct"], - "ram_queue": ["qwen2.5:72b", "nemotron-3-nano:30b", "mixtral:8x7b-instruct"], - "context_steps": [32768, 65536, 98304, 131072, 163840, 200704, 262144, 327680], - "last_updated": "2026-04-30T00:00:00Z" -} -``` - ---- - -## Results CSV Format - -```csv -timestamp,track,model,backend,phase,num_ctx,num_gpu,flash_attn,tokens_per_sec,vram_gb,ram_gb,status,is_best -``` - ---- - -## Notes - -- **Manual execution** - Run benchmarks when needed, no automated cron job -- **Two tracks**: Complete GPU track first (coding models), then RAM track -- **Backend**: ollama (llama.cpp optional for advanced users) -- **Host access**: Use docker exec (or SSH via ai-worker) for rocm-smi -- **Commit results**: Push best configs to repo for reference diff --git a/assets/ai-optimizer/results.csv b/assets/ai-optimizer/results.csv deleted file mode 100644 index 7e25194..0000000 --- a/assets/ai-optimizer/results.csv +++ /dev/null @@ -1 +0,0 @@ -timestamp,track,model,backend,phase,num_ctx,num_gpu,flash_attn,tokens_per_sec,vram_gb,ram_gb,status,is_best diff --git a/assets/ai-optimizer/state.json b/assets/ai-optimizer/state.json deleted file mode 100644 index 08dac90..0000000 --- a/assets/ai-optimizer/state.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "track": "gpu", - "current_model": "deepseek-coder-v2:16b", - "model_index": 0, - "phase": "context_scaling", - "backend": "ollama", - "current_config": { - "num_ctx": 32768, - "num_gpu": 99, - "flash_attn": true - }, - "best_configs": { - "gpu": {}, - "ram": {} - }, - "completed_models": [], - "gpu_queue": ["deepseek-coder-v2:16b", "qwen2.5-coder:32b", "codellama:34b-instruct"], - "ram_queue": ["qwen2.5:72b", "nemotron-3-nano:30b", "mixtral:8x7b-instruct"], - "context_steps": [32768, 65536, 98304, 131072, 163840, 200704, 262144, 327680], - "last_updated": "2026-05-09T00:00:00Z" -}