370 lines
15 KiB
Python
370 lines
15 KiB
Python
from flask import Blueprint, jsonify
|
|
from config import Config
|
|
from services.ollama_service import call_ollama, list_ollama_models
|
|
import os
|
|
|
|
ollama_bp = Blueprint('ollama', __name__)
|
|
|
|
|
|
@ollama_bp.route('/api/ollama/ping', methods=['GET', 'POST'])
|
|
def ping_ollama():
|
|
"""Test connection to Ollama server"""
|
|
try:
|
|
# Check if Ollama is enabled
|
|
if not Config.OLLAMA_ENABLED:
|
|
return jsonify({
|
|
'status': 'disabled',
|
|
'message': 'Ollama is not enabled. Set OLLAMA_ENABLED=true in your .env file.',
|
|
'ollama_config': {
|
|
'base_url': Config.OLLAMA_BASE_URL,
|
|
'model': Config.OLLAMA_MODEL,
|
|
'enabled': False
|
|
}
|
|
}), 200
|
|
|
|
# Send a simple test prompt
|
|
test_prompt = "Say 'Hello! I am connected and working.' in one sentence."
|
|
system_prompt = "You are a helpful assistant. Respond briefly and concisely."
|
|
|
|
response_text, error_message = call_ollama(test_prompt, system_prompt)
|
|
|
|
if response_text:
|
|
return jsonify({
|
|
'status': 'success',
|
|
'message': 'Successfully connected to Ollama',
|
|
'response': response_text,
|
|
'ollama_config': {
|
|
'base_url': Config.OLLAMA_BASE_URL,
|
|
'model': Config.OLLAMA_MODEL,
|
|
'enabled': True
|
|
}
|
|
}), 200
|
|
else:
|
|
# Try to get available models for better error message
|
|
available_models, _ = list_ollama_models()
|
|
|
|
troubleshooting = {
|
|
'check_server': f'Verify Ollama is running at {Config.OLLAMA_BASE_URL}',
|
|
'check_model': f'Verify model "{Config.OLLAMA_MODEL}" is available (run: ollama list)',
|
|
'test_connection': f'Test manually: curl {Config.OLLAMA_BASE_URL}/api/generate -d \'{{"model":"{Config.OLLAMA_MODEL}","prompt":"test"}}\''
|
|
}
|
|
|
|
if available_models:
|
|
troubleshooting['available_models'] = available_models
|
|
troubleshooting['suggestion'] = f'Try setting OLLAMA_MODEL to one of: {", ".join(available_models[:5])}'
|
|
|
|
return jsonify({
|
|
'status': 'error',
|
|
'message': error_message or 'Failed to get response from Ollama',
|
|
'error_details': error_message,
|
|
'ollama_config': {
|
|
'base_url': Config.OLLAMA_BASE_URL,
|
|
'model': Config.OLLAMA_MODEL,
|
|
'enabled': True
|
|
},
|
|
'troubleshooting': troubleshooting
|
|
}), 500
|
|
|
|
except Exception as e:
|
|
return jsonify({
|
|
'status': 'error',
|
|
'message': f'Error connecting to Ollama: {str(e)}',
|
|
'ollama_config': {
|
|
'base_url': Config.OLLAMA_BASE_URL,
|
|
'model': Config.OLLAMA_MODEL,
|
|
'enabled': Config.OLLAMA_ENABLED
|
|
}
|
|
}), 500
|
|
|
|
|
|
@ollama_bp.route('/api/ollama/config', methods=['GET'])
|
|
def get_ollama_config():
|
|
"""Get current Ollama configuration (for debugging)"""
|
|
try:
|
|
from pathlib import Path
|
|
backend_dir = Path(__file__).parent.parent
|
|
env_path = backend_dir / '.env'
|
|
|
|
return jsonify({
|
|
'ollama_config': {
|
|
'base_url': Config.OLLAMA_BASE_URL,
|
|
'model': Config.OLLAMA_MODEL,
|
|
'enabled': Config.OLLAMA_ENABLED,
|
|
'has_api_key': bool(Config.OLLAMA_API_KEY)
|
|
},
|
|
'env_file_path': str(env_path),
|
|
'env_file_exists': env_path.exists(),
|
|
'current_working_directory': os.getcwd()
|
|
}), 200
|
|
except Exception as e:
|
|
return jsonify({
|
|
'error': str(e),
|
|
'ollama_config': {
|
|
'base_url': Config.OLLAMA_BASE_URL,
|
|
'model': Config.OLLAMA_MODEL,
|
|
'enabled': Config.OLLAMA_ENABLED
|
|
}
|
|
}), 500
|
|
|
|
|
|
@ollama_bp.route('/api/ollama/models', methods=['GET'])
|
|
def get_ollama_models():
|
|
"""List available models on Ollama server"""
|
|
try:
|
|
if not Config.OLLAMA_ENABLED:
|
|
return jsonify({
|
|
'status': 'disabled',
|
|
'message': 'Ollama is not enabled. Set OLLAMA_ENABLED=true in your .env file.',
|
|
'ollama_config': {
|
|
'base_url': Config.OLLAMA_BASE_URL,
|
|
'model': Config.OLLAMA_MODEL,
|
|
'enabled': False
|
|
}
|
|
}), 200
|
|
|
|
models, error_message = list_ollama_models()
|
|
|
|
if models is not None:
|
|
return jsonify({
|
|
'status': 'success',
|
|
'models': models,
|
|
'current_model': Config.OLLAMA_MODEL,
|
|
'ollama_config': {
|
|
'base_url': Config.OLLAMA_BASE_URL,
|
|
'model': Config.OLLAMA_MODEL,
|
|
'enabled': True
|
|
}
|
|
}), 200
|
|
else:
|
|
return jsonify({
|
|
'status': 'error',
|
|
'message': error_message or 'Failed to list models',
|
|
'ollama_config': {
|
|
'base_url': Config.OLLAMA_BASE_URL,
|
|
'model': Config.OLLAMA_MODEL,
|
|
'enabled': True
|
|
}
|
|
}), 500
|
|
|
|
except Exception as e:
|
|
return jsonify({
|
|
'status': 'error',
|
|
'message': f'Error listing models: {str(e)}',
|
|
'ollama_config': {
|
|
'base_url': Config.OLLAMA_BASE_URL,
|
|
'model': Config.OLLAMA_MODEL,
|
|
'enabled': Config.OLLAMA_ENABLED
|
|
}
|
|
}), 500
|
|
|
|
|
|
@ollama_bp.route('/api/ollama/gpu-status', methods=['GET'])
|
|
def get_gpu_status():
|
|
"""Check if Ollama is using GPU acceleration"""
|
|
import requests
|
|
|
|
try:
|
|
if not Config.OLLAMA_ENABLED:
|
|
return jsonify({
|
|
'status': 'disabled',
|
|
'message': 'Ollama is not enabled',
|
|
'gpu_available': False,
|
|
'gpu_in_use': False
|
|
}), 200
|
|
|
|
# Get Ollama process info
|
|
try:
|
|
response = requests.get(
|
|
f"{Config.OLLAMA_BASE_URL}/api/ps",
|
|
timeout=5
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
ps_data = response.json()
|
|
|
|
# Check if any models are loaded
|
|
models_loaded = ps_data.get('models', [])
|
|
|
|
gpu_info = {
|
|
'status': 'success',
|
|
'ollama_running': True,
|
|
'models_loaded': len(models_loaded),
|
|
'gpu_available': False,
|
|
'gpu_in_use': False,
|
|
'gpu_details': None
|
|
}
|
|
|
|
# Check for GPU usage in loaded models
|
|
for model in models_loaded:
|
|
# Check various GPU indicators
|
|
gpu_layers = model.get('gpu_layers', 0)
|
|
details = model.get('details', {})
|
|
|
|
# Check if GPU is mentioned in any field
|
|
if (gpu_layers > 0 or
|
|
'gpu' in str(model).lower() or
|
|
'cuda' in str(model).lower() or
|
|
details.get('families', []) and 'gpu' in str(details.get('families', [])).lower()):
|
|
gpu_info['gpu_in_use'] = True
|
|
gpu_info['gpu_available'] = True
|
|
gpu_info['gpu_details'] = {
|
|
'model': model.get('name', 'unknown'),
|
|
'gpu_layers': gpu_layers,
|
|
'size': model.get('size', 0),
|
|
'size_vram': model.get('size_vram', 0)
|
|
}
|
|
break
|
|
|
|
# If no models loaded, check Docker container for GPU
|
|
if not gpu_info['gpu_in_use']:
|
|
try:
|
|
import subprocess
|
|
# Check if nvidia-smi works in ollama container
|
|
result = subprocess.run(
|
|
['docker', 'exec', 'munich-news-ollama', 'nvidia-smi', '--query-gpu=name,memory.used,utilization.gpu', '--format=csv,noheader'],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5
|
|
)
|
|
if result.returncode == 0 and result.stdout.strip():
|
|
gpu_data = result.stdout.strip().split(',')
|
|
gpu_info['gpu_available'] = True
|
|
gpu_info['gpu_details'] = {
|
|
'gpu_name': gpu_data[0].strip() if len(gpu_data) > 0 else 'Unknown',
|
|
'memory_used': gpu_data[1].strip() if len(gpu_data) > 1 else 'N/A',
|
|
'utilization': gpu_data[2].strip() if len(gpu_data) > 2 else 'N/A',
|
|
'note': 'GPU available but no model currently loaded'
|
|
}
|
|
|
|
# Check Ollama logs for GPU usage evidence
|
|
log_result = subprocess.run(
|
|
['docker', 'logs', '--tail', '50', 'munich-news-ollama'],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5
|
|
)
|
|
if log_result.returncode == 0:
|
|
logs = log_result.stdout + log_result.stderr
|
|
# Look for GPU offloading messages
|
|
if 'offloaded' in logs.lower() and 'gpu' in logs.lower():
|
|
gpu_info['gpu_in_use'] = True
|
|
gpu_info['gpu_details']['note'] = 'GPU acceleration active (detected in logs)'
|
|
# Extract layer info if available
|
|
import re
|
|
match = re.search(r'offloaded (\d+)/(\d+) layers', logs, re.IGNORECASE)
|
|
if match:
|
|
gpu_info['gpu_details']['layers_offloaded'] = f"{match.group(1)}/{match.group(2)}"
|
|
except Exception as e:
|
|
gpu_info['debug_error'] = str(e)
|
|
|
|
# Try to get system info
|
|
try:
|
|
tags_response = requests.get(
|
|
f"{Config.OLLAMA_BASE_URL}/api/tags",
|
|
timeout=5
|
|
)
|
|
if tags_response.status_code == 200:
|
|
tags_data = tags_response.json()
|
|
gpu_info['available_models'] = [m.get('name') for m in tags_data.get('models', [])]
|
|
except:
|
|
pass
|
|
|
|
# Add recommendation
|
|
if not gpu_info['gpu_in_use']:
|
|
gpu_info['recommendation'] = (
|
|
"GPU not detected. To enable GPU acceleration:\n"
|
|
"1. Ensure NVIDIA GPU is available\n"
|
|
"2. Install nvidia-docker2\n"
|
|
"3. Use: docker-compose -f docker-compose.yml -f docker-compose.gpu.yml up -d\n"
|
|
"4. See docs/GPU_SETUP.md for details"
|
|
)
|
|
else:
|
|
gpu_info['recommendation'] = "✓ GPU acceleration is active!"
|
|
|
|
return jsonify(gpu_info), 200
|
|
else:
|
|
return jsonify({
|
|
'status': 'error',
|
|
'message': f'Ollama API returned status {response.status_code}',
|
|
'ollama_running': False,
|
|
'gpu_available': False,
|
|
'gpu_in_use': False
|
|
}), 500
|
|
|
|
except requests.exceptions.ConnectionError:
|
|
return jsonify({
|
|
'status': 'error',
|
|
'message': f'Cannot connect to Ollama at {Config.OLLAMA_BASE_URL}',
|
|
'ollama_running': False,
|
|
'gpu_available': False,
|
|
'gpu_in_use': False,
|
|
'troubleshooting': {
|
|
'check_container': 'docker-compose ps ollama',
|
|
'check_logs': 'docker-compose logs ollama',
|
|
'restart': 'docker-compose restart ollama'
|
|
}
|
|
}), 500
|
|
|
|
except Exception as e:
|
|
return jsonify({
|
|
'status': 'error',
|
|
'message': f'Error checking GPU status: {str(e)}',
|
|
'gpu_available': False,
|
|
'gpu_in_use': False
|
|
}), 500
|
|
|
|
|
|
@ollama_bp.route('/api/ollama/test', methods=['GET'])
|
|
def test_ollama_performance():
|
|
"""Test Ollama performance and measure response time"""
|
|
import time
|
|
|
|
try:
|
|
if not Config.OLLAMA_ENABLED:
|
|
return jsonify({
|
|
'status': 'disabled',
|
|
'message': 'Ollama is not enabled'
|
|
}), 200
|
|
|
|
# Test prompt
|
|
test_prompt = "Summarize this in 20 words: Munich is the capital of Bavaria, Germany. It is known for Oktoberfest, BMW, and beautiful architecture."
|
|
|
|
start_time = time.time()
|
|
response_text, error_message = call_ollama(test_prompt, "You are a helpful assistant.")
|
|
duration = time.time() - start_time
|
|
|
|
if response_text:
|
|
# Estimate performance
|
|
if duration < 5:
|
|
performance = "Excellent (GPU likely active)"
|
|
elif duration < 15:
|
|
performance = "Good (GPU may be active)"
|
|
elif duration < 30:
|
|
performance = "Fair (CPU mode)"
|
|
else:
|
|
performance = "Slow (CPU mode, consider GPU)"
|
|
|
|
return jsonify({
|
|
'status': 'success',
|
|
'response': response_text,
|
|
'duration_seconds': round(duration, 2),
|
|
'performance': performance,
|
|
'model': Config.OLLAMA_MODEL,
|
|
'recommendation': (
|
|
"GPU acceleration recommended" if duration > 15
|
|
else "Performance is good"
|
|
)
|
|
}), 200
|
|
else:
|
|
return jsonify({
|
|
'status': 'error',
|
|
'message': error_message or 'Failed to get response',
|
|
'duration_seconds': round(duration, 2)
|
|
}), 500
|
|
|
|
except Exception as e:
|
|
return jsonify({
|
|
'status': 'error',
|
|
'message': f'Error testing Ollama: {str(e)}'
|
|
}), 500
|