Munich-news/backend/routes/ollama_routes.py

from flask import Blueprint, jsonify
from config import Config
from services.ollama_service import call_ollama, list_ollama_models
import os

ollama_bp = Blueprint('ollama', __name__)


@ollama_bp.route('/api/ollama/ping', methods=['GET', 'POST'])
def ping_ollama():
    """Test connection to Ollama server"""
    try:
        # Check if Ollama is enabled
        if not Config.OLLAMA_ENABLED:
            return jsonify({
                'status': 'disabled',
                'message': 'Ollama is not enabled. Set OLLAMA_ENABLED=true in your .env file.',
                'ollama_config': {
                    'base_url': Config.OLLAMA_BASE_URL,
                    'model': Config.OLLAMA_MODEL,
                    'enabled': False
                }
            }), 200

        # Send a simple test prompt
        test_prompt = "Say 'Hello! I am connected and working.' in one sentence."
        system_prompt = "You are a helpful assistant. Respond briefly and concisely."

        response_text, error_message = call_ollama(test_prompt, system_prompt)

        if response_text:
            return jsonify({
                'status': 'success',
                'message': 'Successfully connected to Ollama',
                'response': response_text,
                'ollama_config': {
                    'base_url': Config.OLLAMA_BASE_URL,
                    'model': Config.OLLAMA_MODEL,
                    'enabled': True
                }
            }), 200
        else:
            # Try to get available models for better error message
            available_models, _ = list_ollama_models()

            troubleshooting = {
                'check_server': f'Verify Ollama is running at {Config.OLLAMA_BASE_URL}',
                'check_model': f'Verify model "{Config.OLLAMA_MODEL}" is available (run: ollama list)',
                'test_connection': f'Test manually: curl {Config.OLLAMA_BASE_URL}/api/generate -d \'{{"model":"{Config.OLLAMA_MODEL}","prompt":"test"}}\''
            }

            if available_models:
                troubleshooting['available_models'] = available_models
                troubleshooting['suggestion'] = f'Try setting OLLAMA_MODEL to one of: {", ".join(available_models[:5])}'

            return jsonify({
                'status': 'error',
                'message': error_message or 'Failed to get response from Ollama',
                'error_details': error_message,
                'ollama_config': {
                    'base_url': Config.OLLAMA_BASE_URL,
                    'model': Config.OLLAMA_MODEL,
                    'enabled': True
                },
                'troubleshooting': troubleshooting
            }), 500

    except Exception as e:
        return jsonify({
            'status': 'error',
            'message': f'Error connecting to Ollama: {str(e)}',
            'ollama_config': {
                'base_url': Config.OLLAMA_BASE_URL,
                'model': Config.OLLAMA_MODEL,
                'enabled': Config.OLLAMA_ENABLED
            }
        }), 500


@ollama_bp.route('/api/ollama/config', methods=['GET'])
def get_ollama_config():
    """Get current Ollama configuration (for debugging)"""
    try:
        from pathlib import Path
        backend_dir = Path(__file__).parent.parent
        env_path = backend_dir / '.env'

        return jsonify({
            'ollama_config': {
                'base_url': Config.OLLAMA_BASE_URL,
                'model': Config.OLLAMA_MODEL,
                'enabled': Config.OLLAMA_ENABLED,
                'has_api_key': bool(Config.OLLAMA_API_KEY)
            },
            'env_file_path': str(env_path),
            'env_file_exists': env_path.exists(),
            'current_working_directory': os.getcwd()
        }), 200
    except Exception as e:
        return jsonify({
            'error': str(e),
            'ollama_config': {
                'base_url': Config.OLLAMA_BASE_URL,
                'model': Config.OLLAMA_MODEL,
                'enabled': Config.OLLAMA_ENABLED
            }
        }), 500


@ollama_bp.route('/api/ollama/models', methods=['GET'])
def get_ollama_models():
    """List available models on Ollama server"""
    try:
        if not Config.OLLAMA_ENABLED:
            return jsonify({
                'status': 'disabled',
                'message': 'Ollama is not enabled. Set OLLAMA_ENABLED=true in your .env file.',
                'ollama_config': {
                    'base_url': Config.OLLAMA_BASE_URL,
                    'model': Config.OLLAMA_MODEL,
                    'enabled': False
                }
            }), 200

        models, error_message = list_ollama_models()

        if models is not None:
            return jsonify({
                'status': 'success',
                'models': models,
                'current_model': Config.OLLAMA_MODEL,
                'ollama_config': {
                    'base_url': Config.OLLAMA_BASE_URL,
                    'model': Config.OLLAMA_MODEL,
                    'enabled': True
                }
            }), 200
        else:
            return jsonify({
                'status': 'error',
                'message': error_message or 'Failed to list models',
                'ollama_config': {
                    'base_url': Config.OLLAMA_BASE_URL,
                    'model': Config.OLLAMA_MODEL,
                    'enabled': True
                }
            }), 500

    except Exception as e:
        return jsonify({
            'status': 'error',
            'message': f'Error listing models: {str(e)}',
            'ollama_config': {
                'base_url': Config.OLLAMA_BASE_URL,
                'model': Config.OLLAMA_MODEL,
                'enabled': Config.OLLAMA_ENABLED
            }
        }), 500


@ollama_bp.route('/api/ollama/gpu-status', methods=['GET'])
def get_gpu_status():
    """Check if Ollama is using GPU acceleration"""
    import requests

    try:
        if not Config.OLLAMA_ENABLED:
            return jsonify({
                'status': 'disabled',
                'message': 'Ollama is not enabled',
                'gpu_available': False,
                'gpu_in_use': False
            }), 200

        # Get Ollama process info
        try:
            response = requests.get(
                f"{Config.OLLAMA_BASE_URL}/api/ps",
                timeout=5
            )

            if response.status_code == 200:
                ps_data = response.json()

                # Check if any models are loaded
                models_loaded = ps_data.get('models', [])

                gpu_info = {
                    'status': 'success',
                    'ollama_running': True,
                    'models_loaded': len(models_loaded),
                    'gpu_available': False,
                    'gpu_in_use': False,
                    'gpu_details': None
                }

                # Check for GPU usage in loaded models
                for model in models_loaded:
                    # Check various GPU indicators
                    gpu_layers = model.get('gpu_layers', 0)
                    details = model.get('details', {})

                    # Check if GPU is mentioned in any field
                    if (gpu_layers > 0 or
                        'gpu' in str(model).lower() or
                        'cuda' in str(model).lower() or
                        details.get('families', []) and 'gpu' in str(details.get('families', [])).lower()):
                        gpu_info['gpu_in_use'] = True
                        gpu_info['gpu_available'] = True
                        gpu_info['gpu_details'] = {
                            'model': model.get('name', 'unknown'),
                            'gpu_layers': gpu_layers,
                            'size': model.get('size', 0),
                            'size_vram': model.get('size_vram', 0)
                        }
                        break

                # If no models loaded, check Docker container for GPU
                if not gpu_info['gpu_in_use']:
                    try:
                        import subprocess
                        # Check if nvidia-smi works in ollama container
                        result = subprocess.run(
                            ['docker', 'exec', 'munich-news-ollama', 'nvidia-smi', '--query-gpu=name,memory.used,utilization.gpu', '--format=csv,noheader'],
                            capture_output=True,
                            text=True,
                            timeout=5
                        )
                        if result.returncode == 0 and result.stdout.strip():
                            gpu_data = result.stdout.strip().split(',')
                            gpu_info['gpu_available'] = True
                            gpu_info['gpu_details'] = {
                                'gpu_name': gpu_data[0].strip() if len(gpu_data) > 0 else 'Unknown',
                                'memory_used': gpu_data[1].strip() if len(gpu_data) > 1 else 'N/A',
                                'utilization': gpu_data[2].strip() if len(gpu_data) > 2 else 'N/A',
                                'note': 'GPU available but no model currently loaded'
                            }

                            # Check Ollama logs for GPU usage evidence
                            log_result = subprocess.run(
                                ['docker', 'logs', '--tail', '50', 'munich-news-ollama'],
                                capture_output=True,
                                text=True,
                                timeout=5
                            )
                            if log_result.returncode == 0:
                                logs = log_result.stdout + log_result.stderr
                                # Look for GPU offloading messages
                                if 'offloaded' in logs.lower() and 'gpu' in logs.lower():
                                    gpu_info['gpu_in_use'] = True
                                    gpu_info['gpu_details']['note'] = 'GPU acceleration active (detected in logs)'
                                    # Extract layer info if available
                                    import re
                                    match = re.search(r'offloaded (\d+)/(\d+) layers', logs, re.IGNORECASE)
                                    if match:
                                        gpu_info['gpu_details']['layers_offloaded'] = f"{match.group(1)}/{match.group(2)}"
                    except Exception as e:
                        gpu_info['debug_error'] = str(e)

                # Try to get system info
                try:
                    tags_response = requests.get(
                        f"{Config.OLLAMA_BASE_URL}/api/tags",
                        timeout=5
                    )
                    if tags_response.status_code == 200:
                        tags_data = tags_response.json()
                        gpu_info['available_models'] = [m.get('name') for m in tags_data.get('models', [])]
                except:
                    pass

                # Add recommendation
                if not gpu_info['gpu_in_use']:
                    gpu_info['recommendation'] = (
                        "GPU not detected. To enable GPU acceleration:\n"
                        "1. Ensure NVIDIA GPU is available\n"
                        "2. Install nvidia-docker2\n"
                        "3. Use: docker-compose -f docker-compose.yml -f docker-compose.gpu.yml up -d\n"
                        "4. See docs/GPU_SETUP.md for details"
                    )
                else:
                    gpu_info['recommendation'] = "✓ GPU acceleration is active!"

                return jsonify(gpu_info), 200
            else:
                return jsonify({
                    'status': 'error',
                    'message': f'Ollama API returned status {response.status_code}',
                    'ollama_running': False,
                    'gpu_available': False,
                    'gpu_in_use': False
                }), 500

        except requests.exceptions.ConnectionError:
            return jsonify({
                'status': 'error',
                'message': f'Cannot connect to Ollama at {Config.OLLAMA_BASE_URL}',
                'ollama_running': False,
                'gpu_available': False,
                'gpu_in_use': False,
                'troubleshooting': {
                    'check_container': 'docker-compose ps ollama',
                    'check_logs': 'docker-compose logs ollama',
                    'restart': 'docker-compose restart ollama'
                }
            }), 500

    except Exception as e:
        return jsonify({
            'status': 'error',
            'message': f'Error checking GPU status: {str(e)}',
            'gpu_available': False,
            'gpu_in_use': False
        }), 500


@ollama_bp.route('/api/ollama/test', methods=['GET'])
def test_ollama_performance():
    """Test Ollama performance and measure response time"""
    import time

    try:
        if not Config.OLLAMA_ENABLED:
            return jsonify({
                'status': 'disabled',
                'message': 'Ollama is not enabled'
            }), 200

        # Test prompt
        test_prompt = "Summarize this in 20 words: Munich is the capital of Bavaria, Germany. It is known for Oktoberfest, BMW, and beautiful architecture."

        start_time = time.time()
        response_text, error_message = call_ollama(test_prompt, "You are a helpful assistant.")
        duration = time.time() - start_time

        if response_text:
            # Estimate performance
            if duration < 5:
                performance = "Excellent (GPU likely active)"
            elif duration < 15:
                performance = "Good (GPU may be active)"
            elif duration < 30:
                performance = "Fair (CPU mode)"
            else:
                performance = "Slow (CPU mode, consider GPU)"

            return jsonify({
                'status': 'success',
                'response': response_text,
                'duration_seconds': round(duration, 2),
                'performance': performance,
                'model': Config.OLLAMA_MODEL,
                'recommendation': (
                    "GPU acceleration recommended" if duration > 15
                    else "Performance is good"
                )
            }), 200
        else:
            return jsonify({
                'status': 'error',
                'message': error_message or 'Failed to get response',
                'duration_seconds': round(duration, 2)
            }), 500

    except Exception as e:
        return jsonify({
            'status': 'error',
            'message': f'Error testing Ollama: {str(e)}'
        }), 500