malabar/step5_assign_colors.py

#!/usr/bin/env python3
"""
Step 5: Assign colors to speakers based on their characteristics.

Input: Speaker files in "_speakers/" folder
Output: _colors.json with speaker-color mappings

Output format:
{
  "Malabar": "golden",
  "Moon": "silver",
  "Earth": "green",
  ...
}

Usage:
    uv run step5_assign_colors.py

Environment Variables:
    OPENAI_API_KEY - Required
    OPENAI_BASE_URL - Optional (for Kimi/GLM APIs)
    LLM_MODEL - Optional (e.g., "glm-4.5-air")
"""

import os
import re
import sys
import json
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Set
from openai import OpenAI

# ============== Configuration ==============

INPUT_DIR = Path("_speakers")
OUTPUT_FILE = Path("_colors.json")

# Fixed color assignments
FIXED_COLORS = {
    "Malabar": "#000000",  # Black
    "Mars": "#FF0000",     # Red
    "Venus": "#FFD700",    # Gold
    "Sun": "#FFA500",      # Bright gold/orange
    "Earth": "#228B22"     # Forest green
}

# Default configurations for different providers
DEFAULT_CONFIGS = {
    "openai": {
        "base_url": None,
        "model": "gpt-4o-mini"
    },
    "moonshot": {
        "base_url": "https://api.moonshot.cn/v1",
        "model": "kimi-latest"
    },
    "bigmodel": {  # Zhipu AI (GLM)
        "base_url": "https://open.bigmodel.cn/api/paas/v4",
        "model": "glm-4.5-air"
    }
}


def get_llm_config() -> Tuple[str, str]:
    """Get LLM configuration from environment."""
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OPENAI_API_KEY environment variable is required")

    base_url = os.getenv("OPENAI_BASE_URL")
    model = os.getenv("LLM_MODEL")

    if base_url:
        if model:
            return base_url, model
        if "bigmodel" in base_url:
            return base_url, DEFAULT_CONFIGS["bigmodel"]["model"]
        elif "moonshot" in base_url or "kimi" in base_url:
            return base_url, DEFAULT_CONFIGS["moonshot"]["model"]
        else:
            return base_url, DEFAULT_CONFIGS["openai"]["model"]
    else:
        return None, model or DEFAULT_CONFIGS["openai"]["model"]


def collect_speakers(input_dir: Path) -> Set[str]:
    """Collect all unique speakers from speaker files."""
    speakers = set()

    for file_path in input_dir.glob("*_speakers.txt"):
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                # Parse line: [timestamp](Speaker) text
                match = re.match(r'^\[\d{2}:\d{2}\]\(([^)]+)\)', line)
                if match:
                    speakers.add(match.group(1))

    return speakers


def get_unique_fallback_color(index: int) -> str:
    """Generate a unique fallback color from a palette."""
    # Distinct color palette for fallback (dark enough for white background)
    palette = [
        "#8B4513",  # Saddle Brown
        "#556B2F",  # Dark Olive Green
        "#483D8B",  # Dark Slate Blue
        "#2F4F4F",  # Dark Slate Gray
        "#8B008B",  # Dark Magenta
        "#4B0082",  # Indigo
        "#191970",  # Midnight Blue
        "#006400",  # Dark Green
        "#8B0000",  # Dark Red
        "#B8860B",  # Dark Goldenrod
        "#5F9EA0",  # Cadet Blue
        "#708090",  # Slate Gray
        "#CD853F",  # Peru
        "#BC8F8F",  # Rosy Brown
        "#4682B4",  # Steel Blue
        "#6B8E23",  # Olive Drab
        "#9370DB",  # Medium Purple
        "#8FBC8F",  # Dark Sea Green
        "#CD5C5C",  # Indian Red
        "#4169E1",  # Royal Blue
    ]
    return palette[index % len(palette)]


def call_llm_for_colors(speakers: List[str], client: OpenAI, model: str,
                        existing_mapping: Dict[str, str], attempt: int = 1) -> Dict[str, str]:
    """Call LLM to assign colors to speakers. Returns parsed color mapping."""
    speakers_list = ", ".join(speakers)

    existing_info = ""
    if existing_mapping:
        existing_colors = [f"  - {k} → {v}" for k, v in existing_mapping.items()]
        existing_info = f"\nAlready assigned:\n" + "\n".join(existing_colors)

    prompt = f"""Assign CSS hex color codes to each speaker from "Little Malabar" based on their characteristics.

Speakers to assign colors:
{speakers_list}{existing_info}

Color assignment guidelines (use hex codes like #FF0000):
- Moon → #A9A9A9 (dark gray) - avoid light colors
- Jupiter → #D2691E (chocolate/orange)
- Galaxy → #9370DB (medium purple) or #FF69B4 (hot pink)
- Star → #DAA520 (goldenrod) or #B8860B (dark goldenrod) - avoid white/light colors
- Volcano → #8B0000 (dark red) or #FF4500 (orange red)
- Kangaroo/Giraffe → #D2691E (chocolate) or #8B4513 (saddle brown)
- Song → #4682B4 (steel blue) or #9370DB (medium purple) - avoid light colors
- Asteroids → #696969 (dim gray) or #A9A9A9 (dark gray)
- Atoms → #20B2AA (light sea green) or #008B8B (dark cyan)
- Comet → #FFD700 (gold) or #DAA520 (goldenrod)
- Narrator → #708090 (slate gray) or #778899 (light slate gray)

IMPORTANT:
- Do NOT use light colors like #FFFFFF (white), #FFFACD, #87CEEB, #C0C0C0
- All colors must be dark enough to read on white backgrounds
- Each speaker should have a UNIQUE color (no duplicates!)

Fixed assignments (DO NOT change these):
- Malabar → #000000 (black)
- Mars → #FF0000 (red)
- Venus → #FFD700 (gold)
- Sun → #FFA500 (bright gold)
- Earth → #228B22 (green)

Reply with ONLY a JSON object mapping the remaining speaker names to hex color codes:
{{"SpeakerName": "#RRGGBB", ...}}

JSON:"""

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You assign colors to characters. Reply with ONLY valid JSON."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=500,
            extra_body={"thinking": {"type": "disabled"}}  # Disable thinking
        )

        message = response.choices[0].message
        result = message.content or ""

        # GLM models may put response in reasoning_content
        if not result and hasattr(message, 'reasoning_content') and message.reasoning_content:
            result = message.reasoning_content

        # Try to parse JSON
        json_match = re.search(r'\{[^}]+\}', result)
        if json_match:
            try:
                parsed = json.loads(json_match.group())
                return {k: v for k, v in parsed.items() if k in speakers}
            except json.JSONDecodeError:
                print(f"    Warning: Could not parse JSON response on attempt {attempt}")

        return {}

    except Exception as e:
        print(f"    Error calling LLM on attempt {attempt}: {e}")
        return {}


def assign_colors(speakers: Set[str], client: OpenAI, model: str) -> Dict[str, str]:
    """Assign colors to speakers using LLM with retry logic."""
    # Start with fixed colors
    color_mapping = FIXED_COLORS.copy()

    # Filter out speakers that already have fixed colors
    remaining_speakers = [s for s in speakers if s not in color_mapping]

    if not remaining_speakers:
        return color_mapping

    max_retries = 3

    for attempt in range(1, max_retries + 1):
        # Get speakers that still need colors
        still_need_colors = [s for s in remaining_speakers if s not in color_mapping]

        if not still_need_colors:
            break  # All speakers have colors

        if attempt > 1:
            print(f"    Retry {attempt-1}: {len(still_need_colors)} speakers still need colors...")

        # Call LLM to get colors
        llm_result = call_llm_for_colors(still_need_colors, client, model, color_mapping, attempt)

        # Merge results
        for speaker, color in llm_result.items():
            if speaker in still_need_colors:
                color_mapping[speaker] = color

    # Check for any remaining speakers without colors
    still_need_colors = [s for s in remaining_speakers if s not in color_mapping]

    if still_need_colors:
        print(f"    Using fallback colors for {len(still_need_colors)} speakers...")
        # Assign unique fallback colors from palette
        for idx, speaker in enumerate(still_need_colors):
            color_mapping[speaker] = get_unique_fallback_color(idx)

    return color_mapping


def main():
    # Get LLM config
    base_url, model = get_llm_config()
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url=base_url)

    print(f"Using model: {model}")
    print(f"Endpoint: {base_url or 'OpenAI default'}")

    # Check input directory
    if not INPUT_DIR.exists():
        print(f"Error: Input directory {INPUT_DIR}/ not found")
        sys.exit(1)

    # Collect all speakers
    print(f"\nCollecting speakers from {INPUT_DIR}/...")
    speakers = collect_speakers(INPUT_DIR)

    if not speakers:
        print("Error: No speakers found")
        sys.exit(1)

    print(f"Found {len(speakers)} unique speakers:")
    for speaker in sorted(speakers):
        if speaker in FIXED_COLORS:
            print(f"  - {speaker}: {FIXED_COLORS[speaker]} (fixed)")
        else:
            print(f"  - {speaker}")

    # Assign colors
    print(f"\nAssigning colors...")
    color_mapping = assign_colors(speakers, client, model)

    print(f"\nFinal color assignments:")
    for speaker, color in sorted(color_mapping.items()):
        fixed = " (fixed)" if speaker in FIXED_COLORS else ""
        print(f"  - {speaker}: {color}{fixed}")

    # Save to JSON
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(color_mapping, f, ensure_ascii=False, indent=2)

    print(f"\nSaved to: {OUTPUT_FILE}")
    print(f"\nStep 5 Complete!")


if __name__ == "__main__":
    main()