#!/usr/bin/env python3 """ Step 3: Use LLM to infer speaker names from transcript context. Input: Line-formatted files in "_lines/" folder Output: Files with inferred speaker names in "_speakers/" folder This version uses a multi-step approach: 1. First identify Malabar (he's always present) 2. Then identify each remaining speaker one by one 3. Each step saves debug info to _speakers_debug/ Usage: uv run step3_infer_speakers.py Environment Variables: OPENAI_API_KEY - Required (can be OpenAI, Kimi, or GLM key) OPENAI_BASE_URL - Optional (for Kimi/GLM APIs) LLM_MODEL - Optional (e.g., "glm-4.5-air", "kimi-latest") """ import os import re import sys import json from pathlib import Path from typing import List, Dict, Tuple, Optional from openai import OpenAI # ============== Configuration ============== INPUT_DIR = Path("_lines") OUTPUT_DIR = Path("_speakers") DEBUG_DIR = Path("_speakers_debug") PROGRESS_FILE = Path(".step3_progress.json") # Examples of good speaker names (for reference, not a restricted list) NAME_EXAMPLES = ["Malabar", "Sun", "Jupiter", "Kangaroo", "Mole"] # Default configurations for different providers DEFAULT_CONFIGS = { "openai": { "base_url": None, "model": "gpt-4o-mini" }, "moonshot": { "base_url": "https://api.moonshot.cn/v1", "model": "kimi-latest" }, "bigmodel": { # Zhipu AI (GLM) "base_url": "https://open.bigmodel.cn/api/paas/v4", "model": "glm-4.5-air" } } def ensure_dirs(): """Ensure output directories exist.""" OUTPUT_DIR.mkdir(exist_ok=True) DEBUG_DIR.mkdir(exist_ok=True) def load_progress() -> dict: """Load progress tracking.""" if PROGRESS_FILE.exists(): with open(PROGRESS_FILE, 'r', encoding='utf-8') as f: return json.load(f) return {} def save_progress(progress: dict): """Save progress tracking.""" with open(PROGRESS_FILE, 'w', encoding='utf-8') as f: json.dump(progress, f, indent=2) def get_llm_config() -> Tuple[str, str]: """Get LLM configuration from environment.""" api_key = os.getenv("OPENAI_API_KEY") if not api_key: raise ValueError("OPENAI_API_KEY environment variable is required") base_url = os.getenv("OPENAI_BASE_URL") model = os.getenv("LLM_MODEL") if base_url: if model: return base_url, model if "bigmodel" in base_url: return base_url, DEFAULT_CONFIGS["bigmodel"]["model"] elif "moonshot" in base_url or "kimi" in base_url: return base_url, DEFAULT_CONFIGS["moonshot"]["model"] else: return base_url, DEFAULT_CONFIGS["openai"]["model"] else: return None, model or DEFAULT_CONFIGS["openai"]["model"] def parse_lines(lines_text: str) -> List[Tuple[str, str, str]]: """Parse formatted lines. Returns list of (timestamp, speaker_label, text).""" # Pattern to match speaker formats: (Speaker X), (Song), (Narrator), etc. pattern = r'^(\[\d{2}:\d{2}\])\(([^)]+)\) (.+)$' result = [] for line in lines_text.strip().split('\n'): line = line.strip() if not line: continue match = re.match(pattern, line) if match: timestamp = match.group(1) speaker_raw = match.group(2) text = match.group(3) # Normalize: "Speaker X" -> "X", "Song" -> "Song", "Narrator" -> "Narrator" if speaker_raw == "Song": speaker = "Song" elif speaker_raw == "Narrator": speaker = "Narrator" else: # Extract letter from "Speaker X" speaker = speaker_raw.replace("Speaker ", "") result.append((timestamp, speaker, text)) return result def save_debug(filename: str, request: str, response: str, step: int, model: str = "", endpoint: str = ""): """Save debug info to _speakers_debug folder.""" debug_file = DEBUG_DIR / f"{filename}_step{step}.txt" with open(debug_file, 'w', encoding='utf-8') as f: f.write("=" * 60 + "\n") f.write("DEBUG INFO:\n") f.write("=" * 60 + "\n") if model: f.write(f"Model: {model}\n") if endpoint: f.write(f"Endpoint: {endpoint}\n") f.write("\n") f.write("=" * 60 + "\n") f.write("REQUEST:\n") f.write("=" * 60 + "\n\n") f.write(request) f.write("\n\n") f.write("=" * 60 + "\n") f.write("RESPONSE:\n") f.write("=" * 60 + "\n\n") f.write(response) def ask_llm_for_name(prompt: str, client: OpenAI, model: str, debug_filename: str, step: int, exclude_names: list = None, base_url: str = "") -> str: """Ask LLM for a single name. Returns the name or raises exception if invalid.""" # Valid speaker names valid_names = ['Malabar', 'Moon', 'Earth', 'Mars', 'Sun', 'Jupiter', 'Saturn', 'Venus', 'Mercury', 'Neptune', 'Uranus', 'Pluto', 'Galaxy', 'Star', 'Kangaroo', 'Giraffe', 'Volcano', 'Volcanoes', 'Sea', 'Ocean', 'Wave', 'Comet', 'Asteroid', 'Meteor', 'Nebula', 'Black Hole', 'Alien', 'Robot', 'Scientist'] # Filter out excluded names if exclude_names: valid_names = [n for n in valid_names if n not in exclude_names] try: response = client.chat.completions.create( model=model, messages=[ {"role": "user", "content": prompt} ], temperature=0.0, max_tokens=20, # Short response expected extra_body={"thinking": {"type": "disabled"}} # Disable thinking ) message = response.choices[0].message raw_result = message.content.strip() if message.content else "" # Save debug info save_debug(debug_filename, prompt, f"RAW: {raw_result}", step, model=model, endpoint=base_url or "OpenAI default") # Simple validation: result should be one of the valid names if raw_result in valid_names: return raw_result # Check case-insensitive match for name in valid_names: if raw_result.lower() == name.lower(): return name raise ValueError(f"Invalid response from LLM: expected one of {valid_names}, got '{raw_result}'") except Exception as e: save_debug(debug_filename, prompt, f"ERROR: {e}", step, model=model, endpoint=base_url or "OpenAI default") raise # Re-raise the exception def identify_malabar(dialogue_lines: List[Tuple[str, str, str]], client: OpenAI, model: str, debug_filename: str, base_url: str = "") -> Optional[str]: """Identify which speaker is Malabar.""" # Only consider single-letter speakers (exclude "?", "Song", and other special markers) speakers = sorted(set(speaker for _, speaker, _ in dialogue_lines if len(speaker) == 1 and speaker.isalpha())) if not speakers: return None # Output ALL lines in chronological order (preserving original order) samples = [] for ts, spk, text in dialogue_lines: # Skip Song and Narrator speakers for Malabar identification if spk in ("Song", "Narrator"): continue # Only include speakers we're trying to identify if spk in speakers: samples.append(f'{spk}: "{text}"') sample_text = '\n'.join(samples) prompt = f"""Little Malabar dialogue. Malabar is a boy who talks to stars, planets and animals. {sample_text} Which speaker letter is Malabar? Reply with ONLY the letter A, B, or C.""" try: response = client.chat.completions.create( model=model, messages=[ {"role": "user", "content": prompt} ], temperature=0.0, max_tokens=10, # Short response expected extra_body={"thinking": {"type": "disabled"}} # Disable thinking ) message = response.choices[0].message raw_result = message.content.strip() if message.content else "" # Get endpoint for debug info endpoint = base_url or (str(client.base_url) if hasattr(client, 'base_url') else "OpenAI default") # Save debug info save_debug(debug_filename, prompt, f"RAW: {raw_result}", 1, model=model, endpoint=endpoint) # Simple validation: result should be a single letter in speakers list if raw_result and len(raw_result) == 1 and raw_result.upper() in speakers: return raw_result.upper() raise ValueError(f"Invalid response from LLM: expected single letter A/B/C, got '{raw_result}'") except Exception as e: endpoint = base_url or (str(client.base_url) if hasattr(client, 'base_url') else "OpenAI default") save_debug(debug_filename, prompt, f"ERROR: {e}", 1, model=model, endpoint=endpoint) raise # Re-raise the exception def identify_speaker(speaker: str, dialogue_lines: List[Tuple[str, str, str]], known_names: Dict[str, str], client: OpenAI, model: str, debug_filename: str, step: int) -> str: """Identify a single speaker's name.""" # Build the full dialogue with proper speaker names # For known speakers, use their real name # For the target speaker, keep as "Speaker X" # For unknown speakers, keep as "Speaker X" full_dialogue_lines = [] for ts, spk, text in dialogue_lines: if spk == speaker: # Target speaker - keep as Speaker X (we're trying to identify them) full_dialogue_lines.append(f'Speaker {spk}: "{text}"') elif spk in known_names: # Known speaker - use real name full_dialogue_lines.append(f'{known_names[spk]}: "{text}"') elif spk in ("Song", "Narrator"): full_dialogue_lines.append(f'{spk}: "{text}"') else: # Unknown speaker - keep as Speaker X full_dialogue_lines.append(f'Speaker {spk}: "{text}"') full_dialogue = '\n'.join(full_dialogue_lines) # Build list of who we already know known_info = "Known: " + ", ".join([f"Speaker {s} = {n}" for s, n in known_names.items()]) if known_names else "" prompt = f"""Little Malabar dialogue. {known_info} CONTEXT: - Malabar is a boy who talks to stars, planets and animals - Other speakers are usually celestial bodies (Moon, Earth, Mars, Sun, etc.) - BUT speakers can also be other entities: volcanoes, the sea, a comet, a star, etc. - Look at what the speaker talks about AND what others say to them to identify them IDENTIFICATION GUIDELINES: - Speaker mentions "my surface" + warm/shaking → likely Earth - Speaker mentions being "up here" with no ocean → likely Moon - Speaker says "us volcanoes on Mars" → this is Volcanoes (not Mars!) - Speaker is spoken TO about Mars/volcanoes → could be Mars - Speaker mentions the sea/ocean/waves → could be Sea/Ocean - Speaker suggests going TO a place → likely describing that place from outside - Use your judgment based on context and content FULL DIALOGUE: {full_dialogue} Who is Speaker {speaker}? Reply with ONLY the name, nothing else. Examples: Moon, Earth, Mars, Volcanoes, Sea, Sun, Jupiter:""" # Get list of already known names to exclude from extraction known_names_list = list(known_names.values()) if known_names else [] # Get base_url from client for debug info base_url = client.base_url if hasattr(client, 'base_url') else "" return ask_llm_for_name(prompt, client, model, debug_filename, step, exclude_names=known_names_list, base_url=base_url) def process_lines_file(input_path: Path, client: OpenAI, model: str, force: bool = False, base_url: str = "") -> Path: """Process a single lines file using multi-step approach.""" progress = load_progress() filename = input_path.name # Check if already processed if not force and filename in progress and progress[filename].get("status") == "completed": output_path = Path(progress[filename]["output_file"]) if output_path.exists(): print(f"Skipping {filename} (already processed)") return output_path print(f"\n{'='*50}") print(f"Processing: {input_path.name}") print(f"{'='*50}") debug_filename = input_path.stem # Read lines file with open(input_path, 'r', encoding='utf-8') as f: lines_text = f.read() # Parse lines lines = parse_lines(lines_text) print(f" Parsed {len(lines)} lines") if not lines: print(" No valid lines found!") return None # Get unique speakers (excluding "Song" and "Narrator" - already known) all_speakers = set(speaker for _, speaker, _ in lines) speakers_to_identify = [s for s in all_speakers if s not in ("Song", "Narrator")] print(f" Speakers to identify: {', '.join(sorted(speakers_to_identify))}") # Build mapping final_mapping = {} if not speakers_to_identify: print(f" No speakers to identify (only Song present)") else: # Separate regular speakers from unknown/merged speakers (like "?") regular_speakers = [s for s in speakers_to_identify if s.isalpha()] unknown_speakers = [s for s in speakers_to_identify if not s.isalpha()] # Step 1: Identify Malabar (from regular speakers only) print(f" Step 1: Identifying Malabar...") try: malabar_speaker = identify_malabar(lines, client, model, debug_filename, base_url) final_mapping[malabar_speaker] = "Malabar" print(f" Identified Speaker {malabar_speaker} = Malabar") except Exception as e: print(f" Error: {e}") if regular_speakers: # Fallback: assume first regular speaker alphabetically is Malabar malabar_speaker = sorted(regular_speakers)[0] final_mapping[malabar_speaker] = "Malabar" print(f" Fallback: Speaker {malabar_speaker} = Malabar") # Step 2+: Identify remaining regular speakers one by one remaining = [s for s in regular_speakers if s not in final_mapping] step = 2 for speaker in remaining: print(f" Step {step}: Identifying Speaker {speaker}...") try: name = identify_speaker(speaker, lines, final_mapping, client, model, debug_filename, step) final_mapping[speaker] = name print(f" Identified Speaker {speaker} = {name}") except Exception as e: print(f" Error: {e}") final_mapping[speaker] = f"Speaker_{speaker}" print(f" Fallback: Speaker {speaker} = Speaker_{speaker}") step += 1 # Handle unknown speakers (like "?") for speaker in unknown_speakers: print(f" Step {step}: Identifying unknown Speaker {speaker}...") try: name = identify_speaker(speaker, lines, final_mapping, client, model, debug_filename, step) final_mapping[speaker] = name print(f" Identified Speaker {speaker} = {name}") except Exception as e: print(f" Error: {e}") final_mapping[speaker] = "Unknown" print(f" Marked Speaker {speaker} = Unknown") step += 1 print(f" Final mapping: {final_mapping}") # Apply speaker names to output output_text = apply_speaker_names(lines, final_mapping) # Save output output_filename = input_path.stem.replace("_lines", "") + "_speakers.txt" output_path = OUTPUT_DIR / output_filename with open(output_path, 'w', encoding='utf-8') as f: f.write(output_text) # Update progress progress[filename] = { "status": "completed", "output_file": str(output_path), "speaker_mapping": final_mapping } save_progress(progress) print(f" Saved to: {output_path}") return output_path def apply_speaker_names(lines: List[Tuple[str, str, str]], mapping: Dict[str, str]) -> str: """Apply speaker names to lines. SPECIAL: "Song" speaker is passed through unchanged (already labeled in Step 2). """ result_lines = [] for timestamp, speaker, text in lines: # "Song" and "Narrator" speakers are already correctly labeled - pass through unchanged if speaker in ("Song", "Narrator"): speaker_name = speaker else: speaker_name = mapping.get(speaker, f"Speaker_{speaker}") result_lines.append(f"{timestamp}({speaker_name}) {text}") return '\n'.join(result_lines) def get_input_files() -> list[Path]: """Discover all text files in _lines/ folder.""" if not INPUT_DIR.exists(): return [] files = [f for f in INPUT_DIR.iterdir() if f.is_file() and f.suffix == '.txt'] return sorted(files) def main(): ensure_dirs() # Check for force flag force = "--force" in sys.argv or "-f" in sys.argv # Get LLM config base_url, model = get_llm_config() client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url=base_url) print(f"Using model: {model}") print(f"Endpoint: {base_url or 'OpenAI default'}") # Discover input files lines_files = get_input_files() if not lines_files: print(f"No .txt files found in {INPUT_DIR}/") sys.exit(1) print(f"Found {len(lines_files)} transcript(s) in {INPUT_DIR}/") if force: print("Force mode: ON (reprocessing all files)") print(f"Debug info will be saved to {DEBUG_DIR}/") print("") # Process all files success_count = 0 fail_count = 0 for input_path in lines_files: try: output_path = process_lines_file(input_path, client, model, force=force, base_url=base_url or "") if output_path: success_count += 1 except Exception as e: progress = load_progress() progress[input_path.name] = {"status": "error", "error": str(e)} save_progress(progress) print(f"\n❌ Failed to process {input_path.name}: {e}") import traceback traceback.print_exc() fail_count += 1 # Summary print("\n" + "="*50) print(f"Step 3 Complete: {success_count} succeeded, {fail_count} failed") print(f"Debug files saved to: {DEBUG_DIR}/") print("="*50) if fail_count > 0: sys.exit(1) if __name__ == "__main__": main()