fix speaker recognize

2026-03-04 17:02:59 +08:00
parent e085eeddbc
commit 2395c048ff
18 changed files with 652 additions and 5794 deletions
--- a/step3_infer_speakers.py
+++ b/step3_infer_speakers.py
@@ -124,10 +124,18 @@ def parse_lines(lines_text: str) -> List[Tuple[str, str, str]]:
    return result


-def save_debug(filename: str, request: str, response: str, step: int):
+def save_debug(filename: str, request: str, response: str, step: int, model: str = "", endpoint: str = ""):
    """Save debug info to _speakers_debug folder."""
    debug_file = DEBUG_DIR / f"{filename}_step{step}.txt"
    with open(debug_file, 'w', encoding='utf-8') as f:
+        f.write("=" * 60 + "\n")
+        f.write("DEBUG INFO:\n")
+        f.write("=" * 60 + "\n")
+        if model:
+            f.write(f"Model: {model}\n")
+        if endpoint:
+            f.write(f"Endpoint: {endpoint}\n")
+        f.write("\n")
        f.write("=" * 60 + "\n")
        f.write("REQUEST:\n")
        f.write("=" * 60 + "\n\n")
@@ -139,86 +147,53 @@ def save_debug(filename: str, request: str, response: str, step: int):
        f.write(response)


-def extract_name_from_response(text: str) -> str:
-    """Extract a single name from LLM response text."""
-    text = text.strip()
-    
-    # Expanded list of valid names - includes celestial bodies and other entities
+def ask_llm_for_name(prompt: str, client: OpenAI, model: str, debug_filename: str, step: int, exclude_names: list = None, base_url: str = "") -> str:
+    """Ask LLM for a single name. Returns the name or raises exception if invalid."""
+    # Valid speaker names
    valid_names = ['Malabar', 'Moon', 'Earth', 'Mars', 'Sun', 'Jupiter', 'Saturn', 'Venus', 
                   'Mercury', 'Neptune', 'Uranus', 'Pluto', 'Galaxy', 'Star', 'Kangaroo', 
                   'Giraffe', 'Volcano', 'Volcanoes', 'Sea', 'Ocean', 'Wave', 'Comet', 
                   'Asteroid', 'Meteor', 'Nebula', 'Black Hole', 'Alien', 'Robot', 'Scientist']
    
-    # Check if the response is just a single word (the name)
-    if ' ' not in text and len(text) > 1:
-        return text.strip('"\'')
+    # Filter out excluded names
+    if exclude_names:
+        valid_names = [n for n in valid_names if n not in exclude_names]
    
-    # Look for explicit "Answer: X" or "Name: X" patterns
-    answer_match = re.search(r'(?:answer|name|is)[:\s]+["\']?([A-Z][a-z]+)', text, re.IGNORECASE)
-    if answer_match:
-        return answer_match.group(1)
-    
-    # Check last few lines for a valid name
-    lines = text.split('\n')
-    for line in reversed(lines[-5:]):  # Check last 5 lines
-        line = line.strip().strip('"\'')
-        for name in valid_names:
-            if line.lower() == name.lower():
-                return name
-            if re.search(rf'\b{name}\b', line, re.IGNORECASE):
-                return name
-    
-    # Default: return first valid name found
-    for name in valid_names:
-        if re.search(rf'\b{name}\b', text, re.IGNORECASE):
-            return name
-    
-    # If no known name found, extract any capitalized word as potential name
-    for line in text.split('\n'):
-        line = line.strip()
-        match = re.search(r'\b([A-Z][a-z]{2,})\b', line)
-        if match:
-            word = match.group(1)
-            if word.lower() not in ['the', 'and', 'but', 'for', 'are', 'was', 'were', 'been', 'this', 'that']:
-                return word
-    
-    return ""
-
-
-def ask_llm_for_name(prompt: str, client: OpenAI, model: str, debug_filename: str, step: int) -> str:
-    """Ask LLM for a single name. Returns the name or empty string if failed."""
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
-                {"role": "system", "content": "Reply with ONLY a single word - the name. No explanation."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0,
-            max_tokens=1000
+            max_tokens=20,  # Short response expected
+            extra_body={"thinking": {"type": "disabled"}}  # Disable thinking
        )
        
        message = response.choices[0].message
-        raw_result = message.content or ""
-        
-        # If content is empty but reasoning_content exists, use that
-        if not raw_result and hasattr(message, 'reasoning_content') and message.reasoning_content:
-            raw_result = message.reasoning_content
-        
-        # Extract name from the response
-        result = extract_name_from_response(raw_result)
+        raw_result = message.content.strip() if message.content else ""
        
        # Save debug info
-        save_debug(debug_filename, prompt, f"RAW: {raw_result[:800]}\n\nEXTRACTED: {result}", step)
+        save_debug(debug_filename, prompt, f"RAW: {raw_result}", step, model=model, endpoint=base_url or "OpenAI default")
+        
+        # Simple validation: result should be one of the valid names
+        if raw_result in valid_names:
+            return raw_result
+        
+        # Check case-insensitive match
+        for name in valid_names:
+            if raw_result.lower() == name.lower():
+                return name
+        
+        raise ValueError(f"Invalid response from LLM: expected one of {valid_names}, got '{raw_result}'")
        
-        return result
    except Exception as e:
-        save_debug(debug_filename, prompt, f"ERROR: {e}", step)
-        return ""
+        save_debug(debug_filename, prompt, f"ERROR: {e}", step, model=model, endpoint=base_url or "OpenAI default")
+        raise  # Re-raise the exception


 def identify_malabar(dialogue_lines: List[Tuple[str, str, str]], 
-                     client: OpenAI, model: str, debug_filename: str) -> Optional[str]:
+                     client: OpenAI, model: str, debug_filename: str, base_url: str = "") -> Optional[str]:
    """Identify which speaker is Malabar."""
    # Only consider single-letter speakers (exclude "?", "Song", and other special markers)
    speakers = sorted(set(speaker for _, speaker, _ in dialogue_lines 
@@ -227,30 +202,54 @@ def identify_malabar(dialogue_lines: List[Tuple[str, str, str]],
    if not speakers:
        return None
    
-    # Get sample lines from each speaker
+    # Output ALL lines in chronological order (preserving original order)
    samples = []
-    for speaker in speakers:
-        lines = [(ts, text) for ts, spk, text in dialogue_lines 
-                 if spk == speaker][:3]
-        for ts, text in lines:
-            samples.append(f'{speaker}: "{text}"')
+    for ts, spk, text in dialogue_lines:
+        # Skip Song speaker for Malabar identification
+        if spk == "Song":
+            continue
+        # Only include speakers we're trying to identify
+        if spk in speakers:
+            samples.append(f'{spk}: "{text}"')
    
    sample_text = '\n'.join(samples)
    
-    prompt = f"""Little Malabar dialogue. Malabar is the boy who addresses Kangaroo/Giraffe.
+    prompt = f"""Little Malabar dialogue. Malabar is a boy who talks to stars, planets and animals.

 {sample_text}

-Which speaker letter is Malabar? Reply with ONLY A, B, or C:"""
+Which speaker letter is Malabar? Reply with ONLY the letter A, B, or C."""
    
-    result = ask_llm_for_name(prompt, client, model, debug_filename, 1)
-    
-    # Extract the letter
-    match = re.search(r'\b([A-Z])\b', result.upper())
-    if match and match.group(1) in speakers:
-        return match.group(1)
-    
-    return None
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0.0,
+            max_tokens=10,  # Short response expected
+            extra_body={"thinking": {"type": "disabled"}}  # Disable thinking
+        )
+        
+        message = response.choices[0].message
+        raw_result = message.content.strip() if message.content else ""
+        
+        # Get endpoint for debug info
+        endpoint = base_url or (str(client.base_url) if hasattr(client, 'base_url') else "OpenAI default")
+        
+        # Save debug info
+        save_debug(debug_filename, prompt, f"RAW: {raw_result}", 1, model=model, endpoint=endpoint)
+        
+        # Simple validation: result should be a single letter in speakers list
+        if raw_result and len(raw_result) == 1 and raw_result.upper() in speakers:
+            return raw_result.upper()
+        
+        raise ValueError(f"Invalid response from LLM: expected single letter A/B/C, got '{raw_result}'")
+        
+    except Exception as e:
+        endpoint = base_url or (str(client.base_url) if hasattr(client, 'base_url') else "OpenAI default")
+        save_debug(debug_filename, prompt, f"ERROR: {e}", 1, model=model, endpoint=endpoint)
+        raise  # Re-raise the exception


 def identify_speaker(speaker: str, 
@@ -258,19 +257,25 @@ def identify_speaker(speaker: str,
                     known_names: Dict[str, str],
                     client: OpenAI, model: str, debug_filename: str, step: int) -> str:
    """Identify a single speaker's name."""
-    # Get this speaker's lines
-    speaker_lines = [(ts, text) for ts, spk, text in dialogue_lines 
-                     if spk == speaker]
+    # Build the full dialogue with proper speaker names
+    # For known speakers, use their real name
+    # For the target speaker, keep as "Speaker X"
+    # For unknown speakers, keep as "Speaker X"
+    full_dialogue_lines = []
+    for ts, spk, text in dialogue_lines:
+        if spk == speaker:
+            # Target speaker - keep as Speaker X (we're trying to identify them)
+            full_dialogue_lines.append(f'Speaker {spk}: "{text}"')
+        elif spk in known_names:
+            # Known speaker - use real name
+            full_dialogue_lines.append(f'{known_names[spk]}: "{text}"')
+        elif spk == "Song":
+            full_dialogue_lines.append(f'Song: "{text}"')
+        else:
+            # Unknown speaker - keep as Speaker X
+            full_dialogue_lines.append(f'Speaker {spk}: "{text}"')
    
-    # Prioritize lines with identifying keywords - Mars mentions first
-    mars_lines = [l for l in speaker_lines if 'mars' in l[1].lower()]
-    other_priority = [l for l in speaker_lines if 'mars' not in l[1].lower() and 
-                      any(k in l[1].lower() for k in ['surface', 'volcanoes', 'craters', 'my surface', 'up here', 'labyrinth'])]
-    other_lines = [l for l in speaker_lines if l not in mars_lines and l not in other_priority]
-    
-    # Combine: Mars lines first, then other priority, then others, max 8 lines
-    selected_lines = (mars_lines + other_priority + other_lines)[:8]
-    speaker_sample = '\n'.join([f'{ts} "{text}"' for ts, text in selected_lines])
+    full_dialogue = '\n'.join(full_dialogue_lines)
    
    # Build list of who we already know
    known_info = "Known: " + ", ".join([f"Speaker {s} = {n}" for s, n in known_names.items()]) if known_names else ""
@@ -278,28 +283,33 @@ def identify_speaker(speaker: str,
    prompt = f"""Little Malabar dialogue. {known_info}

 CONTEXT:
- Malabar is the main character (a boy) who explores space
+- Malabar is a boy who talks to stars, planets and animals
 - Other speakers are usually celestial bodies (Moon, Earth, Mars, Sun, etc.)
 - BUT speakers can also be other entities: volcanoes, the sea, a comet, a star, etc.
- Look at what the speaker talks about to identify them
+- Look at what the speaker talks about AND what others say to them to identify them

 IDENTIFICATION GUIDELINES:
 - Speaker mentions "my surface" + warm/shaking → likely Earth
 - Speaker mentions being "up here" with no ocean → likely Moon
- Speaker mentions "us volcanoes on Mars" → could be Mars OR Volcanoes
+- Speaker says "us volcanoes on Mars" → this is Volcanoes (not Mars!)
+- Speaker is spoken TO about Mars/volcanoes → could be Mars
 - Speaker mentions the sea/ocean/waves → could be Sea/Ocean
 - Speaker suggests going TO a place → likely describing that place from outside
 - Use your judgment based on context and content

-Speaker {speaker}'s lines:
-{speaker_sample}
+FULL DIALOGUE:
+{full_dialogue}

-Who is Speaker {speaker}? Reply with a single descriptive name (e.g., "Moon", "Earth", "Mars", "Volcanoes", "Sea", "Sun", "Comet", "Star"):"""
+Who is Speaker {speaker}? Reply with ONLY the name, nothing else. Examples: Moon, Earth, Mars, Volcanoes, Sea, Sun, Jupiter:"""
    
-    return ask_llm_for_name(prompt, client, model, debug_filename, step)
+    # Get list of already known names to exclude from extraction
+    known_names_list = list(known_names.values()) if known_names else []
+    # Get base_url from client for debug info
+    base_url = client.base_url if hasattr(client, 'base_url') else ""
+    return ask_llm_for_name(prompt, client, model, debug_filename, step, exclude_names=known_names_list, base_url=base_url)


-def process_lines_file(input_path: Path, client: OpenAI, model: str, force: bool = False) -> Path:
+def process_lines_file(input_path: Path, client: OpenAI, model: str, force: bool = False, base_url: str = "") -> Path:
    """Process a single lines file using multi-step approach."""
    progress = load_progress()
    filename = input_path.name
@@ -347,16 +357,17 @@ def process_lines_file(input_path: Path, client: OpenAI, model: str, force: bool
        
        # Step 1: Identify Malabar (from regular speakers only)
        print(f"  Step 1: Identifying Malabar...")
-        malabar_speaker = identify_malabar(lines, client, model, debug_filename)
-        
-        if malabar_speaker:
+        try:
+            malabar_speaker = identify_malabar(lines, client, model, debug_filename, base_url)
            final_mapping[malabar_speaker] = "Malabar"
            print(f"    Identified Speaker {malabar_speaker} = Malabar")
-        elif regular_speakers:
-            # Fallback: assume first regular speaker alphabetically is Malabar
-            malabar_speaker = sorted(regular_speakers)[0]
-            final_mapping[malabar_speaker] = "Malabar"
-            print(f"    Fallback: Speaker {malabar_speaker} = Malabar")
+        except Exception as e:
+            print(f"    Error: {e}")
+            if regular_speakers:
+                # Fallback: assume first regular speaker alphabetically is Malabar
+                malabar_speaker = sorted(regular_speakers)[0]
+                final_mapping[malabar_speaker] = "Malabar"
+                print(f"    Fallback: Speaker {malabar_speaker} = Malabar")
        
        # Step 2+: Identify remaining regular speakers one by one
        remaining = [s for s in regular_speakers if s not in final_mapping]
@@ -364,26 +375,25 @@ def process_lines_file(input_path: Path, client: OpenAI, model: str, force: bool
        
        for speaker in remaining:
            print(f"  Step {step}: Identifying Speaker {speaker}...")
-            name = identify_speaker(speaker, lines, final_mapping, client, model, debug_filename, step)
-            
-            if name and len(name) > 1:
+            try:
+                name = identify_speaker(speaker, lines, final_mapping, client, model, debug_filename, step)
                final_mapping[speaker] = name
                print(f"    Identified Speaker {speaker} = {name}")
-            else:
+            except Exception as e:
+                print(f"    Error: {e}")
                final_mapping[speaker] = f"Speaker_{speaker}"
                print(f"    Fallback: Speaker {speaker} = Speaker_{speaker}")
-            
            step += 1
        
        # Handle unknown speakers (like "?")
        for speaker in unknown_speakers:
            print(f"  Step {step}: Identifying unknown Speaker {speaker}...")
-            # Try to identify based on content
-            name = identify_speaker(speaker, lines, final_mapping, client, model, debug_filename, step)
-            if name and len(name) > 1 and name.lower() not in ['unknown', 'speaker', 'name']:
+            try:
+                name = identify_speaker(speaker, lines, final_mapping, client, model, debug_filename, step)
                final_mapping[speaker] = name
                print(f"    Identified Speaker {speaker} = {name}")
-            else:
+            except Exception as e:
+                print(f"    Error: {e}")
                final_mapping[speaker] = "Unknown"
                print(f"    Marked Speaker {speaker} = Unknown")
            step += 1
@@ -449,6 +459,9 @@ def main():
    base_url, model = get_llm_config()
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url=base_url)
    
+    print(f"Using model: {model}")
+    print(f"Endpoint: {base_url or 'OpenAI default'}")
+    
    # Discover input files
    lines_files = get_input_files()
    
@@ -468,7 +481,7 @@ def main():
    
    for input_path in lines_files:
        try:
-            output_path = process_lines_file(input_path, client, model, force=force)
+            output_path = process_lines_file(input_path, client, model, force=force, base_url=base_url or "")
            if output_path:
                success_count += 1
        except Exception as e: