fix speaker recognize

This commit is contained in:
2026-03-04 17:02:59 +08:00
parent e085eeddbc
commit 2395c048ff
18 changed files with 652 additions and 5794 deletions

View File

@@ -124,10 +124,18 @@ def parse_lines(lines_text: str) -> List[Tuple[str, str, str]]:
return result
def save_debug(filename: str, request: str, response: str, step: int):
def save_debug(filename: str, request: str, response: str, step: int, model: str = "", endpoint: str = ""):
"""Save debug info to _speakers_debug folder."""
debug_file = DEBUG_DIR / f"{filename}_step{step}.txt"
with open(debug_file, 'w', encoding='utf-8') as f:
f.write("=" * 60 + "\n")
f.write("DEBUG INFO:\n")
f.write("=" * 60 + "\n")
if model:
f.write(f"Model: {model}\n")
if endpoint:
f.write(f"Endpoint: {endpoint}\n")
f.write("\n")
f.write("=" * 60 + "\n")
f.write("REQUEST:\n")
f.write("=" * 60 + "\n\n")
@@ -139,86 +147,53 @@ def save_debug(filename: str, request: str, response: str, step: int):
f.write(response)
def extract_name_from_response(text: str) -> str:
"""Extract a single name from LLM response text."""
text = text.strip()
# Expanded list of valid names - includes celestial bodies and other entities
def ask_llm_for_name(prompt: str, client: OpenAI, model: str, debug_filename: str, step: int, exclude_names: list = None, base_url: str = "") -> str:
"""Ask LLM for a single name. Returns the name or raises exception if invalid."""
# Valid speaker names
valid_names = ['Malabar', 'Moon', 'Earth', 'Mars', 'Sun', 'Jupiter', 'Saturn', 'Venus',
'Mercury', 'Neptune', 'Uranus', 'Pluto', 'Galaxy', 'Star', 'Kangaroo',
'Giraffe', 'Volcano', 'Volcanoes', 'Sea', 'Ocean', 'Wave', 'Comet',
'Asteroid', 'Meteor', 'Nebula', 'Black Hole', 'Alien', 'Robot', 'Scientist']
# Check if the response is just a single word (the name)
if ' ' not in text and len(text) > 1:
return text.strip('"\'')
# Filter out excluded names
if exclude_names:
valid_names = [n for n in valid_names if n not in exclude_names]
# Look for explicit "Answer: X" or "Name: X" patterns
answer_match = re.search(r'(?:answer|name|is)[:\s]+["\']?([A-Z][a-z]+)', text, re.IGNORECASE)
if answer_match:
return answer_match.group(1)
# Check last few lines for a valid name
lines = text.split('\n')
for line in reversed(lines[-5:]): # Check last 5 lines
line = line.strip().strip('"\'')
for name in valid_names:
if line.lower() == name.lower():
return name
if re.search(rf'\b{name}\b', line, re.IGNORECASE):
return name
# Default: return first valid name found
for name in valid_names:
if re.search(rf'\b{name}\b', text, re.IGNORECASE):
return name
# If no known name found, extract any capitalized word as potential name
for line in text.split('\n'):
line = line.strip()
match = re.search(r'\b([A-Z][a-z]{2,})\b', line)
if match:
word = match.group(1)
if word.lower() not in ['the', 'and', 'but', 'for', 'are', 'was', 'were', 'been', 'this', 'that']:
return word
return ""
def ask_llm_for_name(prompt: str, client: OpenAI, model: str, debug_filename: str, step: int) -> str:
"""Ask LLM for a single name. Returns the name or empty string if failed."""
try:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "Reply with ONLY a single word - the name. No explanation."},
{"role": "user", "content": prompt}
],
temperature=0.0,
max_tokens=1000
max_tokens=20, # Short response expected
extra_body={"thinking": {"type": "disabled"}} # Disable thinking
)
message = response.choices[0].message
raw_result = message.content or ""
# If content is empty but reasoning_content exists, use that
if not raw_result and hasattr(message, 'reasoning_content') and message.reasoning_content:
raw_result = message.reasoning_content
# Extract name from the response
result = extract_name_from_response(raw_result)
raw_result = message.content.strip() if message.content else ""
# Save debug info
save_debug(debug_filename, prompt, f"RAW: {raw_result[:800]}\n\nEXTRACTED: {result}", step)
save_debug(debug_filename, prompt, f"RAW: {raw_result}", step, model=model, endpoint=base_url or "OpenAI default")
# Simple validation: result should be one of the valid names
if raw_result in valid_names:
return raw_result
# Check case-insensitive match
for name in valid_names:
if raw_result.lower() == name.lower():
return name
raise ValueError(f"Invalid response from LLM: expected one of {valid_names}, got '{raw_result}'")
return result
except Exception as e:
save_debug(debug_filename, prompt, f"ERROR: {e}", step)
return ""
save_debug(debug_filename, prompt, f"ERROR: {e}", step, model=model, endpoint=base_url or "OpenAI default")
raise # Re-raise the exception
def identify_malabar(dialogue_lines: List[Tuple[str, str, str]],
client: OpenAI, model: str, debug_filename: str) -> Optional[str]:
client: OpenAI, model: str, debug_filename: str, base_url: str = "") -> Optional[str]:
"""Identify which speaker is Malabar."""
# Only consider single-letter speakers (exclude "?", "Song", and other special markers)
speakers = sorted(set(speaker for _, speaker, _ in dialogue_lines
@@ -227,30 +202,54 @@ def identify_malabar(dialogue_lines: List[Tuple[str, str, str]],
if not speakers:
return None
# Get sample lines from each speaker
# Output ALL lines in chronological order (preserving original order)
samples = []
for speaker in speakers:
lines = [(ts, text) for ts, spk, text in dialogue_lines
if spk == speaker][:3]
for ts, text in lines:
samples.append(f'{speaker}: "{text}"')
for ts, spk, text in dialogue_lines:
# Skip Song speaker for Malabar identification
if spk == "Song":
continue
# Only include speakers we're trying to identify
if spk in speakers:
samples.append(f'{spk}: "{text}"')
sample_text = '\n'.join(samples)
prompt = f"""Little Malabar dialogue. Malabar is the boy who addresses Kangaroo/Giraffe.
prompt = f"""Little Malabar dialogue. Malabar is a boy who talks to stars, planets and animals.
{sample_text}
Which speaker letter is Malabar? Reply with ONLY A, B, or C:"""
Which speaker letter is Malabar? Reply with ONLY the letter A, B, or C."""
result = ask_llm_for_name(prompt, client, model, debug_filename, 1)
# Extract the letter
match = re.search(r'\b([A-Z])\b', result.upper())
if match and match.group(1) in speakers:
return match.group(1)
return None
try:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "user", "content": prompt}
],
temperature=0.0,
max_tokens=10, # Short response expected
extra_body={"thinking": {"type": "disabled"}} # Disable thinking
)
message = response.choices[0].message
raw_result = message.content.strip() if message.content else ""
# Get endpoint for debug info
endpoint = base_url or (str(client.base_url) if hasattr(client, 'base_url') else "OpenAI default")
# Save debug info
save_debug(debug_filename, prompt, f"RAW: {raw_result}", 1, model=model, endpoint=endpoint)
# Simple validation: result should be a single letter in speakers list
if raw_result and len(raw_result) == 1 and raw_result.upper() in speakers:
return raw_result.upper()
raise ValueError(f"Invalid response from LLM: expected single letter A/B/C, got '{raw_result}'")
except Exception as e:
endpoint = base_url or (str(client.base_url) if hasattr(client, 'base_url') else "OpenAI default")
save_debug(debug_filename, prompt, f"ERROR: {e}", 1, model=model, endpoint=endpoint)
raise # Re-raise the exception
def identify_speaker(speaker: str,
@@ -258,19 +257,25 @@ def identify_speaker(speaker: str,
known_names: Dict[str, str],
client: OpenAI, model: str, debug_filename: str, step: int) -> str:
"""Identify a single speaker's name."""
# Get this speaker's lines
speaker_lines = [(ts, text) for ts, spk, text in dialogue_lines
if spk == speaker]
# Build the full dialogue with proper speaker names
# For known speakers, use their real name
# For the target speaker, keep as "Speaker X"
# For unknown speakers, keep as "Speaker X"
full_dialogue_lines = []
for ts, spk, text in dialogue_lines:
if spk == speaker:
# Target speaker - keep as Speaker X (we're trying to identify them)
full_dialogue_lines.append(f'Speaker {spk}: "{text}"')
elif spk in known_names:
# Known speaker - use real name
full_dialogue_lines.append(f'{known_names[spk]}: "{text}"')
elif spk == "Song":
full_dialogue_lines.append(f'Song: "{text}"')
else:
# Unknown speaker - keep as Speaker X
full_dialogue_lines.append(f'Speaker {spk}: "{text}"')
# Prioritize lines with identifying keywords - Mars mentions first
mars_lines = [l for l in speaker_lines if 'mars' in l[1].lower()]
other_priority = [l for l in speaker_lines if 'mars' not in l[1].lower() and
any(k in l[1].lower() for k in ['surface', 'volcanoes', 'craters', 'my surface', 'up here', 'labyrinth'])]
other_lines = [l for l in speaker_lines if l not in mars_lines and l not in other_priority]
# Combine: Mars lines first, then other priority, then others, max 8 lines
selected_lines = (mars_lines + other_priority + other_lines)[:8]
speaker_sample = '\n'.join([f'{ts} "{text}"' for ts, text in selected_lines])
full_dialogue = '\n'.join(full_dialogue_lines)
# Build list of who we already know
known_info = "Known: " + ", ".join([f"Speaker {s} = {n}" for s, n in known_names.items()]) if known_names else ""
@@ -278,28 +283,33 @@ def identify_speaker(speaker: str,
prompt = f"""Little Malabar dialogue. {known_info}
CONTEXT:
- Malabar is the main character (a boy) who explores space
- Malabar is a boy who talks to stars, planets and animals
- Other speakers are usually celestial bodies (Moon, Earth, Mars, Sun, etc.)
- BUT speakers can also be other entities: volcanoes, the sea, a comet, a star, etc.
- Look at what the speaker talks about to identify them
- Look at what the speaker talks about AND what others say to them to identify them
IDENTIFICATION GUIDELINES:
- Speaker mentions "my surface" + warm/shaking → likely Earth
- Speaker mentions being "up here" with no ocean → likely Moon
- Speaker mentions "us volcanoes on Mars"could be Mars OR Volcanoes
- Speaker says "us volcanoes on Mars"this is Volcanoes (not Mars!)
- Speaker is spoken TO about Mars/volcanoes → could be Mars
- Speaker mentions the sea/ocean/waves → could be Sea/Ocean
- Speaker suggests going TO a place → likely describing that place from outside
- Use your judgment based on context and content
Speaker {speaker}'s lines:
{speaker_sample}
FULL DIALOGUE:
{full_dialogue}
Who is Speaker {speaker}? Reply with a single descriptive name (e.g., "Moon", "Earth", "Mars", "Volcanoes", "Sea", "Sun", "Comet", "Star"):"""
Who is Speaker {speaker}? Reply with ONLY the name, nothing else. Examples: Moon, Earth, Mars, Volcanoes, Sea, Sun, Jupiter:"""
return ask_llm_for_name(prompt, client, model, debug_filename, step)
# Get list of already known names to exclude from extraction
known_names_list = list(known_names.values()) if known_names else []
# Get base_url from client for debug info
base_url = client.base_url if hasattr(client, 'base_url') else ""
return ask_llm_for_name(prompt, client, model, debug_filename, step, exclude_names=known_names_list, base_url=base_url)
def process_lines_file(input_path: Path, client: OpenAI, model: str, force: bool = False) -> Path:
def process_lines_file(input_path: Path, client: OpenAI, model: str, force: bool = False, base_url: str = "") -> Path:
"""Process a single lines file using multi-step approach."""
progress = load_progress()
filename = input_path.name
@@ -347,16 +357,17 @@ def process_lines_file(input_path: Path, client: OpenAI, model: str, force: bool
# Step 1: Identify Malabar (from regular speakers only)
print(f" Step 1: Identifying Malabar...")
malabar_speaker = identify_malabar(lines, client, model, debug_filename)
if malabar_speaker:
try:
malabar_speaker = identify_malabar(lines, client, model, debug_filename, base_url)
final_mapping[malabar_speaker] = "Malabar"
print(f" Identified Speaker {malabar_speaker} = Malabar")
elif regular_speakers:
# Fallback: assume first regular speaker alphabetically is Malabar
malabar_speaker = sorted(regular_speakers)[0]
final_mapping[malabar_speaker] = "Malabar"
print(f" Fallback: Speaker {malabar_speaker} = Malabar")
except Exception as e:
print(f" Error: {e}")
if regular_speakers:
# Fallback: assume first regular speaker alphabetically is Malabar
malabar_speaker = sorted(regular_speakers)[0]
final_mapping[malabar_speaker] = "Malabar"
print(f" Fallback: Speaker {malabar_speaker} = Malabar")
# Step 2+: Identify remaining regular speakers one by one
remaining = [s for s in regular_speakers if s not in final_mapping]
@@ -364,26 +375,25 @@ def process_lines_file(input_path: Path, client: OpenAI, model: str, force: bool
for speaker in remaining:
print(f" Step {step}: Identifying Speaker {speaker}...")
name = identify_speaker(speaker, lines, final_mapping, client, model, debug_filename, step)
if name and len(name) > 1:
try:
name = identify_speaker(speaker, lines, final_mapping, client, model, debug_filename, step)
final_mapping[speaker] = name
print(f" Identified Speaker {speaker} = {name}")
else:
except Exception as e:
print(f" Error: {e}")
final_mapping[speaker] = f"Speaker_{speaker}"
print(f" Fallback: Speaker {speaker} = Speaker_{speaker}")
step += 1
# Handle unknown speakers (like "?")
for speaker in unknown_speakers:
print(f" Step {step}: Identifying unknown Speaker {speaker}...")
# Try to identify based on content
name = identify_speaker(speaker, lines, final_mapping, client, model, debug_filename, step)
if name and len(name) > 1 and name.lower() not in ['unknown', 'speaker', 'name']:
try:
name = identify_speaker(speaker, lines, final_mapping, client, model, debug_filename, step)
final_mapping[speaker] = name
print(f" Identified Speaker {speaker} = {name}")
else:
except Exception as e:
print(f" Error: {e}")
final_mapping[speaker] = "Unknown"
print(f" Marked Speaker {speaker} = Unknown")
step += 1
@@ -449,6 +459,9 @@ def main():
base_url, model = get_llm_config()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url=base_url)
print(f"Using model: {model}")
print(f"Endpoint: {base_url or 'OpenAI default'}")
# Discover input files
lines_files = get_input_files()
@@ -468,7 +481,7 @@ def main():
for input_path in lines_files:
try:
output_path = process_lines_file(input_path, client, model, force=force)
output_path = process_lines_file(input_path, client, model, force=force, base_url=base_url or "")
if output_path:
success_count += 1
except Exception as e: