Simplify the pipeline, merge the opening song

This commit is contained in:
2026-03-04 13:14:53 +08:00
parent 6153d386e7
commit 18eae970ad
3 changed files with 274 additions and 264 deletions

View File

@@ -11,7 +11,6 @@ Usage:
""" """
import os import os
import re
import sys import sys
import json import json
from pathlib import Path from pathlib import Path
@@ -30,162 +29,6 @@ def ensure_dirs():
OUTPUT_DIR.mkdir(exist_ok=True) OUTPUT_DIR.mkdir(exist_ok=True)
def split_words_by_sentences(words: list) -> list:
"""
Split a list of words into sentence segments based on punctuation.
Args:
words: List of word dictionaries with 'text' key
Returns:
List of word segments, each representing a sentence
"""
if not words:
return []
segments = []
current_segment = []
# Pattern for sentence-ending punctuation (including the punctuation itself)
sentence_end_pattern = re.compile(r'[.!?]+["\')\]]*$')
for word in words:
current_segment.append(word)
text = word.get("text", "")
# Check if this word ends with sentence-ending punctuation
if sentence_end_pattern.search(text):
# End of sentence - save this segment
segments.append(current_segment)
current_segment = []
# Don't forget any remaining words
if current_segment:
segments.append(current_segment)
return segments
def ends_with_sentence_punctuation(text: str) -> bool:
"""Check if text ends with sentence-ending punctuation."""
text = text.strip()
return bool(re.search(r'[.!?]["\'\)\]]*$', text))
def merge_incomplete_sentences(utterances: list) -> list:
"""
Merge consecutive utterances where the first doesn't end with sentence punctuation.
This handles cases where AssemblyAI splits mid-sentence between speakers.
Uses the first speaker's label for merged utterances.
"""
if not utterances:
return utterances
result = []
current = utterances[0].copy()
for i in range(1, len(utterances)):
next_utt = utterances[i]
current_text = current.get("text", "")
# If current doesn't end with sentence punctuation, merge with next
if not ends_with_sentence_punctuation(current_text):
# Merge words
current["words"] = current.get("words", []) + next_utt.get("words", [])
# Update text
current["text"] = current_text + " " + next_utt.get("text", "")
# Update end time
current["end"] = next_utt.get("end", current["end"])
# Keep the first speaker's label (don't change to "?")
# current["speaker"] stays the same
else:
# Current is complete, save it and move to next
result.append(current)
current = next_utt.copy()
# Don't forget the last one
result.append(current)
return result
def split_utterances_by_pauses(utterances: list, pause_threshold_ms: int = 1500) -> list:
"""
Split long utterances based on pauses between words and sentence boundaries.
Args:
utterances: List of utterance dictionaries from AssemblyAI
pause_threshold_ms: Minimum gap (in milliseconds) to create a new utterance
Returns:
List of split utterances
"""
# First, merge consecutive utterances that don't end with sentence punctuation
utterances = merge_incomplete_sentences(utterances)
result = []
for utt in utterances:
words = utt.get("words", [])
if not words:
# No word-level data, keep original
result.append(utt)
continue
speaker = utt.get("speaker", "?")
current_segment_words = []
segments = []
for i, word in enumerate(words):
if not current_segment_words:
# First word in segment
current_segment_words.append(word)
else:
# Check gap from previous word
prev_word = current_segment_words[-1]
gap = word.get("start", 0) - prev_word.get("end", 0)
if gap >= pause_threshold_ms:
# Gap is large enough - first split by sentences within current segment
sentence_segments = split_words_by_sentences(current_segment_words)
for seg_words in sentence_segments:
segments.append({
"speaker": speaker,
"words": seg_words,
"start": seg_words[0]["start"],
"end": seg_words[-1]["end"]
})
current_segment_words = [word]
else:
# Continue current segment
current_segment_words.append(word)
# Don't forget the last segment - also split by sentences
if current_segment_words:
sentence_segments = split_words_by_sentences(current_segment_words)
for seg_words in sentence_segments:
segments.append({
"speaker": speaker,
"words": seg_words,
"start": seg_words[0]["start"],
"end": seg_words[-1]["end"]
})
# Convert segments to utterance format
for seg in segments:
text = " ".join(w.get("text", "") for w in seg["words"]).strip()
if text: # Only add non-empty segments
result.append({
"speaker": seg["speaker"],
"text": text,
"start": seg["start"],
"end": seg["end"],
"words": seg["words"]
})
return result
def load_progress() -> dict: def load_progress() -> dict:
"""Load progress tracking.""" """Load progress tracking."""
if PROGRESS_FILE.exists(): if PROGRESS_FILE.exists():
@@ -214,12 +57,11 @@ def transcribe_video(video_path: Path) -> dict:
print(f" Uploading {video_path.name}...") print(f" Uploading {video_path.name}...")
# Speaker diarization config # Speaker diarization config
# By default, AssemblyAI detects 1-10 speakers # Lower speaker_sensitivity = more aggressive speaker detection (more speakers)
# If you know the expected number, you can set speakers_expected
# Or set speaker_options for a range
speaker_options = aai.SpeakerOptions( speaker_options = aai.SpeakerOptions(
min_speakers=2, min_speakers=2,
max_speakers=10 # Allow up to 10 speakers max_speakers=10, # Allow up to 10 speakers
speaker_sensitivity=0.2 # Low value = more sensitive to speaker changes
) )
config = aai.TranscriptionConfig( config = aai.TranscriptionConfig(
@@ -237,35 +79,8 @@ def transcribe_video(video_path: Path) -> dict:
print(f" Transcription complete!") print(f" Transcription complete!")
# Convert utterances to dictionaries first # Convert transcript to serializable dictionary - NO POSTPROCESSING
raw_utterances = [] # Raw AssemblyAI output
for utt in transcript.utterances:
raw_utterances.append({
"speaker": utt.speaker,
"text": utt.text.strip(),
"start": utt.start,
"end": utt.end,
"confidence": utt.confidence if hasattr(utt, 'confidence') else None,
"words": [
{
"text": w.text,
"start": w.start,
"end": w.end,
"speaker": w.speaker if hasattr(w, 'speaker') else None
}
for w in (utt.words if hasattr(utt, 'words') else [])
]
})
# Split long utterances based on pauses
original_count = len(raw_utterances)
split_utterances = split_utterances_by_pauses(raw_utterances, pause_threshold_ms=1500)
new_count = len(split_utterances)
if new_count > original_count:
print(f" Split {original_count} utterances into {new_count} (based on 1.5s pauses)")
# Convert transcript to serializable dictionary
result = { result = {
"id": transcript.id, "id": transcript.id,
"status": str(transcript.status), "status": str(transcript.status),
@@ -274,7 +89,25 @@ def transcribe_video(video_path: Path) -> dict:
"confidence": transcript.confidence, "confidence": transcript.confidence,
"audio_duration": transcript.audio_duration, "audio_duration": transcript.audio_duration,
"language_code": transcript.json_response.get("language_code", "unknown"), "language_code": transcript.json_response.get("language_code", "unknown"),
"utterances": split_utterances "utterances": [
{
"speaker": utt.speaker,
"text": utt.text.strip(),
"start": utt.start,
"end": utt.end,
"confidence": utt.confidence if hasattr(utt, 'confidence') else None,
"words": [
{
"text": w.text,
"start": w.start,
"end": w.end,
"speaker": w.speaker if hasattr(w, 'speaker') else None
}
for w in (utt.words if hasattr(utt, 'words') else [])
]
}
for utt in transcript.utterances
]
} }
return result return result

View File

@@ -16,7 +16,12 @@ import sys
import json import json
import re import re
from pathlib import Path from pathlib import Path
from typing import List, Dict, Any from typing import List, Dict, Any, Tuple
# ============== Configuration ==============
# Split utterances on pauses longer than this (milliseconds)
PAUSE_THRESHOLD_MS = 1500
# ============== Configuration ============== # ============== Configuration ==============
@@ -51,6 +56,89 @@ def ensure_dirs():
OUTPUT_DIR.mkdir(exist_ok=True) OUTPUT_DIR.mkdir(exist_ok=True)
def split_words_by_sentences(words: list) -> list:
"""Split words into sentence segments based on punctuation."""
if not words:
return []
segments = []
current_segment = []
sentence_end_pattern = re.compile(r'[.!?]+["\')\]]*$')
for word in words:
current_segment.append(word)
text = word.get("text", "")
if sentence_end_pattern.search(text):
segments.append(current_segment)
current_segment = []
if current_segment:
segments.append(current_segment)
return segments
def split_utterances_by_pauses(utterances: list, pause_threshold_ms: int = 1500) -> list:
"""Split long utterances based on pauses between words and sentence boundaries."""
result = []
for utt in utterances:
words = utt.get("words", [])
if not words:
result.append(utt)
continue
speaker = utt.get("speaker", "?")
current_segment_words = []
segments = []
for i, word in enumerate(words):
if not current_segment_words:
current_segment_words.append(word)
else:
prev_word = current_segment_words[-1]
gap = word.get("start", 0) - prev_word.get("end", 0)
if gap >= pause_threshold_ms:
# Gap is large enough - split by sentences within current segment
sentence_segments = split_words_by_sentences(current_segment_words)
for seg_words in sentence_segments:
segments.append({
"speaker": speaker,
"words": seg_words,
"start": seg_words[0]["start"],
"end": seg_words[-1]["end"]
})
current_segment_words = [word]
else:
current_segment_words.append(word)
# Process final segment
if current_segment_words:
sentence_segments = split_words_by_sentences(current_segment_words)
for seg_words in sentence_segments:
segments.append({
"speaker": speaker,
"words": seg_words,
"start": seg_words[0]["start"],
"end": seg_words[-1]["end"]
})
# Convert segments to utterance format
for seg in segments:
text = " ".join(w.get("text", "") for w in seg["words"]).strip()
if text:
result.append({
"speaker": seg["speaker"],
"text": text,
"start": seg["start"],
"end": seg["end"],
"words": seg["words"]
})
return result
def format_timestamp(ms: int) -> str: def format_timestamp(ms: int) -> str:
"""Format milliseconds as [mm:ss].""" """Format milliseconds as [mm:ss]."""
seconds = ms // 1000 seconds = ms // 1000
@@ -108,6 +196,73 @@ def merge_utterances(utterances: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
return merged return merged
def extract_opening_song_title(utterances: List[Dict[str, Any]]) -> Tuple[str, str, str, List[Dict[str, Any]]]:
"""
Extract title from opening song (lines within first 15 seconds).
Returns (title, song_speaker, joined_song_lyrics, remaining_utterances).
The title is the text after 'Malabar' in the opening song lyrics.
All opening song lyrics (except title) are joined into one string.
"""
OPENING_SONG_THRESHOLD_MS = 15000 # 15 seconds
# Separate opening song utterances (within first 15s) from the rest
opening_song = []
remaining = []
for utt in utterances:
if utt.get("start", 0) < OPENING_SONG_THRESHOLD_MS:
opening_song.append(utt)
else:
remaining.append(utt)
if not opening_song:
return "", "", "", utterances
# Find the utterance containing "Malabar"
malabar_idx = -1
title = ""
song_speaker = opening_song[0].get("speaker", "A") if opening_song else "A"
title_utterance_idx = -1 # The utterance that contains the title (to exclude from song)
for i, utt in enumerate(opening_song):
text = utt.get("text", "")
if "Malabar" in text or "malabar" in text.lower():
malabar_idx = i
song_speaker = utt.get("speaker", song_speaker)
# Extract title: text after "Malabar" (and any punctuation/space)
match = re.search(r'Malabar[\s,]*(.+)', text, re.IGNORECASE)
if match:
title = match.group(1).strip()
# Remove trailing punctuation from title
title = re.sub(r'[.!?]+$', '', title).strip()
title_utterance_idx = i
# Remove title part from this utterance for song lyrics
utt["text"] = re.sub(r'Malabar[\s,]*.+$', 'Malabar', text, flags=re.IGNORECASE).strip()
break
# If title not in same utterance as Malabar, check next utterance(s)
if not title and malabar_idx >= 0:
for j in range(malabar_idx + 1, len(opening_song)):
next_text = opening_song[j].get("text", "").strip()
if next_text:
title = re.sub(r'[.!?]+$', '', next_text).strip()
title_utterance_idx = j
break
# Join all opening song lyrics except the title utterance
song_lines = []
for i, utt in enumerate(opening_song):
if i != title_utterance_idx:
text = utt.get("text", "").strip()
if text:
song_lines.append(text)
joined_song = " ".join(song_lines)
return title, song_speaker, joined_song, remaining
def format_lines(transcript_data: Dict[str, Any]) -> str: def format_lines(transcript_data: Dict[str, Any]) -> str:
""" """
Format transcript utterances into lines. Format transcript utterances into lines.
@@ -118,12 +273,32 @@ def format_lines(transcript_data: Dict[str, Any]) -> str:
if not utterances: if not utterances:
return "" return ""
# Split long utterances based on pauses and sentence boundaries
utterances = split_utterances_by_pauses(utterances, PAUSE_THRESHOLD_MS)
# Extract title from opening song (first 15 seconds) and get joined song lyrics
title, song_speaker, joined_song, utterances = extract_opening_song_title(utterances)
# Merge non-word utterances # Merge non-word utterances
merged = merge_utterances(utterances) merged = merge_utterances(utterances)
# Format lines # Format lines
lines = [] lines = []
# Add title as first line if found (use "Song" as speaker)
if title:
lines.append(f"[00:00](Song) {title}")
# Add joined opening song as second line if exists (use "Song" as speaker)
if joined_song:
lines.append(f"[00:01](Song) {joined_song}")
# Format remaining lines (skip those within first 15s as they're in the joined song)
for utt in merged: for utt in merged:
# Skip utterances within opening song window (they're already included in joined_song)
if utt.get("start", 0) < 15000:
continue
text = utt.get("text", "").strip() text = utt.get("text", "").strip()
# Skip standalone non-words unless they're at the end # Skip standalone non-words unless they're at the end
@@ -155,10 +330,10 @@ def process_transcript(input_path: Path) -> Path:
with open(input_path, 'r', encoding='utf-8') as f: with open(input_path, 'r', encoding='utf-8') as f:
transcript_data = json.load(f) transcript_data = json.load(f)
utterance_count = len(transcript_data.get("utterances", [])) raw_count = len(transcript_data.get("utterances", []))
print(f" Loaded {utterance_count} utterances") print(f" Loaded {raw_count} raw utterances")
# Format lines # Format lines (includes splitting by pauses)
formatted_text = format_lines(transcript_data) formatted_text = format_lines(transcript_data)
# Save output # Save output

View File

@@ -32,6 +32,7 @@ from openai import OpenAI
INPUT_DIR = Path("_lines") INPUT_DIR = Path("_lines")
OUTPUT_DIR = Path("_speakers") OUTPUT_DIR = Path("_speakers")
DEBUG_DIR = Path("_speakers_debug") DEBUG_DIR = Path("_speakers_debug")
PROGRESS_FILE = Path(".step3_progress.json")
# Examples of good speaker names (for reference, not a restricted list) # Examples of good speaker names (for reference, not a restricted list)
NAME_EXAMPLES = ["Malabar", "Sun", "Jupiter", "Kangaroo", "Mole"] NAME_EXAMPLES = ["Malabar", "Sun", "Jupiter", "Kangaroo", "Mole"]
@@ -59,6 +60,20 @@ def ensure_dirs():
DEBUG_DIR.mkdir(exist_ok=True) DEBUG_DIR.mkdir(exist_ok=True)
def load_progress() -> dict:
"""Load progress tracking."""
if PROGRESS_FILE.exists():
with open(PROGRESS_FILE, 'r', encoding='utf-8') as f:
return json.load(f)
return {}
def save_progress(progress: dict):
"""Save progress tracking."""
with open(PROGRESS_FILE, 'w', encoding='utf-8') as f:
json.dump(progress, f, indent=2)
def get_llm_config() -> Tuple[str, str]: def get_llm_config() -> Tuple[str, str]:
"""Get LLM configuration from environment.""" """Get LLM configuration from environment."""
api_key = os.getenv("OPENAI_API_KEY") api_key = os.getenv("OPENAI_API_KEY")
@@ -83,7 +98,9 @@ def get_llm_config() -> Tuple[str, str]:
def parse_lines(lines_text: str) -> List[Tuple[str, str, str]]: def parse_lines(lines_text: str) -> List[Tuple[str, str, str]]:
"""Parse formatted lines. Returns list of (timestamp, speaker_label, text).""" """Parse formatted lines. Returns list of (timestamp, speaker_label, text)."""
pattern = r'^(\[\d{2}:\d{2}\])\(Speaker ([A-Z?])\) (.+)$' # Pattern to match both (Speaker X) and (Song) formats
# Speaker "Song" is reserved for the opening song
pattern = r'^(\[\d{2}:\d{2}\])\((Speaker [A-Z?]|Song)\) (.+)$'
result = [] result = []
for line in lines_text.strip().split('\n'): for line in lines_text.strip().split('\n'):
@@ -94,60 +111,19 @@ def parse_lines(lines_text: str) -> List[Tuple[str, str, str]]:
match = re.match(pattern, line) match = re.match(pattern, line)
if match: if match:
timestamp = match.group(1) timestamp = match.group(1)
speaker = match.group(2) speaker_raw = match.group(2)
text = match.group(3) text = match.group(3)
# Normalize: "Speaker X" -> "X", "Song" -> "Song"
if speaker_raw == "Song":
speaker = "Song"
else:
# Extract letter from "Speaker X"
speaker = speaker_raw.replace("Speaker ", "")
result.append((timestamp, speaker, text)) result.append((timestamp, speaker, text))
return result return result
def parse_timestamp(ts: str) -> int:
"""Parse [mm:ss] timestamp to total seconds."""
match = re.match(r'\[(\d{2}):(\d{2})\]', ts)
if match:
minutes = int(match.group(1))
seconds = int(match.group(2))
return minutes * 60 + seconds
return 0
def classify_speakers_by_time(lines: List[Tuple[str, str, str]]) -> Tuple[set, set]:
"""Classify speakers based on when they appear."""
all_speakers = set(speaker for _, speaker, _ in lines)
song_speakers = set()
dialogue_speakers = set()
for speaker in all_speakers:
has_lines_after_15 = any(
parse_timestamp(ts) > 15 and spk == speaker
for ts, spk, _ in lines
)
if has_lines_after_15:
dialogue_speakers.add(speaker)
else:
has_lines_in_first_15 = any(
parse_timestamp(ts) <= 15 and spk == speaker
for ts, spk, _ in lines
)
if has_lines_in_first_15:
song_speakers.add(speaker)
return song_speakers, dialogue_speakers
def format_dialogue_with_names(lines: List[Tuple[str, str, str]], speaker_names: Dict[str, str]) -> str:
"""Format dialogue lines with known speaker names."""
result_lines = []
for timestamp, speaker, text in lines:
# Skip lines in first 15 seconds (opening song)
if parse_timestamp(timestamp) <= 15:
continue
name = speaker_names.get(speaker, f"Speaker_{speaker}")
result_lines.append(f'{timestamp}({name}) {text}')
return '\n'.join(result_lines)
def save_debug(filename: str, request: str, response: str, step: int): def save_debug(filename: str, request: str, response: str, step: int):
"""Save debug info to _speakers_debug folder.""" """Save debug info to _speakers_debug folder."""
debug_file = DEBUG_DIR / f"{filename}_step{step}.txt" debug_file = DEBUG_DIR / f"{filename}_step{step}.txt"
@@ -244,9 +220,9 @@ def ask_llm_for_name(prompt: str, client: OpenAI, model: str, debug_filename: st
def identify_malabar(dialogue_lines: List[Tuple[str, str, str]], def identify_malabar(dialogue_lines: List[Tuple[str, str, str]],
client: OpenAI, model: str, debug_filename: str) -> Optional[str]: client: OpenAI, model: str, debug_filename: str) -> Optional[str]:
"""Identify which speaker is Malabar.""" """Identify which speaker is Malabar."""
# Only consider single-letter speakers (exclude "?" and other special markers) # Only consider single-letter speakers (exclude "?", "Song", and other special markers)
speakers = sorted(set(speaker for _, speaker, _ in dialogue_lines speakers = sorted(set(speaker for _, speaker, _ in dialogue_lines
if parse_timestamp(_) > 15 and len(speaker) == 1 and speaker.isalpha())) if len(speaker) == 1 and speaker.isalpha()))
if not speakers: if not speakers:
return None return None
@@ -255,7 +231,7 @@ def identify_malabar(dialogue_lines: List[Tuple[str, str, str]],
samples = [] samples = []
for speaker in speakers: for speaker in speakers:
lines = [(ts, text) for ts, spk, text in dialogue_lines lines = [(ts, text) for ts, spk, text in dialogue_lines
if spk == speaker and parse_timestamp(ts) > 15][:3] if spk == speaker][:3]
for ts, text in lines: for ts, text in lines:
samples.append(f'{speaker}: "{text}"') samples.append(f'{speaker}: "{text}"')
@@ -282,9 +258,9 @@ def identify_speaker(speaker: str,
known_names: Dict[str, str], known_names: Dict[str, str],
client: OpenAI, model: str, debug_filename: str, step: int) -> str: client: OpenAI, model: str, debug_filename: str, step: int) -> str:
"""Identify a single speaker's name.""" """Identify a single speaker's name."""
# Get this speaker's lines (after 15s) # Get this speaker's lines
speaker_lines = [(ts, text) for ts, spk, text in dialogue_lines speaker_lines = [(ts, text) for ts, spk, text in dialogue_lines
if spk == speaker and parse_timestamp(ts) > 15] if spk == speaker]
# Prioritize lines with identifying keywords - Mars mentions first # Prioritize lines with identifying keywords - Mars mentions first
mars_lines = [l for l in speaker_lines if 'mars' in l[1].lower()] mars_lines = [l for l in speaker_lines if 'mars' in l[1].lower()]
@@ -323,8 +299,18 @@ Who is Speaker {speaker}? Reply with a single descriptive name (e.g., "Moon", "E
return ask_llm_for_name(prompt, client, model, debug_filename, step) return ask_llm_for_name(prompt, client, model, debug_filename, step)
def process_lines_file(input_path: Path, client: OpenAI, model: str) -> Path: def process_lines_file(input_path: Path, client: OpenAI, model: str, force: bool = False) -> Path:
"""Process a single lines file using multi-step approach.""" """Process a single lines file using multi-step approach."""
progress = load_progress()
filename = input_path.name
# Check if already processed
if not force and filename in progress and progress[filename].get("status") == "completed":
output_path = Path(progress[filename]["output_file"])
if output_path.exists():
print(f"Skipping {filename} (already processed)")
return output_path
print(f"\n{'='*50}") print(f"\n{'='*50}")
print(f"Processing: {input_path.name}") print(f"Processing: {input_path.name}")
print(f"{'='*50}") print(f"{'='*50}")
@@ -343,21 +329,21 @@ def process_lines_file(input_path: Path, client: OpenAI, model: str) -> Path:
print(" No valid lines found!") print(" No valid lines found!")
return None return None
# Classify speakers # Get unique speakers (excluding "Song" - already known)
song_speakers, dialogue_speakers = classify_speakers_by_time(lines) all_speakers = set(speaker for _, speaker, _ in lines)
print(f" Dialogue speakers: {', '.join(sorted(dialogue_speakers))}") speakers_to_identify = [s for s in all_speakers if s != "Song"]
# Build mapping starting with song speakers print(f" Speakers to identify: {', '.join(sorted(speakers_to_identify))}")
# Build mapping
final_mapping = {} final_mapping = {}
for speaker in song_speakers:
final_mapping[speaker] = "Song"
if not dialogue_speakers: if not speakers_to_identify:
print(f" All lines are within first 15 seconds (opening song)") print(f" No speakers to identify (only Song present)")
else: else:
# Separate regular speakers from unknown/merged speakers (like "?") # Separate regular speakers from unknown/merged speakers (like "?")
regular_speakers = [s for s in dialogue_speakers if s.isalpha()] regular_speakers = [s for s in speakers_to_identify if s.isalpha()]
unknown_speakers = [s for s in dialogue_speakers if not s.isalpha()] unknown_speakers = [s for s in speakers_to_identify if not s.isalpha()]
# Step 1: Identify Malabar (from regular speakers only) # Step 1: Identify Malabar (from regular speakers only)
print(f" Step 1: Identifying Malabar...") print(f" Step 1: Identifying Malabar...")
@@ -414,6 +400,14 @@ def process_lines_file(input_path: Path, client: OpenAI, model: str) -> Path:
with open(output_path, 'w', encoding='utf-8') as f: with open(output_path, 'w', encoding='utf-8') as f:
f.write(output_text) f.write(output_text)
# Update progress
progress[filename] = {
"status": "completed",
"output_file": str(output_path),
"speaker_mapping": final_mapping
}
save_progress(progress)
print(f" Saved to: {output_path}") print(f" Saved to: {output_path}")
return output_path return output_path
@@ -422,13 +416,13 @@ def process_lines_file(input_path: Path, client: OpenAI, model: str) -> Path:
def apply_speaker_names(lines: List[Tuple[str, str, str]], mapping: Dict[str, str]) -> str: def apply_speaker_names(lines: List[Tuple[str, str, str]], mapping: Dict[str, str]) -> str:
"""Apply speaker names to lines. """Apply speaker names to lines.
SPECIAL: Lines in first 15 seconds are labeled as "Song" (opening theme). SPECIAL: "Song" speaker is passed through unchanged (already labeled in Step 2).
""" """
result_lines = [] result_lines = []
for timestamp, speaker, text in lines: for timestamp, speaker, text in lines:
# Check if this line is in the first 15 seconds # "Song" speaker is already correctly labeled - pass through unchanged
if parse_timestamp(timestamp) <= 15: if speaker == "Song":
speaker_name = "Song" speaker_name = "Song"
else: else:
speaker_name = mapping.get(speaker, f"Speaker_{speaker}") speaker_name = mapping.get(speaker, f"Speaker_{speaker}")
@@ -448,6 +442,9 @@ def get_input_files() -> list[Path]:
def main(): def main():
ensure_dirs() ensure_dirs()
# Check for force flag
force = "--force" in sys.argv or "-f" in sys.argv
# Get LLM config # Get LLM config
base_url, model = get_llm_config() base_url, model = get_llm_config()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url=base_url) client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url=base_url)
@@ -460,6 +457,8 @@ def main():
sys.exit(1) sys.exit(1)
print(f"Found {len(lines_files)} transcript(s) in {INPUT_DIR}/") print(f"Found {len(lines_files)} transcript(s) in {INPUT_DIR}/")
if force:
print("Force mode: ON (reprocessing all files)")
print(f"Debug info will be saved to {DEBUG_DIR}/") print(f"Debug info will be saved to {DEBUG_DIR}/")
print("") print("")
@@ -469,10 +468,13 @@ def main():
for input_path in lines_files: for input_path in lines_files:
try: try:
output_path = process_lines_file(input_path, client, model) output_path = process_lines_file(input_path, client, model, force=force)
if output_path: if output_path:
success_count += 1 success_count += 1
except Exception as e: except Exception as e:
progress = load_progress()
progress[input_path.name] = {"status": "error", "error": str(e)}
save_progress(progress)
print(f"\n❌ Failed to process {input_path.name}: {e}") print(f"\n❌ Failed to process {input_path.name}: {e}")
import traceback import traceback
traceback.print_exc() traceback.print_exc()