Simplify the pipeline, merge the opening song

2026-03-04 13:14:53 +08:00
parent 6153d386e7
commit 18eae970ad
3 changed files with 274 additions and 264 deletions
--- a/step2_format.py
+++ b/step2_format.py
@@ -16,7 +16,12 @@ import sys
 import json
 import re
 from pathlib import Path
-from typing import List, Dict, Any
+from typing import List, Dict, Any, Tuple
+
+# ============== Configuration ==============
+
+# Split utterances on pauses longer than this (milliseconds)
+PAUSE_THRESHOLD_MS = 1500

 # ============== Configuration ==============

@@ -51,6 +56,89 @@ def ensure_dirs():
    OUTPUT_DIR.mkdir(exist_ok=True)


+def split_words_by_sentences(words: list) -> list:
+    """Split words into sentence segments based on punctuation."""
+    if not words:
+        return []
+    
+    segments = []
+    current_segment = []
+    sentence_end_pattern = re.compile(r'[.!?]+["\')\]]*$')
+    
+    for word in words:
+        current_segment.append(word)
+        text = word.get("text", "")
+        if sentence_end_pattern.search(text):
+            segments.append(current_segment)
+            current_segment = []
+    
+    if current_segment:
+        segments.append(current_segment)
+    
+    return segments
+
+
+def split_utterances_by_pauses(utterances: list, pause_threshold_ms: int = 1500) -> list:
+    """Split long utterances based on pauses between words and sentence boundaries."""
+    result = []
+    
+    for utt in utterances:
+        words = utt.get("words", [])
+        if not words:
+            result.append(utt)
+            continue
+        
+        speaker = utt.get("speaker", "?")
+        current_segment_words = []
+        segments = []
+        
+        for i, word in enumerate(words):
+            if not current_segment_words:
+                current_segment_words.append(word)
+            else:
+                prev_word = current_segment_words[-1]
+                gap = word.get("start", 0) - prev_word.get("end", 0)
+                
+                if gap >= pause_threshold_ms:
+                    # Gap is large enough - split by sentences within current segment
+                    sentence_segments = split_words_by_sentences(current_segment_words)
+                    for seg_words in sentence_segments:
+                        segments.append({
+                            "speaker": speaker,
+                            "words": seg_words,
+                            "start": seg_words[0]["start"],
+                            "end": seg_words[-1]["end"]
+                        })
+                    current_segment_words = [word]
+                else:
+                    current_segment_words.append(word)
+        
+        # Process final segment
+        if current_segment_words:
+            sentence_segments = split_words_by_sentences(current_segment_words)
+            for seg_words in sentence_segments:
+                segments.append({
+                    "speaker": speaker,
+                    "words": seg_words,
+                    "start": seg_words[0]["start"],
+                    "end": seg_words[-1]["end"]
+                })
+        
+        # Convert segments to utterance format
+        for seg in segments:
+            text = " ".join(w.get("text", "") for w in seg["words"]).strip()
+            if text:
+                result.append({
+                    "speaker": seg["speaker"],
+                    "text": text,
+                    "start": seg["start"],
+                    "end": seg["end"],
+                    "words": seg["words"]
+                })
+    
+    return result
+
+
 def format_timestamp(ms: int) -> str:
    """Format milliseconds as [mm:ss]."""
    seconds = ms // 1000
@@ -108,6 +196,73 @@ def merge_utterances(utterances: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    return merged


+def extract_opening_song_title(utterances: List[Dict[str, Any]]) -> Tuple[str, str, str, List[Dict[str, Any]]]:
+    """
+    Extract title from opening song (lines within first 15 seconds).
+    Returns (title, song_speaker, joined_song_lyrics, remaining_utterances).
+    
+    The title is the text after 'Malabar' in the opening song lyrics.
+    All opening song lyrics (except title) are joined into one string.
+    """
+    OPENING_SONG_THRESHOLD_MS = 15000  # 15 seconds
+    
+    # Separate opening song utterances (within first 15s) from the rest
+    opening_song = []
+    remaining = []
+    
+    for utt in utterances:
+        if utt.get("start", 0) < OPENING_SONG_THRESHOLD_MS:
+            opening_song.append(utt)
+        else:
+            remaining.append(utt)
+    
+    if not opening_song:
+        return "", "", "", utterances
+    
+    # Find the utterance containing "Malabar"
+    malabar_idx = -1
+    title = ""
+    song_speaker = opening_song[0].get("speaker", "A") if opening_song else "A"
+    title_utterance_idx = -1  # The utterance that contains the title (to exclude from song)
+    
+    for i, utt in enumerate(opening_song):
+        text = utt.get("text", "")
+        if "Malabar" in text or "malabar" in text.lower():
+            malabar_idx = i
+            song_speaker = utt.get("speaker", song_speaker)
+            # Extract title: text after "Malabar" (and any punctuation/space)
+            match = re.search(r'Malabar[\s,]*(.+)', text, re.IGNORECASE)
+            if match:
+                title = match.group(1).strip()
+                # Remove trailing punctuation from title
+                title = re.sub(r'[.!?]+$', '', title).strip()
+                title_utterance_idx = i
+                # Remove title part from this utterance for song lyrics
+                utt["text"] = re.sub(r'Malabar[\s,]*.+$', 'Malabar', text, flags=re.IGNORECASE).strip()
+            break
+    
+    # If title not in same utterance as Malabar, check next utterance(s)
+    if not title and malabar_idx >= 0:
+        for j in range(malabar_idx + 1, len(opening_song)):
+            next_text = opening_song[j].get("text", "").strip()
+            if next_text:
+                title = re.sub(r'[.!?]+$', '', next_text).strip()
+                title_utterance_idx = j
+                break
+    
+    # Join all opening song lyrics except the title utterance
+    song_lines = []
+    for i, utt in enumerate(opening_song):
+        if i != title_utterance_idx:
+            text = utt.get("text", "").strip()
+            if text:
+                song_lines.append(text)
+    
+    joined_song = " ".join(song_lines)
+    
+    return title, song_speaker, joined_song, remaining
+
+
 def format_lines(transcript_data: Dict[str, Any]) -> str:
    """
    Format transcript utterances into lines.
@@ -118,12 +273,32 @@ def format_lines(transcript_data: Dict[str, Any]) -> str:
    if not utterances:
        return ""
    
+    # Split long utterances based on pauses and sentence boundaries
+    utterances = split_utterances_by_pauses(utterances, PAUSE_THRESHOLD_MS)
+    
+    # Extract title from opening song (first 15 seconds) and get joined song lyrics
+    title, song_speaker, joined_song, utterances = extract_opening_song_title(utterances)
+    
    # Merge non-word utterances
    merged = merge_utterances(utterances)
    
    # Format lines
    lines = []
+    
+    # Add title as first line if found (use "Song" as speaker)
+    if title:
+        lines.append(f"[00:00](Song) {title}")
+    
+    # Add joined opening song as second line if exists (use "Song" as speaker)
+    if joined_song:
+        lines.append(f"[00:01](Song) {joined_song}")
+    
+    # Format remaining lines (skip those within first 15s as they're in the joined song)
    for utt in merged:
+        # Skip utterances within opening song window (they're already included in joined_song)
+        if utt.get("start", 0) < 15000:
+            continue
+        
        text = utt.get("text", "").strip()
        
        # Skip standalone non-words unless they're at the end
@@ -155,10 +330,10 @@ def process_transcript(input_path: Path) -> Path:
    with open(input_path, 'r', encoding='utf-8') as f:
        transcript_data = json.load(f)
    
-    utterance_count = len(transcript_data.get("utterances", []))
-    print(f"  Loaded {utterance_count} utterances")
+    raw_count = len(transcript_data.get("utterances", []))
+    print(f"  Loaded {raw_count} raw utterances")
    
-    # Format lines
+    # Format lines (includes splitting by pauses)
    formatted_text = format_lines(transcript_data)
    
    # Save output