Simplify the pipeline, merge the opening song

2026-03-04 13:14:53 +08:00
parent 6153d386e7
commit 18eae970ad
3 changed files with 274 additions and 264 deletions
--- a/step1_transcribe.py
+++ b/step1_transcribe.py
@@ -11,7 +11,6 @@ Usage:
 """

 import os
-import re
 import sys
 import json
 from pathlib import Path
@@ -30,162 +29,6 @@ def ensure_dirs():
    OUTPUT_DIR.mkdir(exist_ok=True)


-def split_words_by_sentences(words: list) -> list:
-    """
-    Split a list of words into sentence segments based on punctuation.
-    
-    Args:
-        words: List of word dictionaries with 'text' key
-        
-    Returns:
-        List of word segments, each representing a sentence
-    """
-    if not words:
-        return []
-    
-    segments = []
-    current_segment = []
-    
-    # Pattern for sentence-ending punctuation (including the punctuation itself)
-    sentence_end_pattern = re.compile(r'[.!?]+["\')\]]*$')
-    
-    for word in words:
-        current_segment.append(word)
-        text = word.get("text", "")
-        
-        # Check if this word ends with sentence-ending punctuation
-        if sentence_end_pattern.search(text):
-            # End of sentence - save this segment
-            segments.append(current_segment)
-            current_segment = []
-    
-    # Don't forget any remaining words
-    if current_segment:
-        segments.append(current_segment)
-    
-    return segments
-
-
-def ends_with_sentence_punctuation(text: str) -> bool:
-    """Check if text ends with sentence-ending punctuation."""
-    text = text.strip()
-    return bool(re.search(r'[.!?]["\'\)\]]*$', text))
-
-
-def merge_incomplete_sentences(utterances: list) -> list:
-    """
-    Merge consecutive utterances where the first doesn't end with sentence punctuation.
-    This handles cases where AssemblyAI splits mid-sentence between speakers.
-    Uses the first speaker's label for merged utterances.
-    """
-    if not utterances:
-        return utterances
-    
-    result = []
-    current = utterances[0].copy()
-    
-    for i in range(1, len(utterances)):
-        next_utt = utterances[i]
-        current_text = current.get("text", "")
-        
-        # If current doesn't end with sentence punctuation, merge with next
-        if not ends_with_sentence_punctuation(current_text):
-            # Merge words
-            current["words"] = current.get("words", []) + next_utt.get("words", [])
-            # Update text
-            current["text"] = current_text + " " + next_utt.get("text", "")
-            # Update end time
-            current["end"] = next_utt.get("end", current["end"])
-            # Keep the first speaker's label (don't change to "?")
-            # current["speaker"] stays the same
-        else:
-            # Current is complete, save it and move to next
-            result.append(current)
-            current = next_utt.copy()
-    
-    # Don't forget the last one
-    result.append(current)
-    
-    return result
-
-
-def split_utterances_by_pauses(utterances: list, pause_threshold_ms: int = 1500) -> list:
-    """
-    Split long utterances based on pauses between words and sentence boundaries.
-    
-    Args:
-        utterances: List of utterance dictionaries from AssemblyAI
-        pause_threshold_ms: Minimum gap (in milliseconds) to create a new utterance
-        
-    Returns:
-        List of split utterances
-    """
-    # First, merge consecutive utterances that don't end with sentence punctuation
-    utterances = merge_incomplete_sentences(utterances)
-    
-    result = []
-    
-    for utt in utterances:
-        words = utt.get("words", [])
-        if not words:
-            # No word-level data, keep original
-            result.append(utt)
-            continue
-        
-        speaker = utt.get("speaker", "?")
-        current_segment_words = []
-        segments = []
-        
-        for i, word in enumerate(words):
-            if not current_segment_words:
-                # First word in segment
-                current_segment_words.append(word)
-            else:
-                # Check gap from previous word
-                prev_word = current_segment_words[-1]
-                gap = word.get("start", 0) - prev_word.get("end", 0)
-                
-                if gap >= pause_threshold_ms:
-                    # Gap is large enough - first split by sentences within current segment
-                    sentence_segments = split_words_by_sentences(current_segment_words)
-                    for seg_words in sentence_segments:
-                        segments.append({
-                            "speaker": speaker,
-                            "words": seg_words,
-                            "start": seg_words[0]["start"],
-                            "end": seg_words[-1]["end"]
-                        })
-                    current_segment_words = [word]
-                else:
-                    # Continue current segment
-                    current_segment_words.append(word)
-        
-        # Don't forget the last segment - also split by sentences
-        if current_segment_words:
-            sentence_segments = split_words_by_sentences(current_segment_words)
-            for seg_words in sentence_segments:
-                segments.append({
-                    "speaker": speaker,
-                    "words": seg_words,
-                    "start": seg_words[0]["start"],
-                    "end": seg_words[-1]["end"]
-                })
-        
-        # Convert segments to utterance format
-        for seg in segments:
-            text = " ".join(w.get("text", "") for w in seg["words"]).strip()
-            if text:  # Only add non-empty segments
-                result.append({
-                    "speaker": seg["speaker"],
-                    "text": text,
-                    "start": seg["start"],
-                    "end": seg["end"],
-                    "words": seg["words"]
-                })
-    
-    return result
-
-
 def load_progress() -> dict:
    """Load progress tracking."""
    if PROGRESS_FILE.exists():
@@ -214,12 +57,11 @@ def transcribe_video(video_path: Path) -> dict:
    print(f"  Uploading {video_path.name}...")
    
    # Speaker diarization config
-    # By default, AssemblyAI detects 1-10 speakers
-    # If you know the expected number, you can set speakers_expected
-    # Or set speaker_options for a range
+    # Lower speaker_sensitivity = more aggressive speaker detection (more speakers)
    speaker_options = aai.SpeakerOptions(
        min_speakers=2,
-        max_speakers=10  # Allow up to 10 speakers
+        max_speakers=10,  # Allow up to 10 speakers
+        speaker_sensitivity=0.2  # Low value = more sensitive to speaker changes
    )
    
    config = aai.TranscriptionConfig(
@@ -237,35 +79,8 @@ def transcribe_video(video_path: Path) -> dict:
    
    print(f"  Transcription complete!")
    
-    # Convert utterances to dictionaries first
-    raw_utterances = []
-    for utt in transcript.utterances:
-        raw_utterances.append({
-            "speaker": utt.speaker,
-            "text": utt.text.strip(),
-            "start": utt.start,
-            "end": utt.end,
-            "confidence": utt.confidence if hasattr(utt, 'confidence') else None,
-            "words": [
-                {
-                    "text": w.text,
-                    "start": w.start,
-                    "end": w.end,
-                    "speaker": w.speaker if hasattr(w, 'speaker') else None
-                }
-                for w in (utt.words if hasattr(utt, 'words') else [])
-            ]
-        })
-    
-    # Split long utterances based on pauses
-    original_count = len(raw_utterances)
-    split_utterances = split_utterances_by_pauses(raw_utterances, pause_threshold_ms=1500)
-    new_count = len(split_utterances)
-    
-    if new_count > original_count:
-        print(f"  Split {original_count} utterances into {new_count} (based on 1.5s pauses)")
-    
-    # Convert transcript to serializable dictionary
+    # Convert transcript to serializable dictionary - NO POSTPROCESSING
+    # Raw AssemblyAI output
    result = {
        "id": transcript.id,
        "status": str(transcript.status),
@@ -274,7 +89,25 @@ def transcribe_video(video_path: Path) -> dict:
        "confidence": transcript.confidence,
        "audio_duration": transcript.audio_duration,
        "language_code": transcript.json_response.get("language_code", "unknown"),
-        "utterances": split_utterances
+        "utterances": [
+            {
+                "speaker": utt.speaker,
+                "text": utt.text.strip(),
+                "start": utt.start,
+                "end": utt.end,
+                "confidence": utt.confidence if hasattr(utt, 'confidence') else None,
+                "words": [
+                    {
+                        "text": w.text,
+                        "start": w.start,
+                        "end": w.end,
+                        "speaker": w.speaker if hasattr(w, 'speaker') else None
+                    }
+                    for w in (utt.words if hasattr(utt, 'words') else [])
+                ]
+            }
+            for utt in transcript.utterances
+        ]
    }
    
    return result