Simplify the pipeline, merge the opening song

This commit is contained in:
2026-03-04 13:14:53 +08:00
parent 6153d386e7
commit 18eae970ad
3 changed files with 274 additions and 264 deletions

View File

@@ -11,7 +11,6 @@ Usage:
"""
import os
import re
import sys
import json
from pathlib import Path
@@ -30,162 +29,6 @@ def ensure_dirs():
OUTPUT_DIR.mkdir(exist_ok=True)
def split_words_by_sentences(words: list) -> list:
"""
Split a list of words into sentence segments based on punctuation.
Args:
words: List of word dictionaries with 'text' key
Returns:
List of word segments, each representing a sentence
"""
if not words:
return []
segments = []
current_segment = []
# Pattern for sentence-ending punctuation (including the punctuation itself)
sentence_end_pattern = re.compile(r'[.!?]+["\')\]]*$')
for word in words:
current_segment.append(word)
text = word.get("text", "")
# Check if this word ends with sentence-ending punctuation
if sentence_end_pattern.search(text):
# End of sentence - save this segment
segments.append(current_segment)
current_segment = []
# Don't forget any remaining words
if current_segment:
segments.append(current_segment)
return segments
def ends_with_sentence_punctuation(text: str) -> bool:
"""Check if text ends with sentence-ending punctuation."""
text = text.strip()
return bool(re.search(r'[.!?]["\'\)\]]*$', text))
def merge_incomplete_sentences(utterances: list) -> list:
"""
Merge consecutive utterances where the first doesn't end with sentence punctuation.
This handles cases where AssemblyAI splits mid-sentence between speakers.
Uses the first speaker's label for merged utterances.
"""
if not utterances:
return utterances
result = []
current = utterances[0].copy()
for i in range(1, len(utterances)):
next_utt = utterances[i]
current_text = current.get("text", "")
# If current doesn't end with sentence punctuation, merge with next
if not ends_with_sentence_punctuation(current_text):
# Merge words
current["words"] = current.get("words", []) + next_utt.get("words", [])
# Update text
current["text"] = current_text + " " + next_utt.get("text", "")
# Update end time
current["end"] = next_utt.get("end", current["end"])
# Keep the first speaker's label (don't change to "?")
# current["speaker"] stays the same
else:
# Current is complete, save it and move to next
result.append(current)
current = next_utt.copy()
# Don't forget the last one
result.append(current)
return result
def split_utterances_by_pauses(utterances: list, pause_threshold_ms: int = 1500) -> list:
"""
Split long utterances based on pauses between words and sentence boundaries.
Args:
utterances: List of utterance dictionaries from AssemblyAI
pause_threshold_ms: Minimum gap (in milliseconds) to create a new utterance
Returns:
List of split utterances
"""
# First, merge consecutive utterances that don't end with sentence punctuation
utterances = merge_incomplete_sentences(utterances)
result = []
for utt in utterances:
words = utt.get("words", [])
if not words:
# No word-level data, keep original
result.append(utt)
continue
speaker = utt.get("speaker", "?")
current_segment_words = []
segments = []
for i, word in enumerate(words):
if not current_segment_words:
# First word in segment
current_segment_words.append(word)
else:
# Check gap from previous word
prev_word = current_segment_words[-1]
gap = word.get("start", 0) - prev_word.get("end", 0)
if gap >= pause_threshold_ms:
# Gap is large enough - first split by sentences within current segment
sentence_segments = split_words_by_sentences(current_segment_words)
for seg_words in sentence_segments:
segments.append({
"speaker": speaker,
"words": seg_words,
"start": seg_words[0]["start"],
"end": seg_words[-1]["end"]
})
current_segment_words = [word]
else:
# Continue current segment
current_segment_words.append(word)
# Don't forget the last segment - also split by sentences
if current_segment_words:
sentence_segments = split_words_by_sentences(current_segment_words)
for seg_words in sentence_segments:
segments.append({
"speaker": speaker,
"words": seg_words,
"start": seg_words[0]["start"],
"end": seg_words[-1]["end"]
})
# Convert segments to utterance format
for seg in segments:
text = " ".join(w.get("text", "") for w in seg["words"]).strip()
if text: # Only add non-empty segments
result.append({
"speaker": seg["speaker"],
"text": text,
"start": seg["start"],
"end": seg["end"],
"words": seg["words"]
})
return result
def load_progress() -> dict:
"""Load progress tracking."""
if PROGRESS_FILE.exists():
@@ -214,12 +57,11 @@ def transcribe_video(video_path: Path) -> dict:
print(f" Uploading {video_path.name}...")
# Speaker diarization config
# By default, AssemblyAI detects 1-10 speakers
# If you know the expected number, you can set speakers_expected
# Or set speaker_options for a range
# Lower speaker_sensitivity = more aggressive speaker detection (more speakers)
speaker_options = aai.SpeakerOptions(
min_speakers=2,
max_speakers=10 # Allow up to 10 speakers
max_speakers=10, # Allow up to 10 speakers
speaker_sensitivity=0.2 # Low value = more sensitive to speaker changes
)
config = aai.TranscriptionConfig(
@@ -237,35 +79,8 @@ def transcribe_video(video_path: Path) -> dict:
print(f" Transcription complete!")
# Convert utterances to dictionaries first
raw_utterances = []
for utt in transcript.utterances:
raw_utterances.append({
"speaker": utt.speaker,
"text": utt.text.strip(),
"start": utt.start,
"end": utt.end,
"confidence": utt.confidence if hasattr(utt, 'confidence') else None,
"words": [
{
"text": w.text,
"start": w.start,
"end": w.end,
"speaker": w.speaker if hasattr(w, 'speaker') else None
}
for w in (utt.words if hasattr(utt, 'words') else [])
]
})
# Split long utterances based on pauses
original_count = len(raw_utterances)
split_utterances = split_utterances_by_pauses(raw_utterances, pause_threshold_ms=1500)
new_count = len(split_utterances)
if new_count > original_count:
print(f" Split {original_count} utterances into {new_count} (based on 1.5s pauses)")
# Convert transcript to serializable dictionary
# Convert transcript to serializable dictionary - NO POSTPROCESSING
# Raw AssemblyAI output
result = {
"id": transcript.id,
"status": str(transcript.status),
@@ -274,7 +89,25 @@ def transcribe_video(video_path: Path) -> dict:
"confidence": transcript.confidence,
"audio_duration": transcript.audio_duration,
"language_code": transcript.json_response.get("language_code", "unknown"),
"utterances": split_utterances
"utterances": [
{
"speaker": utt.speaker,
"text": utt.text.strip(),
"start": utt.start,
"end": utt.end,
"confidence": utt.confidence if hasattr(utt, 'confidence') else None,
"words": [
{
"text": w.text,
"start": w.start,
"end": w.end,
"speaker": w.speaker if hasattr(w, 'speaker') else None
}
for w in (utt.words if hasattr(utt, 'words') else [])
]
}
for utt in transcript.utterances
]
}
return result