Simplify the pipeline, merge the opening song
This commit is contained in:
@@ -11,7 +11,6 @@ Usage:
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
@@ -30,162 +29,6 @@ def ensure_dirs():
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
|
||||
def split_words_by_sentences(words: list) -> list:
|
||||
"""
|
||||
Split a list of words into sentence segments based on punctuation.
|
||||
|
||||
Args:
|
||||
words: List of word dictionaries with 'text' key
|
||||
|
||||
Returns:
|
||||
List of word segments, each representing a sentence
|
||||
"""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
segments = []
|
||||
current_segment = []
|
||||
|
||||
# Pattern for sentence-ending punctuation (including the punctuation itself)
|
||||
sentence_end_pattern = re.compile(r'[.!?]+["\')\]]*$')
|
||||
|
||||
for word in words:
|
||||
current_segment.append(word)
|
||||
text = word.get("text", "")
|
||||
|
||||
# Check if this word ends with sentence-ending punctuation
|
||||
if sentence_end_pattern.search(text):
|
||||
# End of sentence - save this segment
|
||||
segments.append(current_segment)
|
||||
current_segment = []
|
||||
|
||||
# Don't forget any remaining words
|
||||
if current_segment:
|
||||
segments.append(current_segment)
|
||||
|
||||
return segments
|
||||
|
||||
|
||||
def ends_with_sentence_punctuation(text: str) -> bool:
|
||||
"""Check if text ends with sentence-ending punctuation."""
|
||||
text = text.strip()
|
||||
return bool(re.search(r'[.!?]["\'\)\]]*$', text))
|
||||
|
||||
|
||||
def merge_incomplete_sentences(utterances: list) -> list:
|
||||
"""
|
||||
Merge consecutive utterances where the first doesn't end with sentence punctuation.
|
||||
This handles cases where AssemblyAI splits mid-sentence between speakers.
|
||||
Uses the first speaker's label for merged utterances.
|
||||
"""
|
||||
if not utterances:
|
||||
return utterances
|
||||
|
||||
result = []
|
||||
current = utterances[0].copy()
|
||||
|
||||
for i in range(1, len(utterances)):
|
||||
next_utt = utterances[i]
|
||||
current_text = current.get("text", "")
|
||||
|
||||
# If current doesn't end with sentence punctuation, merge with next
|
||||
if not ends_with_sentence_punctuation(current_text):
|
||||
# Merge words
|
||||
current["words"] = current.get("words", []) + next_utt.get("words", [])
|
||||
# Update text
|
||||
current["text"] = current_text + " " + next_utt.get("text", "")
|
||||
# Update end time
|
||||
current["end"] = next_utt.get("end", current["end"])
|
||||
# Keep the first speaker's label (don't change to "?")
|
||||
# current["speaker"] stays the same
|
||||
else:
|
||||
# Current is complete, save it and move to next
|
||||
result.append(current)
|
||||
current = next_utt.copy()
|
||||
|
||||
# Don't forget the last one
|
||||
result.append(current)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def split_utterances_by_pauses(utterances: list, pause_threshold_ms: int = 1500) -> list:
|
||||
"""
|
||||
Split long utterances based on pauses between words and sentence boundaries.
|
||||
|
||||
Args:
|
||||
utterances: List of utterance dictionaries from AssemblyAI
|
||||
pause_threshold_ms: Minimum gap (in milliseconds) to create a new utterance
|
||||
|
||||
Returns:
|
||||
List of split utterances
|
||||
"""
|
||||
# First, merge consecutive utterances that don't end with sentence punctuation
|
||||
utterances = merge_incomplete_sentences(utterances)
|
||||
|
||||
result = []
|
||||
|
||||
for utt in utterances:
|
||||
words = utt.get("words", [])
|
||||
if not words:
|
||||
# No word-level data, keep original
|
||||
result.append(utt)
|
||||
continue
|
||||
|
||||
speaker = utt.get("speaker", "?")
|
||||
current_segment_words = []
|
||||
segments = []
|
||||
|
||||
for i, word in enumerate(words):
|
||||
if not current_segment_words:
|
||||
# First word in segment
|
||||
current_segment_words.append(word)
|
||||
else:
|
||||
# Check gap from previous word
|
||||
prev_word = current_segment_words[-1]
|
||||
gap = word.get("start", 0) - prev_word.get("end", 0)
|
||||
|
||||
if gap >= pause_threshold_ms:
|
||||
# Gap is large enough - first split by sentences within current segment
|
||||
sentence_segments = split_words_by_sentences(current_segment_words)
|
||||
for seg_words in sentence_segments:
|
||||
segments.append({
|
||||
"speaker": speaker,
|
||||
"words": seg_words,
|
||||
"start": seg_words[0]["start"],
|
||||
"end": seg_words[-1]["end"]
|
||||
})
|
||||
current_segment_words = [word]
|
||||
else:
|
||||
# Continue current segment
|
||||
current_segment_words.append(word)
|
||||
|
||||
# Don't forget the last segment - also split by sentences
|
||||
if current_segment_words:
|
||||
sentence_segments = split_words_by_sentences(current_segment_words)
|
||||
for seg_words in sentence_segments:
|
||||
segments.append({
|
||||
"speaker": speaker,
|
||||
"words": seg_words,
|
||||
"start": seg_words[0]["start"],
|
||||
"end": seg_words[-1]["end"]
|
||||
})
|
||||
|
||||
# Convert segments to utterance format
|
||||
for seg in segments:
|
||||
text = " ".join(w.get("text", "") for w in seg["words"]).strip()
|
||||
if text: # Only add non-empty segments
|
||||
result.append({
|
||||
"speaker": seg["speaker"],
|
||||
"text": text,
|
||||
"start": seg["start"],
|
||||
"end": seg["end"],
|
||||
"words": seg["words"]
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def load_progress() -> dict:
|
||||
"""Load progress tracking."""
|
||||
if PROGRESS_FILE.exists():
|
||||
@@ -214,12 +57,11 @@ def transcribe_video(video_path: Path) -> dict:
|
||||
print(f" Uploading {video_path.name}...")
|
||||
|
||||
# Speaker diarization config
|
||||
# By default, AssemblyAI detects 1-10 speakers
|
||||
# If you know the expected number, you can set speakers_expected
|
||||
# Or set speaker_options for a range
|
||||
# Lower speaker_sensitivity = more aggressive speaker detection (more speakers)
|
||||
speaker_options = aai.SpeakerOptions(
|
||||
min_speakers=2,
|
||||
max_speakers=10 # Allow up to 10 speakers
|
||||
max_speakers=10, # Allow up to 10 speakers
|
||||
speaker_sensitivity=0.2 # Low value = more sensitive to speaker changes
|
||||
)
|
||||
|
||||
config = aai.TranscriptionConfig(
|
||||
@@ -237,35 +79,8 @@ def transcribe_video(video_path: Path) -> dict:
|
||||
|
||||
print(f" Transcription complete!")
|
||||
|
||||
# Convert utterances to dictionaries first
|
||||
raw_utterances = []
|
||||
for utt in transcript.utterances:
|
||||
raw_utterances.append({
|
||||
"speaker": utt.speaker,
|
||||
"text": utt.text.strip(),
|
||||
"start": utt.start,
|
||||
"end": utt.end,
|
||||
"confidence": utt.confidence if hasattr(utt, 'confidence') else None,
|
||||
"words": [
|
||||
{
|
||||
"text": w.text,
|
||||
"start": w.start,
|
||||
"end": w.end,
|
||||
"speaker": w.speaker if hasattr(w, 'speaker') else None
|
||||
}
|
||||
for w in (utt.words if hasattr(utt, 'words') else [])
|
||||
]
|
||||
})
|
||||
|
||||
# Split long utterances based on pauses
|
||||
original_count = len(raw_utterances)
|
||||
split_utterances = split_utterances_by_pauses(raw_utterances, pause_threshold_ms=1500)
|
||||
new_count = len(split_utterances)
|
||||
|
||||
if new_count > original_count:
|
||||
print(f" Split {original_count} utterances into {new_count} (based on 1.5s pauses)")
|
||||
|
||||
# Convert transcript to serializable dictionary
|
||||
# Convert transcript to serializable dictionary - NO POSTPROCESSING
|
||||
# Raw AssemblyAI output
|
||||
result = {
|
||||
"id": transcript.id,
|
||||
"status": str(transcript.status),
|
||||
@@ -274,7 +89,25 @@ def transcribe_video(video_path: Path) -> dict:
|
||||
"confidence": transcript.confidence,
|
||||
"audio_duration": transcript.audio_duration,
|
||||
"language_code": transcript.json_response.get("language_code", "unknown"),
|
||||
"utterances": split_utterances
|
||||
"utterances": [
|
||||
{
|
||||
"speaker": utt.speaker,
|
||||
"text": utt.text.strip(),
|
||||
"start": utt.start,
|
||||
"end": utt.end,
|
||||
"confidence": utt.confidence if hasattr(utt, 'confidence') else None,
|
||||
"words": [
|
||||
{
|
||||
"text": w.text,
|
||||
"start": w.start,
|
||||
"end": w.end,
|
||||
"speaker": w.speaker if hasattr(w, 'speaker') else None
|
||||
}
|
||||
for w in (utt.words if hasattr(utt, 'words') else [])
|
||||
]
|
||||
}
|
||||
for utt in transcript.utterances
|
||||
]
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user