Simplify the pipeline, merge the opening song
This commit is contained in:
183
step2_format.py
183
step2_format.py
@@ -16,7 +16,12 @@ import sys
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
from typing import List, Dict, Any, Tuple
|
||||
|
||||
# ============== Configuration ==============
|
||||
|
||||
# Split utterances on pauses longer than this (milliseconds)
|
||||
PAUSE_THRESHOLD_MS = 1500
|
||||
|
||||
# ============== Configuration ==============
|
||||
|
||||
@@ -51,6 +56,89 @@ def ensure_dirs():
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
|
||||
def split_words_by_sentences(words: list) -> list:
|
||||
"""Split words into sentence segments based on punctuation."""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
segments = []
|
||||
current_segment = []
|
||||
sentence_end_pattern = re.compile(r'[.!?]+["\')\]]*$')
|
||||
|
||||
for word in words:
|
||||
current_segment.append(word)
|
||||
text = word.get("text", "")
|
||||
if sentence_end_pattern.search(text):
|
||||
segments.append(current_segment)
|
||||
current_segment = []
|
||||
|
||||
if current_segment:
|
||||
segments.append(current_segment)
|
||||
|
||||
return segments
|
||||
|
||||
|
||||
def split_utterances_by_pauses(utterances: list, pause_threshold_ms: int = 1500) -> list:
|
||||
"""Split long utterances based on pauses between words and sentence boundaries."""
|
||||
result = []
|
||||
|
||||
for utt in utterances:
|
||||
words = utt.get("words", [])
|
||||
if not words:
|
||||
result.append(utt)
|
||||
continue
|
||||
|
||||
speaker = utt.get("speaker", "?")
|
||||
current_segment_words = []
|
||||
segments = []
|
||||
|
||||
for i, word in enumerate(words):
|
||||
if not current_segment_words:
|
||||
current_segment_words.append(word)
|
||||
else:
|
||||
prev_word = current_segment_words[-1]
|
||||
gap = word.get("start", 0) - prev_word.get("end", 0)
|
||||
|
||||
if gap >= pause_threshold_ms:
|
||||
# Gap is large enough - split by sentences within current segment
|
||||
sentence_segments = split_words_by_sentences(current_segment_words)
|
||||
for seg_words in sentence_segments:
|
||||
segments.append({
|
||||
"speaker": speaker,
|
||||
"words": seg_words,
|
||||
"start": seg_words[0]["start"],
|
||||
"end": seg_words[-1]["end"]
|
||||
})
|
||||
current_segment_words = [word]
|
||||
else:
|
||||
current_segment_words.append(word)
|
||||
|
||||
# Process final segment
|
||||
if current_segment_words:
|
||||
sentence_segments = split_words_by_sentences(current_segment_words)
|
||||
for seg_words in sentence_segments:
|
||||
segments.append({
|
||||
"speaker": speaker,
|
||||
"words": seg_words,
|
||||
"start": seg_words[0]["start"],
|
||||
"end": seg_words[-1]["end"]
|
||||
})
|
||||
|
||||
# Convert segments to utterance format
|
||||
for seg in segments:
|
||||
text = " ".join(w.get("text", "") for w in seg["words"]).strip()
|
||||
if text:
|
||||
result.append({
|
||||
"speaker": seg["speaker"],
|
||||
"text": text,
|
||||
"start": seg["start"],
|
||||
"end": seg["end"],
|
||||
"words": seg["words"]
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def format_timestamp(ms: int) -> str:
|
||||
"""Format milliseconds as [mm:ss]."""
|
||||
seconds = ms // 1000
|
||||
@@ -108,6 +196,73 @@ def merge_utterances(utterances: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
return merged
|
||||
|
||||
|
||||
def extract_opening_song_title(utterances: List[Dict[str, Any]]) -> Tuple[str, str, str, List[Dict[str, Any]]]:
|
||||
"""
|
||||
Extract title from opening song (lines within first 15 seconds).
|
||||
Returns (title, song_speaker, joined_song_lyrics, remaining_utterances).
|
||||
|
||||
The title is the text after 'Malabar' in the opening song lyrics.
|
||||
All opening song lyrics (except title) are joined into one string.
|
||||
"""
|
||||
OPENING_SONG_THRESHOLD_MS = 15000 # 15 seconds
|
||||
|
||||
# Separate opening song utterances (within first 15s) from the rest
|
||||
opening_song = []
|
||||
remaining = []
|
||||
|
||||
for utt in utterances:
|
||||
if utt.get("start", 0) < OPENING_SONG_THRESHOLD_MS:
|
||||
opening_song.append(utt)
|
||||
else:
|
||||
remaining.append(utt)
|
||||
|
||||
if not opening_song:
|
||||
return "", "", "", utterances
|
||||
|
||||
# Find the utterance containing "Malabar"
|
||||
malabar_idx = -1
|
||||
title = ""
|
||||
song_speaker = opening_song[0].get("speaker", "A") if opening_song else "A"
|
||||
title_utterance_idx = -1 # The utterance that contains the title (to exclude from song)
|
||||
|
||||
for i, utt in enumerate(opening_song):
|
||||
text = utt.get("text", "")
|
||||
if "Malabar" in text or "malabar" in text.lower():
|
||||
malabar_idx = i
|
||||
song_speaker = utt.get("speaker", song_speaker)
|
||||
# Extract title: text after "Malabar" (and any punctuation/space)
|
||||
match = re.search(r'Malabar[\s,]*(.+)', text, re.IGNORECASE)
|
||||
if match:
|
||||
title = match.group(1).strip()
|
||||
# Remove trailing punctuation from title
|
||||
title = re.sub(r'[.!?]+$', '', title).strip()
|
||||
title_utterance_idx = i
|
||||
# Remove title part from this utterance for song lyrics
|
||||
utt["text"] = re.sub(r'Malabar[\s,]*.+$', 'Malabar', text, flags=re.IGNORECASE).strip()
|
||||
break
|
||||
|
||||
# If title not in same utterance as Malabar, check next utterance(s)
|
||||
if not title and malabar_idx >= 0:
|
||||
for j in range(malabar_idx + 1, len(opening_song)):
|
||||
next_text = opening_song[j].get("text", "").strip()
|
||||
if next_text:
|
||||
title = re.sub(r'[.!?]+$', '', next_text).strip()
|
||||
title_utterance_idx = j
|
||||
break
|
||||
|
||||
# Join all opening song lyrics except the title utterance
|
||||
song_lines = []
|
||||
for i, utt in enumerate(opening_song):
|
||||
if i != title_utterance_idx:
|
||||
text = utt.get("text", "").strip()
|
||||
if text:
|
||||
song_lines.append(text)
|
||||
|
||||
joined_song = " ".join(song_lines)
|
||||
|
||||
return title, song_speaker, joined_song, remaining
|
||||
|
||||
|
||||
def format_lines(transcript_data: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Format transcript utterances into lines.
|
||||
@@ -118,12 +273,32 @@ def format_lines(transcript_data: Dict[str, Any]) -> str:
|
||||
if not utterances:
|
||||
return ""
|
||||
|
||||
# Split long utterances based on pauses and sentence boundaries
|
||||
utterances = split_utterances_by_pauses(utterances, PAUSE_THRESHOLD_MS)
|
||||
|
||||
# Extract title from opening song (first 15 seconds) and get joined song lyrics
|
||||
title, song_speaker, joined_song, utterances = extract_opening_song_title(utterances)
|
||||
|
||||
# Merge non-word utterances
|
||||
merged = merge_utterances(utterances)
|
||||
|
||||
# Format lines
|
||||
lines = []
|
||||
|
||||
# Add title as first line if found (use "Song" as speaker)
|
||||
if title:
|
||||
lines.append(f"[00:00](Song) {title}")
|
||||
|
||||
# Add joined opening song as second line if exists (use "Song" as speaker)
|
||||
if joined_song:
|
||||
lines.append(f"[00:01](Song) {joined_song}")
|
||||
|
||||
# Format remaining lines (skip those within first 15s as they're in the joined song)
|
||||
for utt in merged:
|
||||
# Skip utterances within opening song window (they're already included in joined_song)
|
||||
if utt.get("start", 0) < 15000:
|
||||
continue
|
||||
|
||||
text = utt.get("text", "").strip()
|
||||
|
||||
# Skip standalone non-words unless they're at the end
|
||||
@@ -155,10 +330,10 @@ def process_transcript(input_path: Path) -> Path:
|
||||
with open(input_path, 'r', encoding='utf-8') as f:
|
||||
transcript_data = json.load(f)
|
||||
|
||||
utterance_count = len(transcript_data.get("utterances", []))
|
||||
print(f" Loaded {utterance_count} utterances")
|
||||
raw_count = len(transcript_data.get("utterances", []))
|
||||
print(f" Loaded {raw_count} raw utterances")
|
||||
|
||||
# Format lines
|
||||
# Format lines (includes splitting by pauses)
|
||||
formatted_text = format_lines(transcript_data)
|
||||
|
||||
# Save output
|
||||
|
||||
Reference in New Issue
Block a user