#!/usr/bin/env python3 """ Step 1: Extract transcript from video using AssemblyAI. Input: Video files in "episodes/" folder Output: Raw AssemblyAI transcript JSON in "_assembleai/" folder Usage: uv run step1_transcribe.py uv run step1_transcribe.py -f # Force reprocessing """ import os import re import sys import json from pathlib import Path import assemblyai as aai # ============== Configuration ============== EPISODES_DIR = Path("episodes") OUTPUT_DIR = Path("_assembleai") PROGRESS_FILE = Path(".step1_progress.json") def ensure_dirs(): """Ensure output directories exist.""" OUTPUT_DIR.mkdir(exist_ok=True) def split_words_by_sentences(words: list) -> list: """ Split a list of words into sentence segments based on punctuation. Args: words: List of word dictionaries with 'text' key Returns: List of word segments, each representing a sentence """ if not words: return [] segments = [] current_segment = [] # Pattern for sentence-ending punctuation (including the punctuation itself) sentence_end_pattern = re.compile(r'[.!?]+["\')\]]*$') for word in words: current_segment.append(word) text = word.get("text", "") # Check if this word ends with sentence-ending punctuation if sentence_end_pattern.search(text): # End of sentence - save this segment segments.append(current_segment) current_segment = [] # Don't forget any remaining words if current_segment: segments.append(current_segment) return segments def ends_with_sentence_punctuation(text: str) -> bool: """Check if text ends with sentence-ending punctuation.""" text = text.strip() return bool(re.search(r'[.!?]["\'\)\]]*$', text)) def merge_incomplete_sentences(utterances: list) -> list: """ Merge consecutive utterances where the first doesn't end with sentence punctuation. This handles cases where AssemblyAI splits mid-sentence between speakers. Uses the first speaker's label for merged utterances. """ if not utterances: return utterances result = [] current = utterances[0].copy() for i in range(1, len(utterances)): next_utt = utterances[i] current_text = current.get("text", "") # If current doesn't end with sentence punctuation, merge with next if not ends_with_sentence_punctuation(current_text): # Merge words current["words"] = current.get("words", []) + next_utt.get("words", []) # Update text current["text"] = current_text + " " + next_utt.get("text", "") # Update end time current["end"] = next_utt.get("end", current["end"]) # Keep the first speaker's label (don't change to "?") # current["speaker"] stays the same else: # Current is complete, save it and move to next result.append(current) current = next_utt.copy() # Don't forget the last one result.append(current) return result def split_utterances_by_pauses(utterances: list, pause_threshold_ms: int = 1500) -> list: """ Split long utterances based on pauses between words and sentence boundaries. Args: utterances: List of utterance dictionaries from AssemblyAI pause_threshold_ms: Minimum gap (in milliseconds) to create a new utterance Returns: List of split utterances """ # First, merge consecutive utterances that don't end with sentence punctuation utterances = merge_incomplete_sentences(utterances) result = [] for utt in utterances: words = utt.get("words", []) if not words: # No word-level data, keep original result.append(utt) continue speaker = utt.get("speaker", "?") current_segment_words = [] segments = [] for i, word in enumerate(words): if not current_segment_words: # First word in segment current_segment_words.append(word) else: # Check gap from previous word prev_word = current_segment_words[-1] gap = word.get("start", 0) - prev_word.get("end", 0) if gap >= pause_threshold_ms: # Gap is large enough - first split by sentences within current segment sentence_segments = split_words_by_sentences(current_segment_words) for seg_words in sentence_segments: segments.append({ "speaker": speaker, "words": seg_words, "start": seg_words[0]["start"], "end": seg_words[-1]["end"] }) current_segment_words = [word] else: # Continue current segment current_segment_words.append(word) # Don't forget the last segment - also split by sentences if current_segment_words: sentence_segments = split_words_by_sentences(current_segment_words) for seg_words in sentence_segments: segments.append({ "speaker": speaker, "words": seg_words, "start": seg_words[0]["start"], "end": seg_words[-1]["end"] }) # Convert segments to utterance format for seg in segments: text = " ".join(w.get("text", "") for w in seg["words"]).strip() if text: # Only add non-empty segments result.append({ "speaker": seg["speaker"], "text": text, "start": seg["start"], "end": seg["end"], "words": seg["words"] }) return result def load_progress() -> dict: """Load progress tracking.""" if PROGRESS_FILE.exists(): with open(PROGRESS_FILE, 'r', encoding='utf-8') as f: return json.load(f) return {} def save_progress(progress: dict): """Save progress tracking.""" with open(PROGRESS_FILE, 'w', encoding='utf-8') as f: json.dump(progress, f, indent=2) def transcribe_video(video_path: Path) -> dict: """ Transcribe video using AssemblyAI with speaker diarization. Returns the raw transcript as a dictionary. """ api_key = os.getenv("ASSEMBLYAI_API_KEY") if not api_key: raise ValueError("ASSEMBLYAI_API_KEY environment variable is required") aai.settings.api_key = api_key print(f" Uploading {video_path.name}...") # Speaker diarization config # By default, AssemblyAI detects 1-10 speakers # If you know the expected number, you can set speakers_expected # Or set speaker_options for a range speaker_options = aai.SpeakerOptions( min_speakers=2, max_speakers=10 # Allow up to 10 speakers ) config = aai.TranscriptionConfig( speaker_labels=True, speech_models=["universal-2"], language_detection=True, # Auto-detect language speaker_options=speaker_options, ) transcriber = aai.Transcriber(config=config) transcript = transcriber.transcribe(str(video_path)) if transcript.status == aai.TranscriptStatus.error: raise Exception(f"Transcription failed: {transcript.error}") print(f" Transcription complete!") # Convert utterances to dictionaries first raw_utterances = [] for utt in transcript.utterances: raw_utterances.append({ "speaker": utt.speaker, "text": utt.text.strip(), "start": utt.start, "end": utt.end, "confidence": utt.confidence if hasattr(utt, 'confidence') else None, "words": [ { "text": w.text, "start": w.start, "end": w.end, "speaker": w.speaker if hasattr(w, 'speaker') else None } for w in (utt.words if hasattr(utt, 'words') else []) ] }) # Split long utterances based on pauses original_count = len(raw_utterances) split_utterances = split_utterances_by_pauses(raw_utterances, pause_threshold_ms=1500) new_count = len(split_utterances) if new_count > original_count: print(f" Split {original_count} utterances into {new_count} (based on 1.5s pauses)") # Convert transcript to serializable dictionary result = { "id": transcript.id, "status": str(transcript.status), "audio_url": transcript.audio_url, "text": transcript.text, "confidence": transcript.confidence, "audio_duration": transcript.audio_duration, "language_code": transcript.json_response.get("language_code", "unknown"), "utterances": split_utterances } return result def process_video(video_path: Path, force: bool = False) -> Path: """ Process a single video file. Returns the path to the output JSON file. """ progress = load_progress() filename = video_path.name # Check if already processed if not force and filename in progress and progress[filename].get("status") == "completed": output_path = Path(progress[filename]["output_file"]) if output_path.exists(): print(f"Skipping {filename} (already processed)") return output_path print(f"\n{'='*50}") print(f"Processing: {filename}") print(f"{'='*50}") try: # Transcribe progress[filename] = {"status": "transcribing"} save_progress(progress) transcript_data = transcribe_video(video_path) # Save to JSON output_filename = video_path.stem + "_assemblyai.json" output_path = OUTPUT_DIR / output_filename with open(output_path, 'w', encoding='utf-8') as f: json.dump(transcript_data, f, indent=2, ensure_ascii=False) # Update progress progress[filename] = { "status": "completed", "output_file": str(output_path), "utterance_count": len(transcript_data["utterances"]) } save_progress(progress) print(f" Saved to: {output_path}") print(f" Utterances: {len(transcript_data['utterances'])}") return output_path except Exception as e: progress[filename] = {"status": "error", "error": str(e)} save_progress(progress) print(f" ERROR: {e}") raise def get_input_files() -> list[Path]: """Discover all video files in episodes/ folder.""" if not EPISODES_DIR.exists(): return [] # Support common video formats video_extensions = {'.mp4', '.mkv', '.avi', '.mov', '.webm'} files = [] for f in EPISODES_DIR.iterdir(): if f.is_file() and f.suffix.lower() in video_extensions: files.append(f) return sorted(files) def main(): ensure_dirs() # Check for force flag force = "--force" in sys.argv or "-f" in sys.argv # Discover input files video_files = get_input_files() if not video_files: print(f"No video files found in {EPISODES_DIR}/") print("Supported formats: .mp4, .mkv, .avi, .mov, .webm") sys.exit(1) print(f"Found {len(video_files)} video(s) in {EPISODES_DIR}/") if force: print("Force mode: ON (reprocessing all files)") print("") # Process all videos success_count = 0 fail_count = 0 for video_path in video_files: try: output_path = process_video(video_path, force=force) success_count += 1 except Exception as e: print(f"\n❌ Failed to process {video_path.name}: {e}") fail_count += 1 # Summary print("\n" + "="*50) print(f"Step 1 Complete: {success_count} succeeded, {fail_count} failed") print("="*50) if fail_count > 0: sys.exit(1) if __name__ == "__main__": main()