malabar/step1_transcribe.py

#!/usr/bin/env python3
"""
Step 1: Extract transcript from video using AssemblyAI.

Input: Video files in "episodes/" folder
Output: Raw AssemblyAI transcript JSON in "_assembleai/" folder

Usage:
    uv run step1_transcribe.py
    uv run step1_transcribe.py -f    # Force reprocessing
"""

import os
import re
import sys
import json
from pathlib import Path

import assemblyai as aai

# ============== Configuration ==============

EPISODES_DIR = Path("episodes")
OUTPUT_DIR = Path("_assembleai")
PROGRESS_FILE = Path(".step1_progress.json")


def ensure_dirs():
    """Ensure output directories exist."""
    OUTPUT_DIR.mkdir(exist_ok=True)


def split_words_by_sentences(words: list) -> list:
    """
    Split a list of words into sentence segments based on punctuation.

    Args:
        words: List of word dictionaries with 'text' key

    Returns:
        List of word segments, each representing a sentence
    """
    if not words:
        return []

    segments = []
    current_segment = []

    # Pattern for sentence-ending punctuation (including the punctuation itself)
    sentence_end_pattern = re.compile(r'[.!?]+["\')\]]*$')

    for word in words:
        current_segment.append(word)
        text = word.get("text", "")

        # Check if this word ends with sentence-ending punctuation
        if sentence_end_pattern.search(text):
            # End of sentence - save this segment
            segments.append(current_segment)
            current_segment = []

    # Don't forget any remaining words
    if current_segment:
        segments.append(current_segment)

    return segments


def ends_with_sentence_punctuation(text: str) -> bool:
    """Check if text ends with sentence-ending punctuation."""
    text = text.strip()
    return bool(re.search(r'[.!?]["\'\)\]]*$', text))


def merge_incomplete_sentences(utterances: list) -> list:
    """
    Merge consecutive utterances where the first doesn't end with sentence punctuation.
    This handles cases where AssemblyAI splits mid-sentence between speakers.
    Uses the first speaker's label for merged utterances.
    """
    if not utterances:
        return utterances

    result = []
    current = utterances[0].copy()

    for i in range(1, len(utterances)):
        next_utt = utterances[i]
        current_text = current.get("text", "")

        # If current doesn't end with sentence punctuation, merge with next
        if not ends_with_sentence_punctuation(current_text):
            # Merge words
            current["words"] = current.get("words", []) + next_utt.get("words", [])
            # Update text
            current["text"] = current_text + " " + next_utt.get("text", "")
            # Update end time
            current["end"] = next_utt.get("end", current["end"])
            # Keep the first speaker's label (don't change to "?")
            # current["speaker"] stays the same
        else:
            # Current is complete, save it and move to next
            result.append(current)
            current = next_utt.copy()

    # Don't forget the last one
    result.append(current)

    return result


def split_utterances_by_pauses(utterances: list, pause_threshold_ms: int = 1500) -> list:
    """
    Split long utterances based on pauses between words and sentence boundaries.

    Args:
        utterances: List of utterance dictionaries from AssemblyAI
        pause_threshold_ms: Minimum gap (in milliseconds) to create a new utterance

    Returns:
        List of split utterances
    """
    # First, merge consecutive utterances that don't end with sentence punctuation
    utterances = merge_incomplete_sentences(utterances)

    result = []

    for utt in utterances:
        words = utt.get("words", [])
        if not words:
            # No word-level data, keep original
            result.append(utt)
            continue

        speaker = utt.get("speaker", "?")
        current_segment_words = []
        segments = []

        for i, word in enumerate(words):
            if not current_segment_words:
                # First word in segment
                current_segment_words.append(word)
            else:
                # Check gap from previous word
                prev_word = current_segment_words[-1]
                gap = word.get("start", 0) - prev_word.get("end", 0)

                if gap >= pause_threshold_ms:
                    # Gap is large enough - first split by sentences within current segment
                    sentence_segments = split_words_by_sentences(current_segment_words)
                    for seg_words in sentence_segments:
                        segments.append({
                            "speaker": speaker,
                            "words": seg_words,
                            "start": seg_words[0]["start"],
                            "end": seg_words[-1]["end"]
                        })
                    current_segment_words = [word]
                else:
                    # Continue current segment
                    current_segment_words.append(word)

        # Don't forget the last segment - also split by sentences
        if current_segment_words:
            sentence_segments = split_words_by_sentences(current_segment_words)
            for seg_words in sentence_segments:
                segments.append({
                    "speaker": speaker,
                    "words": seg_words,
                    "start": seg_words[0]["start"],
                    "end": seg_words[-1]["end"]
                })

        # Convert segments to utterance format
        for seg in segments:
            text = " ".join(w.get("text", "") for w in seg["words"]).strip()
            if text:  # Only add non-empty segments
                result.append({
                    "speaker": seg["speaker"],
                    "text": text,
                    "start": seg["start"],
                    "end": seg["end"],
                    "words": seg["words"]
                })

    return result


def load_progress() -> dict:
    """Load progress tracking."""
    if PROGRESS_FILE.exists():
        with open(PROGRESS_FILE, 'r', encoding='utf-8') as f:
            return json.load(f)
    return {}


def save_progress(progress: dict):
    """Save progress tracking."""
    with open(PROGRESS_FILE, 'w', encoding='utf-8') as f:
        json.dump(progress, f, indent=2)


def transcribe_video(video_path: Path) -> dict:
    """
    Transcribe video using AssemblyAI with speaker diarization.
    Returns the raw transcript as a dictionary.
    """
    api_key = os.getenv("ASSEMBLYAI_API_KEY")
    if not api_key:
        raise ValueError("ASSEMBLYAI_API_KEY environment variable is required")

    aai.settings.api_key = api_key

    print(f"  Uploading {video_path.name}...")

    # Speaker diarization config
    # By default, AssemblyAI detects 1-10 speakers
    # If you know the expected number, you can set speakers_expected
    # Or set speaker_options for a range
    speaker_options = aai.SpeakerOptions(
        min_speakers=2,
        max_speakers=10  # Allow up to 10 speakers
    )

    config = aai.TranscriptionConfig(
        speaker_labels=True,
        speech_models=["universal-2"],
        language_detection=True,  # Auto-detect language
        speaker_options=speaker_options,
    )

    transcriber = aai.Transcriber(config=config)
    transcript = transcriber.transcribe(str(video_path))

    if transcript.status == aai.TranscriptStatus.error:
        raise Exception(f"Transcription failed: {transcript.error}")

    print(f"  Transcription complete!")

    # Convert utterances to dictionaries first
    raw_utterances = []
    for utt in transcript.utterances:
        raw_utterances.append({
            "speaker": utt.speaker,
            "text": utt.text.strip(),
            "start": utt.start,
            "end": utt.end,
            "confidence": utt.confidence if hasattr(utt, 'confidence') else None,
            "words": [
                {
                    "text": w.text,
                    "start": w.start,
                    "end": w.end,
                    "speaker": w.speaker if hasattr(w, 'speaker') else None
                }
                for w in (utt.words if hasattr(utt, 'words') else [])
            ]
        })

    # Split long utterances based on pauses
    original_count = len(raw_utterances)
    split_utterances = split_utterances_by_pauses(raw_utterances, pause_threshold_ms=1500)
    new_count = len(split_utterances)

    if new_count > original_count:
        print(f"  Split {original_count} utterances into {new_count} (based on 1.5s pauses)")

    # Convert transcript to serializable dictionary
    result = {
        "id": transcript.id,
        "status": str(transcript.status),
        "audio_url": transcript.audio_url,
        "text": transcript.text,
        "confidence": transcript.confidence,
        "audio_duration": transcript.audio_duration,
        "language_code": transcript.json_response.get("language_code", "unknown"),
        "utterances": split_utterances
    }

    return result


def process_video(video_path: Path, force: bool = False) -> Path:
    """
    Process a single video file.
    Returns the path to the output JSON file.
    """
    progress = load_progress()
    filename = video_path.name

    # Check if already processed
    if not force and filename in progress and progress[filename].get("status") == "completed":
        output_path = Path(progress[filename]["output_file"])
        if output_path.exists():
            print(f"Skipping {filename} (already processed)")
            return output_path

    print(f"\n{'='*50}")
    print(f"Processing: {filename}")
    print(f"{'='*50}")

    try:
        # Transcribe
        progress[filename] = {"status": "transcribing"}
        save_progress(progress)

        transcript_data = transcribe_video(video_path)

        # Save to JSON
        output_filename = video_path.stem + "_assemblyai.json"
        output_path = OUTPUT_DIR / output_filename

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(transcript_data, f, indent=2, ensure_ascii=False)

        # Update progress
        progress[filename] = {
            "status": "completed",
            "output_file": str(output_path),
            "utterance_count": len(transcript_data["utterances"])
        }
        save_progress(progress)

        print(f"  Saved to: {output_path}")
        print(f"  Utterances: {len(transcript_data['utterances'])}")

        return output_path

    except Exception as e:
        progress[filename] = {"status": "error", "error": str(e)}
        save_progress(progress)
        print(f"  ERROR: {e}")
        raise


def get_input_files() -> list[Path]:
    """Discover all video files in episodes/ folder."""
    if not EPISODES_DIR.exists():
        return []
    # Support common video formats
    video_extensions = {'.mp4', '.mkv', '.avi', '.mov', '.webm'}
    files = []
    for f in EPISODES_DIR.iterdir():
        if f.is_file() and f.suffix.lower() in video_extensions:
            files.append(f)
    return sorted(files)


def main():
    ensure_dirs()

    # Check for force flag
    force = "--force" in sys.argv or "-f" in sys.argv

    # Discover input files
    video_files = get_input_files()

    if not video_files:
        print(f"No video files found in {EPISODES_DIR}/")
        print("Supported formats: .mp4, .mkv, .avi, .mov, .webm")
        sys.exit(1)

    print(f"Found {len(video_files)} video(s) in {EPISODES_DIR}/")
    if force:
        print("Force mode: ON (reprocessing all files)")
    print("")

    # Process all videos
    success_count = 0
    fail_count = 0

    for video_path in video_files:
        try:
            output_path = process_video(video_path, force=force)
            success_count += 1
        except Exception as e:
            print(f"\n❌ Failed to process {video_path.name}: {e}")
            fail_count += 1

    # Summary
    print("\n" + "="*50)
    print(f"Step 1 Complete: {success_count} succeeded, {fail_count} failed")
    print("="*50)

    if fail_count > 0:
        sys.exit(1)


if __name__ == "__main__":
    main()