224 lines
6.5 KiB
Python
224 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Step 1: Extract transcript from video using AssemblyAI.
|
|
|
|
Input: Video files in "episodes/" folder
|
|
Output: Raw AssemblyAI transcript JSON in "_assembleai/" folder
|
|
|
|
Usage:
|
|
uv run step1_transcribe.py
|
|
uv run step1_transcribe.py -f # Force reprocessing
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import assemblyai as aai
|
|
|
|
# ============== Configuration ==============
|
|
|
|
EPISODES_DIR = Path("episodes")
|
|
OUTPUT_DIR = Path("_assembleai")
|
|
PROGRESS_FILE = Path(".step1_progress.json")
|
|
|
|
|
|
def ensure_dirs():
|
|
"""Ensure output directories exist."""
|
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
|
|
|
|
|
def load_progress() -> dict:
|
|
"""Load progress tracking."""
|
|
if PROGRESS_FILE.exists():
|
|
with open(PROGRESS_FILE, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
return {}
|
|
|
|
|
|
def save_progress(progress: dict):
|
|
"""Save progress tracking."""
|
|
with open(PROGRESS_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(progress, f, indent=2)
|
|
|
|
|
|
def transcribe_video(video_path: Path) -> dict:
|
|
"""
|
|
Transcribe video using AssemblyAI with speaker diarization.
|
|
Returns the raw transcript as a dictionary.
|
|
"""
|
|
api_key = os.getenv("ASSEMBLYAI_API_KEY")
|
|
if not api_key:
|
|
raise ValueError("ASSEMBLYAI_API_KEY environment variable is required")
|
|
|
|
aai.settings.api_key = api_key
|
|
|
|
print(f" Uploading {video_path.name}...")
|
|
|
|
# Speaker diarization config
|
|
# Lower speaker_sensitivity = more aggressive speaker detection (more speakers)
|
|
speaker_options = aai.SpeakerOptions(
|
|
min_speakers=2,
|
|
max_speakers=10, # Allow up to 10 speakers
|
|
speaker_sensitivity=0.2 # Low value = more sensitive to speaker changes
|
|
)
|
|
|
|
config = aai.TranscriptionConfig(
|
|
speaker_labels=True,
|
|
speech_models=["universal-2"],
|
|
language_detection=True, # Auto-detect language
|
|
speaker_options=speaker_options,
|
|
)
|
|
|
|
transcriber = aai.Transcriber(config=config)
|
|
transcript = transcriber.transcribe(str(video_path))
|
|
|
|
if transcript.status == aai.TranscriptStatus.error:
|
|
raise Exception(f"Transcription failed: {transcript.error}")
|
|
|
|
print(f" Transcription complete!")
|
|
|
|
# Convert transcript to serializable dictionary - NO POSTPROCESSING
|
|
# Raw AssemblyAI output
|
|
result = {
|
|
"id": transcript.id,
|
|
"status": str(transcript.status),
|
|
"audio_url": transcript.audio_url,
|
|
"text": transcript.text,
|
|
"confidence": transcript.confidence,
|
|
"audio_duration": transcript.audio_duration,
|
|
"language_code": transcript.json_response.get("language_code", "unknown"),
|
|
"utterances": [
|
|
{
|
|
"speaker": utt.speaker,
|
|
"text": utt.text.strip(),
|
|
"start": utt.start,
|
|
"end": utt.end,
|
|
"confidence": utt.confidence if hasattr(utt, 'confidence') else None,
|
|
"words": [
|
|
{
|
|
"text": w.text,
|
|
"start": w.start,
|
|
"end": w.end,
|
|
"speaker": w.speaker if hasattr(w, 'speaker') else None
|
|
}
|
|
for w in (utt.words if hasattr(utt, 'words') else [])
|
|
]
|
|
}
|
|
for utt in transcript.utterances
|
|
]
|
|
}
|
|
|
|
return result
|
|
|
|
|
|
def process_video(video_path: Path, force: bool = False) -> Path:
|
|
"""
|
|
Process a single video file.
|
|
Returns the path to the output JSON file.
|
|
"""
|
|
progress = load_progress()
|
|
filename = video_path.name
|
|
|
|
# Check if already processed
|
|
if not force and filename in progress and progress[filename].get("status") == "completed":
|
|
output_path = Path(progress[filename]["output_file"])
|
|
if output_path.exists():
|
|
print(f"Skipping {filename} (already processed)")
|
|
return output_path
|
|
|
|
print(f"\n{'='*50}")
|
|
print(f"Processing: {filename}")
|
|
print(f"{'='*50}")
|
|
|
|
try:
|
|
# Transcribe
|
|
progress[filename] = {"status": "transcribing"}
|
|
save_progress(progress)
|
|
|
|
transcript_data = transcribe_video(video_path)
|
|
|
|
# Save to JSON
|
|
output_filename = video_path.stem + "_assemblyai.json"
|
|
output_path = OUTPUT_DIR / output_filename
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(transcript_data, f, indent=2, ensure_ascii=False)
|
|
|
|
# Update progress
|
|
progress[filename] = {
|
|
"status": "completed",
|
|
"output_file": str(output_path),
|
|
"utterance_count": len(transcript_data["utterances"])
|
|
}
|
|
save_progress(progress)
|
|
|
|
print(f" Saved to: {output_path}")
|
|
print(f" Utterances: {len(transcript_data['utterances'])}")
|
|
|
|
return output_path
|
|
|
|
except Exception as e:
|
|
progress[filename] = {"status": "error", "error": str(e)}
|
|
save_progress(progress)
|
|
print(f" ERROR: {e}")
|
|
raise
|
|
|
|
|
|
def get_input_files() -> list[Path]:
|
|
"""Discover all video files in episodes/ folder."""
|
|
if not EPISODES_DIR.exists():
|
|
return []
|
|
# Support common video formats
|
|
video_extensions = {'.mp4', '.mkv', '.avi', '.mov', '.webm'}
|
|
files = []
|
|
for f in EPISODES_DIR.iterdir():
|
|
if f.is_file() and f.suffix.lower() in video_extensions:
|
|
files.append(f)
|
|
return sorted(files)
|
|
|
|
|
|
def main():
|
|
ensure_dirs()
|
|
|
|
# Check for force flag
|
|
force = "--force" in sys.argv or "-f" in sys.argv
|
|
|
|
# Discover input files
|
|
video_files = get_input_files()
|
|
|
|
if not video_files:
|
|
print(f"No video files found in {EPISODES_DIR}/")
|
|
print("Supported formats: .mp4, .mkv, .avi, .mov, .webm")
|
|
sys.exit(1)
|
|
|
|
print(f"Found {len(video_files)} video(s) in {EPISODES_DIR}/")
|
|
if force:
|
|
print("Force mode: ON (reprocessing all files)")
|
|
print("")
|
|
|
|
# Process all videos
|
|
success_count = 0
|
|
fail_count = 0
|
|
|
|
for video_path in video_files:
|
|
try:
|
|
output_path = process_video(video_path, force=force)
|
|
success_count += 1
|
|
except Exception as e:
|
|
print(f"\n❌ Failed to process {video_path.name}: {e}")
|
|
fail_count += 1
|
|
|
|
# Summary
|
|
print("\n" + "="*50)
|
|
print(f"Step 1 Complete: {success_count} succeeded, {fail_count} failed")
|
|
print("="*50)
|
|
|
|
if fail_count > 0:
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|