a
This commit is contained in:
747
transcribe_episodes.py
Normal file
747
transcribe_episodes.py
Normal file
@@ -0,0 +1,747 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Transcribe episodes with speaker diarization and name inference.
|
||||
|
||||
Requirements:
|
||||
uv sync
|
||||
# or: uv run transcribe_episodes.py (auto-installs deps)
|
||||
|
||||
Environment Variables:
|
||||
ASSEMBLYAI_API_KEY - Your AssemblyAI API key
|
||||
OPENAI_API_KEY - Your OpenAI/Kimi API key
|
||||
OPENAI_BASE_URL - (Optional) API base URL
|
||||
- Regular Kimi: https://api.moonshot.cn/v1
|
||||
- Kimi Code: https://api.kimi.com/coding/v1
|
||||
LLM_MODEL - (Optional) Model name, e.g., "kimi-for-coding"
|
||||
|
||||
Usage:
|
||||
uv run transcribe_episodes.py
|
||||
uv run transcribe_episodes.py status
|
||||
uv run transcribe_episodes.py reset [filename]
|
||||
uv run transcribe_episodes.py test-llm
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass, asdict
|
||||
from typing import List, Optional, Dict
|
||||
from datetime import timedelta
|
||||
|
||||
import assemblyai as aai
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
# ============== Configuration ==============
|
||||
|
||||
EPISODES_DIR = Path("episodes")
|
||||
OUTPUT_DIR = Path("transcripts")
|
||||
PROGRESS_FILE = Path(".transcription_progress.json")
|
||||
|
||||
# Characters to recognize
|
||||
CHARACTERS = ["Malabar", "Sun", "Jupiter", "Kangarro", "Mole"]
|
||||
|
||||
# LLM Configuration
|
||||
# For Kimi Code API: set OPENAI_BASE_URL="https://api.kimi.com/coding/v1" and LLM_MODEL="kimi-for-coding"
|
||||
DEFAULT_LLM_BASE_URL = "https://api.moonshot.cn/v1" # Default to regular Kimi
|
||||
DEFAULT_LLM_MODEL = "kimi-latest"
|
||||
|
||||
# Patterns for non-word utterances to merge with adjacent lines
|
||||
# These are sounds, modal particles, short acknowledgments
|
||||
NON_WORD_PATTERNS = [
|
||||
r'^[\s]*[嗯|啊|哦|呃|唉|哎|哈|哼|哟|哼|唔|呦|啊哈|哦豁|哎呀|哎哟|呜呼]+[\s]*$', # Chinese modal particles
|
||||
r'^[\s]*[Mm]hm+[\s]*$', # Mhm
|
||||
r'^[\s]*[Uu]h+[\s]*$', # Uh
|
||||
r'^[\s]*[Uu]m+[\s]*$', # Um
|
||||
r'^[\s]*[Aa]h+[\s]*$', # Ah
|
||||
r'^[\s]*[Oo]h+[\s]*$', # Oh
|
||||
r'^[\s]*[Hh]uh+[\s]*$', # Huh
|
||||
r'^[\s]*[Hh]mm+[\s]*$', # Hmm
|
||||
r'^[\s]*[Yy]eah?[\s]*$', # Yeah (standalone)
|
||||
r'^[\s]*[Nn]o+[\s]*$', # No (standalone)
|
||||
r'^[\s]*[Oo]k+[\s]*$', # Ok
|
||||
r'^[\s]*[Oo]kay+[\s]*$', # Okay
|
||||
r'^[\s]*[Rr]ight+[\s]*$', # Right (standalone)
|
||||
r'^[\s]*[Ww]hat+[\s]*$', # What (standalone)
|
||||
r'^[\s]*\([^)]*\)[\s]*$', # (laughs), (coughs), etc.
|
||||
r'^[\s]*\[[^\]]*\][\s]*$', # [laughs], [coughs], etc.
|
||||
]
|
||||
|
||||
NON_WORD_REGEX = re.compile('|'.join(f'({p})' for p in NON_WORD_PATTERNS), re.IGNORECASE)
|
||||
|
||||
|
||||
# ============== Data Classes ==============
|
||||
|
||||
@dataclass
|
||||
class Utterance:
|
||||
"""A single utterance from a speaker."""
|
||||
speaker: str # Original speaker label (A, B, C...)
|
||||
text: str
|
||||
start_ms: int
|
||||
end_ms: int
|
||||
inferred_name: Optional[str] = None
|
||||
|
||||
@property
|
||||
def timestamp(self) -> str:
|
||||
"""Format timestamp as [mm:ss]"""
|
||||
seconds = self.start_ms // 1000
|
||||
minutes = seconds // 60
|
||||
secs = seconds % 60
|
||||
return f"[{minutes:02d}:{secs:02d}]"
|
||||
|
||||
|
||||
@dataclass
|
||||
class EpisodeProgress:
|
||||
"""Track progress for an episode."""
|
||||
filename: str
|
||||
status: str # "pending", "transcribing", "naming", "completed", "error"
|
||||
error_message: Optional[str] = None
|
||||
output_file: Optional[str] = None
|
||||
|
||||
|
||||
# ============== Progress Manager ==============
|
||||
|
||||
class ProgressManager:
|
||||
"""Manages progress tracking to avoid re-processing."""
|
||||
|
||||
def __init__(self, progress_file: Path = PROGRESS_FILE):
|
||||
self.progress_file = progress_file
|
||||
self.progress: Dict[str, dict] = self._load()
|
||||
|
||||
def _load(self) -> Dict[str, dict]:
|
||||
if self.progress_file.exists():
|
||||
with open(self.progress_file, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
def save(self):
|
||||
with open(self.progress_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.progress, f, indent=2, ensure_ascii=False)
|
||||
|
||||
def get_status(self, filename: str) -> Optional[str]:
|
||||
return self.progress.get(filename, {}).get('status')
|
||||
|
||||
def set_status(self, filename: str, status: str, error_message: str = None, output_file: str = None):
|
||||
if filename not in self.progress:
|
||||
self.progress[filename] = {}
|
||||
self.progress[filename]['status'] = status
|
||||
if error_message:
|
||||
self.progress[filename]['error'] = error_message
|
||||
if output_file:
|
||||
self.progress[filename]['output_file'] = output_file
|
||||
self.save()
|
||||
|
||||
def is_completed(self, filename: str) -> bool:
|
||||
return self.get_status(filename) == "completed"
|
||||
|
||||
def reset(self, filename: str = None):
|
||||
"""Reset progress for a file or all files."""
|
||||
if filename:
|
||||
if filename in self.progress:
|
||||
del self.progress[filename]
|
||||
else:
|
||||
self.progress = {}
|
||||
self.save()
|
||||
|
||||
|
||||
# ============== Transcription Service ==============
|
||||
|
||||
class AssemblyAITranscriber:
|
||||
"""Handles transcription with speaker diarization using AssemblyAI."""
|
||||
|
||||
def __init__(self, api_key: str):
|
||||
aai.settings.api_key = api_key
|
||||
|
||||
def transcribe(self, audio_file: Path, on_progress=None) -> List[Utterance]:
|
||||
"""
|
||||
Transcribe audio file with speaker diarization.
|
||||
Returns list of Utterance objects.
|
||||
"""
|
||||
config = aai.TranscriptionConfig(
|
||||
speaker_labels=True,
|
||||
speech_models=["universal-2"],
|
||||
language_detection=True, # Auto-detect language (English, Chinese, etc.)
|
||||
)
|
||||
|
||||
transcriber = aai.Transcriber(config=config)
|
||||
|
||||
print(f" Uploading {audio_file.name}...")
|
||||
transcript = transcriber.transcribe(str(audio_file))
|
||||
|
||||
if transcript.status == aai.TranscriptStatus.error:
|
||||
raise Exception(f"Transcription failed: {transcript.error}")
|
||||
|
||||
print(f" Transcription complete. Processing utterances...")
|
||||
|
||||
utterances = []
|
||||
for utt in transcript.utterances:
|
||||
utterances.append(Utterance(
|
||||
speaker=utt.speaker,
|
||||
text=utt.text.strip(),
|
||||
start_ms=utt.start,
|
||||
end_ms=utt.end
|
||||
))
|
||||
|
||||
return utterances
|
||||
|
||||
|
||||
# ============== Speaker Naming Service ==============
|
||||
|
||||
class SpeakerNamer:
|
||||
"""Uses LLM to infer speaker names from context."""
|
||||
|
||||
# Kimi/Moonshot API endpoints to try
|
||||
KIMI_ENDPOINTS = [
|
||||
"https://api.moonshot.cn/v1",
|
||||
"https://api.moonshot.ai/v1",
|
||||
]
|
||||
|
||||
def __init__(self, api_key: str, base_url: Optional[str] = None, model: Optional[str] = None):
|
||||
# Determine API type
|
||||
base_url = base_url or DEFAULT_LLM_BASE_URL
|
||||
is_kimi = "moonshot" in base_url or "kimi" in base_url
|
||||
|
||||
# Use provided model or default based on API
|
||||
if model:
|
||||
self.model = model
|
||||
elif is_kimi:
|
||||
self.model = DEFAULT_LLM_MODEL
|
||||
else:
|
||||
self.model = "gpt-4o-mini"
|
||||
|
||||
# Debug: Show which API is being used (without exposing the key)
|
||||
print(f" LLM Config: Using {'Kimi' if is_kimi else 'OpenAI'} API")
|
||||
print(f" Base URL: {base_url}")
|
||||
print(f" Model: {self.model}")
|
||||
print(f" API Key set: {'Yes (starts with ' + api_key[:8] + '...)' if api_key else 'NO - MISSING!'}")
|
||||
|
||||
self.api_key = api_key
|
||||
self.base_url = base_url
|
||||
self.is_kimi = is_kimi
|
||||
|
||||
self.client = OpenAI(
|
||||
api_key=api_key,
|
||||
base_url=base_url
|
||||
)
|
||||
|
||||
def _try_infer_with_endpoint(self, endpoint: str, prompt: str, speakers: List[str]) -> Optional[Dict[str, str]]:
|
||||
"""Try to infer speakers using a specific endpoint."""
|
||||
client = OpenAI(api_key=self.api_key, base_url=endpoint)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant that identifies speakers in a dialogue based on context and speaking style."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
temperature=0.3
|
||||
)
|
||||
|
||||
result = response.choices[0].message.content
|
||||
return self._parse_naming_response(result, speakers)
|
||||
|
||||
def infer_speakers(self, utterances: List[Utterance]) -> Dict[str, str]:
|
||||
"""
|
||||
Infer speaker names based on context.
|
||||
Returns mapping of speaker_label -> name
|
||||
"""
|
||||
# Get unique speaker labels
|
||||
speakers = sorted(set(utt.speaker for utt in utterances))
|
||||
|
||||
# Build context with samples from each speaker
|
||||
speaker_samples = {s: [] for s in speakers}
|
||||
for utt in utterances:
|
||||
if len(speaker_samples[utt.speaker]) < 5: # Sample up to 5 utterances
|
||||
speaker_samples[utt.speaker].append(utt.text)
|
||||
|
||||
# Build prompt
|
||||
prompt = self._build_naming_prompt(speaker_samples)
|
||||
|
||||
print(f" Sending to LLM for speaker naming...")
|
||||
|
||||
# Try different endpoints for Kimi
|
||||
endpoints_to_try = []
|
||||
if self.is_kimi:
|
||||
# Start with configured base_url, then try alternatives
|
||||
if self.base_url:
|
||||
endpoints_to_try.append(self.base_url)
|
||||
for ep in self.KIMI_ENDPOINTS:
|
||||
if ep not in endpoints_to_try:
|
||||
endpoints_to_try.append(ep)
|
||||
else:
|
||||
endpoints_to_try = [self.base_url] if self.base_url else [None]
|
||||
|
||||
last_error = None
|
||||
for endpoint in endpoints_to_try:
|
||||
try:
|
||||
print(f" Trying endpoint: {endpoint or 'default (OpenAI)'}")
|
||||
if self.is_kimi:
|
||||
mapping = self._try_infer_with_endpoint(endpoint, prompt, speakers)
|
||||
else:
|
||||
# Use default client for OpenAI
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant that identifies speakers in a dialogue based on context and speaking style."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
temperature=0.3
|
||||
)
|
||||
result = response.choices[0].message.content
|
||||
mapping = self._parse_naming_response(result, speakers)
|
||||
|
||||
print(f" Identified speakers: {mapping}")
|
||||
return mapping
|
||||
|
||||
except Exception as e:
|
||||
last_error = str(e)
|
||||
print(f" Failed: {last_error[:100]}...")
|
||||
continue # Try next endpoint
|
||||
|
||||
# All endpoints failed
|
||||
print(f"\n Warning: Speaker naming failed on all endpoints.")
|
||||
print(f" Last error: {last_error}")
|
||||
|
||||
# Provide helpful guidance for authentication errors
|
||||
if "401" in str(last_error) or "403" in str(last_error) or "Invalid Authentication" in str(last_error):
|
||||
print("\n === Authentication Error ===")
|
||||
print(" Your API key was rejected. Please check:")
|
||||
print(" 1. Is OPENAI_API_KEY set to your Kimi API key?")
|
||||
print(" Get your key from: https://platform.moonshot.cn/")
|
||||
print(" 2. Does your API key have sufficient balance/credits?")
|
||||
print(" 3. Kimi Code API (api.kimi.com/coding) requires special access.")
|
||||
print(" For regular Kimi, use: export OPENAI_BASE_URL='https://api.moonshot.cn/v1'")
|
||||
print(" 4. The script will continue with generic speaker labels (Speaker A, B, etc.)")
|
||||
print(" =============================\n")
|
||||
|
||||
# Fallback to speaker labels
|
||||
return {s: f"Speaker {s}" for s in speakers}
|
||||
|
||||
def _build_naming_prompt(self, speaker_samples: Dict[str, List[str]]) -> str:
|
||||
prompt = """I have a transcript from a video featuring the following characters: """ + ", ".join(CHARACTERS) + """.
|
||||
|
||||
Below are sample utterances from each speaker. Based on the context, speaking style, and content, please identify which character is which speaker.
|
||||
|
||||
"""
|
||||
for speaker, samples in speaker_samples.items():
|
||||
prompt += f"Speaker {speaker}:\n"
|
||||
for i, sample in enumerate(samples, 1):
|
||||
prompt += f" {i}. \"{sample}\"\n"
|
||||
prompt += "\n"
|
||||
|
||||
prompt += """Please respond with a JSON object mapping speaker labels to character names.
|
||||
Example: {"A": "Malabar", "B": "Sun", "C": "Jupiter"}
|
||||
|
||||
Only use the character names from the list provided. If you're unsure, make your best guess based on the speaking style and context."""
|
||||
|
||||
return prompt
|
||||
|
||||
def _parse_naming_response(self, response: str, valid_speakers: List[str]) -> Dict[str, str]:
|
||||
"""Parse LLM response to extract speaker mapping."""
|
||||
# Try to find JSON in the response
|
||||
import json
|
||||
|
||||
# Look for JSON block
|
||||
json_match = re.search(r'\{[^}]+\}', response)
|
||||
if json_match:
|
||||
try:
|
||||
mapping = json.loads(json_match.group())
|
||||
# Validate keys
|
||||
return {k: v for k, v in mapping.items() if k in valid_speakers}
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Fallback: look for "Speaker X: Name" pattern
|
||||
mapping = {}
|
||||
for speaker in valid_speakers:
|
||||
pattern = rf'["\']?{speaker}["\']?\s*[:=]\s*["\']?([^"\'\n,]+)'
|
||||
match = re.search(pattern, response, re.IGNORECASE)
|
||||
if match:
|
||||
name = match.group(1).strip()
|
||||
# Ensure name is in our character list
|
||||
for char in CHARACTERS:
|
||||
if char.lower() in name.lower():
|
||||
mapping[speaker] = char
|
||||
break
|
||||
else:
|
||||
mapping[speaker] = name
|
||||
|
||||
# Fill in any missing speakers
|
||||
for speaker in valid_speakers:
|
||||
if speaker not in mapping:
|
||||
mapping[speaker] = f"Speaker {speaker}"
|
||||
|
||||
return mapping
|
||||
|
||||
|
||||
# ============== Output Formatter ==============
|
||||
|
||||
class OutputFormatter:
|
||||
"""Formats and merges utterances for output."""
|
||||
|
||||
@staticmethod
|
||||
def is_non_word(text: str) -> bool:
|
||||
"""Check if text is a non-word utterance (sound, modal particle)."""
|
||||
return bool(NON_WORD_REGEX.match(text.strip()))
|
||||
|
||||
@classmethod
|
||||
def merge_utterances(cls, utterances: List[Utterance]) -> List[Utterance]:
|
||||
"""
|
||||
Merge consecutive utterances from the same speaker that are non-words
|
||||
with adjacent meaningful utterances.
|
||||
"""
|
||||
if not utterances:
|
||||
return []
|
||||
|
||||
merged = []
|
||||
i = 0
|
||||
|
||||
while i < len(utterances):
|
||||
current = utterances[i]
|
||||
|
||||
# Check if current is a non-word
|
||||
if cls.is_non_word(current.text):
|
||||
# Look ahead to find the next meaningful utterance from same speaker
|
||||
j = i + 1
|
||||
while j < len(utterances) and utterances[j].speaker == current.speaker:
|
||||
if not cls.is_non_word(utterances[j].text):
|
||||
# Merge current into the next meaningful one
|
||||
utterances[j].text = current.text + " " + utterances[j].text
|
||||
utterances[j].start_ms = current.start_ms
|
||||
i = j
|
||||
break
|
||||
j += 1
|
||||
else:
|
||||
# No meaningful utterance found, keep as is
|
||||
merged.append(current)
|
||||
i += 1
|
||||
else:
|
||||
# Check if previous was a non-word from same speaker
|
||||
if merged and merged[-1].speaker == current.speaker and cls.is_non_word(merged[-1].text):
|
||||
# Merge previous into current
|
||||
current.text = merged[-1].text + " " + current.text
|
||||
current.start_ms = merged[-1].start_ms
|
||||
merged.pop()
|
||||
|
||||
merged.append(current)
|
||||
i += 1
|
||||
|
||||
return merged
|
||||
|
||||
@classmethod
|
||||
def format_output(cls, utterances: List[Utterance], speaker_mapping: Dict[str, str]) -> str:
|
||||
"""Format utterances to output string."""
|
||||
# Apply speaker names
|
||||
for utt in utterances:
|
||||
utt.inferred_name = speaker_mapping.get(utt.speaker, f"Speaker {utt.speaker}")
|
||||
|
||||
# Merge consecutive non-words
|
||||
merged = cls.merge_utterances(utterances)
|
||||
|
||||
# Format lines
|
||||
lines = []
|
||||
for utt in merged:
|
||||
# Skip standalone non-words unless they're at the end
|
||||
if cls.is_non_word(utt.text) and utt != merged[-1]:
|
||||
continue
|
||||
|
||||
speaker_name = utt.inferred_name or f"Speaker {utt.speaker}"
|
||||
lines.append(f"{utt.timestamp}({speaker_name}) {utt.text}")
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
# ============== Main Processor ==============
|
||||
|
||||
class EpisodeProcessor:
|
||||
"""Main processor that orchestrates transcription and naming."""
|
||||
|
||||
def __init__(self):
|
||||
# Check API keys
|
||||
assembly_key = os.getenv("ASSEMBLYAI_API_KEY")
|
||||
openai_key = os.getenv("OPENAI_API_KEY")
|
||||
|
||||
if not assembly_key:
|
||||
raise ValueError("ASSEMBLYAI_API_KEY environment variable is required")
|
||||
if not openai_key:
|
||||
raise ValueError("OPENAI_API_KEY environment variable is required")
|
||||
|
||||
self.transcriber = AssemblyAITranscriber(assembly_key)
|
||||
|
||||
# Get LLM configuration from environment
|
||||
openai_base = os.getenv("OPENAI_BASE_URL", DEFAULT_LLM_BASE_URL)
|
||||
llm_model = os.getenv("LLM_MODEL") # e.g., "kimi-for-coding"
|
||||
|
||||
self.namer = SpeakerNamer(openai_key, openai_base, llm_model)
|
||||
|
||||
self.progress = ProgressManager()
|
||||
self.formatter = OutputFormatter()
|
||||
|
||||
# Ensure output directory exists
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
def process_episode(self, video_file: Path) -> bool:
|
||||
"""Process a single episode. Returns True on success."""
|
||||
filename = video_file.name
|
||||
|
||||
# Check if already completed
|
||||
if self.progress.is_completed(filename):
|
||||
print(f"Skipping {filename} (already completed)")
|
||||
return True
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Processing: {filename}")
|
||||
print(f"{'='*50}")
|
||||
|
||||
try:
|
||||
# Step 1: Transcription
|
||||
self.progress.set_status(filename, "transcribing")
|
||||
utterances = self.transcriber.transcribe(video_file)
|
||||
|
||||
if not utterances:
|
||||
print(f" No utterances found in {filename}")
|
||||
self.progress.set_status(filename, "completed", output_file=None)
|
||||
return True
|
||||
|
||||
print(f" Found {len(utterances)} utterances")
|
||||
|
||||
# Step 2: Speaker naming
|
||||
self.progress.set_status(filename, "naming")
|
||||
speaker_mapping = self.namer.infer_speakers(utterances)
|
||||
|
||||
# Step 3: Format and save
|
||||
output_text = self.formatter.format_output(utterances, speaker_mapping)
|
||||
|
||||
output_filename = video_file.stem + ".txt"
|
||||
output_path = OUTPUT_DIR / output_filename
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(output_text)
|
||||
|
||||
self.progress.set_status(filename, "completed", output_file=str(output_path))
|
||||
print(f" Saved to: {output_path}")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
self.progress.set_status(filename, "error", error_message=error_msg)
|
||||
print(f" ERROR: {error_msg}")
|
||||
return False
|
||||
|
||||
def process_all(self):
|
||||
"""Process all video files in episodes directory."""
|
||||
# Find all video files
|
||||
video_extensions = {'.mp4', '.mkv', '.avi', '.mov', '.webm'}
|
||||
video_files = [
|
||||
f for f in EPISODES_DIR.iterdir()
|
||||
if f.is_file() and f.suffix.lower() in video_extensions
|
||||
]
|
||||
|
||||
if not video_files:
|
||||
print(f"No video files found in {EPISODES_DIR}")
|
||||
return
|
||||
|
||||
print(f"Found {len(video_files)} video file(s) to process")
|
||||
|
||||
success_count = 0
|
||||
fail_count = 0
|
||||
|
||||
for video_file in sorted(video_files):
|
||||
if self.process_episode(video_file):
|
||||
success_count += 1
|
||||
else:
|
||||
fail_count += 1
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Processing complete: {success_count} succeeded, {fail_count} failed")
|
||||
print(f"Transcripts saved to: {OUTPUT_DIR}")
|
||||
print(f"Progress tracked in: {PROGRESS_FILE}")
|
||||
|
||||
|
||||
# ============== CLI ==============
|
||||
|
||||
def print_usage():
|
||||
print("""
|
||||
Usage: uv run transcribe_episodes.py [command]
|
||||
|
||||
Commands:
|
||||
(none) Process all episodes
|
||||
reset Reset progress for all files (will re-process everything)
|
||||
reset <file> Reset progress for specific file
|
||||
status Show current progress status
|
||||
test-llm Test LLM API connection (diagnostic)
|
||||
|
||||
Environment Variables:
|
||||
ASSEMBLYAI_API_KEY Required - Your AssemblyAI API key
|
||||
OPENAI_API_KEY Required - Your OpenAI/Kimi API key
|
||||
OPENAI_BASE_URL Optional - API endpoint URL
|
||||
- Regular Kimi: https://api.moonshot.cn/v1
|
||||
- Kimi Code: https://api.kimi.com/coding/v1
|
||||
LLM_MODEL Optional - Model name (e.g., "kimi-for-coding")
|
||||
|
||||
Examples:
|
||||
# Regular Kimi
|
||||
export ASSEMBLYAI_API_KEY="your-assembly-key"
|
||||
export OPENAI_API_KEY="your-kimi-key"
|
||||
uv run transcribe_episodes.py
|
||||
|
||||
# Kimi Code
|
||||
export ASSEMBLYAI_API_KEY="your-assembly-key"
|
||||
export OPENAI_API_KEY="your-kimi-code-key"
|
||||
export OPENAI_BASE_URL="https://api.kimi.com/coding/v1"
|
||||
export LLM_MODEL="kimi-for-coding"
|
||||
uv run transcribe_episodes.py
|
||||
|
||||
# Test connection:
|
||||
uv run transcribe_episodes.py test-llm
|
||||
""")
|
||||
|
||||
|
||||
def test_llm_connection():
|
||||
"""Test LLM API connection and print diagnostic info."""
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
base_url = os.getenv("OPENAI_BASE_URL", DEFAULT_LLM_BASE_URL)
|
||||
model = os.getenv("LLM_MODEL") # e.g., "kimi-for-coding"
|
||||
|
||||
print("=" * 50)
|
||||
print("LLM API Connection Test")
|
||||
print("=" * 50)
|
||||
print(f"\nConfiguration:")
|
||||
print(f" Base URL: {base_url}")
|
||||
print(f" Model: {model or '(auto-detect)'}")
|
||||
print(f" API Key: {'Set (starts with ' + api_key[:12] + '...)' if api_key else 'NOT SET'}")
|
||||
|
||||
if not api_key:
|
||||
print("\n❌ ERROR: OPENAI_API_KEY environment variable is not set!")
|
||||
print("\nTo fix:")
|
||||
print(" export OPENAI_API_KEY='your-api-key-here'")
|
||||
return
|
||||
|
||||
# Try connecting
|
||||
print(f"\nTesting connection...")
|
||||
|
||||
is_kimi = "moonshot" in base_url or "kimi" in base_url
|
||||
if not model:
|
||||
model = "kimi-latest" if is_kimi else "gpt-4o-mini"
|
||||
|
||||
endpoints_to_try = [base_url]
|
||||
|
||||
for endpoint in endpoints_to_try:
|
||||
print(f"\n Trying endpoint: {endpoint}")
|
||||
print(f" Model: {model}")
|
||||
try:
|
||||
client = OpenAI(api_key=api_key, base_url=endpoint)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "Say 'Hello' and nothing else."}],
|
||||
max_tokens=10
|
||||
)
|
||||
|
||||
print(f" ✅ SUCCESS!")
|
||||
print(f" Response: '{response.choices[0].message.content}'")
|
||||
print(f"\n✅ LLM API is working correctly!")
|
||||
return
|
||||
|
||||
except Exception as e:
|
||||
error_str = str(e)
|
||||
print(f" ❌ Failed: {error_str}")
|
||||
|
||||
if "401" in error_str or "403" in error_str:
|
||||
print("\n Authentication Error - Possible causes:")
|
||||
print(" 1. API key is incorrect or has been revoked")
|
||||
print(" 2. API key has no credits remaining")
|
||||
print(" 3. Wrong API key for the selected service")
|
||||
print(" 4. Kimi Code API requires special access (only for Coding Agents)")
|
||||
print("\n To fix:")
|
||||
print(" - For regular Kimi, use: export OPENAI_BASE_URL='https://api.moonshot.cn/v1'")
|
||||
print(" - For Kimi Code: Ensure your account has Coding Agent access")
|
||||
print(" - Get a regular Kimi API key from: https://platform.moonshot.cn/")
|
||||
|
||||
print("\n❌ All endpoints failed. Please check your API key and try again.")
|
||||
|
||||
|
||||
def show_status():
|
||||
"""Show current progress status."""
|
||||
progress = ProgressManager()
|
||||
|
||||
print(f"\nProgress Status ({PROGRESS_FILE}):")
|
||||
print("-" * 50)
|
||||
|
||||
if not progress.progress:
|
||||
print("No progress recorded yet.")
|
||||
return
|
||||
|
||||
for filename, data in sorted(progress.progress.items()):
|
||||
status = data.get('status', 'unknown')
|
||||
output = data.get('output_file', '-')
|
||||
error = data.get('error', '')
|
||||
|
||||
status_icon = "✓" if status == "completed" else "✗" if status == "error" else "⋯"
|
||||
print(f"{status_icon} {filename}: {status}")
|
||||
if output and status == "completed":
|
||||
print(f" Output: {output}")
|
||||
if error:
|
||||
print(f" Error: {error}")
|
||||
|
||||
|
||||
def main():
|
||||
args = sys.argv[1:]
|
||||
|
||||
if len(args) > 0:
|
||||
command = args[0]
|
||||
|
||||
if command == "help" or command == "--help" or command == "-h":
|
||||
print_usage()
|
||||
return
|
||||
|
||||
elif command == "status":
|
||||
show_status()
|
||||
return
|
||||
|
||||
elif command == "reset":
|
||||
progress = ProgressManager()
|
||||
if len(args) > 1:
|
||||
target = args[1]
|
||||
progress.reset(target)
|
||||
print(f"Reset progress for: {target}")
|
||||
else:
|
||||
progress.reset()
|
||||
print("Reset all progress")
|
||||
return
|
||||
|
||||
elif command == "test-llm":
|
||||
test_llm_connection()
|
||||
return
|
||||
|
||||
else:
|
||||
print(f"Unknown command: {command}")
|
||||
print_usage()
|
||||
return
|
||||
|
||||
# Default: process all episodes
|
||||
try:
|
||||
processor = EpisodeProcessor()
|
||||
processor.process_all()
|
||||
except ValueError as e:
|
||||
print(f"Error: {e}")
|
||||
print("\nPlease set the required environment variables:")
|
||||
print(" export ASSEMBLYAI_API_KEY='your-key'")
|
||||
print(" export OPENAI_API_KEY='your-key'")
|
||||
print("\nOptional (for Kimi Code):")
|
||||
print(" export OPENAI_BASE_URL='https://api.kimi.com/coding/v1'")
|
||||
print(" export LLM_MODEL='kimi-for-coding'")
|
||||
print("\nFor regular Kimi, the base URL defaults to https://api.moonshot.cn/v1")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Unexpected error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user