Transcribe Audio with multiple speakers

Code

import whisperx
import os
import tkinter as tk
from tkinter import filedialog, ttk, scrolledtext
from pydub import AudioSegment
import logging
import subprocess
import sys
import shutil
import warnings

# Suppress deprecation warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# Force WhisperX to use local VAD model to avoid redirect error
os.environ["WHISPERX_VAD_MODEL_PATH"] = r"D:\\PY\\models\\vad\\pytorch_model.bin"

# Redirect logging to GUI log window
class TextHandler(logging.Handler):
    def __init__(self, text_widget):
        super().__init__()
        self.text_widget = text_widget

    def emit(self, record):
        try:
            msg = self.format(record)
            if self.text_widget.winfo_exists():  # Check if widget still exists
                self.text_widget.insert(tk.END, msg + '\n')
                self.text_widget.see(tk.END)
                self.text_widget.update()
        except tk.TclError:
            pass  # Ignore errors if GUI is closed

# Supported audio formats
SUPPORTED_FORMATS = ['.wav', '.m4a', '.mp3', '.mp4', '.mkv']

# Function to convert audio to WAV if not already WAV

def convert_to_wav(input_file, output_dir, temp_dir):
    file_ext = os.path.splitext(input_file)[1].lower()
    if file_ext == '.wav':
        logger.info(f"Input file {input_file} is already WAV. No conversion needed.")
        return input_file

    output_file = os.path.join(temp_dir, os.path.splitext(os.path.basename(input_file))[0] + '.wav')
    try:
        format_param = 'matroska' if file_ext == '.mkv' else file_ext[1:]
        audio = AudioSegment.from_file(input_file, format=format_param)
        audio.export(output_file, format='wav')
        logger.info(f"Converted {input_file} to {output_file}")
        return output_file
    except Exception as e:
        logger.error(f"Error converting {input_file} to WAV: {str(e)}")
        try:
            result = subprocess.run(
                ['ffmpeg', '-i', input_file],
                capture_output=True, text=True, check=False
            )
            logger.error(f"FFmpeg output: {result.stderr}")
        except Exception as ffmpeg_e:
            logger.error(f"Could not run FFmpeg to diagnose file: {str(ffmpeg_e)}")
        raise

# Main transcription function

def transcribe_audio(input_file, output_dir, temp_dir):
    wav_file = None
    try:
        # Convert to WAV if necessary
        wav_file = convert_to_wav(input_file, output_dir, temp_dir)

        # Load the model
        logger.info("Loading WhisperX model...")
        asr_options = {
            "max_new_tokens": 448,
            "clip_timestamps": False,
            "hallucination_silence_threshold": 0.6
        }
        model = whisperx.load_model("base", device="cpu", compute_type="float32", asr_options=asr_options)

        # Transcribe
        logger.info("Transcribing audio...")
        result = model.transcribe(
            wav_file,
            batch_size=16,
            language=None,
        )

        # Align timestamps
        logger.info("Aligning timestamps...")
        model_a, metadata = whisperx.load_align_model(language_code="en", device="cpu")
        result = whisperx.align(result["segments"], model_a, metadata, wav_file, device="cpu")

        # Diarization
        logger.info("Performing diarization...")
        hf_token = "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"  # Replace with your token
        try:
            diarize_model = whisperx.diarize.DiarizationPipeline(use_auth_token=hf_token, device="cpu")
            diarize_segments = diarize_model(wav_file)
            result = whisperx.assign_word_speakers(diarize_segments, result)
        except AttributeError:
            logger.warning("DiarizationPipeline not available in this whisperx version. Skipping diarization.")
            for segment in result["segments"]:
                segment["speaker"] = "Unknown"

        # Save output
        output_file = os.path.join(output_dir, "transcription_with_speakers.txt")
        with open(output_file, "w") as f:
            for segment in result["segments"]:
                start = segment["start"]
                end = segment["end"]
                text = segment["text"]
                speaker = segment.get("speaker", "Unknown")
                f.write(f"[{start:.2f}s - {end:.2f}s] Speaker {speaker}: {text}\n")

        logger.info(f"Transcription complete. Output saved to {output_file}")

    except Exception as e:
        logger.error(f"Error during transcription: {str(e)}")
        raise
    finally:
        if wav_file and wav_file != input_file and os.path.exists(wav_file):
            try:
                os.remove(wav_file)
                logger.info(f"Removed temporary WAV file: {wav_file}")
            except Exception as e:
                logger.warning(f"Could not remove temporary WAV file {wav_file}: {str(e)}")

# GUI Application

class TranscriptionApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Audio Transcription")
        self.root.geometry("600x600")

        tk.Label(root, text="Input Audio File:").pack(pady=5)
        self.input_entry = tk.Entry(root, width=50)
        self.input_entry.pack(pady=5)
        tk.Button(root, text="Browse", command=self.browse_input).pack(pady=5)

        tk.Label(root, text="Output Directory:").pack(pady=5)
        self.output_entry = tk.Entry(root, width=50)
        self.output_entry.pack(pady=5)
        tk.Button(root, text="Browse", command=self.browse_output).pack(pady=5)

        tk.Label(root, text="Temporary Directory (for WAV files):").pack(pady=5)
        self.temp_entry = tk.Entry(root, width=50)
        self.temp_entry.insert(0, "D:\\PY\\temp")
        self.temp_entry.pack(pady=5)
        tk.Button(root, text="Browse", command=self.browse_temp).pack(pady=5)

        tk.Button(root, text="Transcribe", command=self.start_transcription).pack(pady=10)

        tk.Label(root, text="Log:").pack(pady=5)
        self.log_text = scrolledtext.ScrolledText(root, height=10, width=60, wrap=tk.WORD)
        self.log_text.pack(pady=5)

        text_handler = TextHandler(self.log_text)
        text_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
        logger.addHandler(text_handler)

    def browse_input(self):
        file_path = filedialog.askopenfilename(filetypes=[("Audio/Video Files", "*.wav *.m4a *.mp3 *.mp4 *.mkv")])
        if file_path:
            self.input_entry.delete(0, tk.END)
            self.input_entry.insert(0, file_path)

    def browse_output(self):
        dir_path = filedialog.askdirectory()
        if dir_path:
            self.output_entry.delete(0, tk.END)
            self.output_entry.insert(0, dir_path)

    def browse_temp(self):
        dir_path = filedialog.askdirectory()
        if dir_path:
            self.temp_entry.delete(0, tk.END)
            self.temp_entry.insert(0, dir_path)

    def start_transcription(self):
        input_file = self.input_entry.get()
        output_dir = self.output_entry.get()
        temp_dir = self.temp_entry.get()

        if not input_file or not output_dir or not temp_dir:
            logger.error("Please select input file, output directory, and temporary directory.")
            return

        if not os.path.exists(input_file):
            logger.error(f"Input file {input_file} does not exist.")
            return

        if os.path.splitext(input_file)[1].lower() not in SUPPORTED_FORMATS:
            logger.error(f"Unsupported file format. Supported formats: {', '.join(SUPPORTED_FORMATS)}")
            return

        if not os.path.exists(output_dir):
            try:
                os.makedirs(output_dir)
                logger.info(f"Created output directory: {output_dir}")
            except Exception as e:
                logger.error(f"Could not create output directory {output_dir}: {str(e)}")
                return

        if not os.path.exists(temp_dir):
            try:
                os.makedirs(temp_dir)
                logger.info(f"Created temporary directory: {temp_dir}")
            except Exception as e:
                logger.error(f"Could not create temporary directory {temp_dir}: {str(e)}")
                return

        import threading
        threading.Thread(target=transcribe_audio, args=(input_file, output_dir, temp_dir), daemon=True).start()

# Main execution
if __name__ == "__main__":
    try:
        subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
    except (subprocess.CalledProcessError, FileNotFoundError):
        logger.error("FFmpeg is not installed or not found. Please install FFmpeg to proceed.")
        sys.exit(1)

    default_temp_dir = "D:\\PY\\temp"
    if not os.path.exists(default_temp_dir):
        try:
            os.makedirs(default_temp_dir)
        except Exception as e:
            logger.error(f"Could not create default temporary directory {default_temp_dir}: {str(e)}")
            sys.exit(1)

    root = tk.Tk()
    app = TranscriptionApp(root)
    root.mainloop()

    try:
        if os.path.exists(default_temp_dir):
            shutil.rmtree(default_temp_dir)
            logger.info(f"Cleaned up default temporary directory: {default_temp_dir}")
    except Exception as e:
        logger.warning(f"Could not clean up default temporary directory {default_temp_dir}: {str(e)}")

🧠 WhisperX Offline Transcription Setup with GUI

📝 Summary

This guide details how to set up and patch WhisperX to transcribe long audio files (MP3, MP4, etc.) offline using a GUI-based Python app, bypassing VAD model downloads and network dependencies.

📦 Project Overview

Platform: Python 3.12 with WhisperX + SpeechBrain
Goal: Offline GUI app for audio transcription with speaker diarization
Input: Audio/Video file
Output: Timestamped transcript with speaker labels

✅ Key Features

No network requirement for VAD
GUI with file selection and logging
Long file support (tested on 3hr+ MP3)
Speaker diarization using speechbrain
Chunk-based transcription (VAD manually bypassed)

🛠️ Setup Instructions

1. 🐍 Python Environment

2. 📁 Folder Structure

3. 🔧 Environment Variable (Set in `transcribe.py`)

4. 🎯 GUI Usage

Run:

Then:

Select audio file
Choose output and temp folders
Click Transcribe

🔧 WhisperX Modifications

✅ `vad.py` Patch

Replaced Hugging Face model download with local load
Stubbed merge_chunks() for compatibility

✅ `asr.py` Patch

Skipped internal VAD model logic
Injected manual chunking (30s per segment)

Modified transcribe() inside FasterWhisperPipeline:

🐛 Issues Resolved

Issue	Resolution
`TranscriptionOptions.__new__()` missing args	Manually passed `asr_options` with required fields
HTTP 301 for VAD model	Replaced remote load with offline `.bin` path
`'dict' has no attribute 'ndim'`	Dummy VAD model returned incompatible type → fully bypassed
`vad_segments` unexpected argument	Removed invalid param from `transcribe()` call
`input shape (1, 80, 782456)` too large	Manual chunking into 30s segments

📁 Final Notes

Long audio files (2–3 hrs) may take 30–60+ minutes depending on CPU speed
Recommended: run on GPU or chunk files into 1-hour batches
Supports .mp3, .wav, .mp4, .mkv, .m4a

💾 Files to Backup for Future Use

transcribe.py
Patched: whisperx/vad.py
Patched: whisperx/asr.py
pytorch_model.bin saved locally

🧩 Future Improvements

Optional: add GUI dropdown for model size (base/medium/large)
Optional: progress bar and chunk counters
Optional: automatic chunked transcription and merge