# Python Scripts

# Transcribe Audio with multiple speakers

<details id="bkmrk-code-%C2%A0"><summary>Code</summary>

- ```python
    import whisperx
    import os
    import tkinter as tk
    from tkinter import filedialog, ttk, scrolledtext
    from pydub import AudioSegment
    import logging
    import subprocess
    import sys
    import shutil
    import warnings
    
    # Suppress deprecation warnings
    warnings.filterwarnings("ignore", category=UserWarning)
    
    # Set up logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    logger = logging.getLogger()
    
    # Force WhisperX to use local VAD model to avoid redirect error
    os.environ["WHISPERX_VAD_MODEL_PATH"] = r"D:\\PY\\models\\vad\\pytorch_model.bin"
    
    # Redirect logging to GUI log window
    class TextHandler(logging.Handler):
        def __init__(self, text_widget):
            super().__init__()
            self.text_widget = text_widget
    
        def emit(self, record):
            try:
                msg = self.format(record)
                if self.text_widget.winfo_exists():  # Check if widget still exists
                    self.text_widget.insert(tk.END, msg + '\n')
                    self.text_widget.see(tk.END)
                    self.text_widget.update()
            except tk.TclError:
                pass  # Ignore errors if GUI is closed
    
    # Supported audio formats
    SUPPORTED_FORMATS = ['.wav', '.m4a', '.mp3', '.mp4', '.mkv']
    
    # Function to convert audio to WAV if not already WAV
    
    def convert_to_wav(input_file, output_dir, temp_dir):
        file_ext = os.path.splitext(input_file)[1].lower()
        if file_ext == '.wav':
            logger.info(f"Input file {input_file} is already WAV. No conversion needed.")
            return input_file
    
        output_file = os.path.join(temp_dir, os.path.splitext(os.path.basename(input_file))[0] + '.wav')
        try:
            format_param = 'matroska' if file_ext == '.mkv' else file_ext[1:]
            audio = AudioSegment.from_file(input_file, format=format_param)
            audio.export(output_file, format='wav')
            logger.info(f"Converted {input_file} to {output_file}")
            return output_file
        except Exception as e:
            logger.error(f"Error converting {input_file} to WAV: {str(e)}")
            try:
                result = subprocess.run(
                    ['ffmpeg', '-i', input_file],
                    capture_output=True, text=True, check=False
                )
                logger.error(f"FFmpeg output: {result.stderr}")
            except Exception as ffmpeg_e:
                logger.error(f"Could not run FFmpeg to diagnose file: {str(ffmpeg_e)}")
            raise
    
    # Main transcription function
    
    def transcribe_audio(input_file, output_dir, temp_dir):
        wav_file = None
        try:
            # Convert to WAV if necessary
            wav_file = convert_to_wav(input_file, output_dir, temp_dir)
    
            # Load the model
            logger.info("Loading WhisperX model...")
            asr_options = {
                "max_new_tokens": 448,
                "clip_timestamps": False,
                "hallucination_silence_threshold": 0.6
            }
            model = whisperx.load_model("base", device="cpu", compute_type="float32", asr_options=asr_options)
    
            # Transcribe
            logger.info("Transcribing audio...")
            result = model.transcribe(
                wav_file,
                batch_size=16,
                language=None,
            )
    
            # Align timestamps
            logger.info("Aligning timestamps...")
            model_a, metadata = whisperx.load_align_model(language_code="en", device="cpu")
            result = whisperx.align(result["segments"], model_a, metadata, wav_file, device="cpu")
    
            # Diarization
            logger.info("Performing diarization...")
            hf_token = "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"  # Replace with your token
            try:
                diarize_model = whisperx.diarize.DiarizationPipeline(use_auth_token=hf_token, device="cpu")
                diarize_segments = diarize_model(wav_file)
                result = whisperx.assign_word_speakers(diarize_segments, result)
            except AttributeError:
                logger.warning("DiarizationPipeline not available in this whisperx version. Skipping diarization.")
                for segment in result["segments"]:
                    segment["speaker"] = "Unknown"
    
            # Save output
            output_file = os.path.join(output_dir, "transcription_with_speakers.txt")
            with open(output_file, "w") as f:
                for segment in result["segments"]:
                    start = segment["start"]
                    end = segment["end"]
                    text = segment["text"]
                    speaker = segment.get("speaker", "Unknown")
                    f.write(f"[{start:.2f}s - {end:.2f}s] Speaker {speaker}: {text}\n")
    
            logger.info(f"Transcription complete. Output saved to {output_file}")
    
        except Exception as e:
            logger.error(f"Error during transcription: {str(e)}")
            raise
        finally:
            if wav_file and wav_file != input_file and os.path.exists(wav_file):
                try:
                    os.remove(wav_file)
                    logger.info(f"Removed temporary WAV file: {wav_file}")
                except Exception as e:
                    logger.warning(f"Could not remove temporary WAV file {wav_file}: {str(e)}")
    
    # GUI Application
    
    class TranscriptionApp:
        def __init__(self, root):
            self.root = root
            self.root.title("Audio Transcription")
            self.root.geometry("600x600")
    
            tk.Label(root, text="Input Audio File:").pack(pady=5)
            self.input_entry = tk.Entry(root, width=50)
            self.input_entry.pack(pady=5)
            tk.Button(root, text="Browse", command=self.browse_input).pack(pady=5)
    
            tk.Label(root, text="Output Directory:").pack(pady=5)
            self.output_entry = tk.Entry(root, width=50)
            self.output_entry.pack(pady=5)
            tk.Button(root, text="Browse", command=self.browse_output).pack(pady=5)
    
            tk.Label(root, text="Temporary Directory (for WAV files):").pack(pady=5)
            self.temp_entry = tk.Entry(root, width=50)
            self.temp_entry.insert(0, "D:\\PY\\temp")
            self.temp_entry.pack(pady=5)
            tk.Button(root, text="Browse", command=self.browse_temp).pack(pady=5)
    
            tk.Button(root, text="Transcribe", command=self.start_transcription).pack(pady=10)
    
            tk.Label(root, text="Log:").pack(pady=5)
            self.log_text = scrolledtext.ScrolledText(root, height=10, width=60, wrap=tk.WORD)
            self.log_text.pack(pady=5)
    
            text_handler = TextHandler(self.log_text)
            text_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
            logger.addHandler(text_handler)
    
        def browse_input(self):
            file_path = filedialog.askopenfilename(filetypes=[("Audio/Video Files", "*.wav *.m4a *.mp3 *.mp4 *.mkv")])
            if file_path:
                self.input_entry.delete(0, tk.END)
                self.input_entry.insert(0, file_path)
    
        def browse_output(self):
            dir_path = filedialog.askdirectory()
            if dir_path:
                self.output_entry.delete(0, tk.END)
                self.output_entry.insert(0, dir_path)
    
        def browse_temp(self):
            dir_path = filedialog.askdirectory()
            if dir_path:
                self.temp_entry.delete(0, tk.END)
                self.temp_entry.insert(0, dir_path)
    
        def start_transcription(self):
            input_file = self.input_entry.get()
            output_dir = self.output_entry.get()
            temp_dir = self.temp_entry.get()
    
            if not input_file or not output_dir or not temp_dir:
                logger.error("Please select input file, output directory, and temporary directory.")
                return
    
            if not os.path.exists(input_file):
                logger.error(f"Input file {input_file} does not exist.")
                return
    
            if os.path.splitext(input_file)[1].lower() not in SUPPORTED_FORMATS:
                logger.error(f"Unsupported file format. Supported formats: {', '.join(SUPPORTED_FORMATS)}")
                return
    
            if not os.path.exists(output_dir):
                try:
                    os.makedirs(output_dir)
                    logger.info(f"Created output directory: {output_dir}")
                except Exception as e:
                    logger.error(f"Could not create output directory {output_dir}: {str(e)}")
                    return
    
            if not os.path.exists(temp_dir):
                try:
                    os.makedirs(temp_dir)
                    logger.info(f"Created temporary directory: {temp_dir}")
                except Exception as e:
                    logger.error(f"Could not create temporary directory {temp_dir}: {str(e)}")
                    return
    
            import threading
            threading.Thread(target=transcribe_audio, args=(input_file, output_dir, temp_dir), daemon=True).start()
    
    # Main execution
    if __name__ == "__main__":
        try:
            subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
        except (subprocess.CalledProcessError, FileNotFoundError):
            logger.error("FFmpeg is not installed or not found. Please install FFmpeg to proceed.")
            sys.exit(1)
    
        default_temp_dir = "D:\\PY\\temp"
        if not os.path.exists(default_temp_dir):
            try:
                os.makedirs(default_temp_dir)
            except Exception as e:
                logger.error(f"Could not create default temporary directory {default_temp_dir}: {str(e)}")
                sys.exit(1)
    
        root = tk.Tk()
        app = TranscriptionApp(root)
        root.mainloop()
    
        try:
            if os.path.exists(default_temp_dir):
                shutil.rmtree(default_temp_dir)
                logger.info(f"Cleaned up default temporary directory: {default_temp_dir}")
        except Exception as e:
            logger.warning(f"Could not clean up default temporary directory {default_temp_dir}: {str(e)}")
    
    ```

</details>## 🧠 WhisperX Offline Transcription Setup with GUI

### 📝 Summary

This guide details how to set up and patch WhisperX to transcribe long audio files (MP3, MP4, etc.) **offline** using a GUI-based Python app, bypassing VAD model downloads and network dependencies.

---

### 📦 Project Overview

- **Platform**: Python 3.12 with WhisperX + SpeechBrain
- **Goal**: Offline GUI app for audio transcription with speaker diarization
- **Input**: Audio/Video file
- **Output**: Timestamped transcript with speaker labels

---

### ✅ Key Features

- No network requirement for VAD
- GUI with file selection and logging
- Long file support (tested on 3hr+ MP3)
- Speaker diarization using `speechbrain`
- Chunk-based transcription (VAD manually bypassed)

---

### 🛠️ Setup Instructions

#### 1. 🐍 Python Environment

<div class="contain-inline-size rounded-md border-[0.5px] border-token-border-medium relative bg-token-sidebar-surface-primary" id="bkmrk-bash-copyedit-python"><div class="flex items-center text-token-text-secondary px-4 py-2 text-xs font-sans justify-between h-9 bg-token-sidebar-surface-primary dark:bg-token-main-surface-secondary select-none rounded-t-[5px]">bash</div><div class="sticky top-9"><div class="absolute end-0 bottom-0 flex h-9 items-center pe-2"><div class="bg-token-sidebar-surface-primary text-token-text-secondary dark:bg-token-main-surface-secondary flex items-center rounded-sm px-2 font-sans text-xs"><span></span></div></div></div><div class="overflow-y-auto p-4" dir="ltr">`python -m venv .venv.venv\Scripts\activatepip install whisperx torchaudio pydub tkinter speechbrain`</div></div>#### 2. 📁 Folder Structure

<div class="contain-inline-size rounded-md border-[0.5px] border-token-border-medium relative bg-token-sidebar-surface-primary" id="bkmrk-bash-copyedit-%2Ftrans"><div class="flex items-center text-token-text-secondary px-4 py-2 text-xs font-sans justify-between h-9 bg-token-sidebar-surface-primary dark:bg-token-main-surface-secondary select-none rounded-t-[5px]">bash</div><div class="sticky top-9"><div class="absolute end-0 bottom-0 flex h-9 items-center pe-2"><div class="bg-token-sidebar-surface-primary text-token-text-secondary dark:bg-token-main-surface-secondary flex items-center rounded-sm px-2 font-sans text-xs"><span></span></div></div></div><div class="overflow-y-auto p-4" dir="ltr">`/transcriber/├── transcribe.py            <span class="hljs-comment"># GUI application</span>├── models/vad/pytorch_model.bin  <span class="hljs-comment"># Downloaded manually</span>├── .venv/...`</div></div>#### 3. 🔧 Environment Variable (Set in `transcribe.py`)

<div class="contain-inline-size rounded-md border-[0.5px] border-token-border-medium relative bg-token-sidebar-surface-primary" id="bkmrk-python-copyedit-os.e"><div class="flex items-center text-token-text-secondary px-4 py-2 text-xs font-sans justify-between h-9 bg-token-sidebar-surface-primary dark:bg-token-main-surface-secondary select-none rounded-t-[5px]">python</div><div class="sticky top-9"><div class="absolute end-0 bottom-0 flex h-9 items-center pe-2"><div class="bg-token-sidebar-surface-primary text-token-text-secondary dark:bg-token-main-surface-secondary flex items-center rounded-sm px-2 font-sans text-xs"><span></span></div></div></div><div class="overflow-y-auto p-4" dir="ltr">`os.environ[<span class="hljs-string">"WHISPERX_VAD_MODEL_PATH"</span>] = <span class="hljs-string">r"D:\\PY\\models\\vad\\pytorch_model.bin"</span>`</div></div>#### 4. 🎯 GUI Usage

Run:

<div class="contain-inline-size rounded-md border-[0.5px] border-token-border-medium relative bg-token-sidebar-surface-primary" id="bkmrk-bash-copyedit-python-1"><div class="flex items-center text-token-text-secondary px-4 py-2 text-xs font-sans justify-between h-9 bg-token-sidebar-surface-primary dark:bg-token-main-surface-secondary select-none rounded-t-[5px]">bash</div><div class="sticky top-9"><div class="absolute end-0 bottom-0 flex h-9 items-center pe-2"><div class="bg-token-sidebar-surface-primary text-token-text-secondary dark:bg-token-main-surface-secondary flex items-center rounded-sm px-2 font-sans text-xs"><span></span></div></div></div><div class="overflow-y-auto p-4" dir="ltr">`python transcribe.py`</div></div>Then:

- Select audio file
- Choose output and temp folders
- Click **Transcribe**

---

### 🔧 WhisperX Modifications

#### ✅ `vad.py` Patch

- Replaced Hugging Face model download with local load
- Stubbed `merge_chunks()` for compatibility

<div class="contain-inline-size rounded-md border-[0.5px] border-token-border-medium relative bg-token-sidebar-surface-primary" id="bkmrk-python-copyedit-def-"><div class="flex items-center text-token-text-secondary px-4 py-2 text-xs font-sans justify-between h-9 bg-token-sidebar-surface-primary dark:bg-token-main-surface-secondary select-none rounded-t-[5px]">python</div><div class="sticky top-9"><div class="absolute end-0 bottom-0 flex h-9 items-center pe-2"><div class="bg-token-sidebar-surface-primary text-token-text-secondary dark:bg-token-main-surface-secondary flex items-center rounded-sm px-2 font-sans text-xs"><span></span></div></div></div><div class="overflow-y-auto p-4" dir="ltr">`<span class="hljs-keyword">def</span> <span class="hljs-title function_">load_vad_model</span>(<span class="hljs-params">...</span>):    model_fp = os.environ.get(<span class="hljs-string">"WHISPERX_VAD_MODEL_PATH"</span>)    <span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> model_fp <span class="hljs-keyword">or</span> <span class="hljs-keyword">not</span> os.path.exists(model_fp):        <span class="hljs-keyword">raise</span> FileNotFoundError(<span class="hljs-string">"Local VAD model path invalid."</span>)    <span class="hljs-built_in">print</span>(<span class="hljs-string">f"Using local VAD model at: <span class="hljs-subst">{model_fp}</span></span>")    bundle = torchaudio.pipelines.HUBERT_BASE    <span class="hljs-keyword">return</span> bundle.get_model().to(device).<span class="hljs-built_in">eval</span>()<span class="hljs-keyword">def</span> <span class="hljs-title function_">merge_chunks</span>(<span class="hljs-params">chunks, *args, **kwargs</span>):    <span class="hljs-keyword">return</span> chunks`</div></div>#### ✅ `asr.py` Patch

- Skipped internal VAD model logic
- Injected manual chunking (30s per segment)

**Modified `transcribe()` inside `FasterWhisperPipeline`:**

<div class="contain-inline-size rounded-md border-[0.5px] border-token-border-medium relative bg-token-sidebar-surface-primary" id="bkmrk-python-copyedit-dura"><div class="flex items-center text-token-text-secondary px-4 py-2 text-xs font-sans justify-between h-9 bg-token-sidebar-surface-primary dark:bg-token-main-surface-secondary select-none rounded-t-[5px]">python</div><div class="sticky top-9"><div class="absolute end-0 bottom-0 flex h-9 items-center pe-2"><div class="bg-token-sidebar-surface-primary text-token-text-secondary dark:bg-token-main-surface-secondary flex items-center rounded-sm px-2 font-sans text-xs"><span></span></div></div></div><div class="overflow-y-auto p-4" dir="ltr">`duration = audio.shape[<span class="hljs-number">0</span>] / SAMPLE_RATEchunk_duration = <span class="hljs-number">30.0</span>vad_segments = []start = <span class="hljs-number">0.0</span><span class="hljs-keyword">while</span> start < duration:    end = <span class="hljs-built_in">min</span>(start + chunk_duration, duration)    vad_segments.append({<span class="hljs-string">"start"</span>: start, <span class="hljs-string">"end"</span>: end})    start = end`</div></div>---

### 🐛 Issues Resolved

<div class="_tableContainer_16hzy_1" id="bkmrk-issue-resolution-tra"><div class="_tableWrapper_16hzy_14 group flex w-fit flex-col-reverse" tabindex="-1"><table class="w-fit min-w-(--thread-content-width)"><thead><tr><th>Issue</th><th>Resolution</th></tr></thead><tbody><tr><td>`TranscriptionOptions.__new__()` missing args</td><td>Manually passed `asr_options` with required fields</td></tr><tr><td>HTTP 301 for VAD model</td><td>Replaced remote load with offline `.bin` path</td></tr><tr><td>`'dict' has no attribute 'ndim'`</td><td>Dummy VAD model returned incompatible type → fully bypassed</td></tr><tr><td>`vad_segments` unexpected argument</td><td>Removed invalid param from `transcribe()` call</td></tr><tr><td>`input shape (1, 80, 782456)` too large</td><td>Manual chunking into 30s segments</td></tr></tbody></table>

<div class="sticky end-(--thread-content-margin) h-0 self-end select-none"><div class="absolute end-0 flex items-end"><span></span></div></div></div></div>---

### 📁 Final Notes

- Long audio files (2–3 hrs) may take 30–60+ minutes depending on CPU speed
- Recommended: run on GPU or chunk files into 1-hour batches
- Supports `.mp3`, `.wav`, `.mp4`, `.mkv`, `.m4a`

---

### 💾 Files to Backup for Future Use

- `transcribe.py`
- Patched: `whisperx/vad.py`
- Patched: `whisperx/asr.py`
- `pytorch_model.bin` saved locally

---

### 🧩 Future Improvements

- Optional: add GUI dropdown for model size (base/medium/large)
- Optional: progress bar and chunk counters
- Optional: automatic chunked transcription and merge