Misty-Rhetorik-Coach/old/whisper_terminal_check.py

import whisper
import numpy as np
import subprocess
import time
import sys
import re
from config import RTSP_URL

FILLER_WORDS = ["äh", "ähhm", "ähm", "mhm", "halt", "quasi", "sozusagen", "eigentlich"]

def analyze_text(text):
    text_clean = re.sub(r'[^\w\s]', '', text.lower())
    words = text_clean.split()
    found_fillers = {w: words.count(w) for w in FILLER_WORDS if w in words}
    return sum(found_fillers.values()), found_fillers

def run_adaptive_whisper():
    print(f"--- M2 Live-Coach: Analyse läuft ---")
    print("Lade KI-Modell...")
    model = whisper.load_model("base")

    # FFmpeg mit längerer Analysezeit und TCP-Zwang
    command = [
        'ffmpeg',
        '-rtsp_transport', 'tcp',
        '-i', RTSP_URL,
        '-ar', '16000', '-ac', '1', '-f', 's16le', '-'
    ]

    print(f"Verbinde zu: {RTSP_URL}")
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)

    audio_buffer = []
    silence_start = None
    THRESHOLD = 300 # Etwas empfindlicher
    SILENCE_DURATION = 2.5

    print("[WARTE AUF STREAM...]")

    try:
        # 10 Versuche, den Stream-Anfang zu finden
        for _ in range(100):
            raw_chunk = process.stdout.read(3200)
            if raw_chunk:
                print("[MISTY HÖRT ZU] - Daten fließen!")
                audio_buffer.append(np.frombuffer(raw_chunk, dtype=np.int16))
                break
            time.sleep(0.1)

        if not audio_buffer:
            print("❌ Fehler: Misty sendet keine Daten auf Port 1936.")
            return

        while True:
            raw_chunk = process.stdout.read(3200)
            if not raw_chunk: break

            chunk_np = np.frombuffer(raw_chunk, dtype=np.int16)
            audio_buffer.append(chunk_np)
            amplitude = np.sqrt(np.mean(chunk_np**2)) if chunk_np.size > 0 else 0

            if amplitude < THRESHOLD:
                if silence_start is None:
                    silence_start = time.time()
                elif time.time() - silence_start > SILENCE_DURATION:
                    if len(audio_buffer) > 20: break
            else:
                sys.stdout.write(".")
                sys.stdout.flush()
                silence_start = None

        process.terminate()
        full_audio = np.concatenate(audio_buffer).astype(np.float32) / 32768.0
        result = model.transcribe(full_audio, language="de", initial_prompt="Äh, ähm, mhm.")

        text = result['text'].strip()
        count, details = analyze_text(text)

        print("\n" + "═"*45)
        print(f"TEXT: {text}")
        print(f"FÜLLWÖRTER: {count}")
        print("═"*45)

    except Exception as e:
        print(f"\nFehler: {e}")
    finally:
        if process: process.kill()

if __name__ == "__main__":
    run_adaptive_whisper()