import whisper
import numpy as np
import subprocess
import time
import sys
import re
from config import RTSP_URL

# DEFINITION DER FÜLLWÖRTER
FILLER_WORDS = ["äh", "ähhm", "ähm", "mhm", "halt", "quasi", "sozusagen", "eigentlich"]

def analyze_text(text):
    """Sucht nach Füllwörtern im erkannten Text."""
    text_clean = re.sub(r'[^\w\s]', '', text.lower())
    # Entfernt einzelne Buchstaben/Satzzeichen und splittet in Wörter
    words = text_clean.split()
    found_fillers = {w: words.count(w) for w in FILLER_WORDS if w in words}
    return sum(found_fillers.values()), found_fillers

def run_adaptive_whisper():
    print(f"--- M2 Live-Coach: Analyse läuft ---")
    
    # Modell laden
    print("Lade Modell (base)...")
    model = whisper.load_model("base")
    
    # FFmpeg Befehl mit TCP für stabilere Verbindung
    command = [
        'ffmpeg', 
        '-rtsp_transport', 'tcp',
        '-i', RTSP_URL,
        '-ar', '16000', 
        '-ac', '1', 
        '-f', 's16le', 
        '-'
    ]

    # Startet den FFmpeg-Prozess
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
    
    audio_buffer = []
    silence_start = None
    
    # EINSTELLUNGEN FÜR DIE PAUSENERKENNUNG
    THRESHOLD = 350        # Empfindlichkeit
    SILENCE_DURATION = 2.5 # Sekunden Stille bis zum Ende
    MIN_AUDIO_LENGTH = 15  # Mindestmenge an Daten
    
    print(f"\nVerbindung zu: {RTSP_URL}")
    print("[MISTY HÖRT ZU] Bitte sprich jetzt (2.5s Pause zum Beenden)...")
    
    try:
        while True:
            raw_chunk = process.stdout.read(3200) 
            if not raw_chunk:
                break
            
            chunk_np = np.frombuffer(raw_chunk, dtype=np.int16)
            if chunk_np.size == 0: continue
            
            audio_buffer.append(chunk_np)
            amplitude = np.sqrt(np.mean(chunk_np**2))
            
            if amplitude < THRESHOLD:
                if silence_start is None:
                    silence_start = time.time()
                elif time.time() - silence_start > SILENCE_DURATION:
                    if len(audio_buffer) > MIN_AUDIO_LENGTH:
                        print("\n[Pause erkannt - Analyse startet]")
                        break
            else:
                sys.stdout.write(".")
                sys.stdout.flush()
                silence_start = None

        process.terminate()
        
        if not audio_buffer:
            print("\n❌ Fehler: Keine Audiodaten empfangen.")
            return

        print("Verarbeite Audio...")
        full_audio = np.concatenate(audio_buffer).astype(np.float32) / 32768.0
        
        # Transkription mit Füllwort-Support
        result = model.transcribe(full_audio, language="de", initial_prompt="Äh, ähm, mhm.")
        
        text = result['text'].strip()
        count, details = analyze_text(text)
        
        # --- AUSGABE ---
        print("\n" + "═"*45)
        print(f"ERKANNT: {text}")
        print("─"*45)
        print(f"ANALYSE: {count} Füllwörter gefunden.")
        if count > 0:
            for w, n in details.items():
                print(f" -> '{w}': {n}x")
        print("═"*45 + "\n")

    except Exception as e:
        print(f"\nFehler: {e}")
    finally:
        if process and process.poll() is None:
            process.kill()

if __name__ == "__main__":
    run_adaptive_whisper()