Misty-Rhetorik-Coach/old/test
2026-05-06 12:46:05 +00:00

108 lines
3.3 KiB
Text

import whisper
import numpy as np
import subprocess
import time
import sys
import re
from config import RTSP_URL
# DEFINITION DER FÜLLWÖRTER
FILLER_WORDS = ["äh", "ähhm", "ähm", "mhm", "halt", "quasi", "sozusagen", "eigentlich"]
def analyze_text(text):
"""Sucht nach Füllwörtern im erkannten Text."""
text_clean = re.sub(r'[^\w\s]', '', text.lower())
# Entfernt einzelne Buchstaben/Satzzeichen und splittet in Wörter
words = text_clean.split()
found_fillers = {w: words.count(w) for w in FILLER_WORDS if w in words}
return sum(found_fillers.values()), found_fillers
def run_adaptive_whisper():
print(f"--- M2 Live-Coach: Analyse läuft ---")
# Modell laden
print("Lade Modell (base)...")
model = whisper.load_model("base")
# FFmpeg Befehl mit TCP für stabilere Verbindung
command = [
'ffmpeg',
'-rtsp_transport', 'tcp',
'-i', RTSP_URL,
'-ar', '16000',
'-ac', '1',
'-f', 's16le',
'-'
]
# Startet den FFmpeg-Prozess
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
audio_buffer = []
silence_start = None
# EINSTELLUNGEN FÜR DIE PAUSENERKENNUNG
THRESHOLD = 350 # Empfindlichkeit
SILENCE_DURATION = 2.5 # Sekunden Stille bis zum Ende
MIN_AUDIO_LENGTH = 15 # Mindestmenge an Daten
print(f"\nVerbindung zu: {RTSP_URL}")
print("[MISTY HÖRT ZU] Bitte sprich jetzt (2.5s Pause zum Beenden)...")
try:
while True:
raw_chunk = process.stdout.read(3200)
if not raw_chunk:
break
chunk_np = np.frombuffer(raw_chunk, dtype=np.int16)
if chunk_np.size == 0: continue
audio_buffer.append(chunk_np)
amplitude = np.sqrt(np.mean(chunk_np**2))
if amplitude < THRESHOLD:
if silence_start is None:
silence_start = time.time()
elif time.time() - silence_start > SILENCE_DURATION:
if len(audio_buffer) > MIN_AUDIO_LENGTH:
print("\n[Pause erkannt - Analyse startet]")
break
else:
sys.stdout.write(".")
sys.stdout.flush()
silence_start = None
process.terminate()
if not audio_buffer:
print("\n❌ Fehler: Keine Audiodaten empfangen.")
return
print("Verarbeite Audio...")
full_audio = np.concatenate(audio_buffer).astype(np.float32) / 32768.0
# Transkription mit Füllwort-Support
result = model.transcribe(full_audio, language="de", initial_prompt="Äh, ähm, mhm.")
text = result['text'].strip()
count, details = analyze_text(text)
# --- AUSGABE ---
print("\n" + "═"*45)
print(f"ERKANNT: {text}")
print("─"*45)
print(f"ANALYSE: {count} Füllwörter gefunden.")
if count > 0:
for w, n in details.items():
print(f" -> '{w}': {n}x")
print("═"*45 + "\n")
except Exception as e:
print(f"\nFehler: {e}")
finally:
if process and process.poll() is None:
process.kill()
if __name__ == "__main__":
run_adaptive_whisper()