108 lines
3.3 KiB
Text
108 lines
3.3 KiB
Text
import whisper
|
|
import numpy as np
|
|
import subprocess
|
|
import time
|
|
import sys
|
|
import re
|
|
from config import RTSP_URL
|
|
|
|
# DEFINITION DER FÜLLWÖRTER
|
|
FILLER_WORDS = ["äh", "ähhm", "ähm", "mhm", "halt", "quasi", "sozusagen", "eigentlich"]
|
|
|
|
def analyze_text(text):
|
|
"""Sucht nach Füllwörtern im erkannten Text."""
|
|
text_clean = re.sub(r'[^\w\s]', '', text.lower())
|
|
# Entfernt einzelne Buchstaben/Satzzeichen und splittet in Wörter
|
|
words = text_clean.split()
|
|
found_fillers = {w: words.count(w) for w in FILLER_WORDS if w in words}
|
|
return sum(found_fillers.values()), found_fillers
|
|
|
|
def run_adaptive_whisper():
|
|
print(f"--- M2 Live-Coach: Analyse läuft ---")
|
|
|
|
# Modell laden
|
|
print("Lade Modell (base)...")
|
|
model = whisper.load_model("base")
|
|
|
|
# FFmpeg Befehl mit TCP für stabilere Verbindung
|
|
command = [
|
|
'ffmpeg',
|
|
'-rtsp_transport', 'tcp',
|
|
'-i', RTSP_URL,
|
|
'-ar', '16000',
|
|
'-ac', '1',
|
|
'-f', 's16le',
|
|
'-'
|
|
]
|
|
|
|
# Startet den FFmpeg-Prozess
|
|
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
|
|
|
|
audio_buffer = []
|
|
silence_start = None
|
|
|
|
# EINSTELLUNGEN FÜR DIE PAUSENERKENNUNG
|
|
THRESHOLD = 350 # Empfindlichkeit
|
|
SILENCE_DURATION = 2.5 # Sekunden Stille bis zum Ende
|
|
MIN_AUDIO_LENGTH = 15 # Mindestmenge an Daten
|
|
|
|
print(f"\nVerbindung zu: {RTSP_URL}")
|
|
print("[MISTY HÖRT ZU] Bitte sprich jetzt (2.5s Pause zum Beenden)...")
|
|
|
|
try:
|
|
while True:
|
|
raw_chunk = process.stdout.read(3200)
|
|
if not raw_chunk:
|
|
break
|
|
|
|
chunk_np = np.frombuffer(raw_chunk, dtype=np.int16)
|
|
if chunk_np.size == 0: continue
|
|
|
|
audio_buffer.append(chunk_np)
|
|
amplitude = np.sqrt(np.mean(chunk_np**2))
|
|
|
|
if amplitude < THRESHOLD:
|
|
if silence_start is None:
|
|
silence_start = time.time()
|
|
elif time.time() - silence_start > SILENCE_DURATION:
|
|
if len(audio_buffer) > MIN_AUDIO_LENGTH:
|
|
print("\n[Pause erkannt - Analyse startet]")
|
|
break
|
|
else:
|
|
sys.stdout.write(".")
|
|
sys.stdout.flush()
|
|
silence_start = None
|
|
|
|
process.terminate()
|
|
|
|
if not audio_buffer:
|
|
print("\n❌ Fehler: Keine Audiodaten empfangen.")
|
|
return
|
|
|
|
print("Verarbeite Audio...")
|
|
full_audio = np.concatenate(audio_buffer).astype(np.float32) / 32768.0
|
|
|
|
# Transkription mit Füllwort-Support
|
|
result = model.transcribe(full_audio, language="de", initial_prompt="Äh, ähm, mhm.")
|
|
|
|
text = result['text'].strip()
|
|
count, details = analyze_text(text)
|
|
|
|
# --- AUSGABE ---
|
|
print("\n" + "═"*45)
|
|
print(f"ERKANNT: {text}")
|
|
print("─"*45)
|
|
print(f"ANALYSE: {count} Füllwörter gefunden.")
|
|
if count > 0:
|
|
for w, n in details.items():
|
|
print(f" -> '{w}': {n}x")
|
|
print("═"*45 + "\n")
|
|
|
|
except Exception as e:
|
|
print(f"\nFehler: {e}")
|
|
finally:
|
|
if process and process.poll() is None:
|
|
process.kill()
|
|
|
|
if __name__ == "__main__":
|
|
run_adaptive_whisper()
|