90 lines
2.8 KiB
Python
90 lines
2.8 KiB
Python
import whisper
|
|
import numpy as np
|
|
import subprocess
|
|
import time
|
|
import sys
|
|
import re
|
|
from config import RTSP_URL
|
|
|
|
FILLER_WORDS = ["äh", "ähhm", "ähm", "mhm", "halt", "quasi", "sozusagen", "eigentlich"]
|
|
|
|
def analyze_text(text):
|
|
text_clean = re.sub(r'[^\w\s]', '', text.lower())
|
|
words = text_clean.split()
|
|
found_fillers = {w: words.count(w) for w in FILLER_WORDS if w in words}
|
|
return sum(found_fillers.values()), found_fillers
|
|
|
|
def run_adaptive_whisper():
|
|
print(f"--- M2 Live-Coach: Analyse läuft ---")
|
|
print("Lade KI-Modell...")
|
|
model = whisper.load_model("base")
|
|
|
|
# FFmpeg mit längerer Analysezeit und TCP-Zwang
|
|
command = [
|
|
'ffmpeg',
|
|
'-rtsp_transport', 'tcp',
|
|
'-i', RTSP_URL,
|
|
'-ar', '16000', '-ac', '1', '-f', 's16le', '-'
|
|
]
|
|
|
|
print(f"Verbinde zu: {RTSP_URL}")
|
|
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
|
|
|
|
audio_buffer = []
|
|
silence_start = None
|
|
THRESHOLD = 300 # Etwas empfindlicher
|
|
SILENCE_DURATION = 2.5
|
|
|
|
print("[WARTE AUF STREAM...]")
|
|
|
|
try:
|
|
# 10 Versuche, den Stream-Anfang zu finden
|
|
for _ in range(100):
|
|
raw_chunk = process.stdout.read(3200)
|
|
if raw_chunk:
|
|
print("[MISTY HÖRT ZU] - Daten fließen!")
|
|
audio_buffer.append(np.frombuffer(raw_chunk, dtype=np.int16))
|
|
break
|
|
time.sleep(0.1)
|
|
|
|
if not audio_buffer:
|
|
print("❌ Fehler: Misty sendet keine Daten auf Port 1936.")
|
|
return
|
|
|
|
while True:
|
|
raw_chunk = process.stdout.read(3200)
|
|
if not raw_chunk: break
|
|
|
|
chunk_np = np.frombuffer(raw_chunk, dtype=np.int16)
|
|
audio_buffer.append(chunk_np)
|
|
amplitude = np.sqrt(np.mean(chunk_np**2)) if chunk_np.size > 0 else 0
|
|
|
|
if amplitude < THRESHOLD:
|
|
if silence_start is None:
|
|
silence_start = time.time()
|
|
elif time.time() - silence_start > SILENCE_DURATION:
|
|
if len(audio_buffer) > 20: break
|
|
else:
|
|
sys.stdout.write(".")
|
|
sys.stdout.flush()
|
|
silence_start = None
|
|
|
|
process.terminate()
|
|
full_audio = np.concatenate(audio_buffer).astype(np.float32) / 32768.0
|
|
result = model.transcribe(full_audio, language="de", initial_prompt="Äh, ähm, mhm.")
|
|
|
|
text = result['text'].strip()
|
|
count, details = analyze_text(text)
|
|
|
|
print("\n" + "═"*45)
|
|
print(f"TEXT: {text}")
|
|
print(f"FÜLLWÖRTER: {count}")
|
|
print("═"*45)
|
|
|
|
except Exception as e:
|
|
print(f"\nFehler: {e}")
|
|
finally:
|
|
if process: process.kill()
|
|
|
|
if __name__ == "__main__":
|
|
run_adaptive_whisper()
|