arioch-assistant/assistant/stt.py

"""
Transcription vocale temps réel via Voxtral Mini Transcribe Realtime.

Flux : microphone (PCM 16kHz) → WebSocket Voxtral → texte transcrit
"""
import asyncio
import sys
from typing import AsyncIterator

import numpy as np
import sounddevice as sd
from mistralai.client import Mistral
from mistralai.client.models import (
    AudioFormat,
    TranscriptionStreamDone,
    TranscriptionStreamTextDelta,
)

from . import config

STT_MODEL = "voxtral-mini-transcribe-realtime-2602"
SAMPLE_RATE = 16000
CHANNELS = 1
CHUNK_FRAMES = 1600  # 100ms de son par chunk


async def _mic_stream(stop_event: asyncio.Event) -> AsyncIterator[bytes]:
    """Capture le microphone et yield des chunks PCM int16 jusqu'à stop_event."""
    loop = asyncio.get_event_loop()
    queue: asyncio.Queue[bytes | None] = asyncio.Queue()

    def callback(indata: np.ndarray, frames: int, time, status) -> None:
        if status:
            print(f"[Mic] {status}", file=sys.stderr)
        # Convertir en int16 little-endian et envoyer
        pcm = (indata[:, 0] * 32767).astype(np.int16).tobytes()
        loop.call_soon_threadsafe(queue.put_nowait, pcm)

    stream = sd.InputStream(
        samplerate=SAMPLE_RATE,
        channels=CHANNELS,
        dtype="float32",
        blocksize=CHUNK_FRAMES,
        callback=callback,
    )

    with stream:
        while not stop_event.is_set():
            try:
                chunk = await asyncio.wait_for(queue.get(), timeout=0.2)
                yield chunk
            except asyncio.TimeoutError:
                continue

    # Vider la queue restante
    while not queue.empty():
        chunk = queue.get_nowait()
        if chunk:
            yield chunk


async def transcribe_from_mic() -> str:
    """
    Écoute le microphone jusqu'à ce que l'utilisateur appuie sur Entrée,
    puis retourne le texte transcrit.
    """
    client = Mistral(api_key=config.MISTRAL_API_KEY)
    stop_event = asyncio.Event()
    loop = asyncio.get_event_loop()

    print("🎤 Parlez... (Entrée pour arrêter)")

    # Attendre Entrée dans un thread pour ne pas bloquer l'event loop
    async def wait_for_enter() -> None:
        await loop.run_in_executor(None, input)
        stop_event.set()

    enter_task = asyncio.create_task(wait_for_enter())

    audio_fmt = AudioFormat(
        encoding="pcm_s16le",
        sample_rate=SAMPLE_RATE,
    )

    final_text = ""

    try:
        async for event in client.audio.realtime.transcribe_stream(
            audio_stream=_mic_stream(stop_event),
            model=STT_MODEL,
            audio_format=audio_fmt,
            target_streaming_delay_ms=300,
        ):
            if isinstance(event, TranscriptionStreamTextDelta):
                # Affichage en temps réel du texte partiel
                print(event.text, end="", flush=True)
            elif isinstance(event, TranscriptionStreamDone):
                final_text = event.text
                print()  # saut de ligne après la transcription
                break
    finally:
        stop_event.set()
        enter_task.cancel()
        try:
            await enter_task
        except asyncio.CancelledError:
            pass

    return final_text.strip()