model.py

from abc import ABC, abstractmethod
from typing import ContextManager, Iterator, List, Optional

from .message import SttMessage, LlmMessage


class AudioSource(ContextManager[None], Iterator[bytes], ABC):
    """
    Audio sources provide the input for speech recognition as interface for interaction, typically a microphone.

    Must be configured by ``source`` (``alsa``, ``pulse``, ``pyaudio``, ``wave``) and the corresponding per-class
    objects ``alsa_source``, ``pulse_source``, ``pyaudio_source``, or ``wave_source``.
    """
    pass


class AudioSink(ContextManager[None], ABC):
    """
    Sinks receive the text-to-speech audio data as generated by the :class:`.Synthesizer`, typically speaker or
    headphone audio devices.

    Must be configured by ``sink`` (``alsa``, ``pulse``, ``pyaudio``, ``wave``), and the per-class ``alsa_sink``,
    ``pulse_sink``, ``pyaudio_sink``, or ``wave_sink`` objects, respectively.
    """

    @abstractmethod
    def play(self, buffer: bytes) -> None:
        """Possibly blocking playback of the given PCM, depending on internal buffering."""
        raise NotImplementedError

    def play_all(self, buffers: Iterator[bytes]) -> None:
        """Convenience helper to sequentially consume all generated playback buffers."""
        for buffer in buffers:
            self.play(buffer)

    def drain(self) -> None:
        """Flush buffers and wait until playback stops, in especially in order to start :class:`.AudioSource` again."""
        pass


class SpeechSegmenter(ContextManager[None], ABC):
    """
    Not all voice recognition implementations provide support for a fully streamed operation, i.e., are able to
    continuously receive audio frames, detect silence or activity, and transcribe speech on-the-fly.
    Thus, this explicit and exchangeable pre-processing step monitors input audio and yields buffers that contain a
    whole utterance, as separated by short breaks of silence.

    As this particular aspect of the pipeline largely depends on environmental conditions, choosing an implementation
    and its config might need some trial-and-error approach.

    Configured by ``speech_segmenter`` (``simple`` as default, ``median``, ``band``, ``sphinx``) and the corresponding
    per-class objects ``simple_speech_segmenter``, ``median_speech_segmenter``, ``band_speech_segmenter``, or
    ``sphinx_speech_segmenter``.

    The ``speech_buffer_limit`` (30.0) and ``speech_pause_limit`` (30.0) configuration values limit the allowed
    utterance and silence durations. This should prevent excessive buffering in case of mis-detected spurious speech
    activity detection or 'start' keyword.
    """

    @abstractmethod
    def push(self, buffer: bytes) -> Iterator[bytes]:
        """Append recorded :class:`.AudioSource` PCM, possibly flushing the whole utterance for :class:`.Recognizer`."""
        raise NotImplementedError


class Recognizer(ABC):
    """Actual instantiation, as created by the :class:`RecognizerModel`."""

    @abstractmethod
    def sample_rate(self) -> int:
        """Accepted sampling rate, which the :class:`.AudioSource` must provide."""
        raise NotImplementedError

    @abstractmethod
    def accept(self, buffer: bytes) -> Iterator[SttMessage]:
        """Speech recognition, accepting PCM buffers from :class:`.SpeechSegmenter`."""
        raise NotImplementedError

    def reset(self) -> None:
        """Reset internal state to start listening from scratch."""
        pass


class RecognizerModel(ContextManager[Recognizer], ABC):
    """
    Voice recognizers transcribe speech from audio buffers.
    Apart from generic prompts, utterances that only consist of a single keyword are detected.
    By the ``keywords`` object, alternate hotwords for ``start``, ``reset``, ``commit``, or ``stop`` can be configured.

    Must be configured by ``recognizer`` (``sphinx``, ``vosk``, ``whisper``) and the corresponding per-class
    ``sphinx_recognizer``, ``vosk_recognizer``, or ``whisper_recognizer`` objects.
    """
    pass


class Processor(ContextManager, ABC):
    """Actual instantiation, as created by the :class:`ProcessorModel`."""

    @abstractmethod
    def generate(self, utterance: List[str]) -> Iterator[str]:
        """Run LLM session on the given prompt/s, yielding tokens to be piped to a :class:`.SentenceSegmenter`."""
        raise NotImplementedError


class ProcessorModel(ContextManager[Processor], ABC):
    """
    Processors are the core functionality, formed by LLMs, which receive transcribed prompts and yield tokens to be
    synthesized to speech output.

    Must be configured by ``processor`` (``noop``, ``ollama``, ``gpt4all``) and the corresponding ``ollama_processor``
    or ``gpt4all_processor`` objects, respectively.
    """
    pass


class SentenceSegmenter(ABC):
    """
    Not all synthesizers support a streaming operation, i.e., are able to continuously receive text/token input while
    yielding internally buffered chunks of audio.
    In its simplest form, sentence segmenters thus combine and flush tokens until certain boundaries are found, for
    example full stop periods.
    By this means, playback can start as soon as the first sentence is available, while further tokens and synthesized
    output is still generated in the background.

    Configured by ``sentence_segmenter`` (``split`` as default, ``sbd``) and the per-class config objects
    ``split_sentence_segmenter``, or ``sbd_sentence_segmenter``, respectively.
    """

    @abstractmethod
    def push(self, utterance: str) -> Iterator[str]:
        """Batch tokens from a :class:`.Processor` to 'sentences' suitable for :class:`.Synthesizer`."""
        raise NotImplementedError

    @abstractmethod
    def drain(self, utterance: Optional[str]) -> Iterator[str]:
        """Flush remaining utterance/s at the end of the token input stream."""
        raise NotImplementedError


class OutputFilter(ABC):
    """
    In a post-processing step, output filters can opt to add either further text or PCM to be played.
    For example, beep sounds can indicate readiness or end of output.

    Configured by ``feedback`` (``noop``, ``speech``, ``beep`` as default).
    """

    @abstractmethod
    def accept(self, message: LlmMessage) -> Iterator[LlmMessage]:
        """Possibly filter out, augment, or replace for example keywords with utterances."""
        raise NotImplementedError

    @abstractmethod
    def generate(self, message: LlmMessage) -> Iterator[bytes]:
        """Possibly give for example signalling PCM audio for keywords."""
        raise NotImplementedError


class Synthesizer(ABC):
    """Actual instantiation, as created by the :class:`SynthesizerModel`."""

    @abstractmethod
    def sample_rate(self) -> int:
        """Provided sampling rate, which the :class:`.AudioSink` must accept."""
        raise NotImplementedError

    @abstractmethod
    def generate(self, utterance: str) -> Iterator[bytes]:
        """Text-to-speech implementation, giving PCM buffers for individual :class:`.SentenceSegmenter` utterances."""
        raise NotImplementedError


class SynthesizerModel(ContextManager[Synthesizer], ABC):
    """
    As actual text-to-speech implementation, synthesizers receive tokens/sentences and yield audio buffer streams.

    Must be configured by ``synthesizer`` (``espeak``, ``coqui``) and the per-class ``espeak_synthesizer`` or
    ``coqui_synthesizer`` objects, respectively.
    """
    pass