from abc import ABC, abstractmethod
from typing import ContextManager, Iterator, List, Optional
from .message import SttMessage, LlmMessage
class AudioSource(ContextManager[None], Iterator[bytes], ABC):
"""
Audio sources provide the input for speech recognition as interface for interaction, typically a microphone.
Must be configured by ``source`` (``alsa``, ``pulse``, ``pyaudio``, ``wave``) and the corresponding per-class
objects ``alsa_source``, ``pulse_source``, ``pyaudio_source``, or ``wave_source``.
"""
pass
class AudioSink(ContextManager[None], ABC):
"""
Sinks receive the text-to-speech audio data as generated by the :class:`.Synthesizer`, typically speaker or
headphone audio devices.
Must be configured by ``sink`` (``alsa``, ``pulse``, ``pyaudio``, ``wave``), and the per-class ``alsa_sink``,
``pulse_sink``, ``pyaudio_sink``, or ``wave_sink`` objects, respectively.
"""
@abstractmethod
def play(self, buffer: bytes) -> None:
"""Possibly blocking playback of the given PCM, depending on internal buffering."""
raise NotImplementedError
def play_all(self, buffers: Iterator[bytes]) -> None:
"""Convenience helper to sequentially consume all generated playback buffers."""
for buffer in buffers:
self.play(buffer)
def drain(self) -> None:
"""Flush buffers and wait until playback stops, in especially in order to start :class:`.AudioSource` again."""
pass
class SpeechSegmenter(ContextManager[None], ABC):
"""
Not all voice recognition implementations provide support for a fully streamed operation, i.e., are able to
continuously receive audio frames, detect silence or activity, and transcribe speech on-the-fly.
Thus, this explicit and exchangeable pre-processing step monitors input audio and yields buffers that contain a
whole utterance, as separated by short breaks of silence.
As this particular aspect of the pipeline largely depends on environmental conditions, choosing an implementation
and its config might need some trial-and-error approach.
Configured by ``speech_segmenter`` (``simple`` as default, ``median``, ``band``, ``sphinx``) and the corresponding
per-class objects ``simple_speech_segmenter``, ``median_speech_segmenter``, ``band_speech_segmenter``, or
``sphinx_speech_segmenter``.
The ``speech_buffer_limit`` (30.0) and ``speech_pause_limit`` (30.0) configuration values limit the allowed
utterance and silence durations. This should prevent excessive buffering in case of mis-detected spurious speech
activity detection or 'start' keyword.
"""
@abstractmethod
def push(self, buffer: bytes) -> Iterator[bytes]:
"""Append recorded :class:`.AudioSource` PCM, possibly flushing the whole utterance for :class:`.Recognizer`."""
raise NotImplementedError
class Recognizer(ABC):
"""Actual instantiation, as created by the :class:`RecognizerModel`."""
@abstractmethod
def sample_rate(self) -> int:
"""Accepted sampling rate, which the :class:`.AudioSource` must provide."""
raise NotImplementedError
@abstractmethod
def accept(self, buffer: bytes) -> Iterator[SttMessage]:
"""Speech recognition, accepting PCM buffers from :class:`.SpeechSegmenter`."""
raise NotImplementedError
def reset(self) -> None:
"""Reset internal state to start listening from scratch."""
pass
class RecognizerModel(ContextManager[Recognizer], ABC):
"""
Voice recognizers transcribe speech from audio buffers.
Apart from generic prompts, utterances that only consist of a single keyword are detected.
By the ``keywords`` object, alternate hotwords for ``start``, ``reset``, ``commit``, or ``stop`` can be configured.
Must be configured by ``recognizer`` (``sphinx``, ``vosk``, ``whisper``) and the corresponding per-class
``sphinx_recognizer``, ``vosk_recognizer``, or ``whisper_recognizer`` objects.
"""
pass
class Processor(ContextManager, ABC):
"""Actual instantiation, as created by the :class:`ProcessorModel`."""
@abstractmethod
def generate(self, utterance: List[str]) -> Iterator[str]:
"""Run LLM session on the given prompt/s, yielding tokens to be piped to a :class:`.SentenceSegmenter`."""
raise NotImplementedError
class ProcessorModel(ContextManager[Processor], ABC):
"""
Processors are the core functionality, formed by LLMs, which receive transcribed prompts and yield tokens to be
synthesized to speech output.
Must be configured by ``processor`` (``noop``, ``ollama``, ``gpt4all``) and the corresponding ``ollama_processor``
or ``gpt4all_processor`` objects, respectively.
"""
pass
class SentenceSegmenter(ABC):
"""
Not all synthesizers support a streaming operation, i.e., are able to continuously receive text/token input while
yielding internally buffered chunks of audio.
In its simplest form, sentence segmenters thus combine and flush tokens until certain boundaries are found, for
example full stop periods.
By this means, playback can start as soon as the first sentence is available, while further tokens and synthesized
output is still generated in the background.
Configured by ``sentence_segmenter`` (``split`` as default, ``sbd``) and the per-class config objects
``split_sentence_segmenter``, or ``sbd_sentence_segmenter``, respectively.
"""
@abstractmethod
def push(self, utterance: str) -> Iterator[str]:
"""Batch tokens from a :class:`.Processor` to 'sentences' suitable for :class:`.Synthesizer`."""
raise NotImplementedError
@abstractmethod
def drain(self, utterance: Optional[str]) -> Iterator[str]:
"""Flush remaining utterance/s at the end of the token input stream."""
raise NotImplementedError
class OutputFilter(ABC):
"""
In a post-processing step, output filters can opt to add either further text or PCM to be played.
For example, beep sounds can indicate readiness or end of output.
Configured by ``feedback`` (``noop``, ``speech``, ``beep`` as default).
"""
@abstractmethod
def accept(self, message: LlmMessage) -> Iterator[LlmMessage]:
"""Possibly filter out, augment, or replace for example keywords with utterances."""
raise NotImplementedError
@abstractmethod
def generate(self, message: LlmMessage) -> Iterator[bytes]:
"""Possibly give for example signalling PCM audio for keywords."""
raise NotImplementedError
class Synthesizer(ABC):
"""Actual instantiation, as created by the :class:`SynthesizerModel`."""
@abstractmethod
def sample_rate(self) -> int:
"""Provided sampling rate, which the :class:`.AudioSink` must accept."""
raise NotImplementedError
@abstractmethod
def generate(self, utterance: str) -> Iterator[bytes]:
"""Text-to-speech implementation, giving PCM buffers for individual :class:`.SentenceSegmenter` utterances."""
raise NotImplementedError
class SynthesizerModel(ContextManager[Synthesizer], ABC):
"""
As actual text-to-speech implementation, synthesizers receive tokens/sentences and yield audio buffer streams.
Must be configured by ``synthesizer`` (``espeak``, ``coqui``) and the per-class ``espeak_synthesizer`` or
``coqui_synthesizer`` objects, respectively.
"""
pass