sphinx.py

import logging
from pathlib import Path
from typing import Optional, Iterator, Dict

import pocketsphinx

from sttts.api.message import Message, SttMessageType
from sttts.api.model import RecognizerModel, Recognizer
from sttts.utils.utils import StringUtils


class SphinxRecognizer(RecognizerModel, Recognizer):
    """
    Use the `Python bindings <https://pocketsphinx.readthedocs.io/en/latest/pocketsphinx.html>`__ of the
    `PocketSphinx <https://pypi.org/project/pocketsphinx/>`__ speech recognizer package.
    Direct :class:`pocketsphinx.Decoder` access is possible without the oversimplified wrappers for audio or live
    speech, as the :class:`.speech.sphinx.SphinxSegmenter` implements a :class:`pocketsphinx.Endpointer` beforehand.

    Generic transcription capabilities seem to be rather poor for nowadays standards, making it more suitable for
    specific speech detection on a limited dictionary.
    More recent models than the ``en-us`` one that the package ships with might be available on the
    `PocketSphinx <https://github.com/cmusphinx/pocketsphinx>`__ project page or from
    `SpeechRecognition <https://github.com/Uberi/speech_recognition>`__.
    """

    def __init__(self, keywords: Dict[str, SttMessageType], *,
                 model_path: Optional[str] = None,
                 hmm: Optional[str] = None, lm: Optional[str] = None, dct: Optional[str] = None,
                 **kwargs) -> None:
        """
        :param str model_path: Common base path for the ``hmm``, ``lm``, and ``dct`` arguments.
               Default to use the ``en-us`` model that ships with :mod:`pocketsphinx`.
        :param str hmm: Sub-path to the directory containing acoustic model files, such as ``acoustic-model``.
        :param str lm: Sub-path to the N-Gram language model, such as ``language-model.lm.bin``.
        :param str dct: Sub-path to the pronunciation dictionary, such as ``pronounciation-dictionary.dict``.
        :param kwargs: Extra :class:`pocketsphinx.Config`
               `parameters <https://pocketsphinx.readthedocs.io/en/latest/config_params.html>`__
               passed to :class:`pocketsphinx.Decoder`.
        """

        self._logger: logging.Logger = logging.getLogger(self.__class__.__name__)
        pocketsphinx.set_loglevel("DEBUG" if self._logger.getEffectiveLevel() == logging.DEBUG else "WARN")

        base_path: Path = Path(model_path if model_path is not None else pocketsphinx.get_model_path())
        model_args: Dict = {
            "hmm": str(base_path / hmm),
            "lm": str(base_path / lm),
            "dict": str(base_path / dct)
        } if hmm and lm and dct else {}

        self._sample_rate: int = 16000
        self._recognizer: pocketsphinx.Decoder = pocketsphinx.Decoder(pocketsphinx.Config(**kwargs),
                                                                      loglevel="INFO",
                                                                      samprate=self._sample_rate,
                                                                      **model_args)
        self._recording: bool = False
        self._keywords: Dict[str, SttMessageType] = keywords

    def _transform_utterance_type(self, utterance: str) -> Message[SttMessageType]:
        try:
            return Message(self._keywords[StringUtils.strip_punctuation(utterance).lower()], utterance)
        except KeyError:
            return Message(SttMessageType.Utterance, utterance.strip() + "\n")

    def sample_rate(self) -> int:
        return self._sample_rate

    def accept(self, buffer: bytes) -> Iterator[Message[SttMessageType]]:
        # NB: already segmented, no need for incremental by get_in_speech, which is part of the sphinx speech segmenter
        self._recognizer.start_utt()
        self._recognizer.process_raw(buffer, full_utt=True)
        self._recognizer.end_utt()

        result: str = self._recognizer.hyp().hypstr
        if result.strip():
            yield self._transform_utterance_type(result)

    def __enter__(self) -> Recognizer:
        self._logger.info(f"Entering {str(self._recognizer)}")
        return self

    def __exit__(self, *args) -> None:
        del self._recognizer