import logging
from pathlib import Path
from typing import Optional, Iterator, Dict
import pocketsphinx
from sttts.api.message import Message, SttMessageType
from sttts.api.model import RecognizerModel, Recognizer
from sttts.utils.utils import StringUtils
class SphinxRecognizer(RecognizerModel, Recognizer):
"""
Use the `Python bindings <https://pocketsphinx.readthedocs.io/en/latest/pocketsphinx.html>`__ of the
`PocketSphinx <https://pypi.org/project/pocketsphinx/>`__ speech recognizer package.
Direct :class:`pocketsphinx.Decoder` access is possible without the oversimplified wrappers for audio or live
speech, as the :class:`.speech.sphinx.SphinxSegmenter` implements a :class:`pocketsphinx.Endpointer` beforehand.
Generic transcription capabilities seem to be rather poor for nowadays standards, making it more suitable for
specific speech detection on a limited dictionary.
More recent models than the ``en-us`` one that the package ships with might be available on the
`PocketSphinx <https://github.com/cmusphinx/pocketsphinx>`__ project page or from
`SpeechRecognition <https://github.com/Uberi/speech_recognition>`__.
"""
def __init__(self, keywords: Dict[str, SttMessageType], *,
model_path: Optional[str] = None,
hmm: Optional[str] = None, lm: Optional[str] = None, dct: Optional[str] = None,
**kwargs) -> None:
"""
:param str model_path: Common base path for the ``hmm``, ``lm``, and ``dct`` arguments.
Default to use the ``en-us`` model that ships with :mod:`pocketsphinx`.
:param str hmm: Sub-path to the directory containing acoustic model files, such as ``acoustic-model``.
:param str lm: Sub-path to the N-Gram language model, such as ``language-model.lm.bin``.
:param str dct: Sub-path to the pronunciation dictionary, such as ``pronounciation-dictionary.dict``.
:param kwargs: Extra :class:`pocketsphinx.Config`
`parameters <https://pocketsphinx.readthedocs.io/en/latest/config_params.html>`__
passed to :class:`pocketsphinx.Decoder`.
"""
self._logger: logging.Logger = logging.getLogger(self.__class__.__name__)
pocketsphinx.set_loglevel("DEBUG" if self._logger.getEffectiveLevel() == logging.DEBUG else "WARN")
base_path: Path = Path(model_path if model_path is not None else pocketsphinx.get_model_path())
model_args: Dict = {
"hmm": str(base_path / hmm),
"lm": str(base_path / lm),
"dict": str(base_path / dct)
} if hmm and lm and dct else {}
self._sample_rate: int = 16000
self._recognizer: pocketsphinx.Decoder = pocketsphinx.Decoder(pocketsphinx.Config(**kwargs),
loglevel="INFO",
samprate=self._sample_rate,
**model_args)
self._recording: bool = False
self._keywords: Dict[str, SttMessageType] = keywords
def _transform_utterance_type(self, utterance: str) -> Message[SttMessageType]:
try:
return Message(self._keywords[StringUtils.strip_punctuation(utterance).lower()], utterance)
except KeyError:
return Message(SttMessageType.Utterance, utterance.strip() + "\n")
def sample_rate(self) -> int:
return self._sample_rate
def accept(self, buffer: bytes) -> Iterator[Message[SttMessageType]]:
# NB: already segmented, no need for incremental by get_in_speech, which is part of the sphinx speech segmenter
self._recognizer.start_utt()
self._recognizer.process_raw(buffer, full_utt=True)
self._recognizer.end_utt()
result: str = self._recognizer.hyp().hypstr
if result.strip():
yield self._transform_utterance_type(result)
def __enter__(self) -> Recognizer:
self._logger.info(f"Entering {str(self._recognizer)}")
return self
def __exit__(self, *args) -> None:
del self._recognizer