import json
import logging
from pathlib import Path
from typing import Optional, Iterator, Dict
import requests
import vosk
from sttts.api.message import Message, SttMessageType, ModelNotFoundError
from sttts.api.model import RecognizerModel, Recognizer
from sttts.utils.utils import StringUtils, PerfCounter
class VoskRecognizer(RecognizerModel, Recognizer):
"""
Use the `Vosk <https://pypi.org/project/vosk/>`__ `speech recognition toolkit <https://alphacephei.com/vosk/>`__,
with a wide range of `models <https://alphacephei.com/vosk/models>`__ available.
This implementation seems to provide good detection capabilities that also runs on low-end hardware.
"""
def __init__(self, keywords: Dict[str, SttMessageType], *,
model_name: Optional[str] = None, download: bool = False,
sample_rate: int = 16000) -> None:
"""
:param str model_name: Model to use, for example ``vosk-model-small-en-us-0.15``.
Omitted to list available models if *download* is enabled.
:param bool download: Opt-in model search and automatic download.
Otherwise, ensure the model exists beforehand.
:param int sample_rate: Accepted input sampling rate, default 16000.
Might be changed if 16K is not supported by the input recording device.
"""
self._logger: logging.Logger = logging.getLogger(self.__class__.__name__)
vosk.SetLogLevel(0 if self._logger.getEffectiveLevel() == logging.DEBUG else -1)
self._sample_rate: int = sample_rate
self._keywords: Dict[str, SttMessageType] = keywords
if model_name is None:
raise ModelNotFoundError(self.__class__.__name__, model=None, msg=None,
options=list(self._list_models()) if download else None)
elif download:
model: vosk.Model = vosk.Model(model_name=model_name)
else:
model = vosk.Model(model_path=self._find_model(model_name))
self._model_name: str = model_name
self._recognizer: vosk.KaldiRecognizer = vosk.KaldiRecognizer(model, self._sample_rate)
@classmethod
def _find_model(cls, model_name: str) -> str:
# duplicate check, as cannot search for models without downloading them, so need to pass an explicit path
for directory in [Path(_) for _ in vosk.MODEL_DIRS if _]:
if directory.is_dir():
for model_dir in directory.iterdir():
if model_dir.is_dir() and model_dir.name == model_name:
return model_dir.as_posix()
raise ModelNotFoundError(cls.__name__, model=model_name, msg=None,
options=[str(_) for _ in vosk.MODEL_DIRS if _])
@classmethod
def _list_models(cls) -> Iterator[str]:
for model in requests.request("GET", vosk.MODEL_LIST_URL).json():
if model["obsolete"] == "false":
yield f"{model['name']} ({model['lang_text']}, {model['size_text']})"
def _create_utterance(self, utterance: str) -> Iterator[Message[SttMessageType]]:
if not utterance.strip():
return
try:
yield Message(self._keywords[StringUtils.strip_punctuation(utterance).lower()], utterance)
except KeyError:
yield Message(SttMessageType.Utterance, utterance.strip() + "\n")
def sample_rate(self) -> int:
return self._sample_rate
def reset(self) -> None:
self._recognizer.Reset()
def accept(self, buffer: bytes) -> Iterator[Message[SttMessageType]]:
# even if no silence detected by accept(), ignore any PartialResult and end the stream, flushing buffers
with PerfCounter(self._logger, logging.INFO, "msec") as counter:
self._recognizer.AcceptWaveform(buffer)
utterance: str = json.loads(self._recognizer.FinalResult())["text"]
counter(round(len(buffer) / 2.0 / self._sample_rate * 1000.0))
yield from self._create_utterance(utterance)
def __enter__(self) -> Recognizer:
self._logger.info(f"Entering {self._recognizer}[{self._model_name}]")
return self
def __exit__(self, *args) -> None:
self._recognizer.Reset()