from typing import Iterator, List, Optional
import pysbd
from sttts.api.message import ModelError
from sttts.api.model import SentenceSegmenter
class SentenceBoundarySegmenter(SentenceSegmenter):
"""
Use the `pySBD <https://github.com/nipunsadvilkar/pySBD>`__ module for sentence boundary disambiguation.
"""
def __init__(self, *, language: str = "en") -> None:
"""
:param str language: Implementation to use, default ``en``.
"""
try:
self._buffer: str = ""
self._segmenter: pysbd.Segmenter = pysbd.Segmenter(
language=language,
clean=False, char_span=False, # XXX: cannot clean *and* get offsets
)
except ValueError as e:
raise ModelError(self.__class__.__name__, language, str(e))
def push(self, utterance: str) -> Iterator[str]:
self._buffer += utterance
parts: List[str] = self._segmenter.segment(self._buffer)
if len(parts) > 1:
while len(parts) > 1:
span: str = parts.pop(0).strip()
if span:
yield span
self._buffer = parts[0]
def drain(self, utterance: Optional[str]) -> Iterator[str]:
sentence, self._buffer = self._buffer.strip(), ""
if sentence:
yield sentence
if utterance:
yield utterance