Fixes re: zognia's testing #2
4 changed files with 133 additions and 104 deletions
|
@ -25,71 +25,59 @@ os.close(old_stderr)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def check_rate(index, channels, rate):
|
|
||||||
try:
|
|
||||||
return pyaudio.is_format_supported(rate,
|
|
||||||
output_channels=channels,
|
|
||||||
output_device=index,
|
|
||||||
output_format=pya.paFloat32)
|
|
||||||
except ValueError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
alt_rates = [44100, 48000]
|
|
||||||
class Clip:
|
class Clip:
|
||||||
def __init__(self, path, output_index, buffer_length=2048, speed=1, force_stereo=True):
|
def __init__(self, path, samplerate=None, speed=1, force_stereo=True):
|
||||||
_raw, native_rate = librosa.load(path, sr=None, dtype='float32', mono=False)
|
self.path = path
|
||||||
self._channels = _raw.shape[0] if len(_raw.shape) == 2 else 1
|
raw, native_rate = librosa.load(self.path, sr=None, dtype='float32', mono=False)
|
||||||
if force_stereo and self._channels == 1:
|
|
||||||
_raw = np.resize(_raw, (2,*_raw.shape))
|
|
||||||
self._channels = 2
|
|
||||||
|
|
||||||
target_samplerate = native_rate
|
self.channels = raw.shape[0] if len(raw.shape) == 2 else 1
|
||||||
if not check_rate(output_index, self._channels , native_rate):
|
if force_stereo and self.channels == 1:
|
||||||
try:
|
raw = np.resize(raw, (2,*raw.shape))
|
||||||
target_samplerate = next((rate for rate in alt_rates if check_rate(output_index, self._channels , rate)))
|
self.channels = 2
|
||||||
except StopIteration:
|
|
||||||
logger.warn('Target audio device does not claim to support any sample rates! Attempting playback at native rate')
|
|
||||||
self._samplerate = target_samplerate
|
|
||||||
|
|
||||||
if native_rate != self._samplerate:
|
self.samplerate = samplerate or native_rate
|
||||||
_raw = librosa.resample(_raw, native_rate, self._samplerate, fix=True, scale=True)
|
if native_rate != self.samplerate:
|
||||||
|
raw = librosa.resample(raw, native_rate, self.samplerate, fix=True, scale=True)
|
||||||
|
|
||||||
self._raw = np.ascontiguousarray(self._stereo_transpose(_raw), dtype='float32')
|
self.raw = np.ascontiguousarray(self._stereo_transpose(raw), dtype='float32')
|
||||||
|
|
||||||
if speed != 1:
|
if speed != 1:
|
||||||
self.stretch(speed)
|
self.stretch(speed)
|
||||||
|
|
||||||
self._pos = 0
|
@property
|
||||||
self._playing = False
|
def length(self):
|
||||||
|
return self.raw.shape[0] / self.samplerate
|
||||||
|
|
||||||
|
def _stereo_transpose(self, ndata):
|
||||||
|
return ndata if self.channels == 1 else ndata.T
|
||||||
|
|
||||||
|
def stretch(self, speed):
|
||||||
|
stretched = tsm.wsola(self._stereo_transpose(self.raw), speed)
|
||||||
|
self.raw = np.ascontiguousarray(self._stereo_transpose(stretched), dtype='float32')
|
||||||
|
|
||||||
|
def save(self, filename):
|
||||||
|
soundfile.write(filename, self._stereo_transpose(self.raw), self.samplerate)
|
||||||
|
|
||||||
|
|
||||||
|
class Stream:
|
||||||
|
def __init__(self, clip, output_index, buffer_length=4096):
|
||||||
|
self.clip = clip
|
||||||
|
self.pos = 0
|
||||||
|
self.playing = False
|
||||||
self._end_event = AioEvent()
|
self._end_event = AioEvent()
|
||||||
self._stream = pyaudio.open(
|
self._stream = pyaudio.open(
|
||||||
output_device_index=output_index,
|
output_device_index=output_index,
|
||||||
format=pya.paFloat32,
|
format=pya.paFloat32,
|
||||||
channels=self._channels,
|
channels=self.clip.channels,
|
||||||
rate=self._samplerate,
|
rate=self.clip.samplerate,
|
||||||
frames_per_buffer=buffer_length,
|
frames_per_buffer=buffer_length,
|
||||||
output=True,
|
output=True,
|
||||||
stream_callback=self._read_callback,
|
stream_callback=self._read_callback,
|
||||||
start=False)
|
start=False)
|
||||||
|
|
||||||
@property
|
|
||||||
def length(self):
|
|
||||||
return self._raw.shape[0] / self._samplerate
|
|
||||||
|
|
||||||
def _stereo_transpose(self, ndata):
|
|
||||||
return ndata if self._channels == 1 else ndata.T
|
|
||||||
|
|
||||||
def stretch(self, speed):
|
|
||||||
stretched = tsm.wsola(self._stereo_transpose(self._raw), speed)
|
|
||||||
self._raw = np.ascontiguousarray(self._stereo_transpose(stretched), dtype='float32')
|
|
||||||
|
|
||||||
def save(self, filename):
|
|
||||||
soundfile.write(filename, self._stereo_transpose(self._raw), self._samplerate)
|
|
||||||
|
|
||||||
def _play(self):
|
def _play(self):
|
||||||
self._playing = True
|
self.playing = True
|
||||||
self._pos = 0
|
self.pos = 0
|
||||||
|
|
||||||
if not self._stream.is_active():
|
if not self._stream.is_active():
|
||||||
self._stream.start_stream()
|
self._stream.start_stream()
|
||||||
|
@ -97,38 +85,48 @@ class Clip:
|
||||||
def play(self):
|
def play(self):
|
||||||
self._end_event.clear()
|
self._end_event.clear()
|
||||||
self._play()
|
self._play()
|
||||||
self._end_event.wait(timeout=self.length)
|
self._end_event.wait(timeout=self.clip.length)
|
||||||
|
|
||||||
async def aplay(self):
|
async def aplay(self):
|
||||||
self._end_event.clear()
|
self._end_event.clear()
|
||||||
self._play()
|
self._play()
|
||||||
try:
|
try:
|
||||||
await self._end_event.coro_wait(timeout=self.length)
|
await self._end_event.coro_wait(timeout=self.clip.length)
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
self._playing = False
|
self.playing = False
|
||||||
self._stream.stop_stream()
|
self._stream.stop_stream()
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
self._stream.close()
|
self._stream.close()
|
||||||
|
|
||||||
def _read_callback(self, in_data, frame_count, time_info, status):
|
def _read_callback(self, in_data, frame_count, time_info, status):
|
||||||
if self._channels > 1:
|
if self.clip.channels > 1:
|
||||||
buffer = np.zeros((frame_count, self._channels), dtype='float32')
|
buffer = np.zeros((frame_count, self.clip.channels), dtype='float32')
|
||||||
else:
|
else:
|
||||||
buffer = np.zeros((frame_count,), dtype='float32')
|
buffer = np.zeros((frame_count,), dtype='float32')
|
||||||
|
|
||||||
if self._playing:
|
if self.playing:
|
||||||
newpos = self._pos + frame_count
|
newpos = self.pos + frame_count
|
||||||
clip_chunk = self._raw[self._pos:newpos]
|
clip_chunk = self.clip.raw[self.pos:newpos]
|
||||||
self._pos = newpos
|
self.pos = newpos
|
||||||
buffer[0:clip_chunk.shape[0]] = clip_chunk
|
buffer[0:clip_chunk.shape[0]] = clip_chunk
|
||||||
|
|
||||||
if self._pos >= self._raw.shape[0]:
|
if self.pos >= self.clip.raw.shape[0]:
|
||||||
self._playing = False
|
self.playing = False
|
||||||
self._end_event.set()
|
self._end_event.set()
|
||||||
|
|
||||||
return buffer, pya.paContinue
|
return buffer, pya.paContinue
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def check_rate(index, channels, rate):
|
||||||
|
try:
|
||||||
|
return pyaudio.is_format_supported(rate,
|
||||||
|
output_channels=channels,
|
||||||
|
output_device=index,
|
||||||
|
output_format=pya.paFloat32)
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def find_output_index(output):
|
def find_output_index(output):
|
||||||
if output is None:
|
if output is None:
|
|
@ -1,3 +1,3 @@
|
||||||
from .WebsocketServerProcess import WebsocketServerProcess
|
from .WebsocketServerProcess import WebsocketServerProcess
|
||||||
from .MainProcess import MainProcess
|
from .MainProcess import MainProcess
|
||||||
from .Clip import Clip
|
from .Audio import Clip, Stream
|
||||||
|
|
|
@ -1,47 +1,75 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
from collections import deque
|
from collections import deque
|
||||||
|
|
||||||
|
import maya
|
||||||
|
|
||||||
from ovtk_audiencekit.plugins import PluginBase
|
from ovtk_audiencekit.plugins import PluginBase
|
||||||
from ovtk_audiencekit.core import Clip
|
from ovtk_audiencekit.core import Clip, Stream
|
||||||
|
|
||||||
class AudioAlert(PluginBase):
|
class AudioAlert(PluginBase):
|
||||||
def setup(self, output=None, buffer_length=2048, cutoff_prevention_buffers=None):
|
def setup(self, output=None, timeout_min=1, sample_rate=None, buffer_length=4096, force_stereo=True):
|
||||||
if cutoff_prevention_buffers:
|
self.force_stereo = force_stereo
|
||||||
self.logger.info('`cutoff_prevention_buffers` are depricated')
|
self.timeout_min = timeout_min
|
||||||
|
self.clips = {}
|
||||||
|
self.streams = {}
|
||||||
|
self.buffer_length = int(buffer_length)
|
||||||
|
self.output_index = Stream.find_output_index(output)
|
||||||
|
if sample_rate is None:
|
||||||
|
try:
|
||||||
|
sample_rate = next((rate for rate in [44100, 48000] if Stream.check_rate(self.output_index, 1, rate)))
|
||||||
|
except StopIteration:
|
||||||
|
self.logger.warn('Target audio device does not claim to support common sample rates! Attempting playback at native rate of audio')
|
||||||
|
|
||||||
self.sounds = {}
|
self._cleanup_task = asyncio.create_task(self._cleanup())
|
||||||
self._buffer_length = int(buffer_length)
|
|
||||||
self._output_index = Clip.find_output_index(output)
|
|
||||||
|
|
||||||
def run(self, path, speed=1, immediate=True, poly=1, **kwargs):
|
def run(self, path, speed=1, immediate=True, poly=1, **kwargs):
|
||||||
sound = None
|
|
||||||
|
|
||||||
if poly != 1:
|
|
||||||
poly = int(poly)
|
poly = int(poly)
|
||||||
sound_dq = self.sounds.get(path)
|
key = f'{path}@{speed}x'
|
||||||
if sound_dq is None or type(sound_dq) != deque or sound_dq.maxlen != poly:
|
clip = self.clips.get(key, [None, None])[0]
|
||||||
sound_dq = deque(maxlen=poly)
|
|
||||||
self.sounds[path] = sound_dq
|
if clip is None:
|
||||||
if len(sound_dq) != poly:
|
clip = Clip(path, speed=speed, force_stereo=self.force_stereo)
|
||||||
self.logger.debug("filling", len(sound_dq), poly, sound_dq)
|
self.clips[key] = [clip, maya.now()]
|
||||||
sound = Clip(path,
|
|
||||||
self._output_index,
|
|
||||||
buffer_length=self._buffer_length,
|
|
||||||
speed=speed)
|
|
||||||
sound_dq.append(sound)
|
|
||||||
else:
|
else:
|
||||||
self.logger.debug("rotate", len(sound_dq), poly, sound_dq)
|
self.clips[key][1] = maya.now()
|
||||||
sound_dq.rotate(1)
|
|
||||||
sound = sound_dq[0]
|
stream_dq, refs = self.streams.get(path, (None, set()))
|
||||||
|
if stream_dq is None:
|
||||||
|
stream_dq = deque(maxlen=poly)
|
||||||
|
self.streams[path] = (stream_dq, refs)
|
||||||
|
refs.add(key)
|
||||||
|
|
||||||
|
if stream_dq.maxlen != poly:
|
||||||
|
self.logger.warn('Cannot change poly while streams are active!')
|
||||||
|
|
||||||
|
if len(stream_dq) != poly:
|
||||||
|
stream = Stream(clip, self.output_index,
|
||||||
|
buffer_length=self.buffer_length)
|
||||||
|
stream_dq.append(stream)
|
||||||
else:
|
else:
|
||||||
if self.sounds.get(path) is None:
|
stream_dq.rotate(1)
|
||||||
self.sounds[path] = Clip(path,
|
stream = stream_dq[0]
|
||||||
self._output_index,
|
|
||||||
buffer_length=self._buffer_length,
|
|
||||||
speed=speed)
|
|
||||||
sound = self.sounds.get(path)
|
|
||||||
|
|
||||||
if immediate:
|
if immediate:
|
||||||
asyncio.create_task(sound.aplay())
|
asyncio.create_task(stream.aplay())
|
||||||
else:
|
else:
|
||||||
sound.play()
|
stream.play()
|
||||||
|
|
||||||
|
async def _cleanup(self):
|
||||||
|
while True:
|
||||||
|
await asyncio.sleep(60)
|
||||||
|
now = maya.now()
|
||||||
|
for key, [clip, last_used] in list(self.clips.items()):
|
||||||
|
if now >= last_used.add(minutes=self.timeout_min, seconds=clip.length):
|
||||||
|
del self.clips[key]
|
||||||
|
self.logger.debug(f'Dropping {key}')
|
||||||
|
|
||||||
|
streams, refs = self.streams.get(clip.path, (None, None))
|
||||||
|
if refs:
|
||||||
|
refs.remove(key)
|
||||||
|
self.logger.debug(f'Stream {clip.path} now refs {refs}')
|
||||||
|
if len(refs) == 0:
|
||||||
|
self.logger.debug('Closing streams...')
|
||||||
|
for stream in streams:
|
||||||
|
stream.close()
|
||||||
|
del self.streams[clip.path]
|
||||||
|
|
|
@ -8,7 +8,7 @@ from TTS.config import load_config
|
||||||
|
|
||||||
from ovtk_audiencekit.plugins import PluginBase
|
from ovtk_audiencekit.plugins import PluginBase
|
||||||
from ovtk_audiencekit.events import Message, SysMessage
|
from ovtk_audiencekit.events import Message, SysMessage
|
||||||
from ovtk_audiencekit.core import Clip
|
from ovtk_audiencekit.core import Clip, Stream
|
||||||
from ovtk_audiencekit.core.Data import CACHE_DIR
|
from ovtk_audiencekit.core.Data import CACHE_DIR
|
||||||
|
|
||||||
|
|
||||||
|
@ -18,12 +18,12 @@ class TextToSpeechPlugin(PluginBase):
|
||||||
|
|
||||||
self.speaker_wav = speaker_wav
|
self.speaker_wav = speaker_wav
|
||||||
|
|
||||||
self._output_index = Clip.find_output_index(output)
|
self.output_index = Stream.find_output_index(output)
|
||||||
|
|
||||||
conf_overrides = {k[2:]: v for k, v in kwargs.items() if k.startswith('o_')}
|
conf_overrides = {k[2:]: v for k, v in kwargs.items() if k.startswith('o_')}
|
||||||
|
|
||||||
self._cache = os.path.join(CACHE_DIR, 'tts')
|
self.cache_dir = os.path.join(CACHE_DIR, 'tts')
|
||||||
os.makedirs(os.path.dirname(self._cache), exist_ok=True)
|
os.makedirs(os.path.dirname(self.cache_dir), exist_ok=True)
|
||||||
|
|
||||||
self.cuda = cuda
|
self.cuda = cuda
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ class TextToSpeechPlugin(PluginBase):
|
||||||
vocoder_path, vocoder_config_path = None, None
|
vocoder_path, vocoder_config_path = None, None
|
||||||
|
|
||||||
if conf_overrides:
|
if conf_overrides:
|
||||||
override_conf_path = os.path.join(self._cache, f'{self._name}_override.json')
|
override_conf_path = os.path.join(self.cache_dir, f'{self._name}_override.json')
|
||||||
|
|
||||||
config = load_config(config_path)
|
config = load_config(config_path)
|
||||||
for key, value in conf_overrides.items():
|
for key, value in conf_overrides.items():
|
||||||
|
@ -55,7 +55,7 @@ class TextToSpeechPlugin(PluginBase):
|
||||||
|
|
||||||
def make_tts_wav(self, text, filename=None):
|
def make_tts_wav(self, text, filename=None):
|
||||||
if filename is None:
|
if filename is None:
|
||||||
filename = os.path.join(self._cache, f'{uuid.uuid1()}.wav')
|
filename = os.path.join(self.cache_dir, f'{uuid.uuid1()}.wav')
|
||||||
|
|
||||||
if self.speaker_wav:
|
if self.speaker_wav:
|
||||||
wav = self.synthesizer.tts(text, None, 'en', self.speaker_wav)
|
wav = self.synthesizer.tts(text, None, 'en', self.speaker_wav)
|
||||||
|
@ -72,17 +72,20 @@ class TextToSpeechPlugin(PluginBase):
|
||||||
text += '.'
|
text += '.'
|
||||||
filename = self.make_tts_wav(text)
|
filename = self.make_tts_wav(text)
|
||||||
# TODO: Play direct from memory
|
# TODO: Play direct from memory
|
||||||
clip = Clip(filename, self._output_index, force_stereo=False)
|
clip = Clip(filename, force_stereo=True)
|
||||||
|
stream = Stream(clip, self.output_index)
|
||||||
if wait:
|
if wait:
|
||||||
async def play():
|
async def play():
|
||||||
await clip.aplay()
|
await stream.aplay()
|
||||||
clip.close()
|
stream.close()
|
||||||
|
os.remove(os.path.join(self.cache_dir, filename))
|
||||||
asyncio.create_task(play())
|
asyncio.create_task(play())
|
||||||
else:
|
else:
|
||||||
clip.play()
|
stream.play()
|
||||||
clip.close()
|
stream.close()
|
||||||
|
os.remove(os.path.join(self.cache_dir, filename))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
self.logger.error(f"Failed to make speech from input: {e}")
|
||||||
if source_event := _ctx.get('event'):
|
if source_event := _ctx.get('event'):
|
||||||
msg = SysMessage(self._name, 'Failed to make speech from input!!')
|
msg = SysMessage(self._name, 'Failed to make speech from input!!')
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue