Fixes re: zognia's testing #2
4 changed files with 133 additions and 104 deletions
|
@ -25,7 +25,99 @@ os.close(old_stderr)
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class Clip:
|
||||
def __init__(self, path, samplerate=None, speed=1, force_stereo=True):
|
||||
self.path = path
|
||||
raw, native_rate = librosa.load(self.path, sr=None, dtype='float32', mono=False)
|
||||
|
||||
self.channels = raw.shape[0] if len(raw.shape) == 2 else 1
|
||||
if force_stereo and self.channels == 1:
|
||||
raw = np.resize(raw, (2,*raw.shape))
|
||||
self.channels = 2
|
||||
|
||||
self.samplerate = samplerate or native_rate
|
||||
if native_rate != self.samplerate:
|
||||
raw = librosa.resample(raw, native_rate, self.samplerate, fix=True, scale=True)
|
||||
|
||||
self.raw = np.ascontiguousarray(self._stereo_transpose(raw), dtype='float32')
|
||||
|
||||
if speed != 1:
|
||||
self.stretch(speed)
|
||||
|
||||
@property
|
||||
def length(self):
|
||||
return self.raw.shape[0] / self.samplerate
|
||||
|
||||
def _stereo_transpose(self, ndata):
|
||||
return ndata if self.channels == 1 else ndata.T
|
||||
|
||||
def stretch(self, speed):
|
||||
stretched = tsm.wsola(self._stereo_transpose(self.raw), speed)
|
||||
self.raw = np.ascontiguousarray(self._stereo_transpose(stretched), dtype='float32')
|
||||
|
||||
def save(self, filename):
|
||||
soundfile.write(filename, self._stereo_transpose(self.raw), self.samplerate)
|
||||
|
||||
|
||||
class Stream:
|
||||
def __init__(self, clip, output_index, buffer_length=4096):
|
||||
self.clip = clip
|
||||
self.pos = 0
|
||||
self.playing = False
|
||||
self._end_event = AioEvent()
|
||||
self._stream = pyaudio.open(
|
||||
output_device_index=output_index,
|
||||
format=pya.paFloat32,
|
||||
channels=self.clip.channels,
|
||||
rate=self.clip.samplerate,
|
||||
frames_per_buffer=buffer_length,
|
||||
output=True,
|
||||
stream_callback=self._read_callback,
|
||||
start=False)
|
||||
|
||||
def _play(self):
|
||||
self.playing = True
|
||||
self.pos = 0
|
||||
|
||||
if not self._stream.is_active():
|
||||
self._stream.start_stream()
|
||||
|
||||
def play(self):
|
||||
self._end_event.clear()
|
||||
self._play()
|
||||
self._end_event.wait(timeout=self.clip.length)
|
||||
|
||||
async def aplay(self):
|
||||
self._end_event.clear()
|
||||
self._play()
|
||||
try:
|
||||
await self._end_event.coro_wait(timeout=self.clip.length)
|
||||
except asyncio.CancelledError:
|
||||
self.playing = False
|
||||
self._stream.stop_stream()
|
||||
|
||||
def close(self):
|
||||
self._stream.close()
|
||||
|
||||
def _read_callback(self, in_data, frame_count, time_info, status):
|
||||
if self.clip.channels > 1:
|
||||
buffer = np.zeros((frame_count, self.clip.channels), dtype='float32')
|
||||
else:
|
||||
buffer = np.zeros((frame_count,), dtype='float32')
|
||||
|
||||
if self.playing:
|
||||
newpos = self.pos + frame_count
|
||||
clip_chunk = self.clip.raw[self.pos:newpos]
|
||||
self.pos = newpos
|
||||
buffer[0:clip_chunk.shape[0]] = clip_chunk
|
||||
|
||||
if self.pos >= self.clip.raw.shape[0]:
|
||||
self.playing = False
|
||||
self._end_event.set()
|
||||
|
||||
return buffer, pya.paContinue
|
||||
|
||||
@staticmethod
|
||||
def check_rate(index, channels, rate):
|
||||
try:
|
||||
return pyaudio.is_format_supported(rate,
|
||||
|
@ -35,100 +127,6 @@ def check_rate(index, channels, rate):
|
|||
except ValueError:
|
||||
return False
|
||||
|
||||
alt_rates = [44100, 48000]
|
||||
class Clip:
|
||||
def __init__(self, path, output_index, buffer_length=2048, speed=1, force_stereo=True):
|
||||
_raw, native_rate = librosa.load(path, sr=None, dtype='float32', mono=False)
|
||||
self._channels = _raw.shape[0] if len(_raw.shape) == 2 else 1
|
||||
if force_stereo and self._channels == 1:
|
||||
_raw = np.resize(_raw, (2,*_raw.shape))
|
||||
self._channels = 2
|
||||
|
||||
target_samplerate = native_rate
|
||||
if not check_rate(output_index, self._channels , native_rate):
|
||||
try:
|
||||
target_samplerate = next((rate for rate in alt_rates if check_rate(output_index, self._channels , rate)))
|
||||
except StopIteration:
|
||||
logger.warn('Target audio device does not claim to support any sample rates! Attempting playback at native rate')
|
||||
self._samplerate = target_samplerate
|
||||
|
||||
if native_rate != self._samplerate:
|
||||
_raw = librosa.resample(_raw, native_rate, self._samplerate, fix=True, scale=True)
|
||||
|
||||
self._raw = np.ascontiguousarray(self._stereo_transpose(_raw), dtype='float32')
|
||||
|
||||
if speed != 1:
|
||||
self.stretch(speed)
|
||||
|
||||
self._pos = 0
|
||||
self._playing = False
|
||||
self._end_event = AioEvent()
|
||||
self._stream = pyaudio.open(
|
||||
output_device_index=output_index,
|
||||
format=pya.paFloat32,
|
||||
channels=self._channels,
|
||||
rate=self._samplerate,
|
||||
frames_per_buffer=buffer_length,
|
||||
output=True,
|
||||
stream_callback=self._read_callback,
|
||||
start=False)
|
||||
|
||||
@property
|
||||
def length(self):
|
||||
return self._raw.shape[0] / self._samplerate
|
||||
|
||||
def _stereo_transpose(self, ndata):
|
||||
return ndata if self._channels == 1 else ndata.T
|
||||
|
||||
def stretch(self, speed):
|
||||
stretched = tsm.wsola(self._stereo_transpose(self._raw), speed)
|
||||
self._raw = np.ascontiguousarray(self._stereo_transpose(stretched), dtype='float32')
|
||||
|
||||
def save(self, filename):
|
||||
soundfile.write(filename, self._stereo_transpose(self._raw), self._samplerate)
|
||||
|
||||
def _play(self):
|
||||
self._playing = True
|
||||
self._pos = 0
|
||||
|
||||
if not self._stream.is_active():
|
||||
self._stream.start_stream()
|
||||
|
||||
def play(self):
|
||||
self._end_event.clear()
|
||||
self._play()
|
||||
self._end_event.wait(timeout=self.length)
|
||||
|
||||
async def aplay(self):
|
||||
self._end_event.clear()
|
||||
self._play()
|
||||
try:
|
||||
await self._end_event.coro_wait(timeout=self.length)
|
||||
except asyncio.CancelledError:
|
||||
self._playing = False
|
||||
self._stream.stop_stream()
|
||||
|
||||
def close(self):
|
||||
self._stream.close()
|
||||
|
||||
def _read_callback(self, in_data, frame_count, time_info, status):
|
||||
if self._channels > 1:
|
||||
buffer = np.zeros((frame_count, self._channels), dtype='float32')
|
||||
else:
|
||||
buffer = np.zeros((frame_count,), dtype='float32')
|
||||
|
||||
if self._playing:
|
||||
newpos = self._pos + frame_count
|
||||
clip_chunk = self._raw[self._pos:newpos]
|
||||
self._pos = newpos
|
||||
buffer[0:clip_chunk.shape[0]] = clip_chunk
|
||||
|
||||
if self._pos >= self._raw.shape[0]:
|
||||
self._playing = False
|
||||
self._end_event.set()
|
||||
|
||||
return buffer, pya.paContinue
|
||||
|
||||
@staticmethod
|
||||
def find_output_index(output):
|
||||
if output is None:
|
|
@ -1,3 +1,3 @@
|
|||
from .WebsocketServerProcess import WebsocketServerProcess
|
||||
from .MainProcess import MainProcess
|
||||
from .Clip import Clip
|
||||
from .Audio import Clip, Stream
|
||||
|
|
|
@ -1,47 +1,75 @@
|
|||
import asyncio
|
||||
from collections import deque
|
||||
|
||||
import maya
|
||||
|
||||
from ovtk_audiencekit.plugins import PluginBase
|
||||
from ovtk_audiencekit.core import Clip
|
||||
from ovtk_audiencekit.core import Clip, Stream
|
||||
|
||||
class AudioAlert(PluginBase):
|
||||
def setup(self, output=None, buffer_length=2048, cutoff_prevention_buffers=None):
|
||||
if cutoff_prevention_buffers:
|
||||
self.logger.info('`cutoff_prevention_buffers` are depricated')
|
||||
def setup(self, output=None, timeout_min=1, sample_rate=None, buffer_length=4096, force_stereo=True):
|
||||
self.force_stereo = force_stereo
|
||||
self.timeout_min = timeout_min
|
||||
self.clips = {}
|
||||
self.streams = {}
|
||||
self.buffer_length = int(buffer_length)
|
||||
self.output_index = Stream.find_output_index(output)
|
||||
if sample_rate is None:
|
||||
try:
|
||||
sample_rate = next((rate for rate in [44100, 48000] if Stream.check_rate(self.output_index, 1, rate)))
|
||||
except StopIteration:
|
||||
self.logger.warn('Target audio device does not claim to support common sample rates! Attempting playback at native rate of audio')
|
||||
|
||||
self.sounds = {}
|
||||
self._buffer_length = int(buffer_length)
|
||||
self._output_index = Clip.find_output_index(output)
|
||||
self._cleanup_task = asyncio.create_task(self._cleanup())
|
||||
|
||||
def run(self, path, speed=1, immediate=True, poly=1, **kwargs):
|
||||
sound = None
|
||||
|
||||
if poly != 1:
|
||||
poly = int(poly)
|
||||
sound_dq = self.sounds.get(path)
|
||||
if sound_dq is None or type(sound_dq) != deque or sound_dq.maxlen != poly:
|
||||
sound_dq = deque(maxlen=poly)
|
||||
self.sounds[path] = sound_dq
|
||||
if len(sound_dq) != poly:
|
||||
self.logger.debug("filling", len(sound_dq), poly, sound_dq)
|
||||
sound = Clip(path,
|
||||
self._output_index,
|
||||
buffer_length=self._buffer_length,
|
||||
speed=speed)
|
||||
sound_dq.append(sound)
|
||||
key = f'{path}@{speed}x'
|
||||
clip = self.clips.get(key, [None, None])[0]
|
||||
|
||||
if clip is None:
|
||||
clip = Clip(path, speed=speed, force_stereo=self.force_stereo)
|
||||
self.clips[key] = [clip, maya.now()]
|
||||
else:
|
||||
self.logger.debug("rotate", len(sound_dq), poly, sound_dq)
|
||||
sound_dq.rotate(1)
|
||||
sound = sound_dq[0]
|
||||
self.clips[key][1] = maya.now()
|
||||
|
||||
stream_dq, refs = self.streams.get(path, (None, set()))
|
||||
if stream_dq is None:
|
||||
stream_dq = deque(maxlen=poly)
|
||||
self.streams[path] = (stream_dq, refs)
|
||||
refs.add(key)
|
||||
|
||||
if stream_dq.maxlen != poly:
|
||||
self.logger.warn('Cannot change poly while streams are active!')
|
||||
|
||||
if len(stream_dq) != poly:
|
||||
stream = Stream(clip, self.output_index,
|
||||
buffer_length=self.buffer_length)
|
||||
stream_dq.append(stream)
|
||||
else:
|
||||
if self.sounds.get(path) is None:
|
||||
self.sounds[path] = Clip(path,
|
||||
self._output_index,
|
||||
buffer_length=self._buffer_length,
|
||||
speed=speed)
|
||||
sound = self.sounds.get(path)
|
||||
stream_dq.rotate(1)
|
||||
stream = stream_dq[0]
|
||||
|
||||
if immediate:
|
||||
asyncio.create_task(sound.aplay())
|
||||
asyncio.create_task(stream.aplay())
|
||||
else:
|
||||
sound.play()
|
||||
stream.play()
|
||||
|
||||
async def _cleanup(self):
|
||||
while True:
|
||||
await asyncio.sleep(60)
|
||||
now = maya.now()
|
||||
for key, [clip, last_used] in list(self.clips.items()):
|
||||
if now >= last_used.add(minutes=self.timeout_min, seconds=clip.length):
|
||||
del self.clips[key]
|
||||
self.logger.debug(f'Dropping {key}')
|
||||
|
||||
streams, refs = self.streams.get(clip.path, (None, None))
|
||||
if refs:
|
||||
refs.remove(key)
|
||||
self.logger.debug(f'Stream {clip.path} now refs {refs}')
|
||||
if len(refs) == 0:
|
||||
self.logger.debug('Closing streams...')
|
||||
for stream in streams:
|
||||
stream.close()
|
||||
del self.streams[clip.path]
|
||||
|
|
|
@ -8,7 +8,7 @@ from TTS.config import load_config
|
|||
|
||||
from ovtk_audiencekit.plugins import PluginBase
|
||||
from ovtk_audiencekit.events import Message, SysMessage
|
||||
from ovtk_audiencekit.core import Clip
|
||||
from ovtk_audiencekit.core import Clip, Stream
|
||||
from ovtk_audiencekit.core.Data import CACHE_DIR
|
||||
|
||||
|
||||
|
@ -18,12 +18,12 @@ class TextToSpeechPlugin(PluginBase):
|
|||
|
||||
self.speaker_wav = speaker_wav
|
||||
|
||||
self._output_index = Clip.find_output_index(output)
|
||||
self.output_index = Stream.find_output_index(output)
|
||||
|
||||
conf_overrides = {k[2:]: v for k, v in kwargs.items() if k.startswith('o_')}
|
||||
|
||||
self._cache = os.path.join(CACHE_DIR, 'tts')
|
||||
os.makedirs(os.path.dirname(self._cache), exist_ok=True)
|
||||
self.cache_dir = os.path.join(CACHE_DIR, 'tts')
|
||||
os.makedirs(os.path.dirname(self.cache_dir), exist_ok=True)
|
||||
|
||||
self.cuda = cuda
|
||||
|
||||
|
@ -36,7 +36,7 @@ class TextToSpeechPlugin(PluginBase):
|
|||
vocoder_path, vocoder_config_path = None, None
|
||||
|
||||
if conf_overrides:
|
||||
override_conf_path = os.path.join(self._cache, f'{self._name}_override.json')
|
||||
override_conf_path = os.path.join(self.cache_dir, f'{self._name}_override.json')
|
||||
|
||||
config = load_config(config_path)
|
||||
for key, value in conf_overrides.items():
|
||||
|
@ -55,7 +55,7 @@ class TextToSpeechPlugin(PluginBase):
|
|||
|
||||
def make_tts_wav(self, text, filename=None):
|
||||
if filename is None:
|
||||
filename = os.path.join(self._cache, f'{uuid.uuid1()}.wav')
|
||||
filename = os.path.join(self.cache_dir, f'{uuid.uuid1()}.wav')
|
||||
|
||||
if self.speaker_wav:
|
||||
wav = self.synthesizer.tts(text, None, 'en', self.speaker_wav)
|
||||
|
@ -72,17 +72,20 @@ class TextToSpeechPlugin(PluginBase):
|
|||
text += '.'
|
||||
filename = self.make_tts_wav(text)
|
||||
# TODO: Play direct from memory
|
||||
clip = Clip(filename, self._output_index, force_stereo=False)
|
||||
clip = Clip(filename, force_stereo=True)
|
||||
stream = Stream(clip, self.output_index)
|
||||
if wait:
|
||||
async def play():
|
||||
await clip.aplay()
|
||||
clip.close()
|
||||
await stream.aplay()
|
||||
stream.close()
|
||||
os.remove(os.path.join(self.cache_dir, filename))
|
||||
asyncio.create_task(play())
|
||||
else:
|
||||
clip.play()
|
||||
clip.close()
|
||||
stream.play()
|
||||
stream.close()
|
||||
os.remove(os.path.join(self.cache_dir, filename))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
self.logger.error(f"Failed to make speech from input: {e}")
|
||||
if source_event := _ctx.get('event'):
|
||||
msg = SysMessage(self._name, 'Failed to make speech from input!!')
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue