Fixes re: zognia's testing #2

Merged
skeh merged 11 commits from feat/zogpog into main 2025-01-24 08:28:14 +00:00
4 changed files with 133 additions and 104 deletions
Showing only changes of commit 27f0997d6a - Show all commits

View file

@ -25,71 +25,59 @@ os.close(old_stderr)
logger = logging.getLogger(__name__)
def check_rate(index, channels, rate):
try:
return pyaudio.is_format_supported(rate,
output_channels=channels,
output_device=index,
output_format=pya.paFloat32)
except ValueError:
return False
alt_rates = [44100, 48000]
class Clip:
def __init__(self, path, output_index, buffer_length=2048, speed=1, force_stereo=True):
_raw, native_rate = librosa.load(path, sr=None, dtype='float32', mono=False)
self._channels = _raw.shape[0] if len(_raw.shape) == 2 else 1
if force_stereo and self._channels == 1:
_raw = np.resize(_raw, (2,*_raw.shape))
self._channels = 2
def __init__(self, path, samplerate=None, speed=1, force_stereo=True):
self.path = path
raw, native_rate = librosa.load(self.path, sr=None, dtype='float32', mono=False)
target_samplerate = native_rate
if not check_rate(output_index, self._channels , native_rate):
try:
target_samplerate = next((rate for rate in alt_rates if check_rate(output_index, self._channels , rate)))
except StopIteration:
logger.warn('Target audio device does not claim to support any sample rates! Attempting playback at native rate')
self._samplerate = target_samplerate
self.channels = raw.shape[0] if len(raw.shape) == 2 else 1
if force_stereo and self.channels == 1:
raw = np.resize(raw, (2,*raw.shape))
self.channels = 2
if native_rate != self._samplerate:
_raw = librosa.resample(_raw, native_rate, self._samplerate, fix=True, scale=True)
self.samplerate = samplerate or native_rate
if native_rate != self.samplerate:
raw = librosa.resample(raw, native_rate, self.samplerate, fix=True, scale=True)
self._raw = np.ascontiguousarray(self._stereo_transpose(_raw), dtype='float32')
self.raw = np.ascontiguousarray(self._stereo_transpose(raw), dtype='float32')
if speed != 1:
self.stretch(speed)
self._pos = 0
self._playing = False
@property
def length(self):
return self.raw.shape[0] / self.samplerate
def _stereo_transpose(self, ndata):
return ndata if self.channels == 1 else ndata.T
def stretch(self, speed):
stretched = tsm.wsola(self._stereo_transpose(self.raw), speed)
self.raw = np.ascontiguousarray(self._stereo_transpose(stretched), dtype='float32')
def save(self, filename):
soundfile.write(filename, self._stereo_transpose(self.raw), self.samplerate)
class Stream:
def __init__(self, clip, output_index, buffer_length=4096):
self.clip = clip
self.pos = 0
self.playing = False
self._end_event = AioEvent()
self._stream = pyaudio.open(
output_device_index=output_index,
format=pya.paFloat32,
channels=self._channels,
rate=self._samplerate,
channels=self.clip.channels,
rate=self.clip.samplerate,
frames_per_buffer=buffer_length,
output=True,
stream_callback=self._read_callback,
start=False)
@property
def length(self):
return self._raw.shape[0] / self._samplerate
def _stereo_transpose(self, ndata):
return ndata if self._channels == 1 else ndata.T
def stretch(self, speed):
stretched = tsm.wsola(self._stereo_transpose(self._raw), speed)
self._raw = np.ascontiguousarray(self._stereo_transpose(stretched), dtype='float32')
def save(self, filename):
soundfile.write(filename, self._stereo_transpose(self._raw), self._samplerate)
def _play(self):
self._playing = True
self._pos = 0
self.playing = True
self.pos = 0
if not self._stream.is_active():
self._stream.start_stream()
@ -97,38 +85,48 @@ class Clip:
def play(self):
self._end_event.clear()
self._play()
self._end_event.wait(timeout=self.length)
self._end_event.wait(timeout=self.clip.length)
async def aplay(self):
self._end_event.clear()
self._play()
try:
await self._end_event.coro_wait(timeout=self.length)
await self._end_event.coro_wait(timeout=self.clip.length)
except asyncio.CancelledError:
self._playing = False
self.playing = False
self._stream.stop_stream()
def close(self):
self._stream.close()
def _read_callback(self, in_data, frame_count, time_info, status):
if self._channels > 1:
buffer = np.zeros((frame_count, self._channels), dtype='float32')
if self.clip.channels > 1:
buffer = np.zeros((frame_count, self.clip.channels), dtype='float32')
else:
buffer = np.zeros((frame_count,), dtype='float32')
if self._playing:
newpos = self._pos + frame_count
clip_chunk = self._raw[self._pos:newpos]
self._pos = newpos
if self.playing:
newpos = self.pos + frame_count
clip_chunk = self.clip.raw[self.pos:newpos]
self.pos = newpos
buffer[0:clip_chunk.shape[0]] = clip_chunk
if self._pos >= self._raw.shape[0]:
self._playing = False
if self.pos >= self.clip.raw.shape[0]:
self.playing = False
self._end_event.set()
return buffer, pya.paContinue
@staticmethod
def check_rate(index, channels, rate):
try:
return pyaudio.is_format_supported(rate,
output_channels=channels,
output_device=index,
output_format=pya.paFloat32)
except ValueError:
return False
@staticmethod
def find_output_index(output):
if output is None:

View file

@ -1,3 +1,3 @@
from .WebsocketServerProcess import WebsocketServerProcess
from .MainProcess import MainProcess
from .Clip import Clip
from .Audio import Clip, Stream

View file

@ -1,47 +1,75 @@
import asyncio
from collections import deque
import maya
from ovtk_audiencekit.plugins import PluginBase
from ovtk_audiencekit.core import Clip
from ovtk_audiencekit.core import Clip, Stream
class AudioAlert(PluginBase):
def setup(self, output=None, buffer_length=2048, cutoff_prevention_buffers=None):
if cutoff_prevention_buffers:
self.logger.info('`cutoff_prevention_buffers` are depricated')
def setup(self, output=None, timeout_min=1, sample_rate=None, buffer_length=4096, force_stereo=True):
self.force_stereo = force_stereo
self.timeout_min = timeout_min
self.clips = {}
self.streams = {}
self.buffer_length = int(buffer_length)
self.output_index = Stream.find_output_index(output)
if sample_rate is None:
try:
sample_rate = next((rate for rate in [44100, 48000] if Stream.check_rate(self.output_index, 1, rate)))
except StopIteration:
self.logger.warn('Target audio device does not claim to support common sample rates! Attempting playback at native rate of audio')
self.sounds = {}
self._buffer_length = int(buffer_length)
self._output_index = Clip.find_output_index(output)
self._cleanup_task = asyncio.create_task(self._cleanup())
def run(self, path, speed=1, immediate=True, poly=1, **kwargs):
sound = None
poly = int(poly)
key = f'{path}@{speed}x'
clip = self.clips.get(key, [None, None])[0]
if poly != 1:
poly = int(poly)
sound_dq = self.sounds.get(path)
if sound_dq is None or type(sound_dq) != deque or sound_dq.maxlen != poly:
sound_dq = deque(maxlen=poly)
self.sounds[path] = sound_dq
if len(sound_dq) != poly:
self.logger.debug("filling", len(sound_dq), poly, sound_dq)
sound = Clip(path,
self._output_index,
buffer_length=self._buffer_length,
speed=speed)
sound_dq.append(sound)
else:
self.logger.debug("rotate", len(sound_dq), poly, sound_dq)
sound_dq.rotate(1)
sound = sound_dq[0]
if clip is None:
clip = Clip(path, speed=speed, force_stereo=self.force_stereo)
self.clips[key] = [clip, maya.now()]
else:
if self.sounds.get(path) is None:
self.sounds[path] = Clip(path,
self._output_index,
buffer_length=self._buffer_length,
speed=speed)
sound = self.sounds.get(path)
self.clips[key][1] = maya.now()
stream_dq, refs = self.streams.get(path, (None, set()))
if stream_dq is None:
stream_dq = deque(maxlen=poly)
self.streams[path] = (stream_dq, refs)
refs.add(key)
if stream_dq.maxlen != poly:
self.logger.warn('Cannot change poly while streams are active!')
if len(stream_dq) != poly:
stream = Stream(clip, self.output_index,
buffer_length=self.buffer_length)
stream_dq.append(stream)
else:
stream_dq.rotate(1)
stream = stream_dq[0]
if immediate:
asyncio.create_task(sound.aplay())
asyncio.create_task(stream.aplay())
else:
sound.play()
stream.play()
async def _cleanup(self):
while True:
await asyncio.sleep(60)
now = maya.now()
for key, [clip, last_used] in list(self.clips.items()):
if now >= last_used.add(minutes=self.timeout_min, seconds=clip.length):
del self.clips[key]
self.logger.debug(f'Dropping {key}')
streams, refs = self.streams.get(clip.path, (None, None))
if refs:
refs.remove(key)
self.logger.debug(f'Stream {clip.path} now refs {refs}')
if len(refs) == 0:
self.logger.debug('Closing streams...')
for stream in streams:
stream.close()
del self.streams[clip.path]

View file

@ -8,7 +8,7 @@ from TTS.config import load_config
from ovtk_audiencekit.plugins import PluginBase
from ovtk_audiencekit.events import Message, SysMessage
from ovtk_audiencekit.core import Clip
from ovtk_audiencekit.core import Clip, Stream
from ovtk_audiencekit.core.Data import CACHE_DIR
@ -18,12 +18,12 @@ class TextToSpeechPlugin(PluginBase):
self.speaker_wav = speaker_wav
self._output_index = Clip.find_output_index(output)
self.output_index = Stream.find_output_index(output)
conf_overrides = {k[2:]: v for k, v in kwargs.items() if k.startswith('o_')}
self._cache = os.path.join(CACHE_DIR, 'tts')
os.makedirs(os.path.dirname(self._cache), exist_ok=True)
self.cache_dir = os.path.join(CACHE_DIR, 'tts')
os.makedirs(os.path.dirname(self.cache_dir), exist_ok=True)
self.cuda = cuda
@ -36,7 +36,7 @@ class TextToSpeechPlugin(PluginBase):
vocoder_path, vocoder_config_path = None, None
if conf_overrides:
override_conf_path = os.path.join(self._cache, f'{self._name}_override.json')
override_conf_path = os.path.join(self.cache_dir, f'{self._name}_override.json')
config = load_config(config_path)
for key, value in conf_overrides.items():
@ -55,7 +55,7 @@ class TextToSpeechPlugin(PluginBase):
def make_tts_wav(self, text, filename=None):
if filename is None:
filename = os.path.join(self._cache, f'{uuid.uuid1()}.wav')
filename = os.path.join(self.cache_dir, f'{uuid.uuid1()}.wav')
if self.speaker_wav:
wav = self.synthesizer.tts(text, None, 'en', self.speaker_wav)
@ -72,17 +72,20 @@ class TextToSpeechPlugin(PluginBase):
text += '.'
filename = self.make_tts_wav(text)
# TODO: Play direct from memory
clip = Clip(filename, self._output_index, force_stereo=False)
clip = Clip(filename, force_stereo=True)
stream = Stream(clip, self.output_index)
if wait:
async def play():
await clip.aplay()
clip.close()
await stream.aplay()
stream.close()
os.remove(os.path.join(self.cache_dir, filename))
asyncio.create_task(play())
else:
clip.play()
clip.close()
stream.play()
stream.close()
os.remove(os.path.join(self.cache_dir, filename))
except Exception as e:
print(e)
self.logger.error(f"Failed to make speech from input: {e}")
if source_event := _ctx.get('event'):
msg = SysMessage(self._name, 'Failed to make speech from input!!')