diff --git a/homeassistant/components/assist_pipeline/audio_enhancer.py b/homeassistant/components/assist_pipeline/audio_enhancer.py index ff2b122187a7ede9117a6b6579fbb435b64c11c7..1fabc7790e73f5baf2e76376bfa61a29d814f252 100644 --- a/homeassistant/components/assist_pipeline/audio_enhancer.py +++ b/homeassistant/components/assist_pipeline/audio_enhancer.py @@ -22,8 +22,8 @@ class EnhancedAudioChunk: timestamp_ms: int """Timestamp relative to start of audio stream (milliseconds)""" - is_speech: bool | None - """True if audio chunk likely contains speech, False if not, None if unknown""" + speech_probability: float | None + """Probability that audio chunk contains speech (0-1), None if unknown""" class AudioEnhancer(ABC): @@ -70,27 +70,27 @@ class MicroVadSpeexEnhancer(AudioEnhancer): ) self.vad: MicroVad | None = None - self.threshold = 0.5 if self.is_vad_enabled: self.vad = MicroVad() - _LOGGER.debug("Initialized microVAD with threshold=%s", self.threshold) + _LOGGER.debug("Initialized microVAD") def enhance_chunk(self, audio: bytes, timestamp_ms: int) -> EnhancedAudioChunk: """Enhance 10ms chunk of PCM audio @ 16Khz with 16-bit mono samples.""" - is_speech: bool | None = None + speech_probability: float | None = None assert len(audio) == BYTES_PER_CHUNK if self.vad is not None: # Run VAD - speech_prob = self.vad.Process10ms(audio) - is_speech = speech_prob > self.threshold + speech_probability = self.vad.Process10ms(audio) if self.audio_processor is not None: # Run noise suppression and auto gain audio = self.audio_processor.Process10ms(audio).audio return EnhancedAudioChunk( - audio=audio, timestamp_ms=timestamp_ms, is_speech=is_speech + audio=audio, + timestamp_ms=timestamp_ms, + speech_probability=speech_probability, ) diff --git a/homeassistant/components/assist_pipeline/pipeline.py b/homeassistant/components/assist_pipeline/pipeline.py index a4255e377568b17f4ed02d4c690f8700e295c414..a55e23ae05189b81a9ba2fcf3c73d6b2799d2c62 100644 --- a/homeassistant/components/assist_pipeline/pipeline.py +++ b/homeassistant/components/assist_pipeline/pipeline.py @@ -780,7 +780,9 @@ class PipelineRun: # speaking the voice command. audio_chunks_for_stt.extend( EnhancedAudioChunk( - audio=chunk_ts[0], timestamp_ms=chunk_ts[1], is_speech=False + audio=chunk_ts[0], + timestamp_ms=chunk_ts[1], + speech_probability=None, ) for chunk_ts in result.queued_audio ) @@ -827,7 +829,7 @@ class PipelineRun: if wake_word_vad is not None: chunk_seconds = (len(chunk.audio) // sample_width) / sample_rate - if not wake_word_vad.process(chunk_seconds, chunk.is_speech): + if not wake_word_vad.process(chunk_seconds, chunk.speech_probability): raise WakeWordTimeoutError( code="wake-word-timeout", message="Wake word was not detected" ) @@ -955,7 +957,7 @@ class PipelineRun: if stt_vad is not None: chunk_seconds = (len(chunk.audio) // sample_width) / sample_rate - if not stt_vad.process(chunk_seconds, chunk.is_speech): + if not stt_vad.process(chunk_seconds, chunk.speech_probability): # Silence detected at the end of voice command self.process_event( PipelineEvent( @@ -1221,7 +1223,7 @@ class PipelineRun: yield EnhancedAudioChunk( audio=sub_chunk, timestamp_ms=timestamp_ms, - is_speech=None, # no VAD + speech_probability=None, # no VAD ) timestamp_ms += MS_PER_CHUNK diff --git a/homeassistant/components/assist_pipeline/vad.py b/homeassistant/components/assist_pipeline/vad.py index 4782d14dee47da294e56c06db62721a03c074528..deae5b9b7b38764b5e8283bce38b9797b214d8db 100644 --- a/homeassistant/components/assist_pipeline/vad.py +++ b/homeassistant/components/assist_pipeline/vad.py @@ -75,7 +75,7 @@ class AudioBuffer: class VoiceCommandSegmenter: """Segments an audio stream into voice commands.""" - speech_seconds: float = 0.3 + speech_seconds: float = 0.1 """Seconds of speech before voice command has started.""" command_seconds: float = 1.0 @@ -96,6 +96,12 @@ class VoiceCommandSegmenter: timed_out: bool = False """True a timeout occurred during voice command.""" + before_command_speech_threshold: float = 0.2 + """Probability threshold for speech before voice command.""" + + in_command_speech_threshold: float = 0.5 + """Probability threshold for speech during voice command.""" + _speech_seconds_left: float = 0.0 """Seconds left before considering voice command as started.""" @@ -124,7 +130,7 @@ class VoiceCommandSegmenter: self._reset_seconds_left = self.reset_seconds self.in_command = False - def process(self, chunk_seconds: float, is_speech: bool | None) -> bool: + def process(self, chunk_seconds: float, speech_probability: float | None) -> bool: """Process samples using external VAD. Returns False when command is done. @@ -142,7 +148,12 @@ class VoiceCommandSegmenter: self.timed_out = True return False + if speech_probability is None: + speech_probability = 0.0 + if not self.in_command: + # Before command + is_speech = speech_probability > self.before_command_speech_threshold if is_speech: self._reset_seconds_left = self.reset_seconds self._speech_seconds_left -= chunk_seconds @@ -160,24 +171,29 @@ class VoiceCommandSegmenter: if self._reset_seconds_left <= 0: self._speech_seconds_left = self.speech_seconds self._reset_seconds_left = self.reset_seconds - elif not is_speech: - # Silence in command - self._reset_seconds_left = self.reset_seconds - self._silence_seconds_left -= chunk_seconds - self._command_seconds_left -= chunk_seconds - if (self._silence_seconds_left <= 0) and (self._command_seconds_left <= 0): - # Command finished successfully - self.reset() - _LOGGER.debug("Voice command finished") - return False else: - # Speech in command. - # Reset silence counter if enough speech. - self._reset_seconds_left -= chunk_seconds - self._command_seconds_left -= chunk_seconds - if self._reset_seconds_left <= 0: - self._silence_seconds_left = self.silence_seconds + # In command + is_speech = speech_probability > self.in_command_speech_threshold + if not is_speech: + # Silence in command self._reset_seconds_left = self.reset_seconds + self._silence_seconds_left -= chunk_seconds + self._command_seconds_left -= chunk_seconds + if (self._silence_seconds_left <= 0) and ( + self._command_seconds_left <= 0 + ): + # Command finished successfully + self.reset() + _LOGGER.debug("Voice command finished") + return False + else: + # Speech in command. + # Reset silence counter if enough speech. + self._reset_seconds_left -= chunk_seconds + self._command_seconds_left -= chunk_seconds + if self._reset_seconds_left <= 0: + self._silence_seconds_left = self.silence_seconds + self._reset_seconds_left = self.reset_seconds return True @@ -226,6 +242,9 @@ class VoiceActivityTimeout: reset_seconds: float = 0.5 """Seconds of speech before resetting timeout.""" + speech_threshold: float = 0.5 + """Threshold for speech.""" + _silence_seconds_left: float = 0.0 """Seconds left before considering voice command as stopped.""" @@ -241,12 +260,15 @@ class VoiceActivityTimeout: self._silence_seconds_left = self.silence_seconds self._reset_seconds_left = self.reset_seconds - def process(self, chunk_seconds: float, is_speech: bool | None) -> bool: + def process(self, chunk_seconds: float, speech_probability: float | None) -> bool: """Process samples using external VAD. Returns False when timeout is reached. """ - if is_speech: + if speech_probability is None: + speech_probability = 0.0 + + if speech_probability > self.speech_threshold: # Speech self._reset_seconds_left -= chunk_seconds if self._reset_seconds_left <= 0: diff --git a/tests/components/assist_pipeline/test_vad.py b/tests/components/assist_pipeline/test_vad.py index fda26d2fb94e66f74a39ce9b95a4349de25edd71..bd07601cd5d10f2459eda98de5701407e29c1169 100644 --- a/tests/components/assist_pipeline/test_vad.py +++ b/tests/components/assist_pipeline/test_vad.py @@ -16,7 +16,7 @@ def test_silence() -> None: segmenter = VoiceCommandSegmenter() # True return value indicates voice command has not finished - assert segmenter.process(_ONE_SECOND * 3, False) + assert segmenter.process(_ONE_SECOND * 3, 0.0) assert not segmenter.in_command @@ -26,15 +26,15 @@ def test_speech() -> None: segmenter = VoiceCommandSegmenter() # silence - assert segmenter.process(_ONE_SECOND, False) + assert segmenter.process(_ONE_SECOND, 0.0) # "speech" - assert segmenter.process(_ONE_SECOND, True) + assert segmenter.process(_ONE_SECOND, 1.0) assert segmenter.in_command # silence # False return value indicates voice command is finished - assert not segmenter.process(_ONE_SECOND, False) + assert not segmenter.process(_ONE_SECOND, 0.0) assert not segmenter.in_command @@ -112,19 +112,19 @@ def test_silence_seconds() -> None: segmenter = VoiceCommandSegmenter(silence_seconds=1.0) # silence - assert segmenter.process(_ONE_SECOND, False) + assert segmenter.process(_ONE_SECOND, 0.0) assert not segmenter.in_command # "speech" - assert segmenter.process(_ONE_SECOND, True) + assert segmenter.process(_ONE_SECOND, 1.0) assert segmenter.in_command # not enough silence to end - assert segmenter.process(_ONE_SECOND * 0.5, False) + assert segmenter.process(_ONE_SECOND * 0.5, 0.0) assert segmenter.in_command # exactly enough silence now - assert not segmenter.process(_ONE_SECOND * 0.5, False) + assert not segmenter.process(_ONE_SECOND * 0.5, 0.0) assert not segmenter.in_command @@ -134,27 +134,27 @@ def test_silence_reset() -> None: segmenter = VoiceCommandSegmenter(silence_seconds=1.0, reset_seconds=0.5) # silence - assert segmenter.process(_ONE_SECOND, False) + assert segmenter.process(_ONE_SECOND, 0.0) assert not segmenter.in_command # "speech" - assert segmenter.process(_ONE_SECOND, True) + assert segmenter.process(_ONE_SECOND, 1.0) assert segmenter.in_command # not enough silence to end - assert segmenter.process(_ONE_SECOND * 0.5, False) + assert segmenter.process(_ONE_SECOND * 0.5, 0.0) assert segmenter.in_command # speech should reset silence detection - assert segmenter.process(_ONE_SECOND * 0.5, True) + assert segmenter.process(_ONE_SECOND * 0.5, 1.0) assert segmenter.in_command # not enough silence to end - assert segmenter.process(_ONE_SECOND * 0.5, False) + assert segmenter.process(_ONE_SECOND * 0.5, 0.0) assert segmenter.in_command # exactly enough silence now - assert not segmenter.process(_ONE_SECOND * 0.5, False) + assert not segmenter.process(_ONE_SECOND * 0.5, 0.0) assert not segmenter.in_command @@ -166,23 +166,23 @@ def test_speech_reset() -> None: ) # silence - assert segmenter.process(_ONE_SECOND, False) + assert segmenter.process(_ONE_SECOND, 0.0) assert not segmenter.in_command # not enough speech to start voice command - assert segmenter.process(_ONE_SECOND * 0.5, True) + assert segmenter.process(_ONE_SECOND * 0.5, 1.0) assert not segmenter.in_command # silence should reset speech detection - assert segmenter.process(_ONE_SECOND, False) + assert segmenter.process(_ONE_SECOND, 0.0) assert not segmenter.in_command # not enough speech to start voice command - assert segmenter.process(_ONE_SECOND * 0.5, True) + assert segmenter.process(_ONE_SECOND * 0.5, 1.0) assert not segmenter.in_command # exactly enough speech now - assert segmenter.process(_ONE_SECOND * 0.5, True) + assert segmenter.process(_ONE_SECOND * 0.5, 1.0) assert segmenter.in_command @@ -193,18 +193,18 @@ def test_timeout() -> None: # not enough to time out assert not segmenter.timed_out - assert segmenter.process(_ONE_SECOND * 0.5, False) + assert segmenter.process(_ONE_SECOND * 0.5, 0.0) assert not segmenter.timed_out # enough to time out - assert not segmenter.process(_ONE_SECOND * 0.5, True) + assert not segmenter.process(_ONE_SECOND * 0.5, 1.0) assert segmenter.timed_out # flag resets with more audio - assert segmenter.process(_ONE_SECOND * 0.5, True) + assert segmenter.process(_ONE_SECOND * 0.5, 1.0) assert not segmenter.timed_out - assert not segmenter.process(_ONE_SECOND * 0.5, False) + assert not segmenter.process(_ONE_SECOND * 0.5, 0.0) assert segmenter.timed_out @@ -215,14 +215,38 @@ def test_command_seconds() -> None: command_seconds=3, speech_seconds=1, silence_seconds=1, reset_seconds=1 ) - assert segmenter.process(_ONE_SECOND, True) + assert segmenter.process(_ONE_SECOND, 1.0) # Silence counts towards total command length - assert segmenter.process(_ONE_SECOND * 0.5, False) + assert segmenter.process(_ONE_SECOND * 0.5, 0.0) # Enough to finish command now - assert segmenter.process(_ONE_SECOND, True) - assert segmenter.process(_ONE_SECOND * 0.5, False) + assert segmenter.process(_ONE_SECOND, 1.0) + assert segmenter.process(_ONE_SECOND * 0.5, 0.0) # Silence to finish - assert not segmenter.process(_ONE_SECOND * 0.5, False) + assert not segmenter.process(_ONE_SECOND * 0.5, 0.0) + + +def test_speech_thresholds() -> None: + """Test before/in command speech thresholds.""" + + segmenter = VoiceCommandSegmenter( + before_command_speech_threshold=0.2, + in_command_speech_threshold=0.5, + command_seconds=2, + speech_seconds=1, + silence_seconds=1, + ) + + # Not high enough probability to trigger command + assert segmenter.process(_ONE_SECOND, 0.1) + assert not segmenter.in_command + + # Triggers command + assert segmenter.process(_ONE_SECOND, 0.3) + assert segmenter.in_command + + # Now that same probability is considered silence. + # Finishes command. + assert not segmenter.process(_ONE_SECOND, 0.3)