From 7f4dabf546f93efa3271152c0f300a824ea06fa0 Mon Sep 17 00:00:00 2001
From: Michael Hansen <mike@rhasspy.org>
Date: Wed, 31 Jul 2024 02:42:45 -0500
Subject: [PATCH] Switch from WebRTC to microVAD (#122861)

* Switch WebRTC to microVAD

* Remove webrtc-noise-gain from licenses
---
 .../assist_pipeline/audio_enhancer.py         |  82 +++++++++
 .../components/assist_pipeline/const.py       |   5 +
 .../components/assist_pipeline/manifest.json  |   2 +-
 .../components/assist_pipeline/pipeline.py    | 164 +++++++++---------
 .../components/assist_pipeline/vad.py         |  66 ++-----
 .../assist_pipeline/websocket_api.py          |  20 ++-
 homeassistant/components/voip/voip.py         |  38 ++--
 homeassistant/package_constraints.txt         |   2 +-
 requirements_all.txt                          |   6 +-
 requirements_test_all.txt                     |   6 +-
 script/licenses.py                            |   1 -
 tests/components/assist_pipeline/test_init.py |  43 ++---
 tests/components/assist_pipeline/test_vad.py  | 124 ++++---------
 .../assist_pipeline/test_websocket.py         |  61 ++-----
 tests/components/voip/test_voip.py            |  55 +++---
 15 files changed, 324 insertions(+), 351 deletions(-)
 create mode 100644 homeassistant/components/assist_pipeline/audio_enhancer.py

diff --git a/homeassistant/components/assist_pipeline/audio_enhancer.py b/homeassistant/components/assist_pipeline/audio_enhancer.py
new file mode 100644
index 00000000000..e7a149bd00e
--- /dev/null
+++ b/homeassistant/components/assist_pipeline/audio_enhancer.py
@@ -0,0 +1,82 @@
+"""Audio enhancement for Assist."""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+import logging
+
+from pymicro_vad import MicroVad
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True, slots=True)
+class EnhancedAudioChunk:
+    """Enhanced audio chunk and metadata."""
+
+    audio: bytes
+    """Raw PCM audio @ 16Khz with 16-bit mono samples"""
+
+    timestamp_ms: int
+    """Timestamp relative to start of audio stream (milliseconds)"""
+
+    is_speech: bool | None
+    """True if audio chunk likely contains speech, False if not, None if unknown"""
+
+
+class AudioEnhancer(ABC):
+    """Base class for audio enhancement."""
+
+    def __init__(
+        self, auto_gain: int, noise_suppression: int, is_vad_enabled: bool
+    ) -> None:
+        """Initialize audio enhancer."""
+        self.auto_gain = auto_gain
+        self.noise_suppression = noise_suppression
+        self.is_vad_enabled = is_vad_enabled
+
+    @abstractmethod
+    def enhance_chunk(self, audio: bytes, timestamp_ms: int) -> EnhancedAudioChunk:
+        """Enhance chunk of PCM audio @ 16Khz with 16-bit mono samples."""
+
+    @property
+    @abstractmethod
+    def samples_per_chunk(self) -> int | None:
+        """Return number of samples per chunk or None if chunking isn't required."""
+
+
+class MicroVadEnhancer(AudioEnhancer):
+    """Audio enhancer that just runs microVAD."""
+
+    def __init__(
+        self, auto_gain: int, noise_suppression: int, is_vad_enabled: bool
+    ) -> None:
+        """Initialize audio enhancer."""
+        super().__init__(auto_gain, noise_suppression, is_vad_enabled)
+
+        self.vad: MicroVad | None = None
+        self.threshold = 0.5
+
+        if self.is_vad_enabled:
+            self.vad = MicroVad()
+            _LOGGER.debug("Initialized microVAD with threshold=%s", self.threshold)
+
+    def enhance_chunk(self, audio: bytes, timestamp_ms: int) -> EnhancedAudioChunk:
+        """Enhance chunk of PCM audio @ 16Khz with 16-bit mono samples."""
+        is_speech: bool | None = None
+
+        if self.vad is not None:
+            # Run VAD
+            speech_prob = self.vad.Process10ms(audio)
+            is_speech = speech_prob > self.threshold
+
+        return EnhancedAudioChunk(
+            audio=audio, timestamp_ms=timestamp_ms, is_speech=is_speech
+        )
+
+    @property
+    def samples_per_chunk(self) -> int | None:
+        """Return number of samples per chunk or None if chunking isn't required."""
+        if self.is_vad_enabled:
+            return 160  # 10ms
+
+        return None
diff --git a/homeassistant/components/assist_pipeline/const.py b/homeassistant/components/assist_pipeline/const.py
index 36b72dad69c..14b93a90372 100644
--- a/homeassistant/components/assist_pipeline/const.py
+++ b/homeassistant/components/assist_pipeline/const.py
@@ -15,3 +15,8 @@ DATA_LAST_WAKE_UP = f"{DOMAIN}.last_wake_up"
 WAKE_WORD_COOLDOWN = 2  # seconds
 
 EVENT_RECORDING = f"{DOMAIN}_recording"
+
+SAMPLE_RATE = 16000  # hertz
+SAMPLE_WIDTH = 2  # bytes
+SAMPLE_CHANNELS = 1  # mono
+SAMPLES_PER_CHUNK = 240  # 20 ms @ 16Khz
diff --git a/homeassistant/components/assist_pipeline/manifest.json b/homeassistant/components/assist_pipeline/manifest.json
index 31b3b0d4e32..b22ce72b1eb 100644
--- a/homeassistant/components/assist_pipeline/manifest.json
+++ b/homeassistant/components/assist_pipeline/manifest.json
@@ -6,5 +6,5 @@
   "documentation": "https://www.home-assistant.io/integrations/assist_pipeline",
   "iot_class": "local_push",
   "quality_scale": "internal",
-  "requirements": ["webrtc-noise-gain==1.2.3"]
+  "requirements": ["pymicro-vad==1.0.0"]
 }
diff --git a/homeassistant/components/assist_pipeline/pipeline.py b/homeassistant/components/assist_pipeline/pipeline.py
index ecf361cb67c..845950caf8d 100644
--- a/homeassistant/components/assist_pipeline/pipeline.py
+++ b/homeassistant/components/assist_pipeline/pipeline.py
@@ -13,14 +13,11 @@ from pathlib import Path
 from queue import Empty, Queue
 from threading import Thread
 import time
-from typing import TYPE_CHECKING, Any, Final, Literal, cast
+from typing import Any, Literal, cast
 import wave
 
 import voluptuous as vol
 
-if TYPE_CHECKING:
-    from webrtc_noise_gain import AudioProcessor
-
 from homeassistant.components import (
     conversation,
     media_source,
@@ -52,12 +49,17 @@ from homeassistant.util import (
 )
 from homeassistant.util.limited_size_dict import LimitedSizeDict
 
+from .audio_enhancer import AudioEnhancer, EnhancedAudioChunk, MicroVadEnhancer
 from .const import (
     CONF_DEBUG_RECORDING_DIR,
     DATA_CONFIG,
     DATA_LAST_WAKE_UP,
     DATA_MIGRATIONS,
     DOMAIN,
+    SAMPLE_CHANNELS,
+    SAMPLE_RATE,
+    SAMPLE_WIDTH,
+    SAMPLES_PER_CHUNK,
     WAKE_WORD_COOLDOWN,
 )
 from .error import (
@@ -111,9 +113,6 @@ STORED_PIPELINE_RUNS = 10
 
 SAVE_DELAY = 10
 
-AUDIO_PROCESSOR_SAMPLES: Final = 160  # 10 ms @ 16 Khz
-AUDIO_PROCESSOR_BYTES: Final = AUDIO_PROCESSOR_SAMPLES * 2  # 16-bit samples
-
 
 @callback
 def _async_resolve_default_pipeline_settings(
@@ -503,8 +502,8 @@ class AudioSettings:
     is_vad_enabled: bool = True
     """True if VAD is used to determine the end of the voice command."""
 
-    is_chunking_enabled: bool = True
-    """True if audio is automatically split into 10 ms chunks (required for VAD, etc.)"""
+    samples_per_chunk: int | None = None
+    """Number of samples that will be in each audio chunk (None for no chunking)."""
 
     def __post_init__(self) -> None:
         """Verify settings post-initialization."""
@@ -514,9 +513,6 @@ class AudioSettings:
         if (self.auto_gain_dbfs < 0) or (self.auto_gain_dbfs > 31):
             raise ValueError("auto_gain_dbfs must be in [0, 31]")
 
-        if self.needs_processor and (not self.is_chunking_enabled):
-            raise ValueError("Chunking must be enabled for audio processing")
-
     @property
     def needs_processor(self) -> bool:
         """True if an audio processor is needed."""
@@ -526,19 +522,10 @@ class AudioSettings:
             or (self.auto_gain_dbfs > 0)
         )
 
-
-@dataclass(frozen=True, slots=True)
-class ProcessedAudioChunk:
-    """Processed audio chunk and metadata."""
-
-    audio: bytes
-    """Raw PCM audio @ 16Khz with 16-bit mono samples"""
-
-    timestamp_ms: int
-    """Timestamp relative to start of audio stream (milliseconds)"""
-
-    is_speech: bool | None
-    """True if audio chunk likely contains speech, False if not, None if unknown"""
+    @property
+    def is_chunking_enabled(self) -> bool:
+        """True if chunk size is set."""
+        return self.samples_per_chunk is not None
 
 
 @dataclass
@@ -573,10 +560,10 @@ class PipelineRun:
     debug_recording_queue: Queue[str | bytes | None] | None = None
     """Queue to communicate with debug recording thread"""
 
-    audio_processor: AudioProcessor | None = None
+    audio_enhancer: AudioEnhancer | None = None
     """VAD/noise suppression/auto gain"""
 
-    audio_processor_buffer: AudioBuffer = field(init=False, repr=False)
+    audio_chunking_buffer: AudioBuffer | None = None
     """Buffer used when splitting audio into chunks for audio processing"""
 
     _device_id: str | None = None
@@ -601,19 +588,16 @@ class PipelineRun:
         pipeline_data.pipeline_runs.add_run(self)
 
         # Initialize with audio settings
-        self.audio_processor_buffer = AudioBuffer(AUDIO_PROCESSOR_BYTES)
-        if self.audio_settings.needs_processor:
-            # Delay import of webrtc so HA start up is not crashing
-            # on older architectures (armhf).
-            #
-            # pylint: disable=import-outside-toplevel
-            from webrtc_noise_gain import AudioProcessor
-
-            self.audio_processor = AudioProcessor(
+        if self.audio_settings.needs_processor and (self.audio_enhancer is None):
+            # Default audio enhancer
+            self.audio_enhancer = MicroVadEnhancer(
                 self.audio_settings.auto_gain_dbfs,
                 self.audio_settings.noise_suppression_level,
+                self.audio_settings.is_vad_enabled,
             )
 
+        self.audio_chunking_buffer = AudioBuffer(self.samples_per_chunk * SAMPLE_WIDTH)
+
     def __eq__(self, other: object) -> bool:
         """Compare pipeline runs by id."""
         if isinstance(other, PipelineRun):
@@ -621,6 +605,14 @@ class PipelineRun:
 
         return False
 
+    @property
+    def samples_per_chunk(self) -> int:
+        """Return number of samples expected in each audio chunk."""
+        if self.audio_enhancer is not None:
+            return self.audio_enhancer.samples_per_chunk or SAMPLES_PER_CHUNK
+
+        return self.audio_settings.samples_per_chunk or SAMPLES_PER_CHUNK
+
     @callback
     def process_event(self, event: PipelineEvent) -> None:
         """Log an event and call listener."""
@@ -688,8 +680,8 @@ class PipelineRun:
 
     async def wake_word_detection(
         self,
-        stream: AsyncIterable[ProcessedAudioChunk],
-        audio_chunks_for_stt: list[ProcessedAudioChunk],
+        stream: AsyncIterable[EnhancedAudioChunk],
+        audio_chunks_for_stt: list[EnhancedAudioChunk],
     ) -> wake_word.DetectionResult | None:
         """Run wake-word-detection portion of pipeline. Returns detection result."""
         metadata_dict = asdict(
@@ -732,10 +724,11 @@ class PipelineRun:
         # Audio chunk buffer. This audio will be forwarded to speech-to-text
         # after wake-word-detection.
         num_audio_chunks_to_buffer = int(
-            (wake_word_settings.audio_seconds_to_buffer * 16000)
-            / AUDIO_PROCESSOR_SAMPLES
+            (wake_word_settings.audio_seconds_to_buffer * SAMPLE_RATE)
+            / self.samples_per_chunk
         )
-        stt_audio_buffer: deque[ProcessedAudioChunk] | None = None
+
+        stt_audio_buffer: deque[EnhancedAudioChunk] | None = None
         if num_audio_chunks_to_buffer > 0:
             stt_audio_buffer = deque(maxlen=num_audio_chunks_to_buffer)
 
@@ -797,7 +790,7 @@ class PipelineRun:
                 # speech-to-text so the user does not have to pause before
                 # speaking the voice command.
                 audio_chunks_for_stt.extend(
-                    ProcessedAudioChunk(
+                    EnhancedAudioChunk(
                         audio=chunk_ts[0], timestamp_ms=chunk_ts[1], is_speech=False
                     )
                     for chunk_ts in result.queued_audio
@@ -819,18 +812,17 @@ class PipelineRun:
 
     async def _wake_word_audio_stream(
         self,
-        audio_stream: AsyncIterable[ProcessedAudioChunk],
-        stt_audio_buffer: deque[ProcessedAudioChunk] | None,
+        audio_stream: AsyncIterable[EnhancedAudioChunk],
+        stt_audio_buffer: deque[EnhancedAudioChunk] | None,
         wake_word_vad: VoiceActivityTimeout | None,
-        sample_rate: int = 16000,
-        sample_width: int = 2,
+        sample_rate: int = SAMPLE_RATE,
+        sample_width: int = SAMPLE_WIDTH,
     ) -> AsyncIterable[tuple[bytes, int]]:
         """Yield audio chunks with timestamps (milliseconds since start of stream).
 
         Adds audio to a ring buffer that will be forwarded to speech-to-text after
         detection. Times out if VAD detects enough silence.
         """
-        chunk_seconds = AUDIO_PROCESSOR_SAMPLES / sample_rate
         async for chunk in audio_stream:
             if self.abort_wake_word_detection:
                 raise WakeWordDetectionAborted
@@ -845,6 +837,7 @@ class PipelineRun:
                 stt_audio_buffer.append(chunk)
 
             if wake_word_vad is not None:
+                chunk_seconds = (len(chunk.audio) // sample_width) / sample_rate
                 if not wake_word_vad.process(chunk_seconds, chunk.is_speech):
                     raise WakeWordTimeoutError(
                         code="wake-word-timeout", message="Wake word was not detected"
@@ -881,7 +874,7 @@ class PipelineRun:
     async def speech_to_text(
         self,
         metadata: stt.SpeechMetadata,
-        stream: AsyncIterable[ProcessedAudioChunk],
+        stream: AsyncIterable[EnhancedAudioChunk],
     ) -> str:
         """Run speech-to-text portion of pipeline. Returns the spoken text."""
         # Create a background task to prepare the conversation agent
@@ -957,18 +950,18 @@ class PipelineRun:
 
     async def _speech_to_text_stream(
         self,
-        audio_stream: AsyncIterable[ProcessedAudioChunk],
+        audio_stream: AsyncIterable[EnhancedAudioChunk],
         stt_vad: VoiceCommandSegmenter | None,
-        sample_rate: int = 16000,
-        sample_width: int = 2,
+        sample_rate: int = SAMPLE_RATE,
+        sample_width: int = SAMPLE_WIDTH,
     ) -> AsyncGenerator[bytes]:
         """Yield audio chunks until VAD detects silence or speech-to-text completes."""
-        chunk_seconds = AUDIO_PROCESSOR_SAMPLES / sample_rate
         sent_vad_start = False
         async for chunk in audio_stream:
             self._capture_chunk(chunk.audio)
 
             if stt_vad is not None:
+                chunk_seconds = (len(chunk.audio) // sample_width) / sample_rate
                 if not stt_vad.process(chunk_seconds, chunk.is_speech):
                     # Silence detected at the end of voice command
                     self.process_event(
@@ -1072,8 +1065,8 @@ class PipelineRun:
             tts_options[tts.ATTR_PREFERRED_FORMAT] = self.tts_audio_output
             if self.tts_audio_output == "wav":
                 # 16 Khz, 16-bit mono
-                tts_options[tts.ATTR_PREFERRED_SAMPLE_RATE] = 16000
-                tts_options[tts.ATTR_PREFERRED_SAMPLE_CHANNELS] = 1
+                tts_options[tts.ATTR_PREFERRED_SAMPLE_RATE] = SAMPLE_RATE
+                tts_options[tts.ATTR_PREFERRED_SAMPLE_CHANNELS] = SAMPLE_CHANNELS
 
         try:
             options_supported = await tts.async_support_options(
@@ -1220,12 +1213,15 @@ class PipelineRun:
     async def process_volume_only(
         self,
         audio_stream: AsyncIterable[bytes],
-        sample_rate: int = 16000,
-        sample_width: int = 2,
-    ) -> AsyncGenerator[ProcessedAudioChunk]:
+        sample_rate: int = SAMPLE_RATE,
+        sample_width: int = SAMPLE_WIDTH,
+    ) -> AsyncGenerator[EnhancedAudioChunk]:
         """Apply volume transformation only (no VAD/audio enhancements) with optional chunking."""
+        assert self.audio_chunking_buffer is not None
+
+        bytes_per_chunk = self.samples_per_chunk * sample_width
         ms_per_sample = sample_rate // 1000
-        ms_per_chunk = (AUDIO_PROCESSOR_SAMPLES // sample_width) // ms_per_sample
+        ms_per_chunk = self.samples_per_chunk // ms_per_sample
         timestamp_ms = 0
 
         async for chunk in audio_stream:
@@ -1233,19 +1229,18 @@ class PipelineRun:
                 chunk = _multiply_volume(chunk, self.audio_settings.volume_multiplier)
 
             if self.audio_settings.is_chunking_enabled:
-                # 10 ms chunking
-                for chunk_10ms in chunk_samples(
-                    chunk, AUDIO_PROCESSOR_BYTES, self.audio_processor_buffer
+                for sub_chunk in chunk_samples(
+                    chunk, bytes_per_chunk, self.audio_chunking_buffer
                 ):
-                    yield ProcessedAudioChunk(
-                        audio=chunk_10ms,
+                    yield EnhancedAudioChunk(
+                        audio=sub_chunk,
                         timestamp_ms=timestamp_ms,
                         is_speech=None,  # no VAD
                     )
                     timestamp_ms += ms_per_chunk
             else:
                 # No chunking
-                yield ProcessedAudioChunk(
+                yield EnhancedAudioChunk(
                     audio=chunk,
                     timestamp_ms=timestamp_ms,
                     is_speech=None,  # no VAD
@@ -1255,14 +1250,19 @@ class PipelineRun:
     async def process_enhance_audio(
         self,
         audio_stream: AsyncIterable[bytes],
-        sample_rate: int = 16000,
-        sample_width: int = 2,
-    ) -> AsyncGenerator[ProcessedAudioChunk]:
+        sample_rate: int = SAMPLE_RATE,
+        sample_width: int = SAMPLE_WIDTH,
+    ) -> AsyncGenerator[EnhancedAudioChunk]:
         """Split audio into 10 ms chunks and apply VAD/noise suppression/auto gain/volume transformation."""
-        assert self.audio_processor is not None
+        assert self.audio_enhancer is not None
+        assert self.audio_enhancer.samples_per_chunk is not None
+        assert self.audio_chunking_buffer is not None
 
+        bytes_per_chunk = self.audio_enhancer.samples_per_chunk * sample_width
         ms_per_sample = sample_rate // 1000
-        ms_per_chunk = (AUDIO_PROCESSOR_SAMPLES // sample_width) // ms_per_sample
+        ms_per_chunk = (
+            self.audio_enhancer.samples_per_chunk // sample_width
+        ) // ms_per_sample
         timestamp_ms = 0
 
         async for dirty_samples in audio_stream:
@@ -1272,17 +1272,11 @@ class PipelineRun:
                     dirty_samples, self.audio_settings.volume_multiplier
                 )
 
-            # Split into 10ms chunks for audio enhancements/VAD
-            for dirty_10ms_chunk in chunk_samples(
-                dirty_samples, AUDIO_PROCESSOR_BYTES, self.audio_processor_buffer
+            # Split into chunks for audio enhancements/VAD
+            for dirty_chunk in chunk_samples(
+                dirty_samples, bytes_per_chunk, self.audio_chunking_buffer
             ):
-                ap_result = self.audio_processor.Process10ms(dirty_10ms_chunk)
-                yield ProcessedAudioChunk(
-                    audio=ap_result.audio,
-                    timestamp_ms=timestamp_ms,
-                    is_speech=ap_result.is_speech,
-                )
-
+                yield self.audio_enhancer.enhance_chunk(dirty_chunk, timestamp_ms)
                 timestamp_ms += ms_per_chunk
 
 
@@ -1323,9 +1317,9 @@ def _pipeline_debug_recording_thread_proc(
 
                 wav_path = run_recording_dir / f"{message}.wav"
                 wav_writer = wave.open(str(wav_path), "wb")
-                wav_writer.setframerate(16000)
-                wav_writer.setsampwidth(2)
-                wav_writer.setnchannels(1)
+                wav_writer.setframerate(SAMPLE_RATE)
+                wav_writer.setsampwidth(SAMPLE_WIDTH)
+                wav_writer.setnchannels(SAMPLE_CHANNELS)
             elif isinstance(message, bytes):
                 # Chunk of 16-bit mono audio at 16Khz
                 if wav_writer is not None:
@@ -1368,8 +1362,8 @@ class PipelineInput:
         """Run pipeline."""
         self.run.start(device_id=self.device_id)
         current_stage: PipelineStage | None = self.run.start_stage
-        stt_audio_buffer: list[ProcessedAudioChunk] = []
-        stt_processed_stream: AsyncIterable[ProcessedAudioChunk] | None = None
+        stt_audio_buffer: list[EnhancedAudioChunk] = []
+        stt_processed_stream: AsyncIterable[EnhancedAudioChunk] | None = None
 
         if self.stt_stream is not None:
             if self.run.audio_settings.needs_processor:
@@ -1423,7 +1417,7 @@ class PipelineInput:
                     # Send audio in the buffer first to speech-to-text, then move on to stt_stream.
                     # This is basically an async itertools.chain.
                     async def buffer_then_audio_stream() -> (
-                        AsyncGenerator[ProcessedAudioChunk]
+                        AsyncGenerator[EnhancedAudioChunk]
                     ):
                         # Buffered audio
                         for chunk in stt_audio_buffer:
diff --git a/homeassistant/components/assist_pipeline/vad.py b/homeassistant/components/assist_pipeline/vad.py
index 5b3d1408f58..e3b425a2a7b 100644
--- a/homeassistant/components/assist_pipeline/vad.py
+++ b/homeassistant/components/assist_pipeline/vad.py
@@ -2,12 +2,11 @@
 
 from __future__ import annotations
 
-from abc import ABC, abstractmethod
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from enum import StrEnum
 import logging
-from typing import Final, cast
+from typing import Final
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -35,44 +34,6 @@ class VadSensitivity(StrEnum):
         return 1.0
 
 
-class VoiceActivityDetector(ABC):
-    """Base class for voice activity detectors (VAD)."""
-
-    @abstractmethod
-    def is_speech(self, chunk: bytes) -> bool:
-        """Return True if audio chunk contains speech."""
-
-    @property
-    @abstractmethod
-    def samples_per_chunk(self) -> int | None:
-        """Return number of samples per chunk or None if chunking is not required."""
-
-
-class WebRtcVad(VoiceActivityDetector):
-    """Voice activity detector based on webrtc."""
-
-    def __init__(self) -> None:
-        """Initialize webrtcvad."""
-        # Delay import of webrtc so HA start up is not crashing
-        # on older architectures (armhf).
-        #
-        # pylint: disable=import-outside-toplevel
-        from webrtc_noise_gain import AudioProcessor
-
-        # Just VAD: no noise suppression or auto gain
-        self._audio_processor = AudioProcessor(0, 0)
-
-    def is_speech(self, chunk: bytes) -> bool:
-        """Return True if audio chunk contains speech."""
-        result = self._audio_processor.Process10ms(chunk)
-        return cast(bool, result.is_speech)
-
-    @property
-    def samples_per_chunk(self) -> int | None:
-        """Return 10 ms."""
-        return int(0.01 * _SAMPLE_RATE)  # 10 ms
-
-
 class AudioBuffer:
     """Fixed-sized audio buffer with variable internal length."""
 
@@ -176,29 +137,38 @@ class VoiceCommandSegmenter:
                 if self._speech_seconds_left <= 0:
                     # Inside voice command
                     self.in_command = True
+                    self._silence_seconds_left = self.silence_seconds
+                    _LOGGER.debug("Voice command started")
             else:
                 # Reset if enough silence
                 self._reset_seconds_left -= chunk_seconds
                 if self._reset_seconds_left <= 0:
                     self._speech_seconds_left = self.speech_seconds
+                    self._reset_seconds_left = self.reset_seconds
         elif not is_speech:
+            # Silence in command
             self._reset_seconds_left = self.reset_seconds
             self._silence_seconds_left -= chunk_seconds
             if self._silence_seconds_left <= 0:
+                # Command finished successfully
                 self.reset()
+                _LOGGER.debug("Voice command finished")
                 return False
         else:
-            # Reset if enough speech
+            # Speech in command.
+            # Reset silence counter if enough speech.
             self._reset_seconds_left -= chunk_seconds
             if self._reset_seconds_left <= 0:
                 self._silence_seconds_left = self.silence_seconds
+                self._reset_seconds_left = self.reset_seconds
 
         return True
 
     def process_with_vad(
         self,
         chunk: bytes,
-        vad: VoiceActivityDetector,
+        vad_samples_per_chunk: int | None,
+        vad_is_speech: Callable[[bytes], bool],
         leftover_chunk_buffer: AudioBuffer | None,
     ) -> bool:
         """Process an audio chunk using an external VAD.
@@ -207,20 +177,20 @@ class VoiceCommandSegmenter:
 
         Returns False when voice command is finished.
         """
-        if vad.samples_per_chunk is None:
+        if vad_samples_per_chunk is None:
             # No chunking
             chunk_seconds = (len(chunk) // _SAMPLE_WIDTH) / _SAMPLE_RATE
-            is_speech = vad.is_speech(chunk)
+            is_speech = vad_is_speech(chunk)
             return self.process(chunk_seconds, is_speech)
 
         if leftover_chunk_buffer is None:
             raise ValueError("leftover_chunk_buffer is required when vad uses chunking")
 
         # With chunking
-        seconds_per_chunk = vad.samples_per_chunk / _SAMPLE_RATE
-        bytes_per_chunk = vad.samples_per_chunk * _SAMPLE_WIDTH
+        seconds_per_chunk = vad_samples_per_chunk / _SAMPLE_RATE
+        bytes_per_chunk = vad_samples_per_chunk * _SAMPLE_WIDTH
         for vad_chunk in chunk_samples(chunk, bytes_per_chunk, leftover_chunk_buffer):
-            is_speech = vad.is_speech(vad_chunk)
+            is_speech = vad_is_speech(vad_chunk)
             if not self.process(seconds_per_chunk, is_speech):
                 return False
 
diff --git a/homeassistant/components/assist_pipeline/websocket_api.py b/homeassistant/components/assist_pipeline/websocket_api.py
index 3855bd7afc5..c96af655589 100644
--- a/homeassistant/components/assist_pipeline/websocket_api.py
+++ b/homeassistant/components/assist_pipeline/websocket_api.py
@@ -24,6 +24,9 @@ from .const import (
     DEFAULT_WAKE_WORD_TIMEOUT,
     DOMAIN,
     EVENT_RECORDING,
+    SAMPLE_CHANNELS,
+    SAMPLE_RATE,
+    SAMPLE_WIDTH,
 )
 from .error import PipelineNotFound
 from .pipeline import (
@@ -92,7 +95,6 @@ def async_register_websocket_api(hass: HomeAssistant) -> None:
                             vol.Optional("volume_multiplier"): float,
                             # Advanced use cases/testing
                             vol.Optional("no_vad"): bool,
-                            vol.Optional("no_chunking"): bool,
                         }
                     },
                     extra=vol.ALLOW_EXTRA,
@@ -170,9 +172,14 @@ async def websocket_run(
 
             # Yield until we receive an empty chunk
             while chunk := await audio_queue.get():
-                if incoming_sample_rate != 16000:
+                if incoming_sample_rate != SAMPLE_RATE:
                     chunk, state = audioop.ratecv(
-                        chunk, 2, 1, incoming_sample_rate, 16000, state
+                        chunk,
+                        SAMPLE_WIDTH,
+                        SAMPLE_CHANNELS,
+                        incoming_sample_rate,
+                        SAMPLE_RATE,
+                        state,
                     )
                 yield chunk
 
@@ -206,7 +213,6 @@ async def websocket_run(
             auto_gain_dbfs=msg_input.get("auto_gain_dbfs", 0),
             volume_multiplier=msg_input.get("volume_multiplier", 1.0),
             is_vad_enabled=not msg_input.get("no_vad", False),
-            is_chunking_enabled=not msg_input.get("no_chunking", False),
         )
     elif start_stage == PipelineStage.INTENT:
         # Input to conversation agent
@@ -424,9 +430,9 @@ def websocket_list_languages(
     connection.send_result(
         msg["id"],
         {
-            "languages": sorted(pipeline_languages)
-            if pipeline_languages
-            else pipeline_languages
+            "languages": (
+                sorted(pipeline_languages) if pipeline_languages else pipeline_languages
+            )
         },
     )
 
diff --git a/homeassistant/components/voip/voip.py b/homeassistant/components/voip/voip.py
index 5770d9d2b4a..243909629cf 100644
--- a/homeassistant/components/voip/voip.py
+++ b/homeassistant/components/voip/voip.py
@@ -31,12 +31,14 @@ from homeassistant.components.assist_pipeline import (
     async_pipeline_from_audio_stream,
     select as pipeline_select,
 )
+from homeassistant.components.assist_pipeline.audio_enhancer import (
+    AudioEnhancer,
+    MicroVadEnhancer,
+)
 from homeassistant.components.assist_pipeline.vad import (
     AudioBuffer,
     VadSensitivity,
-    VoiceActivityDetector,
     VoiceCommandSegmenter,
-    WebRtcVad,
 )
 from homeassistant.const import __version__
 from homeassistant.core import Context, HomeAssistant
@@ -233,13 +235,13 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
         try:
             # Wait for speech before starting pipeline
             segmenter = VoiceCommandSegmenter(silence_seconds=self.silence_seconds)
-            vad = WebRtcVad()
+            audio_enhancer = MicroVadEnhancer(0, 0, True)
             chunk_buffer: deque[bytes] = deque(
                 maxlen=self.buffered_chunks_before_speech,
             )
             speech_detected = await self._wait_for_speech(
                 segmenter,
-                vad,
+                audio_enhancer,
                 chunk_buffer,
             )
             if not speech_detected:
@@ -253,7 +255,7 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
                 try:
                     async for chunk in self._segment_audio(
                         segmenter,
-                        vad,
+                        audio_enhancer,
                         chunk_buffer,
                     ):
                         yield chunk
@@ -317,7 +319,7 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
     async def _wait_for_speech(
         self,
         segmenter: VoiceCommandSegmenter,
-        vad: VoiceActivityDetector,
+        audio_enhancer: AudioEnhancer,
         chunk_buffer: MutableSequence[bytes],
     ):
         """Buffer audio chunks until speech is detected.
@@ -329,13 +331,18 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
         async with asyncio.timeout(self.audio_timeout):
             chunk = await self._audio_queue.get()
 
-        assert vad.samples_per_chunk is not None
-        vad_buffer = AudioBuffer(vad.samples_per_chunk * WIDTH)
+        assert audio_enhancer.samples_per_chunk is not None
+        vad_buffer = AudioBuffer(audio_enhancer.samples_per_chunk * WIDTH)
 
         while chunk:
             chunk_buffer.append(chunk)
 
-            segmenter.process_with_vad(chunk, vad, vad_buffer)
+            segmenter.process_with_vad(
+                chunk,
+                audio_enhancer.samples_per_chunk,
+                lambda x: audio_enhancer.enhance_chunk(x, 0).is_speech is True,
+                vad_buffer,
+            )
             if segmenter.in_command:
                 # Buffer until command starts
                 if len(vad_buffer) > 0:
@@ -351,7 +358,7 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
     async def _segment_audio(
         self,
         segmenter: VoiceCommandSegmenter,
-        vad: VoiceActivityDetector,
+        audio_enhancer: AudioEnhancer,
         chunk_buffer: Sequence[bytes],
     ) -> AsyncIterable[bytes]:
         """Yield audio chunks until voice command has finished."""
@@ -364,11 +371,16 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
         async with asyncio.timeout(self.audio_timeout):
             chunk = await self._audio_queue.get()
 
-        assert vad.samples_per_chunk is not None
-        vad_buffer = AudioBuffer(vad.samples_per_chunk * WIDTH)
+        assert audio_enhancer.samples_per_chunk is not None
+        vad_buffer = AudioBuffer(audio_enhancer.samples_per_chunk * WIDTH)
 
         while chunk:
-            if not segmenter.process_with_vad(chunk, vad, vad_buffer):
+            if not segmenter.process_with_vad(
+                chunk,
+                audio_enhancer.samples_per_chunk,
+                lambda x: audio_enhancer.enhance_chunk(x, 0).is_speech is True,
+                vad_buffer,
+            ):
                 # Voice command is finished
                 break
 
diff --git a/homeassistant/package_constraints.txt b/homeassistant/package_constraints.txt
index 43e737a002d..c52ccfa6a8c 100644
--- a/homeassistant/package_constraints.txt
+++ b/homeassistant/package_constraints.txt
@@ -45,6 +45,7 @@ Pillow==10.4.0
 pip>=21.3.1
 psutil-home-assistant==0.0.1
 PyJWT==2.8.0
+pymicro-vad==1.0.0
 PyNaCl==1.5.0
 pyOpenSSL==24.2.1
 pyserial==3.5
@@ -60,7 +61,6 @@ urllib3>=1.26.5,<2
 voluptuous-openapi==0.0.5
 voluptuous-serialize==2.6.0
 voluptuous==0.15.2
-webrtc-noise-gain==1.2.3
 yarl==1.9.4
 zeroconf==0.132.2
 
diff --git a/requirements_all.txt b/requirements_all.txt
index 1abd6ec5be2..bc1a13f19ab 100644
--- a/requirements_all.txt
+++ b/requirements_all.txt
@@ -2007,6 +2007,9 @@ pymelcloud==2.5.9
 # homeassistant.components.meteoclimatic
 pymeteoclimatic==0.1.0
 
+# homeassistant.components.assist_pipeline
+pymicro-vad==1.0.0
+
 # homeassistant.components.xiaomi_tv
 pymitv==1.4.3
 
@@ -2896,9 +2899,6 @@ weatherflow4py==0.2.21
 # homeassistant.components.webmin
 webmin-xmlrpc==0.0.2
 
-# homeassistant.components.assist_pipeline
-webrtc-noise-gain==1.2.3
-
 # homeassistant.components.whirlpool
 whirlpool-sixth-sense==0.18.8
 
diff --git a/requirements_test_all.txt b/requirements_test_all.txt
index 463cbae4cdf..22de281aa61 100644
--- a/requirements_test_all.txt
+++ b/requirements_test_all.txt
@@ -1603,6 +1603,9 @@ pymelcloud==2.5.9
 # homeassistant.components.meteoclimatic
 pymeteoclimatic==0.1.0
 
+# homeassistant.components.assist_pipeline
+pymicro-vad==1.0.0
+
 # homeassistant.components.mochad
 pymochad==0.2.0
 
@@ -2282,9 +2285,6 @@ weatherflow4py==0.2.21
 # homeassistant.components.webmin
 webmin-xmlrpc==0.0.2
 
-# homeassistant.components.assist_pipeline
-webrtc-noise-gain==1.2.3
-
 # homeassistant.components.whirlpool
 whirlpool-sixth-sense==0.18.8
 
diff --git a/script/licenses.py b/script/licenses.py
index f2298e473a2..ad5ae8476b3 100644
--- a/script/licenses.py
+++ b/script/licenses.py
@@ -172,7 +172,6 @@ EXCEPTIONS = {
     "tapsaff",  # https://github.com/bazwilliams/python-taps-aff/pull/5
     "tellduslive",  # https://github.com/molobrakos/tellduslive/pull/24
     "tellsticknet",  # https://github.com/molobrakos/tellsticknet/pull/33
-    "webrtc_noise_gain",  # https://github.com/rhasspy/webrtc-noise-gain/pull/24
     "vincenty",  # Public domain
     "zeversolar",  # https://github.com/kvanzuijlen/zeversolar/pull/46
 }
diff --git a/tests/components/assist_pipeline/test_init.py b/tests/components/assist_pipeline/test_init.py
index f9b91af3bf1..8fb7ce5b5a5 100644
--- a/tests/components/assist_pipeline/test_init.py
+++ b/tests/components/assist_pipeline/test_init.py
@@ -75,9 +75,7 @@ async def test_pipeline_from_audio_stream_auto(
             channel=stt.AudioChannels.CHANNEL_MONO,
         ),
         stt_stream=audio_data(),
-        audio_settings=assist_pipeline.AudioSettings(
-            is_vad_enabled=False, is_chunking_enabled=False
-        ),
+        audio_settings=assist_pipeline.AudioSettings(is_vad_enabled=False),
     )
 
     assert process_events(events) == snapshot
@@ -140,9 +138,7 @@ async def test_pipeline_from_audio_stream_legacy(
         ),
         stt_stream=audio_data(),
         pipeline_id=pipeline_id,
-        audio_settings=assist_pipeline.AudioSettings(
-            is_vad_enabled=False, is_chunking_enabled=False
-        ),
+        audio_settings=assist_pipeline.AudioSettings(is_vad_enabled=False),
     )
 
     assert process_events(events) == snapshot
@@ -205,9 +201,7 @@ async def test_pipeline_from_audio_stream_entity(
         ),
         stt_stream=audio_data(),
         pipeline_id=pipeline_id,
-        audio_settings=assist_pipeline.AudioSettings(
-            is_vad_enabled=False, is_chunking_enabled=False
-        ),
+        audio_settings=assist_pipeline.AudioSettings(is_vad_enabled=False),
     )
 
     assert process_events(events) == snapshot
@@ -271,9 +265,7 @@ async def test_pipeline_from_audio_stream_no_stt(
             ),
             stt_stream=audio_data(),
             pipeline_id=pipeline_id,
-            audio_settings=assist_pipeline.AudioSettings(
-                is_vad_enabled=False, is_chunking_enabled=False
-            ),
+            audio_settings=assist_pipeline.AudioSettings(is_vad_enabled=False),
         )
 
     assert not events
@@ -335,24 +327,25 @@ async def test_pipeline_from_audio_stream_wake_word(
     # [0, 2, ...]
     wake_chunk_2 = bytes(it.islice(it.cycle(range(0, 256, 2)), BYTES_ONE_SECOND))
 
-    bytes_per_chunk = int(0.01 * BYTES_ONE_SECOND)
+    samples_per_chunk = 160
+    bytes_per_chunk = samples_per_chunk * 2  # 16-bit
 
     async def audio_data():
-        # 1 second in 10 ms chunks
+        # 1 second in chunks
         i = 0
         while i < len(wake_chunk_1):
             yield wake_chunk_1[i : i + bytes_per_chunk]
             i += bytes_per_chunk
 
-        # 1 second in 30 ms chunks
+        # 1 second in chunks
         i = 0
         while i < len(wake_chunk_2):
             yield wake_chunk_2[i : i + bytes_per_chunk]
             i += bytes_per_chunk
 
-        yield b"wake word!"
-        yield b"part1"
-        yield b"part2"
+        for chunk in (b"wake word!", b"part1", b"part2"):
+            yield chunk + bytes(bytes_per_chunk - len(chunk))
+
         yield b""
 
     await assist_pipeline.async_pipeline_from_audio_stream(
@@ -373,7 +366,7 @@ async def test_pipeline_from_audio_stream_wake_word(
             audio_seconds_to_buffer=1.5
         ),
         audio_settings=assist_pipeline.AudioSettings(
-            is_vad_enabled=False, is_chunking_enabled=False
+            is_vad_enabled=False, samples_per_chunk=samples_per_chunk
         ),
     )
 
@@ -390,7 +383,9 @@ async def test_pipeline_from_audio_stream_wake_word(
     )
     assert first_chunk == wake_chunk_1[len(wake_chunk_1) // 2 :] + wake_chunk_2
 
-    assert mock_stt_provider.received[-3:] == [b"queued audio", b"part1", b"part2"]
+    assert mock_stt_provider.received[-3] == b"queued audio"
+    assert mock_stt_provider.received[-2].startswith(b"part1")
+    assert mock_stt_provider.received[-1].startswith(b"part2")
 
 
 async def test_pipeline_save_audio(
@@ -438,9 +433,7 @@ async def test_pipeline_save_audio(
             pipeline_id=pipeline.id,
             start_stage=assist_pipeline.PipelineStage.WAKE_WORD,
             end_stage=assist_pipeline.PipelineStage.STT,
-            audio_settings=assist_pipeline.AudioSettings(
-                is_vad_enabled=False, is_chunking_enabled=False
-            ),
+            audio_settings=assist_pipeline.AudioSettings(is_vad_enabled=False),
         )
 
         pipeline_dirs = list(temp_dir.iterdir())
@@ -685,9 +678,7 @@ async def test_wake_word_detection_aborted(
             wake_word_settings=assist_pipeline.WakeWordSettings(
                 audio_seconds_to_buffer=1.5
             ),
-            audio_settings=assist_pipeline.AudioSettings(
-                is_vad_enabled=False, is_chunking_enabled=False
-            ),
+            audio_settings=assist_pipeline.AudioSettings(is_vad_enabled=False),
         ),
     )
     await pipeline_input.validate()
diff --git a/tests/components/assist_pipeline/test_vad.py b/tests/components/assist_pipeline/test_vad.py
index 139ae915263..17cb73a9139 100644
--- a/tests/components/assist_pipeline/test_vad.py
+++ b/tests/components/assist_pipeline/test_vad.py
@@ -1,11 +1,9 @@
 """Tests for voice command segmenter."""
 
 import itertools as it
-from unittest.mock import patch
 
 from homeassistant.components.assist_pipeline.vad import (
     AudioBuffer,
-    VoiceActivityDetector,
     VoiceCommandSegmenter,
     chunk_samples,
 )
@@ -44,59 +42,41 @@ def test_speech() -> None:
 def test_audio_buffer() -> None:
     """Test audio buffer wrapping."""
 
-    class DisabledVad(VoiceActivityDetector):
-        def is_speech(self, chunk):
-            return False
+    samples_per_chunk = 160  # 10 ms
+    bytes_per_chunk = samples_per_chunk * 2
+    leftover_buffer = AudioBuffer(bytes_per_chunk)
 
-        @property
-        def samples_per_chunk(self):
-            return 160  # 10 ms
+    # Partially fill audio buffer
+    half_chunk = bytes(it.islice(it.cycle(range(256)), bytes_per_chunk // 2))
+    chunks = list(chunk_samples(half_chunk, bytes_per_chunk, leftover_buffer))
 
-    vad = DisabledVad()
-    bytes_per_chunk = vad.samples_per_chunk * 2
-    vad_buffer = AudioBuffer(bytes_per_chunk)
-    segmenter = VoiceCommandSegmenter()
+    assert not chunks
+    assert leftover_buffer.bytes() == half_chunk
+
+    # Fill and wrap with 1/4 chunk left over
+    three_quarters_chunk = bytes(
+        it.islice(it.cycle(range(256)), int(0.75 * bytes_per_chunk))
+    )
+    chunks = list(chunk_samples(three_quarters_chunk, bytes_per_chunk, leftover_buffer))
+
+    assert len(chunks) == 1
+    assert (
+        leftover_buffer.bytes()
+        == three_quarters_chunk[len(three_quarters_chunk) - (bytes_per_chunk // 4) :]
+    )
+    assert chunks[0] == half_chunk + three_quarters_chunk[: bytes_per_chunk // 2]
+
+    # Run 2 chunks through
+    leftover_buffer.clear()
+    assert len(leftover_buffer) == 0
 
-    with patch.object(vad, "is_speech", return_value=False) as mock_process:
-        # Partially fill audio buffer
-        half_chunk = bytes(it.islice(it.cycle(range(256)), bytes_per_chunk // 2))
-        segmenter.process_with_vad(half_chunk, vad, vad_buffer)
-
-        assert not mock_process.called
-        assert vad_buffer is not None
-        assert vad_buffer.bytes() == half_chunk
-
-        # Fill and wrap with 1/4 chunk left over
-        three_quarters_chunk = bytes(
-            it.islice(it.cycle(range(256)), int(0.75 * bytes_per_chunk))
-        )
-        segmenter.process_with_vad(three_quarters_chunk, vad, vad_buffer)
-
-        assert mock_process.call_count == 1
-        assert (
-            vad_buffer.bytes()
-            == three_quarters_chunk[
-                len(three_quarters_chunk) - (bytes_per_chunk // 4) :
-            ]
-        )
-        assert (
-            mock_process.call_args[0][0]
-            == half_chunk + three_quarters_chunk[: bytes_per_chunk // 2]
-        )
-
-        # Run 2 chunks through
-        segmenter.reset()
-        vad_buffer.clear()
-        assert len(vad_buffer) == 0
-
-        mock_process.reset_mock()
-        two_chunks = bytes(it.islice(it.cycle(range(256)), bytes_per_chunk * 2))
-        segmenter.process_with_vad(two_chunks, vad, vad_buffer)
-
-        assert mock_process.call_count == 2
-        assert len(vad_buffer) == 0
-        assert mock_process.call_args_list[0][0][0] == two_chunks[:bytes_per_chunk]
-        assert mock_process.call_args_list[1][0][0] == two_chunks[bytes_per_chunk:]
+    two_chunks = bytes(it.islice(it.cycle(range(256)), bytes_per_chunk * 2))
+    chunks = list(chunk_samples(two_chunks, bytes_per_chunk, leftover_buffer))
+
+    assert len(chunks) == 2
+    assert len(leftover_buffer) == 0
+    assert chunks[0] == two_chunks[:bytes_per_chunk]
+    assert chunks[1] == two_chunks[bytes_per_chunk:]
 
 
 def test_partial_chunk() -> None:
@@ -125,43 +105,3 @@ def test_chunk_samples_leftover() -> None:
 
     assert len(chunks) == 1
     assert leftover_chunk_buffer.bytes() == bytes([5, 6])
-
-
-def test_vad_no_chunking() -> None:
-    """Test VAD that doesn't require chunking."""
-
-    class VadNoChunk(VoiceActivityDetector):
-        def is_speech(self, chunk: bytes) -> bool:
-            return sum(chunk) > 0
-
-        @property
-        def samples_per_chunk(self) -> int | None:
-            return None
-
-    vad = VadNoChunk()
-    segmenter = VoiceCommandSegmenter(
-        speech_seconds=1.0, silence_seconds=1.0, reset_seconds=0.5
-    )
-    silence = bytes([0] * 16000)
-    speech = bytes([255] * (16000 // 2))
-
-    # Test with differently-sized chunks
-    assert vad.is_speech(speech)
-    assert not vad.is_speech(silence)
-
-    # Simulate voice command
-    assert segmenter.process_with_vad(silence, vad, None)
-    # begin
-    assert segmenter.process_with_vad(speech, vad, None)
-    assert segmenter.process_with_vad(speech, vad, None)
-    assert segmenter.process_with_vad(speech, vad, None)
-    # reset with silence
-    assert segmenter.process_with_vad(silence, vad, None)
-    # resume
-    assert segmenter.process_with_vad(speech, vad, None)
-    assert segmenter.process_with_vad(speech, vad, None)
-    assert segmenter.process_with_vad(speech, vad, None)
-    assert segmenter.process_with_vad(speech, vad, None)
-    # end
-    assert segmenter.process_with_vad(silence, vad, None)
-    assert not segmenter.process_with_vad(silence, vad, None)
diff --git a/tests/components/assist_pipeline/test_websocket.py b/tests/components/assist_pipeline/test_websocket.py
index de8ddc7ccc7..7d4a9b18c12 100644
--- a/tests/components/assist_pipeline/test_websocket.py
+++ b/tests/components/assist_pipeline/test_websocket.py
@@ -259,12 +259,7 @@ async def test_audio_pipeline_with_wake_word_no_timeout(
             "type": "assist_pipeline/run",
             "start_stage": "wake_word",
             "end_stage": "tts",
-            "input": {
-                "sample_rate": 16000,
-                "timeout": 0,
-                "no_vad": True,
-                "no_chunking": True,
-            },
+            "input": {"sample_rate": 16000, "timeout": 0, "no_vad": True},
         }
     )
 
@@ -1876,11 +1871,7 @@ async def test_wake_word_cooldown_same_id(
             "type": "assist_pipeline/run",
             "start_stage": "wake_word",
             "end_stage": "tts",
-            "input": {
-                "sample_rate": 16000,
-                "no_vad": True,
-                "no_chunking": True,
-            },
+            "input": {"sample_rate": 16000, "no_vad": True},
         }
     )
 
@@ -1889,11 +1880,7 @@ async def test_wake_word_cooldown_same_id(
             "type": "assist_pipeline/run",
             "start_stage": "wake_word",
             "end_stage": "tts",
-            "input": {
-                "sample_rate": 16000,
-                "no_vad": True,
-                "no_chunking": True,
-            },
+            "input": {"sample_rate": 16000, "no_vad": True},
         }
     )
 
@@ -1967,11 +1954,7 @@ async def test_wake_word_cooldown_different_ids(
                 "type": "assist_pipeline/run",
                 "start_stage": "wake_word",
                 "end_stage": "tts",
-                "input": {
-                    "sample_rate": 16000,
-                    "no_vad": True,
-                    "no_chunking": True,
-                },
+                "input": {"sample_rate": 16000, "no_vad": True},
             }
         )
 
@@ -1980,11 +1963,7 @@ async def test_wake_word_cooldown_different_ids(
                 "type": "assist_pipeline/run",
                 "start_stage": "wake_word",
                 "end_stage": "tts",
-                "input": {
-                    "sample_rate": 16000,
-                    "no_vad": True,
-                    "no_chunking": True,
-                },
+                "input": {"sample_rate": 16000, "no_vad": True},
             }
         )
 
@@ -2094,11 +2073,7 @@ async def test_wake_word_cooldown_different_entities(
             "pipeline": pipeline_id_1,
             "start_stage": "wake_word",
             "end_stage": "tts",
-            "input": {
-                "sample_rate": 16000,
-                "no_vad": True,
-                "no_chunking": True,
-            },
+            "input": {"sample_rate": 16000, "no_vad": True},
         }
     )
 
@@ -2109,11 +2084,7 @@ async def test_wake_word_cooldown_different_entities(
             "pipeline": pipeline_id_2,
             "start_stage": "wake_word",
             "end_stage": "tts",
-            "input": {
-                "sample_rate": 16000,
-                "no_vad": True,
-                "no_chunking": True,
-            },
+            "input": {"sample_rate": 16000, "no_vad": True},
         }
     )
 
@@ -2210,11 +2181,7 @@ async def test_device_capture(
             "type": "assist_pipeline/run",
             "start_stage": "stt",
             "end_stage": "stt",
-            "input": {
-                "sample_rate": 16000,
-                "no_vad": True,
-                "no_chunking": True,
-            },
+            "input": {"sample_rate": 16000, "no_vad": True},
             "device_id": satellite_device.id,
         }
     )
@@ -2315,11 +2282,7 @@ async def test_device_capture_override(
             "type": "assist_pipeline/run",
             "start_stage": "stt",
             "end_stage": "stt",
-            "input": {
-                "sample_rate": 16000,
-                "no_vad": True,
-                "no_chunking": True,
-            },
+            "input": {"sample_rate": 16000, "no_vad": True},
             "device_id": satellite_device.id,
         }
     )
@@ -2464,11 +2427,7 @@ async def test_device_capture_queue_full(
             "type": "assist_pipeline/run",
             "start_stage": "stt",
             "end_stage": "stt",
-            "input": {
-                "sample_rate": 16000,
-                "no_vad": True,
-                "no_chunking": True,
-            },
+            "input": {"sample_rate": 16000, "no_vad": True},
             "device_id": satellite_device.id,
         }
     )
diff --git a/tests/components/voip/test_voip.py b/tests/components/voip/test_voip.py
index 6c292241237..c2978afc17f 100644
--- a/tests/components/voip/test_voip.py
+++ b/tests/components/voip/test_voip.py
@@ -43,9 +43,12 @@ async def test_pipeline(
     """Test that pipeline function is called from RTP protocol."""
     assert await async_setup_component(hass, "voip", {})
 
-    def is_speech(self, chunk):
+    def process_10ms(self, chunk):
         """Anything non-zero is speech."""
-        return sum(chunk) > 0
+        if sum(chunk) > 0:
+            return 1
+
+        return 0
 
     done = asyncio.Event()
 
@@ -98,8 +101,8 @@ async def test_pipeline(
 
     with (
         patch(
-            "homeassistant.components.assist_pipeline.vad.WebRtcVad.is_speech",
-            new=is_speech,
+            "pymicro_vad.MicroVad.Process10ms",
+            new=process_10ms,
         ),
         patch(
             "homeassistant.components.voip.voip.async_pipeline_from_audio_stream",
@@ -238,9 +241,12 @@ async def test_tts_timeout(
     """Test that TTS will time out based on its length."""
     assert await async_setup_component(hass, "voip", {})
 
-    def is_speech(self, chunk):
+    def process_10ms(self, chunk):
         """Anything non-zero is speech."""
-        return sum(chunk) > 0
+        if sum(chunk) > 0:
+            return 1
+
+        return 0
 
     done = asyncio.Event()
 
@@ -298,8 +304,8 @@ async def test_tts_timeout(
 
     with (
         patch(
-            "homeassistant.components.assist_pipeline.vad.WebRtcVad.is_speech",
-            new=is_speech,
+            "pymicro_vad.MicroVad.Process10ms",
+            new=process_10ms,
         ),
         patch(
             "homeassistant.components.voip.voip.async_pipeline_from_audio_stream",
@@ -361,9 +367,12 @@ async def test_tts_wrong_extension(
     """Test that TTS will only stream WAV audio."""
     assert await async_setup_component(hass, "voip", {})
 
-    def is_speech(self, chunk):
+    def process_10ms(self, chunk):
         """Anything non-zero is speech."""
-        return sum(chunk) > 0
+        if sum(chunk) > 0:
+            return 1
+
+        return 0
 
     done = asyncio.Event()
 
@@ -403,8 +412,8 @@ async def test_tts_wrong_extension(
 
     with (
         patch(
-            "homeassistant.components.assist_pipeline.vad.WebRtcVad.is_speech",
-            new=is_speech,
+            "pymicro_vad.MicroVad.Process10ms",
+            new=process_10ms,
         ),
         patch(
             "homeassistant.components.voip.voip.async_pipeline_from_audio_stream",
@@ -456,9 +465,12 @@ async def test_tts_wrong_wav_format(
     """Test that TTS will only stream WAV audio with a specific format."""
     assert await async_setup_component(hass, "voip", {})
 
-    def is_speech(self, chunk):
+    def process_10ms(self, chunk):
         """Anything non-zero is speech."""
-        return sum(chunk) > 0
+        if sum(chunk) > 0:
+            return 1
+
+        return 0
 
     done = asyncio.Event()
 
@@ -505,8 +517,8 @@ async def test_tts_wrong_wav_format(
 
     with (
         patch(
-            "homeassistant.components.assist_pipeline.vad.WebRtcVad.is_speech",
-            new=is_speech,
+            "pymicro_vad.MicroVad.Process10ms",
+            new=process_10ms,
         ),
         patch(
             "homeassistant.components.voip.voip.async_pipeline_from_audio_stream",
@@ -558,9 +570,12 @@ async def test_empty_tts_output(
     """Test that TTS will not stream when output is empty."""
     assert await async_setup_component(hass, "voip", {})
 
-    def is_speech(self, chunk):
+    def process_10ms(self, chunk):
         """Anything non-zero is speech."""
-        return sum(chunk) > 0
+        if sum(chunk) > 0:
+            return 1
+
+        return 0
 
     async def async_pipeline_from_audio_stream(*args, **kwargs):
         stt_stream = kwargs["stt_stream"]
@@ -591,8 +606,8 @@ async def test_empty_tts_output(
 
     with (
         patch(
-            "homeassistant.components.assist_pipeline.vad.WebRtcVad.is_speech",
-            new=is_speech,
+            "pymicro_vad.MicroVad.Process10ms",
+            new=process_10ms,
         ),
         patch(
             "homeassistant.components.voip.voip.async_pipeline_from_audio_stream",
-- 
GitLab