diff --git a/habitat/utils/visualizations/utils.py b/habitat/utils/visualizations/utils.py
index b497517711b4556c4f7a778e18d4b5231f12b76f..f86d586fb979af6d7827acf6610fe8675ab22358 100644
--- a/habitat/utils/visualizations/utils.py
+++ b/habitat/utils/visualizations/utils.py
@@ -164,18 +164,26 @@ def observations_to_image(observation: Dict, info: Dict) -> np.ndarray:
     Returns:
         generated image of a single frame.
     """
-    observation_size = observation["rgb"].shape[0]
-    egocentric_view = observation["rgb"][:, :, :3]
-    # draw collision
-    if "collisions" in info and info["collisions"]["is_collision"]:
-        egocentric_view = draw_collision(egocentric_view)
+    egocentric_view = []
+    if "rgb" in observation:
+        observation_size = observation["rgb"].shape[0]
+        egocentric_view.append(observation["rgb"][:, :, :3])
 
     # draw depth map if observation has depth info
     if "depth" in observation:
+        observation_size = observation["depth"].shape[0]
         depth_map = (observation["depth"].squeeze() * 255).astype(np.uint8)
         depth_map = np.stack([depth_map for _ in range(3)], axis=2)
+        egocentric_view.appned(depth_map)
+
+    assert (
+        len(egocentric_view) > 0
+    ), "Expected at least one visual sensor enabled."
+    egocentric_view = np.concatenate(egocentric_view, axis=1)
 
-        egocentric_view = np.concatenate((egocentric_view, depth_map), axis=1)
+    # draw collision
+    if "collisions" in info and info["collisions"]["is_collision"]:
+        egocentric_view = draw_collision(egocentric_view)
 
     frame = egocentric_view
 
diff --git a/habitat_baselines/rl/requirements.txt b/habitat_baselines/rl/requirements.txt
index 1f31a714d5a20cec3a7a9adcb58d7c8d75372df9..3c0a2cfc40e26fec7a6809da834938864cfb8442 100644
--- a/habitat_baselines/rl/requirements.txt
+++ b/habitat_baselines/rl/requirements.txt
@@ -1,3 +1,4 @@
+moviepy>=1.0.1
 torch==1.1.0
 # full tensorflow required for tensorboard video support
 tensorflow==1.13.1