From 3c65db2947271de3bd1927dc66a044da385de4da Mon Sep 17 00:00:00 2001
From: Massimiliano Pippi <mpippi@gmail.com>
Date: Thu, 27 Feb 2025 22:14:21 +0100
Subject: [PATCH] fix: respect max_depth in KnowledgeBaseWebReader (#17949)

---
 .../readers/web/knowledge_base/base.py            | 15 +++++++++++++--
 .../llama-index-readers-web/pyproject.toml        |  2 +-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/knowledge_base/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/knowledge_base/base.py
index faddf4dd2..b61cb881b 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/knowledge_base/base.py
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/knowledge_base/base.py
@@ -127,7 +127,12 @@ class KnowledgeBaseWebReader(BaseReader):
         return {"title": title, "subtitle": subtitle, "body": body, "url": url}
 
     def get_article_urls(
-        self, browser: Any, root_url: str, current_url: str, max_depth: int = 100
+        self,
+        browser: Any,
+        root_url: str,
+        current_url: str,
+        max_depth: int = 100,
+        depth: int = 0,
     ) -> List[str]:
         """
         Recursively crawl through the knowledge base to find a list of articles.
@@ -136,11 +141,17 @@ class KnowledgeBaseWebReader(BaseReader):
             browser (Any): a Playwright Chromium browser.
             root_url (str): root URL of the knowledge base.
             current_url (str): current URL that is being crawled.
+            max_depth (int): maximum recursion level for the crawler
+            depth (int): current depth level
 
         Returns:
             List[str]: a list of URLs of found articles.
 
         """
+        if depth >= max_depth:
+            print(f"Reached max depth ({max_depth}): {current_url}")
+            return []
+
         page = browser.new_page(ignore_https_errors=True)
         page.set_default_timeout(60000)
         page.goto(current_url, wait_until="domcontentloaded")
@@ -162,7 +173,7 @@ class KnowledgeBaseWebReader(BaseReader):
         for link in links:
             url = root_url + page.evaluate("(node) => node.getAttribute('href')", link)
             article_urls.extend(
-                self.get_article_urls(browser, root_url, url, max_depth)
+                self.get_article_urls(browser, root_url, url, max_depth, depth + 1)
             )
 
         page.close()
diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
index 32b719a42..d8bb4dff7 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
+++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
@@ -46,7 +46,7 @@ license = "GPL-3.0-or-later"
 maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
 name = "llama-index-readers-web"
 readme = "README.md"
-version = "0.3.5"
+version = "0.3.6"
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
-- 
GitLab