From 3c65db2947271de3bd1927dc66a044da385de4da Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi <mpippi@gmail.com> Date: Thu, 27 Feb 2025 22:14:21 +0100 Subject: [PATCH] fix: respect max_depth in KnowledgeBaseWebReader (#17949) --- .../readers/web/knowledge_base/base.py | 15 +++++++++++++-- .../llama-index-readers-web/pyproject.toml | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/knowledge_base/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/knowledge_base/base.py index faddf4dd2..b61cb881b 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/knowledge_base/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/knowledge_base/base.py @@ -127,7 +127,12 @@ class KnowledgeBaseWebReader(BaseReader): return {"title": title, "subtitle": subtitle, "body": body, "url": url} def get_article_urls( - self, browser: Any, root_url: str, current_url: str, max_depth: int = 100 + self, + browser: Any, + root_url: str, + current_url: str, + max_depth: int = 100, + depth: int = 0, ) -> List[str]: """ Recursively crawl through the knowledge base to find a list of articles. @@ -136,11 +141,17 @@ class KnowledgeBaseWebReader(BaseReader): browser (Any): a Playwright Chromium browser. root_url (str): root URL of the knowledge base. current_url (str): current URL that is being crawled. + max_depth (int): maximum recursion level for the crawler + depth (int): current depth level Returns: List[str]: a list of URLs of found articles. """ + if depth >= max_depth: + print(f"Reached max depth ({max_depth}): {current_url}") + return [] + page = browser.new_page(ignore_https_errors=True) page.set_default_timeout(60000) page.goto(current_url, wait_until="domcontentloaded") @@ -162,7 +173,7 @@ class KnowledgeBaseWebReader(BaseReader): for link in links: url = root_url + page.evaluate("(node) => node.getAttribute('href')", link) article_urls.extend( - self.get_article_urls(browser, root_url, url, max_depth) + self.get_article_urls(browser, root_url, url, max_depth, depth + 1) ) page.close() diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml index 32b719a42..d8bb4dff7 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml @@ -46,7 +46,7 @@ license = "GPL-3.0-or-later" maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"] name = "llama-index-readers-web" readme = "README.md" -version = "0.3.5" +version = "0.3.6" [tool.poetry.dependencies] python = ">=3.9,<4.0" -- GitLab