diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/knowledge_base/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/knowledge_base/base.py index faddf4dd205f798f3ffb012b1bf5266ea199666f..b61cb881b9fc0142ab7d866e13696d8c29cb79ba 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/knowledge_base/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/knowledge_base/base.py @@ -127,7 +127,12 @@ class KnowledgeBaseWebReader(BaseReader): return {"title": title, "subtitle": subtitle, "body": body, "url": url} def get_article_urls( - self, browser: Any, root_url: str, current_url: str, max_depth: int = 100 + self, + browser: Any, + root_url: str, + current_url: str, + max_depth: int = 100, + depth: int = 0, ) -> List[str]: """ Recursively crawl through the knowledge base to find a list of articles. @@ -136,11 +141,17 @@ class KnowledgeBaseWebReader(BaseReader): browser (Any): a Playwright Chromium browser. root_url (str): root URL of the knowledge base. current_url (str): current URL that is being crawled. + max_depth (int): maximum recursion level for the crawler + depth (int): current depth level Returns: List[str]: a list of URLs of found articles. """ + if depth >= max_depth: + print(f"Reached max depth ({max_depth}): {current_url}") + return [] + page = browser.new_page(ignore_https_errors=True) page.set_default_timeout(60000) page.goto(current_url, wait_until="domcontentloaded") @@ -162,7 +173,7 @@ class KnowledgeBaseWebReader(BaseReader): for link in links: url = root_url + page.evaluate("(node) => node.getAttribute('href')", link) article_urls.extend( - self.get_article_urls(browser, root_url, url, max_depth) + self.get_article_urls(browser, root_url, url, max_depth, depth + 1) ) page.close() diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml index 32b719a4228d3151ae14cb25236fc8d82dc84083..d8bb4dff767841c9b70fd4753a7ba004f2144405 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml @@ -46,7 +46,7 @@ license = "GPL-3.0-or-later" maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"] name = "llama-index-readers-web" readme = "README.md" -version = "0.3.5" +version = "0.3.6" [tool.poetry.dependencies] python = ">=3.9,<4.0"