Skip to content
Snippets Groups Projects
Unverified Commit 3c65db29 authored by Massimiliano Pippi's avatar Massimiliano Pippi Committed by GitHub
Browse files

fix: respect max_depth in KnowledgeBaseWebReader (#17949)

parent 1da81885
Branches
Tags
No related merge requests found
......@@ -127,7 +127,12 @@ class KnowledgeBaseWebReader(BaseReader):
return {"title": title, "subtitle": subtitle, "body": body, "url": url}
def get_article_urls(
self, browser: Any, root_url: str, current_url: str, max_depth: int = 100
self,
browser: Any,
root_url: str,
current_url: str,
max_depth: int = 100,
depth: int = 0,
) -> List[str]:
"""
Recursively crawl through the knowledge base to find a list of articles.
......@@ -136,11 +141,17 @@ class KnowledgeBaseWebReader(BaseReader):
browser (Any): a Playwright Chromium browser.
root_url (str): root URL of the knowledge base.
current_url (str): current URL that is being crawled.
max_depth (int): maximum recursion level for the crawler
depth (int): current depth level
Returns:
List[str]: a list of URLs of found articles.
"""
if depth >= max_depth:
print(f"Reached max depth ({max_depth}): {current_url}")
return []
page = browser.new_page(ignore_https_errors=True)
page.set_default_timeout(60000)
page.goto(current_url, wait_until="domcontentloaded")
......@@ -162,7 +173,7 @@ class KnowledgeBaseWebReader(BaseReader):
for link in links:
url = root_url + page.evaluate("(node) => node.getAttribute('href')", link)
article_urls.extend(
self.get_article_urls(browser, root_url, url, max_depth)
self.get_article_urls(browser, root_url, url, max_depth, depth + 1)
)
page.close()
......
......@@ -46,7 +46,7 @@ license = "GPL-3.0-or-later"
maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
name = "llama-index-readers-web"
readme = "README.md"
version = "0.3.5"
version = "0.3.6"
[tool.poetry.dependencies]
python = ">=3.9,<4.0"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment