diff --git a/llama-index-core/llama_index/core/embeddings/loading.py b/llama-index-core/llama_index/core/embeddings/loading.py index 658df384db35dc6a642400e4b0165c8563f99b02..fd84ee64ed7f1b44e75869c3910442d8585f98f8 100644 --- a/llama-index-core/llama_index/core/embeddings/loading.py +++ b/llama-index-core/llama_index/core/embeddings/loading.py @@ -7,6 +7,34 @@ RECOGNIZED_EMBEDDINGS: Dict[str, Type[BaseEmbedding]] = { MockEmbedding.class_name(): MockEmbedding, } +# conditionals for llama-cloud support +try: + from llama_index.embeddings.openai import OpenAIEmbedding # pants: no-infer-dep + + RECOGNIZED_EMBEDDINGS[OpenAIEmbedding.class_name()] = OpenAIEmbedding +except ImportError: + pass + +try: + from llama_index.embeddings.azure_openai import ( + AzureOpenAIEmbedding, + ) # pants: no-infer-dep + + RECOGNIZED_EMBEDDINGS[AzureOpenAIEmbedding.class_name()] = AzureOpenAIEmbedding +except ImportError: + pass + +try: + from llama_index.embeddings.huggingface import ( + HuggingFaceInferenceAPIEmbedding, + ) # pants: no-infer-dep + + RECOGNIZED_EMBEDDINGS[ + HuggingFaceInferenceAPIEmbedding.class_name() + ] = HuggingFaceInferenceAPIEmbedding +except ImportError: + pass + def load_embed_model(data: dict) -> BaseEmbedding: """Load Embedding by name.""" diff --git a/llama-index-core/llama_index/core/llms/loading.py b/llama-index-core/llama_index/core/llms/loading.py index 4b7ffbda961611e4fc6603753233effd16584747..7271115f1e9657ae8bf86dac043854c6dbd3389d 100644 --- a/llama-index-core/llama_index/core/llms/loading.py +++ b/llama-index-core/llama_index/core/llms/loading.py @@ -9,6 +9,30 @@ RECOGNIZED_LLMS: Dict[str, Type[LLM]] = { CustomLLM.class_name(): CustomLLM, } +# Conditionals for llama-cloud support +try: + from llama_index.llms.openai import OpenAI # pants: no-infer-dep + + RECOGNIZED_LLMS[OpenAI.class_name()] = OpenAI +except ImportError: + pass + +try: + from llama_index.llms.azure_openai import AzureOpenAI # pants: no-infer-dep + + RECOGNIZED_LLMS[AzureOpenAI.class_name()] = AzureOpenAI +except ImportError: + pass + +try: + from llama_index.llms.huggingface import ( + HuggingFaceInferenceAPI, + ) # pants: no-infer-dep + + RECOGNIZED_LLMS[HuggingFaceInferenceAPI.class_name()] = HuggingFaceInferenceAPI +except ImportError: + pass + def load_llm(data: dict) -> LLM: """Load LLM by name.""" diff --git a/llama-index-core/llama_index/core/node_parser/__init__.py b/llama-index-core/llama_index/core/node_parser/__init__.py index 6bf4bbf5ee58751d313831337e97ddf26e2b03a9..13a3c11c4972b49ef42f12a8c32bff8a8fe23486 100644 --- a/llama-index-core/llama_index/core/node_parser/__init__.py +++ b/llama-index-core/llama_index/core/node_parser/__init__.py @@ -13,6 +13,8 @@ from llama_index.core.node_parser.relational.hierarchical import ( HierarchicalNodeParser, get_leaf_nodes, get_root_nodes, + get_child_nodes, + get_deeper_nodes, ) from llama_index.core.node_parser.relational.markdown_element import ( MarkdownElementNodeParser, @@ -53,6 +55,8 @@ __all__ = [ "UnstructuredElementNodeParser", "get_leaf_nodes", "get_root_nodes", + "get_child_nodes", + "get_deeper_nodes", # deprecated, for backwards compatibility "SimpleNodeParser", ] diff --git a/llama-index-core/llama_index/core/node_parser/relational/hierarchical.py b/llama-index-core/llama_index/core/node_parser/relational/hierarchical.py index 96077a9147decb4acfa4251c0f1ad0818147abdb..e390978724350eebc931bed154f7591a538c6532 100644 --- a/llama-index-core/llama_index/core/node_parser/relational/hierarchical.py +++ b/llama-index-core/llama_index/core/node_parser/relational/hierarchical.py @@ -40,6 +40,41 @@ def get_root_nodes(nodes: List[BaseNode]) -> List[BaseNode]: return root_nodes +def get_child_nodes(nodes: List[BaseNode], all_nodes: List[BaseNode]) -> List[BaseNode]: + """Get child nodes of nodes from given all_nodes.""" + children_ids = [] + for node in nodes: + if NodeRelationship.CHILD not in node.relationships: + continue + + children_ids.extend( + [r.node_id for r in node.relationships[NodeRelationship.CHILD]] + ) + + child_nodes = [] + for candidate_node in all_nodes: + if candidate_node.node_id not in children_ids: + continue + child_nodes.append(candidate_node) + + return child_nodes + + +def get_deeper_nodes(nodes: List[BaseNode], depth: int = 1) -> List[BaseNode]: + """Get children of root nodes in given nodes that have given depth.""" + if depth < 0: + raise ValueError("Depth cannot be a negative number!") + root_nodes = get_root_nodes(nodes) + if not root_nodes: + raise ValueError("There is no root nodes in given nodes!") + + deeper_nodes = root_nodes + for _ in range(depth): + deeper_nodes = get_child_nodes(deeper_nodes, nodes) + + return deeper_nodes + + class HierarchicalNodeParser(NodeParser): """Hierarchical node parser. diff --git a/llama-index-core/tests/node_parser/test_hierarchical.py b/llama-index-core/tests/node_parser/test_hierarchical.py new file mode 100644 index 0000000000000000000000000000000000000000..3515dafd4450ef98f4b74f463cd425b0171e865b --- /dev/null +++ b/llama-index-core/tests/node_parser/test_hierarchical.py @@ -0,0 +1,69 @@ +import pytest + +from llama_index.core import Document +from llama_index.core.node_parser import ( + HierarchicalNodeParser, + get_child_nodes, + get_deeper_nodes, + get_leaf_nodes, + get_root_nodes, +) + +ROOT_NODES_LEN = 1 +CHILDREN_NODES_LEN = 3 +GRAND_CHILDREN_NODES_LEN = 7 + + +@pytest.fixture(scope="module") +def nodes() -> list: + node_parser = HierarchicalNodeParser.from_defaults( + chunk_sizes=[512, 128, 64], + chunk_overlap=10, + ) + return node_parser.get_nodes_from_documents([Document.example()]) + + +def test_get_root_nodes(nodes: list) -> None: + root_nodes = get_root_nodes(nodes) + assert len(root_nodes) == ROOT_NODES_LEN + + +def test_get_root_nodes_empty(nodes: list) -> None: + root_nodes = get_root_nodes(get_leaf_nodes(nodes)) + assert root_nodes == [] + + +def test_get_leaf_nodes(nodes: list) -> None: + leaf_nodes = get_leaf_nodes(nodes) + assert len(leaf_nodes) == GRAND_CHILDREN_NODES_LEN + + +def test_get_child_nodes(nodes: list) -> None: + child_nodes = get_child_nodes(get_root_nodes(nodes), all_nodes=nodes) + assert len(child_nodes) == CHILDREN_NODES_LEN + + +def test_get_deeper_nodes(nodes: list) -> None: + deep_nodes = get_deeper_nodes(nodes, depth=0) + assert deep_nodes == get_root_nodes(nodes) + + deep_nodes = get_deeper_nodes(nodes, depth=1) + assert deep_nodes == get_child_nodes(get_root_nodes(nodes), nodes) + + deep_nodes = get_deeper_nodes(nodes, depth=2) + assert deep_nodes == get_leaf_nodes(nodes) + + deep_nodes = get_deeper_nodes(nodes, depth=2) + assert deep_nodes == get_child_nodes( + get_child_nodes(get_root_nodes(nodes), nodes), nodes + ) + + +def test_get_deeper_nodes_with_no_root_nodes(nodes: list) -> None: + with pytest.raises(ValueError, match="There is no*"): + get_deeper_nodes(get_leaf_nodes(nodes)) + + +def test_get_deeper_nodes_with_negative_depth(nodes: list) -> None: + with pytest.raises(ValueError, match="Depth cannot be*"): + get_deeper_nodes(nodes, -1) diff --git a/llama-index-integrations/indices/llama-index-indices-managed-llama-cloud/pyproject.toml b/llama-index-integrations/indices/llama-index-indices-managed-llama-cloud/pyproject.toml index ee2ab768920a3786d96251daa6caf4f497476ecd..0b897b81e82584b23408f886f5a167621570cbf6 100644 --- a/llama-index-integrations/indices/llama-index-indices-managed-llama-cloud/pyproject.toml +++ b/llama-index-integrations/indices/llama-index-indices-managed-llama-cloud/pyproject.toml @@ -9,6 +9,11 @@ check-hidden = true # work through many typos (--write-changes and --interactive will help) skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" +[tool.llamahub] +classes = ["LlamaCloudIndex", "LlamaCloudRetriever"] +contains_example = false +import_path = "llama_index.indices.managed.llama_cloud" + [tool.mypy] disallow_untyped_defs = true # Remove venv skip when integrated with pre-commit @@ -20,7 +25,7 @@ python_version = "3.8" authors = ["Logan Markewich <logan@llamaindex.ai>"] description = "llama-index indices llama-cloud integration" license = "MIT" -name = "llama-index-indices-llama-cloud" +name = "llama-index-indices-managed-llama-cloud" packages = [{include = "llama_index/"}] readme = "README.md" version = "0.1.0" diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/beautiful_soup_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/beautiful_soup_web/base.py index 32fa11234dab0cdb105a725401020cd6b8df8cdc..384ee2d1814768183245ad9fe4c2dca55001ecc9 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/beautiful_soup_web/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/beautiful_soup_web/base.py @@ -4,6 +4,7 @@ import logging from typing import Any, Callable, Dict, List, Optional, Tuple from urllib.parse import urljoin +from llama_index.core.bridge.pydantic import PrivateAttr from llama_index.core.readers.base import BasePydanticReader from llama_index.core.schema import Document @@ -144,7 +145,11 @@ class BeautifulSoupWebReader(BasePydanticReader): """ is_remote: bool = True - website_extractor: Dict[str, Callable] = DEFAULT_WEBSITE_EXTRACTOR + _website_extractor: Dict[str, Callable] = PrivateAttr() + + def __init__(self, website_extractor: Optional[Dict[str, Callable]] = None) -> None: + self._website_extractor = website_extractor or DEFAULT_WEBSITE_EXTRACTOR + super().__init__() @classmethod def class_name(cls) -> str: @@ -187,8 +192,8 @@ class BeautifulSoupWebReader(BasePydanticReader): data = "" extra_info = {"URL": url} - if hostname in self.website_extractor: - data, metadata = self.website_extractor[hostname]( + if hostname in self._website_extractor: + data, metadata = self._website_extractor[hostname]( soup=soup, url=url, include_url_in_text=include_url_in_text ) extra_info.update(metadata)