diff --git a/.github/workflows/publish_release.yml b/.github/workflows/publish_release.yml index a9f0017858a17f1bb109ebac7b25c85d7258c6a4..bebd6d67a2166a9527ad321b2b18266a58246f6f 100644 --- a/.github/workflows/publish_release.yml +++ b/.github/workflows/publish_release.yml @@ -7,6 +7,10 @@ on: workflow_dispatch: +env: + POETRY_VERSION: "1.6.1" + PYTHON_VERSION: "3.9" + jobs: build-n-publish: name: Build and publish to PyPI @@ -14,6 +18,23 @@ jobs: steps: - uses: actions/checkout@v3 + - name: Set up python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + version: ${{ env.POETRY_VERSION }} + - name: Install deps + shell: bash + run: poetry install + - name: Cache tiktoken and nltk files + shell: bash + run: python -c "from llama_index import get_tokenizer; get_tokenizer()" + - name: Clean up zip files + shell: bash + run: rm -rf llama_index/_static/nltk_cache/corpora/stopwords.zip llama_index/_static/nltk_cache/tokenizers/punkt.zip - name: Build and publish to pypi uses: JRubics/poetry-publish@v1.17 with: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fd0ace912792f84c5998a4475ee30b322406b8e1..e3ae90f4736c9b94735c1bb3e33732b287049de9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,24 +10,30 @@ repos: - id: check-merge-conflict - id: check-symlinks - id: check-toml + exclude: llama_index/_static - id: check-yaml + exclude: llama_index/_static - id: detect-private-key - id: end-of-file-fixer + exclude: llama_index/_static - id: mixed-line-ending + exclude: llama_index/_static - id: trailing-whitespace + exclude: llama_index/_static - repo: https://github.com/charliermarsh/ruff-pre-commit rev: v0.1.5 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] + exclude: llama_index/_static - repo: https://github.com/psf/black-pre-commit-mirror rev: 23.10.1 hooks: - id: black-jupyter name: black-src alias: black - exclude: docs/ + exclude: ^(docs/|llama_index/_static) - repo: https://github.com/psf/black-pre-commit-mirror rev: 23.10.1 hooks: @@ -51,11 +57,13 @@ repos: rev: v3.0.3 hooks: - id: prettier + exclude: llama_index/_static - repo: https://github.com/codespell-project/codespell rev: v2.2.6 hooks: - id: codespell additional_dependencies: [tomli] + exclude: llama_index/_static - repo: https://github.com/srstevenson/nb-clean rev: 3.1.0 hooks: @@ -65,4 +73,4 @@ repos: rev: v0.23.1 hooks: - id: toml-sort-fix - exclude: poetry.lock + exclude: ^(poetry.lock|llama_index/_static) diff --git a/llama_index/_static/nltk_cache/.gitignore b/llama_index/_static/nltk_cache/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..046c31c1546f89ae46ab263f05e2ff11dc927442 --- /dev/null +++ b/llama_index/_static/nltk_cache/.gitignore @@ -0,0 +1,2 @@ +# Include this file +!.gitignore diff --git a/llama_index/_static/tiktoken_cache/.gitignore b/llama_index/_static/tiktoken_cache/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..046c31c1546f89ae46ab263f05e2ff11dc927442 --- /dev/null +++ b/llama_index/_static/tiktoken_cache/.gitignore @@ -0,0 +1,2 @@ +# Include this file +!.gitignore diff --git a/llama_index/finetuning/cross_encoders/dataset_gen.py b/llama_index/finetuning/cross_encoders/dataset_gen.py index a594c221ab8ca540fb906e780b9a4c0236d58c03..4fe35f555b9e20a33347feac551fff6c0b0f5890 100644 --- a/llama_index/finetuning/cross_encoders/dataset_gen.py +++ b/llama_index/finetuning/cross_encoders/dataset_gen.py @@ -3,10 +3,9 @@ import re from dataclasses import dataclass from typing import List, Optional -import tiktoken from tqdm.auto import tqdm -from llama_index import VectorStoreIndex +from llama_index import VectorStoreIndex, get_tokenizer from llama_index.llms import ChatMessage, OpenAI from llama_index.llms.llm import LLM from llama_index.node_parser import TokenTextSplitter @@ -46,7 +45,7 @@ def generate_synthetic_queries_over_documents( chunk_size=max_chunk_length, chunk_overlap=0, backup_separators=["\n"], - tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode, + tokenizer=get_tokenizer(), ) llm = llm or OpenAI(model="gpt-3.5-turbo-16k", temperature=0.3) @@ -123,7 +122,7 @@ def generate_ce_fine_tuning_dataset( chunk_size=max_chunk_length, chunk_overlap=0, backup_separators=["\n"], - tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode, + tokenizer=get_tokenizer(), ) # Use logit bias in case of OpenAI for the tokens for Yes and No diff --git a/llama_index/indices/keyword_table/utils.py b/llama_index/indices/keyword_table/utils.py index e3d05648620cd32ae89fedb9a053c639d6df01a7..d6ec7363c1095150da81e8ede349a03f5576a34a 100644 --- a/llama_index/indices/keyword_table/utils.py +++ b/llama_index/indices/keyword_table/utils.py @@ -29,8 +29,6 @@ def rake_extract_keywords( """Extract keywords with RAKE.""" try: import nltk - - nltk.download("punkt") except ImportError: raise ImportError("Please install nltk: `pip install nltk`") try: @@ -38,7 +36,10 @@ def rake_extract_keywords( except ImportError: raise ImportError("Please install rake_nltk: `pip install rake_nltk`") - r = Rake() + r = Rake( + sentence_tokenizer=nltk.tokenize.sent_tokenize, + word_tokenizer=nltk.tokenize.wordpunct_tokenize, + ) r.extract_keywords_from_text(text_chunk) keywords = r.get_ranked_phrases()[:max_keywords] if expand_with_subtokens: diff --git a/llama_index/memory/chat_memory_buffer.py b/llama_index/memory/chat_memory_buffer.py index baa22c299dab88d4f7e2f7c53730c4df59c789dd..8592394ac465ee70d3701df86f4bbf09727b9b6c 100644 --- a/llama_index/memory/chat_memory_buffer.py +++ b/llama_index/memory/chat_memory_buffer.py @@ -1,10 +1,10 @@ -from typing import Any, Callable, Dict, List, Optional, cast +from typing import Any, Callable, Dict, List, Optional from llama_index.bridge.pydantic import Field, root_validator from llama_index.llms.llm import LLM from llama_index.llms.types import ChatMessage, MessageRole from llama_index.memory.types import BaseMemory -from llama_index.utils import GlobalsHelper +from llama_index.utils import get_tokenizer DEFUALT_TOKEN_LIMIT_RATIO = 0.75 DEFAULT_TOKEN_LIMIT = 3000 @@ -16,7 +16,7 @@ class ChatMemoryBuffer(BaseMemory): token_limit: int tokenizer_fn: Callable[[str], List] = Field( # NOTE: mypy does not handle the typing here well, hence the cast - default_factory=cast(Callable[[], Any], GlobalsHelper().tokenizer), + default_factory=get_tokenizer, exclude=True, ) chat_history: List[ChatMessage] = Field(default_factory=list) @@ -42,7 +42,7 @@ class ChatMemoryBuffer(BaseMemory): # Validate tokenizer -- this avoids errors when loading from json/dict tokenizer_fn = values.get("tokenizer_fn", None) if tokenizer_fn is None: - values["tokenizer_fn"] = GlobalsHelper().tokenizer + values["tokenizer_fn"] = get_tokenizer() return values @@ -63,7 +63,7 @@ class ChatMemoryBuffer(BaseMemory): return cls( token_limit=token_limit, - tokenizer_fn=tokenizer_fn or GlobalsHelper().tokenizer, + tokenizer_fn=tokenizer_fn or get_tokenizer(), chat_history=chat_history or [], ) diff --git a/llama_index/node_parser/text/utils.py b/llama_index/node_parser/text/utils.py index 1f581c43c369757af2ffcf5eceecebc28a1408cb..67465770e4598dd599a59be74d1d814d3e8ded13 100644 --- a/llama_index/node_parser/text/utils.py +++ b/llama_index/node_parser/text/utils.py @@ -35,31 +35,8 @@ def split_by_char() -> Callable[[str], List[str]]: def split_by_sentence_tokenizer() -> Callable[[str], List[str]]: - import os - import nltk - from llama_index.utils import get_cache_dir - - cache_dir = get_cache_dir() - nltk_data_dir = os.environ.get("NLTK_DATA", cache_dir) - - # update nltk path for nltk so that it finds the data - if nltk_data_dir not in nltk.data.path: - nltk.data.path.append(nltk_data_dir) - - try: - nltk.data.find("tokenizers/punkt") - except LookupError: - try: - nltk.download("punkt", download_dir=nltk_data_dir) - except FileExistsError: - logger.info( - "Tried to re-download NLTK files but already exists. " - "This could happen in multi-theaded deployments, " - "should be benign" - ) - tokenizer = nltk.tokenize.PunktSentenceTokenizer() # get the spans and then return the sentences diff --git a/llama_index/postprocessor/optimizer.py b/llama_index/postprocessor/optimizer.py index b5b80fe4961751ceb424d83f888036a94e6335fa..c811e76413e3fc4f0a8ced172d136d92df80ee82 100644 --- a/llama_index/postprocessor/optimizer.py +++ b/llama_index/postprocessor/optimizer.py @@ -66,24 +66,8 @@ class SentenceEmbeddingOptimizer(BaseNodePostprocessor): self._embed_model = embed_model or OpenAIEmbedding() if tokenizer_fn is None: - import os - import nltk.data - from llama_index.utils import get_cache_dir - - cache_dir = get_cache_dir() - nltk_data_dir = os.environ.get("NLTK_DATA", cache_dir) - - # update nltk path for nltk so that it finds the data - if nltk_data_dir not in nltk.data.path: - nltk.data.path.append(nltk_data_dir) - - try: - nltk.data.find("tokenizers/punkt") - except LookupError: - nltk.download("punkt", download_dir=nltk_data_dir) - tokenizer = nltk.data.load("tokenizers/punkt/english.pickle") tokenizer_fn = tokenizer.tokenize self._tokenizer_fn = tokenizer_fn diff --git a/llama_index/utils.py b/llama_index/utils.py index 32a0e582c8e74fc272add381abf771cbc01a789e..585f5be9d491845379f6cbeecba00b819d441621 100644 --- a/llama_index/utils.py +++ b/llama_index/utils.py @@ -25,7 +25,6 @@ from typing import ( Set, Type, Union, - cast, runtime_checkable, ) @@ -38,24 +37,34 @@ class GlobalsHelper: """ - _tokenizer: Optional[Callable[[str], List]] = None _stopwords: Optional[List[str]] = None + _nltk_data_dir: Optional[str] = None + + def __init__(self) -> None: + """Initialize NLTK stopwords and punkt.""" + import nltk + + self._nltk_data_dir = os.environ.get( + "NLTK_DATA", + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "_static/nltk_cache", + ), + ) - @property - def tokenizer(self) -> Callable[[str], List]: - """Get tokenizer. TODO: Deprecated.""" - if self._tokenizer is None: - tiktoken_import_err = ( - "`tiktoken` package not found, please run `pip install tiktoken`" - ) - try: - import tiktoken - except ImportError: - raise ImportError(tiktoken_import_err) - enc = tiktoken.get_encoding("gpt2") - self._tokenizer = cast(Callable[[str], List], enc.encode) - self._tokenizer = partial(self._tokenizer, allowed_special="all") - return self._tokenizer # type: ignore + if self._nltk_data_dir not in nltk.data.path: + nltk.data.path.append(self._nltk_data_dir) + + # ensure access to data is there + try: + nltk.data.find("corpora/stopwords", paths=[self._nltk_data_dir]) + except LookupError: + nltk.download("stopwords", download_dir=self._nltk_data_dir) + + try: + nltk.data.find("tokenizers/punkt", paths=[self._nltk_data_dir]) + except LookupError: + nltk.download("punkt", download_dir=self._nltk_data_dir) @property def stopwords(self) -> List[str]: @@ -69,19 +78,10 @@ class GlobalsHelper: "`nltk` package not found, please run `pip install nltk`" ) - from llama_index.utils import get_cache_dir - - cache_dir = get_cache_dir() - nltk_data_dir = os.environ.get("NLTK_DATA", cache_dir) - - # update nltk path for nltk so that it finds the data - if nltk_data_dir not in nltk.data.path: - nltk.data.path.append(nltk_data_dir) - try: - nltk.data.find("corpora/stopwords") + nltk.data.find("corpora/stopwords", paths=[self._nltk_data_dir]) except LookupError: - nltk.download("stopwords", download_dir=nltk_data_dir) + nltk.download("stopwords", download_dir=self._nltk_data_dir) self._stopwords = stopwords.words("english") return self._stopwords @@ -116,10 +116,23 @@ def get_tokenizer() -> Callable[[str], List]: import tiktoken except ImportError: raise ImportError(tiktoken_import_err) + + # set tokenizer cache temporarily + should_revert = False + if "TIKTOKEN_CACHE_DIR" not in os.environ: + should_revert = True + os.environ["TIKTOKEN_CACHE_DIR"] = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "_static/tiktoken_cache", + ) + enc = tiktoken.encoding_for_model("gpt-3.5-turbo") tokenizer = partial(enc.encode, allowed_special="all") set_global_tokenizer(tokenizer) + if should_revert: + del os.environ["TIKTOKEN_CACHE_DIR"] + assert llama_index.global_tokenizer is not None return llama_index.global_tokenizer diff --git a/pyproject.toml b/pyproject.toml index 4eee9431aec03b84582907c24952ac786613e671..1c0ea8965fd7c176a749fb2e4884fd8b8f9412d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,12 +8,12 @@ check-hidden = true ignore-words-list = "astroid,gallary,momento,narl,ot,rouge" # Feel free to un-skip examples, and experimental, you will just need to # work through many typos (--write-changes and --interactive will help) -skip = "./examples,./experimental,*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" +skip = "./llama_index/_static,./examples,./experimental,*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" [tool.mypy] disallow_untyped_defs = true # Remove venv skip when integrated with pre-commit -exclude = ["build", "examples", "notebooks", "venv"] +exclude = ["_static", "build", "examples", "notebooks", "venv"] ignore_missing_imports = true python_version = "3.8" @@ -27,6 +27,7 @@ classifiers = [ description = "Interface between LLMs and your data" documentation = "https://docs.llamaindex.ai/en/stable/" homepage = "https://llamaindex.ai" +include = ["llama_index/_static"] keywords = ["LLM", "NLP", "RAG", "data", "devtools", "index", "retrieval"] license = "MIT" maintainers = [ @@ -147,6 +148,7 @@ llamaindex-cli = 'llama_index.command_line.command_line:main' [tool.ruff] exclude = [ + "_static", "examples", "notebooks", ] diff --git a/tests/memory/test_chat_memory_buffer.py b/tests/memory/test_chat_memory_buffer.py index 7709f665c58878be077a1eb9fd4a294c1f28f90a..5e08180b8bf4876904b930090770b4dc8511ad01 100644 --- a/tests/memory/test_chat_memory_buffer.py +++ b/tests/memory/test_chat_memory_buffer.py @@ -3,9 +3,9 @@ import pickle import pytest from llama_index.llms import ChatMessage, MessageRole from llama_index.memory.chat_memory_buffer import ChatMemoryBuffer -from llama_index.utils import GlobalsHelper +from llama_index.utils import get_tokenizer -tokenizer = GlobalsHelper().tokenizer +tokenizer = get_tokenizer() USER_CHAT_MESSAGE = ChatMessage(role=MessageRole.USER, content="first message") USER_CHAT_MESSAGE_TOKENS = len(tokenizer(str(USER_CHAT_MESSAGE.content)))