From cfb2d7a58a8b8fe070ece26322b7df39e8d2b804 Mon Sep 17 00:00:00 2001 From: Ravi Theja <ravi03071991@gmail.com> Date: Tue, 26 Mar 2024 18:45:29 +0530 Subject: [PATCH] Add logging to RAFT llamapack (#12275) * Add logging * Add logging * resolve errors --- .../llama_index/packs/raft_dataset/base.py | 17 ++++++++++------- .../pyproject.toml | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/llama-index-packs/llama-index-packs-raft-dataset/llama_index/packs/raft_dataset/base.py b/llama-index-packs/llama-index-packs-raft-dataset/llama_index/packs/raft_dataset/base.py index 88f49c5939..ecb7c13f59 100644 --- a/llama-index-packs/llama-index-packs-raft-dataset/llama_index/packs/raft_dataset/base.py +++ b/llama-index-packs/llama-index-packs-raft-dataset/llama_index/packs/raft_dataset/base.py @@ -4,8 +4,12 @@ from typing import Any, List import random +import logging from datasets import Dataset +# Configure logging to output to the console, with messages of level DEBUG and above +logging.basicConfig(level=logging.INFO) + from llama_index.core.llama_pack.base import BaseLlamaPack from llama_index.core import SimpleDirectoryReader @@ -30,7 +34,6 @@ class RAFTDatasetPack(BaseLlamaPack): num_distract_docs: int = 3, chunk_size: int = DEFAULT_CHUNK_SIZE, default_breakpoint_percentile_threshold=DEFAULT_BREAKPOINT_PERCENTILE_THRESHOLD, - **kwargs: Any, ): self.file_path = file_path self.num_questions_per_chunk = num_questions_per_chunk @@ -116,10 +119,7 @@ class RAFTDatasetPack(BaseLlamaPack): Takes in a `file_path`, retrieves the document, breaks it down into chunks of size `chunk_size`, and returns the chunks. """ - chunks = [] - documents = SimpleDirectoryReader(input_files=[file_path]).load_data() - # TODO: Should be changed to SemanticSplitterNodeParser splitter = SemanticSplitterNodeParser( buffer_size=1, breakpoint_percentile_threshold=self.default_breakpoint_percentile_threshold, @@ -156,7 +156,7 @@ class RAFTDatasetPack(BaseLlamaPack): datapt["type"] = "general" datapt["question"] = q - # add 4 distractor docs + # add distractor docs docs = [chunk] indices = list(range(len(chunks))) indices.remove(i) @@ -199,15 +199,18 @@ class RAFTDatasetPack(BaseLlamaPack): else: self.ds = self.ds.add_item(datapt) - def run(self, *args: Any, **kwargs: Any) -> Any: + def run(self) -> Any: """Run the pipeline.""" chunks = self.get_chunks(self.file_path, self.chunk_size) + logging.info(f"Number of chunks created: {len(chunks)}") + self.num_distract_docs = ( min(self.num_distract_docs, len(chunks)) - 1 ) # should be less than number of chunks/ nodes created - for chunk in chunks: + for index, chunk in enumerate(chunks): + logging.info(f"Processing chunk: {index}") self.add_chunk_to_dataset( chunks, chunk, self.num_questions_per_chunk, self.num_distract_docs ) diff --git a/llama-index-packs/llama-index-packs-raft-dataset/pyproject.toml b/llama-index-packs/llama-index-packs-raft-dataset/pyproject.toml index 6adc633277..3417dd564a 100644 --- a/llama-index-packs/llama-index-packs-raft-dataset/pyproject.toml +++ b/llama-index-packs/llama-index-packs-raft-dataset/pyproject.toml @@ -29,7 +29,7 @@ license = "MIT" maintainers = ["ravi-theja"] name = "llama-index-packs-raft-dataset" readme = "README.md" -version = "0.1.2" +version = "0.1.3" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" -- GitLab