From cfb2d7a58a8b8fe070ece26322b7df39e8d2b804 Mon Sep 17 00:00:00 2001
From: Ravi Theja <ravi03071991@gmail.com>
Date: Tue, 26 Mar 2024 18:45:29 +0530
Subject: [PATCH] Add logging to RAFT llamapack (#12275)

* Add logging

* Add logging

* resolve errors
---
 .../llama_index/packs/raft_dataset/base.py      | 17 ++++++++++-------
 .../pyproject.toml                              |  2 +-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/llama-index-packs/llama-index-packs-raft-dataset/llama_index/packs/raft_dataset/base.py b/llama-index-packs/llama-index-packs-raft-dataset/llama_index/packs/raft_dataset/base.py
index 88f49c5939..ecb7c13f59 100644
--- a/llama-index-packs/llama-index-packs-raft-dataset/llama_index/packs/raft_dataset/base.py
+++ b/llama-index-packs/llama-index-packs-raft-dataset/llama_index/packs/raft_dataset/base.py
@@ -4,8 +4,12 @@
 
 from typing import Any, List
 import random
+import logging
 from datasets import Dataset
 
+# Configure logging to output to the console, with messages of level DEBUG and above
+logging.basicConfig(level=logging.INFO)
+
 from llama_index.core.llama_pack.base import BaseLlamaPack
 from llama_index.core import SimpleDirectoryReader
 
@@ -30,7 +34,6 @@ class RAFTDatasetPack(BaseLlamaPack):
         num_distract_docs: int = 3,
         chunk_size: int = DEFAULT_CHUNK_SIZE,
         default_breakpoint_percentile_threshold=DEFAULT_BREAKPOINT_PERCENTILE_THRESHOLD,
-        **kwargs: Any,
     ):
         self.file_path = file_path
         self.num_questions_per_chunk = num_questions_per_chunk
@@ -116,10 +119,7 @@ class RAFTDatasetPack(BaseLlamaPack):
         Takes in a `file_path`, retrieves the document, breaks it down into chunks of size
         `chunk_size`, and returns the chunks.
         """
-        chunks = []
-
         documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
-        # TODO: Should be changed to SemanticSplitterNodeParser
         splitter = SemanticSplitterNodeParser(
             buffer_size=1,
             breakpoint_percentile_threshold=self.default_breakpoint_percentile_threshold,
@@ -156,7 +156,7 @@ class RAFTDatasetPack(BaseLlamaPack):
             datapt["type"] = "general"
             datapt["question"] = q
 
-            # add 4 distractor docs
+            # add distractor docs
             docs = [chunk]
             indices = list(range(len(chunks)))
             indices.remove(i)
@@ -199,15 +199,18 @@ class RAFTDatasetPack(BaseLlamaPack):
             else:
                 self.ds = self.ds.add_item(datapt)
 
-    def run(self, *args: Any, **kwargs: Any) -> Any:
+    def run(self) -> Any:
         """Run the pipeline."""
         chunks = self.get_chunks(self.file_path, self.chunk_size)
 
+        logging.info(f"Number of chunks created: {len(chunks)}")
+
         self.num_distract_docs = (
             min(self.num_distract_docs, len(chunks)) - 1
         )  # should be less than number of chunks/ nodes created
 
-        for chunk in chunks:
+        for index, chunk in enumerate(chunks):
+            logging.info(f"Processing chunk: {index}")
             self.add_chunk_to_dataset(
                 chunks, chunk, self.num_questions_per_chunk, self.num_distract_docs
             )
diff --git a/llama-index-packs/llama-index-packs-raft-dataset/pyproject.toml b/llama-index-packs/llama-index-packs-raft-dataset/pyproject.toml
index 6adc633277..3417dd564a 100644
--- a/llama-index-packs/llama-index-packs-raft-dataset/pyproject.toml
+++ b/llama-index-packs/llama-index-packs-raft-dataset/pyproject.toml
@@ -29,7 +29,7 @@ license = "MIT"
 maintainers = ["ravi-theja"]
 name = "llama-index-packs-raft-dataset"
 readme = "README.md"
-version = "0.1.2"
+version = "0.1.3"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"
-- 
GitLab