diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/base.py b/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/base.py index 234aab4677537db94b5ba583b51a74b6d05854cd..c057b3f5b49e33e18d1937a9f29ceb7e38a4a553 100644 --- a/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/base.py +++ b/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/base.py @@ -4,6 +4,7 @@ import logging import os from typing import Dict, List, Optional from urllib.parse import unquote +import requests from llama_index.core.readers.base import BaseReader from llama_index.core.schema import Document @@ -322,6 +323,8 @@ class ConfluenceReader(BaseReader): return function(**kwargs) def process_page(self, page, include_attachments, text_maker): + logger.info("Processing " + self.base_url + page["_links"]["webui"]) + if include_attachments: attachment_texts = self.process_attachment(page["id"]) else: @@ -357,23 +360,74 @@ class ConfluenceReader(BaseReader): absolute_url = self.base_url + attachment["_links"]["download"] title = attachment["title"] if media_type == "application/pdf": + logger.info("Processing PDF attachment " + absolute_url) text = title + self.process_pdf(absolute_url) elif ( media_type == "image/png" or media_type == "image/jpg" or media_type == "image/jpeg" + or media_type == "image/webp" ): + logger.info("Processing image attachment " + absolute_url) text = title + self.process_image(absolute_url) elif ( media_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ): + logger.info("Processing Word document attachment " + absolute_url) text = title + self.process_doc(absolute_url) - elif media_type == "application/vnd.ms-excel": - text = title + self.process_xls(absolute_url) + elif ( + media_type == "application/vnd.ms-excel" + or media_type + == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + or media_type == "application/vnd.ms-excel.sheet.macroenabled.12" + ): + if title.endswith(".csv") or absolute_url.endswith(".csv"): + logger.info("Processing CSV attachment " + absolute_url) + text = title + self.process_csv(absolute_url) + else: + logger.info("Processing XLS attachment " + absolute_url) + text = title + self.process_xls(absolute_url) + elif media_type == "application/vnd.ms-excel.sheet.binary.macroenabled.12": + logger.info("Processing XLSB attachment " + absolute_url) + text = title + self.process_xlsb(absolute_url) + elif media_type == "text/csv": + logger.info("Processing CSV attachment " + absolute_url) + text = title + self.process_csv(absolute_url) + elif media_type == "application/vnd.ms-outlook": + logger.info("Processing Outlook message attachment " + absolute_url) + text = title + self.process_msg(absolute_url) + elif media_type == "text/html": + logger.info(" Processing HTML attachment " + absolute_url) + text = title + self.process_html(absolute_url) + elif media_type == "text/plain": + if title.endswith(".csv") or absolute_url.endswith(".csv"): + logger.info("Processing CSV attachment " + absolute_url) + text = title + self.process_csv(absolute_url) + else: + logger.info("Processing Text attachment " + absolute_url) + text = title + self.process_txt(absolute_url) elif media_type == "image/svg+xml": + logger.info("Processing SVG attachment " + absolute_url) text = title + self.process_svg(absolute_url) + elif ( + media_type + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + or media_type + == "application/vnd.ms-powerpoint.presentation.macroenabled.12" + ): + logger.info( + "Processing PowerPoint attachment " + + absolute_url + + " (" + + media_type + + ")" + ) + text = title + self.process_ppt(absolute_url) else: + logger.info( + f"Skipping unsupported attachment {absolute_url} of media_type {media_type}" + ) continue texts.append(text) @@ -409,6 +463,85 @@ class ConfluenceReader(BaseReader): return text + def process_html(self, link): + try: + from bs4 import BeautifulSoup # type: ignore + import requests + except ImportError: + raise ImportError( + "`beautifulsoup4` or `requests` package not found, please run `pip install beautifulsoup4 requests`" + ) + + try: + response = requests.get(link) + if response.status_code != 200: + return "Error fetching HTML content: HTTP Status Code {}".format( + response.status_code + ) + + # Parse the HTML content and extract text + soup = BeautifulSoup(response.content, "html.parser") + return soup.get_text(separator=" ", strip=True) + except Exception as e: + logger.error(f"Error processing HTML file at {link}: {e}") + return f"Error processing HTML file: {link}. An error occurred while fetching or parsing the content." + + def process_txt(self, link): + try: + import requests + except ImportError: + raise ImportError( + "`requests` package not found, please run `pip install requests`" + ) + + try: + response = requests.get(link) + if response.status_code != 200: + return "Error fetching text content: HTTP Status Code {}".format( + response.status_code + ) + return response.text + except Exception as e: + logger.error(f"Error processing text file at {link}: {e}") + return f"Error processing text file: {link}. An error occurred while fetching the content." + + def process_msg(self, link): + try: + import extract_msg # type: ignore + from io import BytesIO + except ImportError: + raise ImportError( + "`extract-msg` package not found, please run `pip install extract-msg`" + ) + + response = self.confluence.request(path=link, absolute=True) + text = "" + + if response.status_code != 200 or response.content in [b"", None]: + logger.error(f"Failed to download .msg file from {link}") + return text + + file_data = BytesIO(response.content) + + try: + # Load the .msg file content + with extract_msg.Message(file_data) as msg: + subject = msg.subject + sender = msg.sender + to = msg.to + cc = msg.cc + body = msg.body + + # Compile the extracted information into a text string + text = ( + f"Subject: {subject}\nFrom: {sender}\nTo: {to}\nCC: {cc}\n\n{body}" + ) + except Exception as e: + logger.error(f"Error processing .msg file at {link}: {e}") + return "Error processing .msg file." + + return text + def process_image(self, link): try: from io import BytesIO # type: ignore @@ -421,6 +554,77 @@ class ConfluenceReader(BaseReader): " pytesseract Pillow`" ) + text = "" + + try: + response = self.confluence.request(path=link, absolute=True) + # Check if the response status code indicates success (200 OK) + if response.status_code == 200 and response.content: + try: + image = Image.open(BytesIO(response.content)) + text = pytesseract.image_to_string(image) + except OSError: + # Handle errors that occur while opening or processing the image + logger.error( + f"Error processing image at {link}: Unable to open or read the image content." + ) + return text + else: + # Log non-200 responses here if needed + logger.error( + f"Error fetching image at {link}: HTTP status code {response.status_code}." + ) + return text + except requests.exceptions.RequestException as e: + # This catches any Requests-related exceptions, including HTTPError, ConnectionError, etc. + logger.error(f"Request error while fetching image at {link}: {e}") + return text + + return text + + def process_doc(self, link): + try: + from io import BytesIO + import docx2txt + import zipfile # Import zipfile to catch BadZipFile exceptions + except ImportError: + raise ImportError( + "`docx2txt` package not found, please run `pip install docx2txt`" + ) + + text = "" + + try: + response = self.confluence.request(path=link, absolute=True) + if response.status_code != 200 or response.content in [b"", None]: + logger.error( + f"Error fetching document at {link}: HTTP status code {response.status_code}." + ) + return text + + file_data = BytesIO(response.content) + try: + text = docx2txt.process(file_data) + except zipfile.BadZipFile: + logger.error( + f"Error processing Word document at {link}: File is not a zip file." + ) + return text + except Exception as e: + logger.error(f"Unexpected error processing document at {link}: {e}") + return text + + return text + + def process_ppt(self, link): + try: + from io import BytesIO + from pptx import Presentation # type: ignore + except ImportError: + raise ImportError( + "`python-pptx` package not found, please run `pip install python-pptx`" + ) + response = self.confluence.request(path=link, absolute=True) text = "" @@ -430,21 +634,69 @@ class ConfluenceReader(BaseReader): or response.content is None ): return text + + file_data = BytesIO(response.content) + + try: + presentation = Presentation(file_data) + for slide in presentation.slides: + for shape in slide.shapes: + if hasattr(shape, "text"): + text += shape.text + " " + except ( + Exception + ) as e: # Catching a general exception to handle any unexpected errors + logger.error(f"Error processing PowerPoint file at {link}: {e}") + text = f"Error processing PowerPoint file: {link}. The file might be corrupt or not a valid PowerPoint file." + + return text.strip() # Remove any leading/trailing whitespace + + def process_xls(self, link): try: - image = Image.open(BytesIO(response.content)) - except OSError: + import pandas as pd # type: ignore + except ImportError: + raise ImportError( + "`pandas` package not found, please run `pip install pandas`" + ) + try: + from io import BytesIO + except ImportError: + raise ImportError("Failed to import BytesIO from io") + + response = self.confluence.request(path=link, absolute=True) + text = "" + + if ( + response.status_code != 200 + or response.content == b"" + or response.content is None + ): return text - return pytesseract.image_to_string(image) + file_data = BytesIO(response.content) - def process_doc(self, link): + # Try to read the Excel file try: - from io import BytesIO # type: ignore + # Use pandas to read all sheets; returns a dict of DataFrame + sheets = pd.read_excel(file_data, sheet_name=None, engine="openpyxl") + except Exception as e: + return f"Failed to read Excel file: {e!s}" + + for sheet_name, sheet_data in sheets.items(): + text += f"{sheet_name}:\n" + for row_index, row in sheet_data.iterrows(): + text += "\t".join(str(value) for value in row) + "\n" + text += "\n" - import docx2txt # type: ignore + return text.strip() + + def process_xlsb(self, link): + try: + import pandas as pd + from io import BytesIO except ImportError: raise ImportError( - "`docx2txt` package not found, please run `pip install docx2txt`" + "`pandas` package not found, please run `pip install pandas`" ) response = self.confluence.request(path=link, absolute=True) @@ -456,15 +708,31 @@ class ConfluenceReader(BaseReader): or response.content is None ): return text + file_data = BytesIO(response.content) - return docx2txt.process(file_data) + try: + # Use pandas to read the .xlsb file, specifying pyxlsb as the engine + df = pd.read_excel(file_data, engine="pyxlsb") + # Convert the DataFrame to a text string + text_rows = [] + for index, row in df.iterrows(): + text_rows.append(", ".join(row.astype(str))) + text = "\n".join(text_rows) + except Exception as e: + logger.error(f"Error processing XLSB file at {link}: {e}") + text = "Error processing XLSB file." - def process_xls(self, link): + return text + + def process_csv(self, link): try: - import xlrd # type: ignore + import pandas as pd + from io import BytesIO except ImportError: - raise ImportError("`xlrd` package not found, please run `pip install xlrd`") + raise ImportError( + "`pandas` package not found, please run `pip install pandas`" + ) response = self.confluence.request(path=link, absolute=True) text = "" @@ -476,14 +744,19 @@ class ConfluenceReader(BaseReader): ): return text - workbook = xlrd.open_workbook(file_contents=response.content) - for sheet in workbook.sheets(): - text += f"{sheet.name}:\n" - for row in range(sheet.nrows): - for col in range(sheet.ncols): - text += f"{sheet.cell_value(row, col)}\t" - text += "\n" - text += "\n" + file_data = BytesIO(response.content) + + try: + # Assuming CSV uses default comma delimiter. If delimiter varies, consider detecting it. + df = pd.read_csv(file_data, low_memory=False) + # Convert the DataFrame to a text string, including headers + text_rows = [] + for index, row in df.iterrows(): + text_rows.append(", ".join(row.astype(str))) + text = "\n".join(text_rows) + except Exception as e: + logger.error(f"Error processing CSV file: {e}") + text = "Error processing CSV file." return text diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml index 691d68005d711192b37df3bb3eb2930eb93c1b9f..e204c7cc477f96f0566c4472114e4c30e4245eac 100644 --- a/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml @@ -28,7 +28,7 @@ license = "MIT" maintainers = ["zywilliamli"] name = "llama-index-readers-confluence" readme = "README.md" -version = "0.1.3" +version = "0.1.4" [tool.poetry.dependencies] python = ">=3.8.1,<4.0"