Skip to content
Snippets Groups Projects
Unverified Commit 6db70f81 authored by Erik Talvola's avatar Erik Talvola Committed by GitHub
Browse files

[Feature Request]: add PowerPoint support to Confluence reader #10592 (#11454)

parent 68013e14
No related branches found
No related tags found
No related merge requests found
...@@ -4,6 +4,7 @@ import logging ...@@ -4,6 +4,7 @@ import logging
import os import os
from typing import Dict, List, Optional from typing import Dict, List, Optional
from urllib.parse import unquote from urllib.parse import unquote
import requests
from llama_index.core.readers.base import BaseReader from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document from llama_index.core.schema import Document
...@@ -322,6 +323,8 @@ class ConfluenceReader(BaseReader): ...@@ -322,6 +323,8 @@ class ConfluenceReader(BaseReader):
return function(**kwargs) return function(**kwargs)
def process_page(self, page, include_attachments, text_maker): def process_page(self, page, include_attachments, text_maker):
logger.info("Processing " + self.base_url + page["_links"]["webui"])
if include_attachments: if include_attachments:
attachment_texts = self.process_attachment(page["id"]) attachment_texts = self.process_attachment(page["id"])
else: else:
...@@ -357,23 +360,74 @@ class ConfluenceReader(BaseReader): ...@@ -357,23 +360,74 @@ class ConfluenceReader(BaseReader):
absolute_url = self.base_url + attachment["_links"]["download"] absolute_url = self.base_url + attachment["_links"]["download"]
title = attachment["title"] title = attachment["title"]
if media_type == "application/pdf": if media_type == "application/pdf":
logger.info("Processing PDF attachment " + absolute_url)
text = title + self.process_pdf(absolute_url) text = title + self.process_pdf(absolute_url)
elif ( elif (
media_type == "image/png" media_type == "image/png"
or media_type == "image/jpg" or media_type == "image/jpg"
or media_type == "image/jpeg" or media_type == "image/jpeg"
or media_type == "image/webp"
): ):
logger.info("Processing image attachment " + absolute_url)
text = title + self.process_image(absolute_url) text = title + self.process_image(absolute_url)
elif ( elif (
media_type media_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document" == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
): ):
logger.info("Processing Word document attachment " + absolute_url)
text = title + self.process_doc(absolute_url) text = title + self.process_doc(absolute_url)
elif media_type == "application/vnd.ms-excel": elif (
text = title + self.process_xls(absolute_url) media_type == "application/vnd.ms-excel"
or media_type
== "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
or media_type == "application/vnd.ms-excel.sheet.macroenabled.12"
):
if title.endswith(".csv") or absolute_url.endswith(".csv"):
logger.info("Processing CSV attachment " + absolute_url)
text = title + self.process_csv(absolute_url)
else:
logger.info("Processing XLS attachment " + absolute_url)
text = title + self.process_xls(absolute_url)
elif media_type == "application/vnd.ms-excel.sheet.binary.macroenabled.12":
logger.info("Processing XLSB attachment " + absolute_url)
text = title + self.process_xlsb(absolute_url)
elif media_type == "text/csv":
logger.info("Processing CSV attachment " + absolute_url)
text = title + self.process_csv(absolute_url)
elif media_type == "application/vnd.ms-outlook":
logger.info("Processing Outlook message attachment " + absolute_url)
text = title + self.process_msg(absolute_url)
elif media_type == "text/html":
logger.info(" Processing HTML attachment " + absolute_url)
text = title + self.process_html(absolute_url)
elif media_type == "text/plain":
if title.endswith(".csv") or absolute_url.endswith(".csv"):
logger.info("Processing CSV attachment " + absolute_url)
text = title + self.process_csv(absolute_url)
else:
logger.info("Processing Text attachment " + absolute_url)
text = title + self.process_txt(absolute_url)
elif media_type == "image/svg+xml": elif media_type == "image/svg+xml":
logger.info("Processing SVG attachment " + absolute_url)
text = title + self.process_svg(absolute_url) text = title + self.process_svg(absolute_url)
elif (
media_type
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
or media_type
== "application/vnd.ms-powerpoint.presentation.macroenabled.12"
):
logger.info(
"Processing PowerPoint attachment "
+ absolute_url
+ " ("
+ media_type
+ ")"
)
text = title + self.process_ppt(absolute_url)
else: else:
logger.info(
f"Skipping unsupported attachment {absolute_url} of media_type {media_type}"
)
continue continue
texts.append(text) texts.append(text)
...@@ -409,6 +463,85 @@ class ConfluenceReader(BaseReader): ...@@ -409,6 +463,85 @@ class ConfluenceReader(BaseReader):
return text return text
def process_html(self, link):
try:
from bs4 import BeautifulSoup # type: ignore
import requests
except ImportError:
raise ImportError(
"`beautifulsoup4` or `requests` package not found, please run `pip install beautifulsoup4 requests`"
)
try:
response = requests.get(link)
if response.status_code != 200:
return "Error fetching HTML content: HTTP Status Code {}".format(
response.status_code
)
# Parse the HTML content and extract text
soup = BeautifulSoup(response.content, "html.parser")
return soup.get_text(separator=" ", strip=True)
except Exception as e:
logger.error(f"Error processing HTML file at {link}: {e}")
return f"Error processing HTML file: {link}. An error occurred while fetching or parsing the content."
def process_txt(self, link):
try:
import requests
except ImportError:
raise ImportError(
"`requests` package not found, please run `pip install requests`"
)
try:
response = requests.get(link)
if response.status_code != 200:
return "Error fetching text content: HTTP Status Code {}".format(
response.status_code
)
return response.text
except Exception as e:
logger.error(f"Error processing text file at {link}: {e}")
return f"Error processing text file: {link}. An error occurred while fetching the content."
def process_msg(self, link):
try:
import extract_msg # type: ignore
from io import BytesIO
except ImportError:
raise ImportError(
"`extract-msg` package not found, please run `pip install extract-msg`"
)
response = self.confluence.request(path=link, absolute=True)
text = ""
if response.status_code != 200 or response.content in [b"", None]:
logger.error(f"Failed to download .msg file from {link}")
return text
file_data = BytesIO(response.content)
try:
# Load the .msg file content
with extract_msg.Message(file_data) as msg:
subject = msg.subject
sender = msg.sender
to = msg.to
cc = msg.cc
body = msg.body
# Compile the extracted information into a text string
text = (
f"Subject: {subject}\nFrom: {sender}\nTo: {to}\nCC: {cc}\n\n{body}"
)
except Exception as e:
logger.error(f"Error processing .msg file at {link}: {e}")
return "Error processing .msg file."
return text
def process_image(self, link): def process_image(self, link):
try: try:
from io import BytesIO # type: ignore from io import BytesIO # type: ignore
...@@ -421,6 +554,77 @@ class ConfluenceReader(BaseReader): ...@@ -421,6 +554,77 @@ class ConfluenceReader(BaseReader):
" pytesseract Pillow`" " pytesseract Pillow`"
) )
text = ""
try:
response = self.confluence.request(path=link, absolute=True)
# Check if the response status code indicates success (200 OK)
if response.status_code == 200 and response.content:
try:
image = Image.open(BytesIO(response.content))
text = pytesseract.image_to_string(image)
except OSError:
# Handle errors that occur while opening or processing the image
logger.error(
f"Error processing image at {link}: Unable to open or read the image content."
)
return text
else:
# Log non-200 responses here if needed
logger.error(
f"Error fetching image at {link}: HTTP status code {response.status_code}."
)
return text
except requests.exceptions.RequestException as e:
# This catches any Requests-related exceptions, including HTTPError, ConnectionError, etc.
logger.error(f"Request error while fetching image at {link}: {e}")
return text
return text
def process_doc(self, link):
try:
from io import BytesIO
import docx2txt
import zipfile # Import zipfile to catch BadZipFile exceptions
except ImportError:
raise ImportError(
"`docx2txt` package not found, please run `pip install docx2txt`"
)
text = ""
try:
response = self.confluence.request(path=link, absolute=True)
if response.status_code != 200 or response.content in [b"", None]:
logger.error(
f"Error fetching document at {link}: HTTP status code {response.status_code}."
)
return text
file_data = BytesIO(response.content)
try:
text = docx2txt.process(file_data)
except zipfile.BadZipFile:
logger.error(
f"Error processing Word document at {link}: File is not a zip file."
)
return text
except Exception as e:
logger.error(f"Unexpected error processing document at {link}: {e}")
return text
return text
def process_ppt(self, link):
try:
from io import BytesIO
from pptx import Presentation # type: ignore
except ImportError:
raise ImportError(
"`python-pptx` package not found, please run `pip install python-pptx`"
)
response = self.confluence.request(path=link, absolute=True) response = self.confluence.request(path=link, absolute=True)
text = "" text = ""
...@@ -430,21 +634,69 @@ class ConfluenceReader(BaseReader): ...@@ -430,21 +634,69 @@ class ConfluenceReader(BaseReader):
or response.content is None or response.content is None
): ):
return text return text
file_data = BytesIO(response.content)
try:
presentation = Presentation(file_data)
for slide in presentation.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text += shape.text + " "
except (
Exception
) as e: # Catching a general exception to handle any unexpected errors
logger.error(f"Error processing PowerPoint file at {link}: {e}")
text = f"Error processing PowerPoint file: {link}. The file might be corrupt or not a valid PowerPoint file."
return text.strip() # Remove any leading/trailing whitespace
def process_xls(self, link):
try: try:
image = Image.open(BytesIO(response.content)) import pandas as pd # type: ignore
except OSError: except ImportError:
raise ImportError(
"`pandas` package not found, please run `pip install pandas`"
)
try:
from io import BytesIO
except ImportError:
raise ImportError("Failed to import BytesIO from io")
response = self.confluence.request(path=link, absolute=True)
text = ""
if (
response.status_code != 200
or response.content == b""
or response.content is None
):
return text return text
return pytesseract.image_to_string(image) file_data = BytesIO(response.content)
def process_doc(self, link): # Try to read the Excel file
try: try:
from io import BytesIO # type: ignore # Use pandas to read all sheets; returns a dict of DataFrame
sheets = pd.read_excel(file_data, sheet_name=None, engine="openpyxl")
except Exception as e:
return f"Failed to read Excel file: {e!s}"
for sheet_name, sheet_data in sheets.items():
text += f"{sheet_name}:\n"
for row_index, row in sheet_data.iterrows():
text += "\t".join(str(value) for value in row) + "\n"
text += "\n"
import docx2txt # type: ignore return text.strip()
def process_xlsb(self, link):
try:
import pandas as pd
from io import BytesIO
except ImportError: except ImportError:
raise ImportError( raise ImportError(
"`docx2txt` package not found, please run `pip install docx2txt`" "`pandas` package not found, please run `pip install pandas`"
) )
response = self.confluence.request(path=link, absolute=True) response = self.confluence.request(path=link, absolute=True)
...@@ -456,15 +708,31 @@ class ConfluenceReader(BaseReader): ...@@ -456,15 +708,31 @@ class ConfluenceReader(BaseReader):
or response.content is None or response.content is None
): ):
return text return text
file_data = BytesIO(response.content) file_data = BytesIO(response.content)
return docx2txt.process(file_data) try:
# Use pandas to read the .xlsb file, specifying pyxlsb as the engine
df = pd.read_excel(file_data, engine="pyxlsb")
# Convert the DataFrame to a text string
text_rows = []
for index, row in df.iterrows():
text_rows.append(", ".join(row.astype(str)))
text = "\n".join(text_rows)
except Exception as e:
logger.error(f"Error processing XLSB file at {link}: {e}")
text = "Error processing XLSB file."
def process_xls(self, link): return text
def process_csv(self, link):
try: try:
import xlrd # type: ignore import pandas as pd
from io import BytesIO
except ImportError: except ImportError:
raise ImportError("`xlrd` package not found, please run `pip install xlrd`") raise ImportError(
"`pandas` package not found, please run `pip install pandas`"
)
response = self.confluence.request(path=link, absolute=True) response = self.confluence.request(path=link, absolute=True)
text = "" text = ""
...@@ -476,14 +744,19 @@ class ConfluenceReader(BaseReader): ...@@ -476,14 +744,19 @@ class ConfluenceReader(BaseReader):
): ):
return text return text
workbook = xlrd.open_workbook(file_contents=response.content) file_data = BytesIO(response.content)
for sheet in workbook.sheets():
text += f"{sheet.name}:\n" try:
for row in range(sheet.nrows): # Assuming CSV uses default comma delimiter. If delimiter varies, consider detecting it.
for col in range(sheet.ncols): df = pd.read_csv(file_data, low_memory=False)
text += f"{sheet.cell_value(row, col)}\t" # Convert the DataFrame to a text string, including headers
text += "\n" text_rows = []
text += "\n" for index, row in df.iterrows():
text_rows.append(", ".join(row.astype(str)))
text = "\n".join(text_rows)
except Exception as e:
logger.error(f"Error processing CSV file: {e}")
text = "Error processing CSV file."
return text return text
......
...@@ -28,7 +28,7 @@ license = "MIT" ...@@ -28,7 +28,7 @@ license = "MIT"
maintainers = ["zywilliamli"] maintainers = ["zywilliamli"]
name = "llama-index-readers-confluence" name = "llama-index-readers-confluence"
readme = "README.md" readme = "README.md"
version = "0.1.3" version = "0.1.4"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = ">=3.8.1,<4.0" python = ">=3.8.1,<4.0"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment