Skip to content
Snippets Groups Projects
Unverified Commit 7d9f10fd authored by Anirban Basu's avatar Anirban Basu Committed by GitHub
Browse files

Added multilingual support for the Wikipedia reader. (#12616)

* Added multilingual support for the Wikipedia reader.

* fix: Added a check for the language prefix.

Checks if the provided language prefix is supported by the list of languages of Wikipedia.
parent 2200c13c
Branches
No related tags found
No related merge requests found
"""Simple reader that reads wikipedia.""" """Simple reader that reads wikipedia."""
from typing import Any, List from typing import Any, List
from llama_index.core.readers.base import BasePydanticReader from llama_index.core.readers.base import BasePydanticReader
...@@ -27,15 +28,26 @@ class WikipediaReader(BasePydanticReader): ...@@ -27,15 +28,26 @@ class WikipediaReader(BasePydanticReader):
def class_name(cls) -> str: def class_name(cls) -> str:
return "WikipediaReader" return "WikipediaReader"
def load_data(self, pages: List[str], **load_kwargs: Any) -> List[Document]: def load_data(
self, pages: List[str], lang_prefix: str = "en", **load_kwargs: Any
) -> List[Document]:
"""Load data from the input directory. """Load data from the input directory.
Args: Args:
pages (List[str]): List of pages to read. pages (List[str]): List of pages to read.
lang_prefix (str): Language prefix for Wikipedia. Defaults to English. Valid Wikipedia language codes
can be found at https://en.wikipedia.org/wiki/List_of_Wikipedias.
""" """
import wikipedia import wikipedia
if lang_prefix.lower() != "en":
if lang_prefix.lower() in wikipedia.languages():
wikipedia.set_lang(lang_prefix.lower())
else:
raise ValueError(
f"Language prefix '{lang_prefix}' for Wikipedia is not supported. Check supported languages at https://en.wikipedia.org/wiki/List_of_Wikipedias."
)
results = [] results = []
for page in pages: for page in pages:
wiki_page = wikipedia.page(page, **load_kwargs) wiki_page = wikipedia.page(page, **load_kwargs)
......
...@@ -28,7 +28,7 @@ license = "MIT" ...@@ -28,7 +28,7 @@ license = "MIT"
maintainers = ["jerryjliu"] maintainers = ["jerryjliu"]
name = "llama-index-readers-wikipedia" name = "llama-index-readers-wikipedia"
readme = "README.md" readme = "README.md"
version = "0.1.3" version = "0.1.4"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = ">=3.8.1,<4.0" python = ">=3.8.1,<4.0"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment