Skip to content
Snippets Groups Projects
Unverified Commit efb2c7e3 authored by Jerry Liu's avatar Jerry Liu Committed by GitHub
Browse files

add wikipedia reader (#31)

parent a2e03a5d
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id:c5d167a5-81f8-4d2c-b42f-0a190577132f tags:
``` python
# My OpenAI Key
import os
os.environ['OPENAI_API_KEY'] = "INSERT OPENAI KEY"
```
%% Cell type:code id:5f60348e-731d-4a95-bae2-426e184a914e tags:
``` python
from gpt_index import GPTKeywordTableIndex, WikipediaReader
```
%% Cell type:code id:952c4659-7fbb-447e-8caf-06916412cc37 tags:
``` python
wiki_docs = WikipediaReader().load_data(pages=['Covid-19'])
```
%% Output
page: Covid-19
%% Cell type:code id:3be202db-a4c7-41d2-ba7d-446d1f934830 tags:
``` python
index = GPTKeywordTableIndex(wiki_docs)
```
%% Cell type:code id:7f5667a9-6758-447b-9af2-5e5a4d008a29 tags:
``` python
# save index to docs
index.save_to_disk('index_covid.json')
```
%% Cell type:code id:77340460-8319-474f-91eb-545ea5790127 tags:
``` python
new_index = GPTKeywordTableIndex.load_from_disk('index_covid.json')
```
%% Cell type:code id:28d7163e-f26f-4ad8-89d5-9cb7662c4d9c tags:
``` python
# GPT doesn't find the corresponding evidence in the leaf node, but still gives the correct answer
# try verbose=True for more detailed outputs
new_index.query("Which country included tocilizumab in treatment for covid-19?")
```
%% Output
> Starting query: Which country included tocilizumab in treatment for covid-19?
Extracted keywords: ['tocilizumab', 'treatment', 'covid-19', 'covid', '19']
> Querying with idx: 1105763466456338724: of age or older weighing at least 40 kilograms ...
> Querying with idx: 2820318727532393752: Coronavirus disease 2019 (COVID-19) is a contag...
> Querying with idx: 897499143815831368: if the mask includes an exhalation valve, a wea...
> Querying with idx: 8628144746434065339: pulmonary fibrosis, cystic fibrosis. Evidence s...
'\n\nChina'
%% Cell type:code id:60ae24ad-0aa1-4844-8e9a-caf30c74643e tags:
``` python
```
This diff is collapsed.
......@@ -17,11 +17,13 @@ from gpt_index.prompts.base import Prompt
# readers
from gpt_index.readers.simple_reader import SimpleDirectoryReader
from gpt_index.readers.wikipedia import WikipediaReader
__all__ = [
"GPTKeywordTableIndex",
"GPTListIndex",
"GPTTreeIndex",
"Prompt",
"WikipediaReader",
"SimpleDirectoryReader",
]
"""Base reader class."""
from abc import abstractmethod
from typing import List
from typing import Any, List
from gpt_index.schema import Document
......@@ -9,5 +9,5 @@ class BaseReader:
"""Utilities for loading data from a directory."""
@abstractmethod
def load_data(self) -> List[Document]:
def load_data(self, **load_kwargs: Any) -> List[Document]:
"""Load data from the input directory."""
"""Simple reader that ."""
from pathlib import Path
from typing import List
from typing import Any, List
from gpt_index.readers.base import BaseReader
from gpt_index.schema import Document
......@@ -22,7 +22,7 @@ class SimpleDirectoryReader(BaseReader):
raise ValueError(f"Expected {input_file} to be a file.")
self.input_files = input_files
def load_data(self) -> List[Document]:
def load_data(self, **load_kwargs: Any) -> List[Document]:
"""Load data from the input directory."""
data = ""
for input_file in self.input_files:
......
"""Simple reader that ."""
from typing import Any, List
from gpt_index.readers.base import BaseReader
from gpt_index.schema import Document
class WikipediaReader(BaseReader):
"""Wikipedia reader.
Reads a page.
"""
def __init__(self) -> None:
"""Initialize with parameters."""
try:
import wikipedia # noqa: F401
except ImportError:
raise ValueError(
"`wikipedia` package not found, please run `pip install wikipedia`"
)
def load_data(self, **load_kwargs: Any) -> List[Document]:
"""Load data from the input directory."""
import wikipedia
pages: List[str] = load_kwargs.pop("pages", None)
if pages is None:
raise ValueError('Must specify a "page" in `load_kwargs`.')
results = []
for page in pages:
page_content = wikipedia.page(page).content
results.append(Document(page_content))
return results
-e .
# third-party
wikipedia
# linting
black
isort
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment