Skip to content
Snippets Groups Projects
Unverified Commit 7edfccaf authored by Sean Hatfield's avatar Sean Hatfield Committed by GitHub
Browse files

Adding url uploads to document picker (#375)


* WIP adding url uploads to document picker

* fix manual script for uploading url to custom-documents

* fix metadata for url scraping

* wip url parsing

* update how async link scraping works

* docker-compose defaults added
no autocomplete on URLs

---------

Co-authored-by: default avatartimothycarambat <rambat1010@gmail.com>
parent d766d128
No related branches found
No related tags found
No related merge requests found
...@@ -11,5 +11,6 @@ collector/outputs/** ...@@ -11,5 +11,6 @@ collector/outputs/**
**/__pycache__/ **/__pycache__/
**/.env **/.env
**/.env.* **/.env.*
**/bundleinspector.html
!docker/.env.example !docker/.env.example
!frontend/.env.production !frontend/.env.production
\ No newline at end of file
...@@ -2,6 +2,7 @@ import os ...@@ -2,6 +2,7 @@ import os
from flask import Flask, json, request from flask import Flask, json, request
from scripts.watch.process_single import process_single from scripts.watch.process_single import process_single
from scripts.watch.filetypes import ACCEPTED_MIMES from scripts.watch.filetypes import ACCEPTED_MIMES
from scripts.link import process_single_link
api = Flask(__name__) api = Flask(__name__)
WATCH_DIRECTORY = "hotdir" WATCH_DIRECTORY = "hotdir"
...@@ -13,6 +14,15 @@ def process_file(): ...@@ -13,6 +14,15 @@ def process_file():
success, reason = process_single(WATCH_DIRECTORY, target_filename) success, reason = process_single(WATCH_DIRECTORY, target_filename)
return json.dumps({'filename': target_filename, 'success': success, 'reason': reason}) return json.dumps({'filename': target_filename, 'success': success, 'reason': reason})
@api.route('/process-link', methods=['POST'])
async def process_link():
content = request.json
url = content.get('link')
print(f"Processing {url}")
success, reason = await process_single_link(url)
return json.dumps({'url': url, 'success': success, 'reason': reason})
@api.route('/accepts', methods=['GET']) @api.route('/accepts', methods=['GET'])
def get_accepted_filetypes(): def get_accepted_filetypes():
return json.dumps(ACCEPTED_MIMES) return json.dumps(ACCEPTED_MIMES)
......
...@@ -5,6 +5,7 @@ alive-progress==3.1.2 ...@@ -5,6 +5,7 @@ alive-progress==3.1.2
anyio==3.7.0 anyio==3.7.0
appdirs==1.4.4 appdirs==1.4.4
argilla==1.8.0 argilla==1.8.0
asgiref==3.7.2
async-timeout==4.0.2 async-timeout==4.0.2
attrs==23.1.0 attrs==23.1.0
backoff==2.2.1 backoff==2.2.1
......
...@@ -2,11 +2,11 @@ import os, json, tempfile ...@@ -2,11 +2,11 @@ import os, json, tempfile
from urllib.parse import urlparse from urllib.parse import urlparse
from requests_html import HTMLSession from requests_html import HTMLSession
from langchain.document_loaders import UnstructuredHTMLLoader from langchain.document_loaders import UnstructuredHTMLLoader
from .link_utils import append_meta from .link_utils import append_meta, AsyncHTMLSessionFixed
from .utils import tokenize, ada_v2_cost from .utils import tokenize, ada_v2_cost
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/ # Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
def link(): def link():
totalTokens = 0 totalTokens = 0
...@@ -21,7 +21,7 @@ def link(): ...@@ -21,7 +21,7 @@ def link():
if(req.ok == False): if(req.ok == False):
print("Could not reach this url!") print("Could not reach this url!")
exit(1) exit(1)
req.html.render() req.html.render()
full_text = None full_text = None
with tempfile.NamedTemporaryFile(mode = "w") as tmp: with tempfile.NamedTemporaryFile(mode = "w") as tmp:
...@@ -31,7 +31,7 @@ def link(): ...@@ -31,7 +31,7 @@ def link():
data = loader.load()[0] data = loader.load()[0]
full_text = data.page_content full_text = data.page_content
tmp.close() tmp.close()
link = append_meta(req, full_text, True) link = append_meta(req, full_text, True)
if(len(full_text) > 0): if(len(full_text) > 0):
totalTokens += len(tokenize(full_text)) totalTokens += len(tokenize(full_text))
...@@ -39,8 +39,8 @@ def link(): ...@@ -39,8 +39,8 @@ def link():
output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json" output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
output_path = f"./outputs/website-logs" output_path = f"./outputs/website-logs"
transaction_output_filename = f"article-{source.path.replace('/','_')}.json" transaction_output_filename = f"website-{source.path.replace('/','_')}.json"
transaction_output_dir = f"../server/storage/documents/website-{source.netloc}" transaction_output_dir = f"../server/storage/documents/custom-documents"
if os.path.isdir(output_path) == False: if os.path.isdir(output_path) == False:
os.makedirs(output_path) os.makedirs(output_path)
...@@ -64,6 +64,57 @@ def link(): ...@@ -64,6 +64,57 @@ def link():
print(f"////////////////////////////") print(f"////////////////////////////")
exit(0) exit(0)
async def process_single_link(url):
session = None
try:
print(f"Working on {url}...")
session = AsyncHTMLSessionFixed()
req = await session.get(url)
await req.html.arender()
await session.close()
if not req.ok:
return False, "Could not reach this URL."
full_text = None
with tempfile.NamedTemporaryFile(mode = "w") as tmp:
tmp.write(req.html.html)
tmp.seek(0)
loader = UnstructuredHTMLLoader(tmp.name)
data = loader.load()[0]
full_text = data.page_content
print("full text 1: ", full_text)
tmp.close()
print(full_text)
print("full text: ", full_text)
if full_text:
link_meta = append_meta(req, full_text, True)
source = urlparse(req.url)
transaction_output_dir = "../server/storage/documents/custom-documents"
transaction_output_filename = f"website-{source.netloc}-{source.path.replace('/', '_')}.json"
if not os.path.isdir(transaction_output_dir):
os.makedirs(transaction_output_dir)
file_path = os.path.join(transaction_output_dir, transaction_output_filename)
with open(file_path, 'w', encoding='utf-8') as file:
json.dump(link_meta, file, ensure_ascii=False, indent=4)
return True, "Content fetched and saved."
else:
return False, "Could not parse any meaningful data from this URL."
except Exception as e:
if session is not None:
session.close() # Kill hanging session.
return False, str(e)
def crawler(): def crawler():
prompt = "Paste in root URI of the pages of interest: " prompt = "Paste in root URI of the pages of interest: "
new_link = input(prompt) new_link = input(prompt)
...@@ -91,17 +142,17 @@ def crawler(): ...@@ -91,17 +142,17 @@ def crawler():
print (data + " does not apply for linking...") print (data + " does not apply for linking...")
except: except:
print (data + " does not apply for linking...") print (data + " does not apply for linking...")
#parse the links found #parse the links found
parse_links(links) parse_links(links)
def links(): def links():
links = [] links = []
prompt = "Paste in the URL of an online article or blog: " prompt = "Paste in the URL of an online article or blog: "
done = False done = False
while(done == False): while(done == False):
new_link = input(prompt) new_link = input(prompt)
if(len(new_link) == 0): if(len(new_link) == 0):
done = True done = True
links = [*set(links)] links = [*set(links)]
continue continue
...@@ -119,17 +170,17 @@ def links(): ...@@ -119,17 +170,17 @@ def links():
# parse links from array # parse links from array
def parse_links(links): def parse_links(links):
totalTokens = 0 totalTokens = 0
for link in links: for link in links:
print(f"Working on {link}...") print(f"Working on {link}...")
session = HTMLSession() session = HTMLSession()
req = session.get(link, timeout=20) req = session.get(link, timeout=20)
if not req.ok: if not req.ok:
print(f"Could not reach {link} - skipping!") print(f"Could not reach {link} - skipping!")
continue continue
req.html.render(timeout=10) req.html.render(timeout=10)
full_text = None full_text = None
with tempfile.NamedTemporaryFile(mode="w") as tmp: with tempfile.NamedTemporaryFile(mode="w") as tmp:
...@@ -139,15 +190,15 @@ def parse_links(links): ...@@ -139,15 +190,15 @@ def parse_links(links):
data = loader.load()[0] data = loader.load()[0]
full_text = data.page_content full_text = data.page_content
tmp.close() tmp.close()
link = append_meta(req, full_text, True) link = append_meta(req, full_text, True)
if len(full_text) > 0: if len(full_text) > 0:
source = urlparse(req.url) source = urlparse(req.url)
output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json" output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
output_path = f"./outputs/website-logs" output_path = f"./outputs/website-logs"
transaction_output_filename = f"article-{source.path.replace('/','_')}.json" transaction_output_filename = f"website-{source.path.replace('/','_')}.json"
transaction_output_dir = f"../server/storage/documents/website-{source.netloc}" transaction_output_dir = f"../server/storage/documents/custom-documents"
if not os.path.isdir(output_path): if not os.path.isdir(output_path):
os.makedirs(output_path) os.makedirs(output_path)
...@@ -168,7 +219,7 @@ def parse_links(links): ...@@ -168,7 +219,7 @@ def parse_links(links):
req.session.close() req.session.close()
else: else:
print(f"Could not parse any meaningful data from {link}.") print(f"Could not parse any meaningful data from {link}.")
continue continue
print(f"\n\n[Success]: {len(links)} article or link contents fetched!") print(f"\n\n[Success]: {len(links)} article or link contents fetched!")
print(f"////////////////////////////") print(f"////////////////////////////")
......
import json import json, pyppeteer
from datetime import datetime from datetime import datetime
from .watch.utils import guid
from dotenv import load_dotenv from dotenv import load_dotenv
from .watch.utils import guid from .watch.utils import guid
from .utils import tokenize from .utils import tokenize
from requests_html import AsyncHTMLSession
load_dotenv() load_dotenv()
def normalize_url(url):
if(url.endswith('.web')):
return url
return f"{url}.web"
def append_meta(request, text, metadata_only = False): def append_meta(request, text, metadata_only = False):
meta = { meta = {
'id': guid(), 'id': guid(),
'url': request.url, 'url': normalize_url(request.url),
'title': request.html.find('title', first=True).text if len(request.html.find('title')) != 0 else '', 'title': request.html.find('title', first=True).text if len(request.html.find('title')) != 0 else '',
'docAuthor': 'N/A', 'docAuthor': 'N/A',
'docSource': 'webpage',
'chunkSource': request.url,
'description': request.html.find('meta[name="description"]', first=True).attrs.get('content') if request.html.find('meta[name="description"]', first=True) != None else '', 'description': request.html.find('meta[name="description"]', first=True).attrs.get('content') if request.html.find('meta[name="description"]', first=True) != None else '',
'docSource': 'web page',
'chunkSource': request.url,
'published':request.html.find('meta[property="article:published_time"]', first=True).attrs.get('content') if request.html.find('meta[property="article:published_time"]', first=True) != None else datetime.today().strftime('%Y-%m-%d %H:%M:%S'), 'published':request.html.find('meta[property="article:published_time"]', first=True).attrs.get('content') if request.html.find('meta[property="article:published_time"]', first=True) != None else datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
'wordCount': len(text.split(' ')), 'wordCount': len(text.split(' ')),
'pageContent': text, 'pageContent': text,
'token_count_estimate':len(tokenize(text)), 'token_count_estimate':len(tokenize(text)),
} }
return "Article JSON Metadata:\n"+json.dumps(meta)+"\n\n\nText Content:\n" + text if metadata_only == False else meta return "Article JSON Metadata:\n"+json.dumps(meta)+"\n\n\nText Content:\n" + text if metadata_only == False else meta
class AsyncHTMLSessionFixed(AsyncHTMLSession):
"""
pip3 install websockets==6.0 --force-reinstall
"""
def __init__(self, **kwargs):
super(AsyncHTMLSessionFixed, self).__init__(**kwargs)
self.__browser_args = kwargs.get("browser_args", ["--no-sandbox"])
@property
async def browser(self):
if not hasattr(self, "_browser"):
self._browser = await pyppeteer.launch(ignoreHTTPSErrors=not(self.verify), headless=True, handleSIGINT=False, handleSIGTERM=False, handleSIGHUP=False, args=self.__browser_args)
return self._browser
\ No newline at end of file
...@@ -15,14 +15,14 @@ services: ...@@ -15,14 +15,14 @@ services:
context: ../. context: ../.
dockerfile: ./docker/Dockerfile dockerfile: ./docker/Dockerfile
args: args:
ARG_UID: ${UID} ARG_UID: ${UID:-1000}
ARG_GID: ${GID} ARG_GID: ${GID:-1000}
volumes: volumes:
- "./.env:/app/server/.env" - "./.env:/app/server/.env"
- "../server/storage:/app/server/storage" - "../server/storage:/app/server/storage"
- "../collector/hotdir/:/app/collector/hotdir" - "../collector/hotdir/:/app/collector/hotdir"
- "../collector/outputs/:/app/collector/outputs" - "../collector/outputs/:/app/collector/outputs"
user: "${UID}:${GID}" user: "${UID:-1000}:${GID:-1000}"
ports: ports:
- "3001:3001" - "3001:3001"
env_file: env_file:
......
...@@ -3,6 +3,7 @@ import PreLoader from "../../../../Preloader"; ...@@ -3,6 +3,7 @@ import PreLoader from "../../../../Preloader";
import { useEffect, useState } from "react"; import { useEffect, useState } from "react";
import FolderRow from "./FolderRow"; import FolderRow from "./FolderRow";
import pluralize from "pluralize"; import pluralize from "pluralize";
import Workspace from "../../../../../models/workspace";
export default function Directory({ export default function Directory({
files, files,
...@@ -139,6 +140,7 @@ export default function Directory({ ...@@ -139,6 +140,7 @@ export default function Directory({
fileTypes={fileTypes} fileTypes={fileTypes}
workspace={workspace} workspace={workspace}
fetchKeys={fetchKeys} fetchKeys={fetchKeys}
setLoading={setLoading}
/> />
</div> </div>
</div> </div>
......
...@@ -5,10 +5,38 @@ import System from "../../../../../models/system"; ...@@ -5,10 +5,38 @@ import System from "../../../../../models/system";
import { useDropzone } from "react-dropzone"; import { useDropzone } from "react-dropzone";
import { v4 } from "uuid"; import { v4 } from "uuid";
import FileUploadProgress from "./FileUploadProgress"; import FileUploadProgress from "./FileUploadProgress";
import Workspace from "../../../../../models/workspace";
export default function UploadFile({ workspace, fileTypes, fetchKeys }) { export default function UploadFile({
workspace,
fileTypes,
fetchKeys,
setLoading,
}) {
const [ready, setReady] = useState(false); const [ready, setReady] = useState(false);
const [files, setFiles] = useState([]); const [files, setFiles] = useState([]);
const [fetchingUrl, setFetchingUrl] = useState(false);
const handleSendLink = async (e) => {
e.preventDefault();
setLoading(true);
setFetchingUrl(true);
const formEl = e.target;
const form = new FormData(formEl);
const { response, data } = await Workspace.uploadLink(
workspace.slug,
form.get("link")
);
if (!response.ok) {
showToast(`Error uploading link: ${data.error}`, "error");
} else {
fetchKeys(true);
showToast("Link uploaded successfully", "success");
formEl.reset();
}
setLoading(false);
setFetchingUrl(false);
};
const handleUploadSuccess = () => { const handleUploadSuccess = () => {
fetchKeys(true); fetchKeys(true);
...@@ -103,6 +131,26 @@ export default function UploadFile({ workspace, fileTypes, fetchKeys }) { ...@@ -103,6 +131,26 @@ export default function UploadFile({ workspace, fileTypes, fetchKeys }) {
</div> </div>
)} )}
</div> </div>
<div className="text-center text-white text-opacity-50 text-xs font-medium w-[560px] py-2">
or submit a link
</div>
<form onSubmit={handleSendLink} className="flex gap-x-2">
<input
disabled={fetchingUrl}
name="link"
type="url"
className="disabled:bg-zinc-600 disabled:text-slate-300 bg-zinc-900 text-white text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-3/4 p-2.5"
placeholder={"https://example.com"}
autoComplete="off"
/>
<button
disabled={fetchingUrl}
type="submit"
className="disabled:bg-white/20 disabled:text-slate-300 disabled:border-slate-400 disabled:cursor-wait bg bg-transparent hover:bg-slate-200 hover:text-slate-800 w-auto border border-white text-sm text-white p-2.5 rounded-lg transition-all duration-300"
>
{fetchingUrl ? "Fetching..." : "Fetch website"}
</button>
</form>
<div className="mt-6 text-center text-white text-opacity-80 text-xs font-medium w-[560px]"> <div className="mt-6 text-center text-white text-opacity-80 text-xs font-medium w-[560px]">
These files will be uploaded to the document processor running on this These files will be uploaded to the document processor running on this
AnythingLLM instance. These files are not sent or shared with a third AnythingLLM instance. These files are not sent or shared with a third
......
...@@ -138,6 +138,16 @@ const Workspace = { ...@@ -138,6 +138,16 @@ const Workspace = {
const data = await response.json(); const data = await response.json();
return { response, data }; return { response, data };
}, },
uploadLink: async function (slug, link) {
const response = await fetch(`${API_BASE}/workspace/${slug}/upload-link`, {
method: "POST",
body: JSON.stringify({ link }),
headers: baseHeaders(),
});
const data = await response.json();
return { response, data };
},
// TODO: Deprecated and should be removed from frontend. // TODO: Deprecated and should be removed from frontend.
sendChat: async function ({ slug }, message, mode = "query") { sendChat: async function ({ slug }, message, mode = "query") {
......
...@@ -8,8 +8,7 @@ export function formatDate(dateString) { ...@@ -8,8 +8,7 @@ export function formatDate(dateString) {
} }
export function getFileExtension(path) { export function getFileExtension(path) {
const match = path.match(/[^\/\\&\?]+\.\w{1,4}(?=([\?&].*$|$))/); return path?.split(".")?.slice(-1)?.[0] || "file";
return match ? match[0].split(".").pop() : "file";
} }
export function truncate(str, n) { export function truncate(str, n) {
......
...@@ -9,6 +9,7 @@ const { setupMulter } = require("../utils/files/multer"); ...@@ -9,6 +9,7 @@ const { setupMulter } = require("../utils/files/multer");
const { const {
checkPythonAppAlive, checkPythonAppAlive,
processDocument, processDocument,
processLink,
} = require("../utils/files/documentProcessor"); } = require("../utils/files/documentProcessor");
const { validatedRequest } = require("../utils/middleware/validatedRequest"); const { validatedRequest } = require("../utils/middleware/validatedRequest");
const { Telemetry } = require("../models/telemetry"); const { Telemetry } = require("../models/telemetry");
...@@ -107,6 +108,38 @@ function workspaceEndpoints(app) { ...@@ -107,6 +108,38 @@ function workspaceEndpoints(app) {
} }
); );
app.post(
"/workspace/:slug/upload-link",
[validatedRequest],
async (request, response) => {
const { link = "" } = reqBody(request);
const processingOnline = await checkPythonAppAlive();
if (!processingOnline) {
response
.status(500)
.json({
success: false,
error: `Python processing API is not online. Link ${link} will not be processed automatically.`,
})
.end();
return;
}
const { success, reason } = await processLink(link);
if (!success) {
response.status(500).json({ success: false, error: reason }).end();
return;
}
console.log(
`Link ${link} uploaded processed and successfully. It is now available in documents.`
);
await Telemetry.sendTelemetry("link_uploaded");
response.status(200).json({ success: true, error: null });
}
);
app.post( app.post(
"/workspace/:slug/update-embeddings", "/workspace/:slug/update-embeddings",
[validatedRequest], [validatedRequest],
......
...@@ -39,8 +39,29 @@ async function processDocument(filename = "") { ...@@ -39,8 +39,29 @@ async function processDocument(filename = "") {
}); });
} }
async function processLink(link = "") {
if (!link) return false;
return await fetch(`${PYTHON_API}/process-link`, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ link }),
})
.then((res) => {
if (!res.ok) throw new Error("Response could not be completed");
return res.json();
})
.then((res) => res)
.catch((e) => {
console.log(e.message);
return { success: false, reason: e.message };
});
}
module.exports = { module.exports = {
checkPythonAppAlive, checkPythonAppAlive,
processDocument, processDocument,
processLink,
acceptedFileTypes, acceptedFileTypes,
}; };
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment