From b35feede879c543e2d6cb58c89f973b29073ecc0 Mon Sep 17 00:00:00 2001
From: Timothy Carambat <rambat1010@gmail.com>
Date: Tue, 16 Jan 2024 16:04:22 -0800
Subject: [PATCH] 570 document api return object (#608)

* Add support for fetching single document in documents folder

* Add document object to upload + support link scraping via API

* hotfixes for documentation

* update api docs
---
 collector/index.js                            |  16 ++-
 collector/processLink/convert/generic.js      |  13 +-
 .../processSingleFile/convert/asAudio.js      |  14 +-
 collector/processSingleFile/convert/asDocx.js |  13 +-
 collector/processSingleFile/convert/asMbox.js |  15 ++-
 .../processSingleFile/convert/asOfficeMime.js |  13 +-
 collector/processSingleFile/convert/asPDF.js  |  13 +-
 collector/processSingleFile/convert/asTxt.js  |  13 +-
 collector/processSingleFile/index.js          |   4 +
 collector/utils/files/index.js                |  19 ++-
 server/endpoints/api/document/index.js        | 121 +++++++++++++++++-
 server/endpoints/api/workspace/index.js       |   4 +-
 server/swagger/openapi.json                   | 105 ++++++++++++++-
 server/utils/files/documentProcessor.js       |   4 +-
 14 files changed, 324 insertions(+), 43 deletions(-)

diff --git a/collector/index.js b/collector/index.js
index 5070ae72f..062d78959 100644
--- a/collector/index.js
+++ b/collector/index.js
@@ -29,14 +29,21 @@ app.post("/process", async function (request, response) {
     const targetFilename = path
       .normalize(filename)
       .replace(/^(\.\.(\/|\\|$))+/, "");
-    const { success, reason } = await processSingleFile(targetFilename);
-    response.status(200).json({ filename: targetFilename, success, reason });
+    const {
+      success,
+      reason,
+      documents = [],
+    } = await processSingleFile(targetFilename);
+    response
+      .status(200)
+      .json({ filename: targetFilename, success, reason, documents });
   } catch (e) {
     console.error(e);
     response.status(200).json({
       filename: filename,
       success: false,
       reason: "A processing error occurred.",
+      documents: [],
     });
   }
   return;
@@ -45,14 +52,15 @@ app.post("/process", async function (request, response) {
 app.post("/process-link", async function (request, response) {
   const { link } = reqBody(request);
   try {
-    const { success, reason } = await processLink(link);
-    response.status(200).json({ url: link, success, reason });
+    const { success, reason, documents = [] } = await processLink(link);
+    response.status(200).json({ url: link, success, reason, documents });
   } catch (e) {
     console.error(e);
     response.status(200).json({
       url: link,
       success: false,
       reason: "A processing error occurred.",
+      documents: [],
     });
   }
   return;
diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js
index f42dcd171..c6431d733 100644
--- a/collector/processLink/convert/generic.js
+++ b/collector/processLink/convert/generic.js
@@ -12,7 +12,11 @@ async function scrapeGenericUrl(link) {
 
   if (!content.length) {
     console.error(`Resulting URL content was empty at ${link}.`);
-    return { success: false, reason: `No URL content found at ${link}.` };
+    return {
+      success: false,
+      reason: `No URL content found at ${link}.`,
+      documents: [],
+    };
   }
 
   const url = new URL(link);
@@ -32,9 +36,12 @@ async function scrapeGenericUrl(link) {
     token_count_estimate: tokenizeString(content).length,
   };
 
-  writeToServerDocuments(data, `url-${slugify(filename)}-${data.id}`);
+  const document = writeToServerDocuments(
+    data,
+    `url-${slugify(filename)}-${data.id}`
+  );
   console.log(`[SUCCESS]: URL ${link} converted & ready for embedding.\n`);
-  return { success: true, reason: null };
+  return { success: true, reason: null, documents: [document] };
 }
 
 async function getPageContent(link) {
diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js
index a15207fba..7688d7b85 100644
--- a/collector/processSingleFile/convert/asAudio.js
+++ b/collector/processSingleFile/convert/asAudio.js
@@ -31,6 +31,7 @@ async function asAudio({ fullFilePath = "", filename = "" }) {
     return {
       success: false,
       reason: `Failed to parse content from ${filename}.`,
+      documents: [],
     };
   }
 
@@ -43,7 +44,11 @@ async function asAudio({ fullFilePath = "", filename = "" }) {
   if (!content.length) {
     console.error(`Resulting text content was empty for ${filename}.`);
     trashFile(fullFilePath);
-    return { success: false, reason: `No text content found in ${filename}.` };
+    return {
+      success: false,
+      reason: `No text content found in ${filename}.`,
+      documents: [],
+    };
   }
 
   const data = {
@@ -60,12 +65,15 @@ async function asAudio({ fullFilePath = "", filename = "" }) {
     token_count_estimate: tokenizeString(content).length,
   };
 
-  writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
+  const document = writeToServerDocuments(
+    data,
+    `${slugify(filename)}-${data.id}`
+  );
   trashFile(fullFilePath);
   console.log(
     `[SUCCESS]: ${filename} transcribed, converted & ready for embedding.\n`
   );
-  return { success: true, reason: null };
+  return { success: true, reason: null, documents: [document] };
 }
 
 async function convertToWavAudioData(sourcePath) {
diff --git a/collector/processSingleFile/convert/asDocx.js b/collector/processSingleFile/convert/asDocx.js
index 7a64a042d..b4fe7d2c9 100644
--- a/collector/processSingleFile/convert/asDocx.js
+++ b/collector/processSingleFile/convert/asDocx.js
@@ -24,7 +24,11 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
   if (!pageContent.length) {
     console.error(`Resulting text content was empty for ${filename}.`);
     trashFile(fullFilePath);
-    return { success: false, reason: `No text content found in ${filename}.` };
+    return {
+      success: false,
+      reason: `No text content found in ${filename}.`,
+      documents: [],
+    };
   }
 
   const content = pageContent.join("");
@@ -42,10 +46,13 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
     token_count_estimate: tokenizeString(content).length,
   };
 
-  writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
+  const document = writeToServerDocuments(
+    data,
+    `${slugify(filename)}-${data.id}`
+  );
   trashFile(fullFilePath);
   console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
-  return { success: true, reason: null };
+  return { success: true, reason: null, documents: [document] };
 }
 
 module.exports = asDocX;
diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js
index 30883f21b..f62f6b2ba 100644
--- a/collector/processSingleFile/convert/asMbox.js
+++ b/collector/processSingleFile/convert/asMbox.js
@@ -22,10 +22,15 @@ async function asMbox({ fullFilePath = "", filename = "" }) {
   if (!mails.length) {
     console.error(`Resulting mail items was empty for ${filename}.`);
     trashFile(fullFilePath);
-    return { success: false, reason: `No mail items found in ${filename}.` };
+    return {
+      success: false,
+      reason: `No mail items found in ${filename}.`,
+      documents: [],
+    };
   }
 
   let item = 1;
+  const documents = [];
   for (const mail of mails) {
     if (!mail.hasOwnProperty("text")) continue;
 
@@ -52,14 +57,18 @@ async function asMbox({ fullFilePath = "", filename = "" }) {
     };
 
     item++;
-    writeToServerDocuments(data, `${slugify(filename)}-${data.id}-msg-${item}`);
+    const document = writeToServerDocuments(
+      data,
+      `${slugify(filename)}-${data.id}-msg-${item}`
+    );
+    documents.push(document);
   }
 
   trashFile(fullFilePath);
   console.log(
     `[SUCCESS]: ${filename} messages converted & ready for embedding.\n`
   );
-  return { success: true, reason: null };
+  return { success: true, reason: null, documents };
 }
 
 module.exports = asMbox;
diff --git a/collector/processSingleFile/convert/asOfficeMime.js b/collector/processSingleFile/convert/asOfficeMime.js
index a6eb0351a..45b316610 100644
--- a/collector/processSingleFile/convert/asOfficeMime.js
+++ b/collector/processSingleFile/convert/asOfficeMime.js
@@ -20,7 +20,11 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) {
   if (!content.length) {
     console.error(`Resulting text content was empty for ${filename}.`);
     trashFile(fullFilePath);
-    return { success: false, reason: `No text content found in ${filename}.` };
+    return {
+      success: false,
+      reason: `No text content found in ${filename}.`,
+      documents: [],
+    };
   }
 
   const data = {
@@ -37,10 +41,13 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) {
     token_count_estimate: tokenizeString(content).length,
   };
 
-  writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
+  const document = writeToServerDocuments(
+    data,
+    `${slugify(filename)}-${data.id}`
+  );
   trashFile(fullFilePath);
   console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
-  return { success: true, reason: null };
+  return { success: true, reason: null, documents: [document] };
 }
 
 module.exports = asOfficeMime;
diff --git a/collector/processSingleFile/convert/asPDF.js b/collector/processSingleFile/convert/asPDF.js
index f6d869d5c..b89b97411 100644
--- a/collector/processSingleFile/convert/asPDF.js
+++ b/collector/processSingleFile/convert/asPDF.js
@@ -29,7 +29,11 @@ async function asPDF({ fullFilePath = "", filename = "" }) {
   if (!pageContent.length) {
     console.error(`Resulting text content was empty for ${filename}.`);
     trashFile(fullFilePath);
-    return { success: false, reason: `No text content found in ${filename}.` };
+    return {
+      success: false,
+      reason: `No text content found in ${filename}.`,
+      documents: [],
+    };
   }
 
   const content = pageContent.join("");
@@ -47,10 +51,13 @@ async function asPDF({ fullFilePath = "", filename = "" }) {
     token_count_estimate: tokenizeString(content).length,
   };
 
-  writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
+  const document = writeToServerDocuments(
+    data,
+    `${slugify(filename)}-${data.id}`
+  );
   trashFile(fullFilePath);
   console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
-  return { success: true, reason: null };
+  return { success: true, reason: null, documents: [document] };
 }
 
 module.exports = asPDF;
diff --git a/collector/processSingleFile/convert/asTxt.js b/collector/processSingleFile/convert/asTxt.js
index ad35e5476..cf7260d4b 100644
--- a/collector/processSingleFile/convert/asTxt.js
+++ b/collector/processSingleFile/convert/asTxt.js
@@ -19,7 +19,11 @@ async function asTxt({ fullFilePath = "", filename = "" }) {
   if (!content?.length) {
     console.error(`Resulting text content was empty for ${filename}.`);
     trashFile(fullFilePath);
-    return { success: false, reason: `No text content found in ${filename}.` };
+    return {
+      success: false,
+      reason: `No text content found in ${filename}.`,
+      documents: [],
+    };
   }
 
   console.log(`-- Working ${filename} --`);
@@ -37,10 +41,13 @@ async function asTxt({ fullFilePath = "", filename = "" }) {
     token_count_estimate: tokenizeString(content).length,
   };
 
-  writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
+  const document = writeToServerDocuments(
+    data,
+    `${slugify(filename)}-${data.id}`
+  );
   trashFile(fullFilePath);
   console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
-  return { success: true, reason: null };
+  return { success: true, reason: null, documents: [document] };
 }
 
 module.exports = asTxt;
diff --git a/collector/processSingleFile/index.js b/collector/processSingleFile/index.js
index 37c9fd5c5..9efd3a70f 100644
--- a/collector/processSingleFile/index.js
+++ b/collector/processSingleFile/index.js
@@ -13,11 +13,13 @@ async function processSingleFile(targetFilename) {
     return {
       success: false,
       reason: "Filename is a reserved filename and cannot be processed.",
+      documents: [],
     };
   if (!fs.existsSync(fullFilePath))
     return {
       success: false,
       reason: "File does not exist in upload directory.",
+      documents: [],
     };
 
   const fileExtension = path.extname(fullFilePath).toLowerCase();
@@ -25,6 +27,7 @@ async function processSingleFile(targetFilename) {
     return {
       success: false,
       reason: `No file extension found. This file cannot be processed.`,
+      documents: [],
     };
   }
 
@@ -33,6 +36,7 @@ async function processSingleFile(targetFilename) {
     return {
       success: false,
       reason: `File extension ${fileExtension} not supported for parsing.`,
+      documents: [],
     };
   }
 
diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js
index 915c4ac10..caf33c888 100644
--- a/collector/utils/files/index.js
+++ b/collector/utils/files/index.js
@@ -38,14 +38,19 @@ function writeToServerDocuments(
       );
   if (!fs.existsSync(destination))
     fs.mkdirSync(destination, { recursive: true });
-  const destinationFilePath = path.resolve(destination, filename);
+  const destinationFilePath = path.resolve(destination, filename) + ".json";
 
-  fs.writeFileSync(
-    destinationFilePath + ".json",
-    JSON.stringify(data, null, 4),
-    { encoding: "utf-8" }
-  );
-  return;
+  fs.writeFileSync(destinationFilePath, JSON.stringify(data, null, 4), {
+    encoding: "utf-8",
+  });
+
+  return {
+    ...data,
+    // relative location string that can be passed into the /update-embeddings api
+    // that will work since we know the location exists and since we only allow
+    // 1-level deep folders this will always work. This still works for integrations like GitHub and YouTube.
+    location: destinationFilePath.split("/").slice(-2).join("/"),
+  };
 }
 
 // When required we can wipe the entire collector hotdir and tmp storage in case
diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js
index f1282e7c2..817043526 100644
--- a/server/endpoints/api/document/index.js
+++ b/server/endpoints/api/document/index.js
@@ -5,11 +5,13 @@ const {
   checkProcessorAlive,
   acceptedFileTypes,
   processDocument,
+  processLink,
 } = require("../../../utils/files/documentProcessor");
 const {
   viewLocalFiles,
   findDocumentInDocuments,
 } = require("../../../utils/files");
+const { reqBody } = require("../../../utils/http");
 const { handleUploads } = setupMulter();
 
 function apiDocumentEndpoints(app) {
@@ -23,7 +25,6 @@ function apiDocumentEndpoints(app) {
       /* 
     #swagger.tags = ['Documents']
     #swagger.description = 'Upload a new file to AnythingLLM to be parsed and prepared for embedding.'
-
     #swagger.requestBody = {
       description: 'File to be uploaded.',
       required: true,
@@ -50,6 +51,21 @@ function apiDocumentEndpoints(app) {
             example: {
               success: true,
               error: null,
+              documents: [
+                {
+                  "location": "custom-documents/anythingllm.txt-6e8be64c-c162-4b43-9997-b068c0071e8b.json",
+                  "name": "anythingllm.txt-6e8be64c-c162-4b43-9997-b068c0071e8b.json",
+                  "url": "file:///Users/tim/Documents/anything-llm/collector/hotdir/anythingllm.txt",
+                  "title": "anythingllm.txt",
+                  "docAuthor": "Unknown",
+                  "description": "Unknown",
+                  "docSource": "a text file uploaded by the user.",
+                  "chunkSource": "anythingllm.txt",
+                  "published": "1/16/2024, 3:07:00 PM",
+                  "wordCount": 93,
+                  "token_count_estimate": 115,
+                }
+              ]
             }
           }
         }           
@@ -75,16 +91,113 @@ function apiDocumentEndpoints(app) {
             .end();
         }
 
-        const { success, reason } = await processDocument(originalname);
+        const { success, reason, documents } =
+          await processDocument(originalname);
         if (!success) {
-          response.status(500).json({ success: false, error: reason }).end();
+          response
+            .status(500)
+            .json({ success: false, error: reason, documents })
+            .end();
+          return;
         }
 
         console.log(
           `Document ${originalname} uploaded processed and successfully. It is now available in documents.`
         );
         await Telemetry.sendTelemetry("document_uploaded");
-        response.status(200).json({ success: true, error: null });
+        response.status(200).json({ success: true, error: null, documents });
+      } catch (e) {
+        console.log(e.message, e);
+        response.sendStatus(500).end();
+      }
+    }
+  );
+
+  app.post(
+    "/v1/document/upload-link",
+    [validApiKey],
+    async (request, response) => {
+      /* 
+    #swagger.tags = ['Documents']
+    #swagger.description = 'Upload a valid URL for AnythingLLM to scrape and prepare for embedding.'
+    #swagger.requestBody = {
+      description: 'Link of web address to be scraped.',
+      required: true,
+      type: 'file',
+      content: {
+          "application/json": {
+            schema: {
+              type: 'object',
+              example: {
+                "link": "https://useanything.com"
+              }
+            }
+          }           
+        }
+    }
+    #swagger.responses[200] = {
+      content: {
+        "application/json": {
+          schema: {
+            type: 'object',
+            example: {
+              success: true,
+              error: null,
+              documents: [
+                {
+                  "id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc",
+                  "url": "file://useanything_com.html",
+                  "title": "useanything_com.html",
+                  "docAuthor": "no author found",
+                  "description": "No description found.",
+                  "docSource": "URL link uploaded by the user.",
+                  "chunkSource": "https:useanything.com.html",
+                  "published": "1/16/2024, 3:46:33 PM",
+                  "wordCount": 252,
+                  "pageContent": "AnythingLLM is the best....",
+                  "token_count_estimate": 447,
+                  "location": "custom-documents/url-useanything_com-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json"
+                }
+              ]
+            }
+          }
+        }           
+      }
+    }  
+    #swagger.responses[403] = {
+      schema: {
+        "$ref": "#/definitions/InvalidAPIKey"
+      }
+    }
+    */
+      try {
+        const { link } = reqBody(request);
+        const processingOnline = await checkProcessorAlive();
+
+        if (!processingOnline) {
+          response
+            .status(500)
+            .json({
+              success: false,
+              error: `Document processing API is not online. Link ${link} will not be processed automatically.`,
+            })
+            .end();
+        }
+
+        const { success, reason, documents } = await processLink(link);
+        if (!success) {
+          response
+            .status(500)
+            .json({ success: false, error: reason, documents })
+            .end();
+          return;
+        }
+
+        console.log(
+          `Link ${link} uploaded processed and successfully. It is now available in documents.`
+        );
+        await Telemetry.sendTelemetry("document_uploaded");
+        response.status(200).json({ success: true, error: null, documents });
       } catch (e) {
         console.log(e.message, e);
         response.sendStatus(500).end();
diff --git a/server/endpoints/api/workspace/index.js b/server/endpoints/api/workspace/index.js
index 365e8b014..c1642ce4a 100644
--- a/server/endpoints/api/workspace/index.js
+++ b/server/endpoints/api/workspace/index.js
@@ -381,8 +381,8 @@ function apiWorkspaceEndpoints(app) {
       content: {
         "application/json": {
           example: {
-            adds: [],
-            deletes: ["custom-documents/anythingllm-hash.json"]
+            adds: ["custom-documents/my-pdf.pdf-hash.json"],
+            deletes: ["custom-documents/anythingllm.txt-hash.json"]
           }
         }
       }
diff --git a/server/swagger/openapi.json b/server/swagger/openapi.json
index 7d91579fd..c7532059d 100644
--- a/server/swagger/openapi.json
+++ b/server/swagger/openapi.json
@@ -845,7 +845,22 @@
                   "type": "object",
                   "example": {
                     "success": true,
-                    "error": null
+                    "error": null,
+                    "documents": [
+                      {
+                        "location": "custom-documents/anythingllm.txt-6e8be64c-c162-4b43-9997-b068c0071e8b.json",
+                        "name": "anythingllm.txt-6e8be64c-c162-4b43-9997-b068c0071e8b.json",
+                        "url": "file://Users/tim/Documents/anything-llm/collector/hotdir/anythingllm.txt",
+                        "title": "anythingllm.txt",
+                        "docAuthor": "Unknown",
+                        "description": "Unknown",
+                        "docSource": "a text file uploaded by the user.",
+                        "chunkSource": "anythingllm.txt",
+                        "published": "1/16/2024, 3:07:00 PM",
+                        "wordCount": 93,
+                        "token_count_estimate": 115
+                      }
+                    ]
                   }
                 }
               }
@@ -890,6 +905,88 @@
         }
       }
     },
+    "/v1/document/upload-link": {
+      "post": {
+        "tags": [
+          "Documents"
+        ],
+        "description": "Upload a valid URL for AnythingLLM to scrape and prepare for embedding.",
+        "parameters": [
+          {
+            "name": "Authorization",
+            "in": "header",
+            "schema": {
+              "type": "string"
+            }
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "OK",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "example": {
+                    "success": true,
+                    "error": null,
+                    "documents": [
+                      {
+                        "id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc",
+                        "url": "file://useanything_com.html",
+                        "title": "useanything_com.html",
+                        "docAuthor": "no author found",
+                        "description": "No description found.",
+                        "docSource": "URL link uploaded by the user.",
+                        "chunkSource": "https:useanything.com.html",
+                        "published": "1/16/2024, 3:46:33 PM",
+                        "wordCount": 252,
+                        "pageContent": "AnythingLLM is the best....",
+                        "token_count_estimate": 447,
+                        "location": "custom-documents/url-useanything_com-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json"
+                      }
+                    ]
+                  }
+                }
+              }
+            }
+          },
+          "403": {
+            "description": "Forbidden",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              },
+              "application/xml": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Internal Server Error"
+          }
+        },
+        "requestBody": {
+          "description": "Link of web address to be scraped.",
+          "required": true,
+          "type": "file",
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "example": {
+                  "link": "https://useanything.com"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
     "/v1/documents": {
       "get": {
         "tags": [
@@ -1593,9 +1690,11 @@
           "content": {
             "application/json": {
               "example": {
-                "adds": [],
+                "adds": [
+                  "custom-documents/my-pdf.pdf-hash.json"
+                ],
                 "deletes": [
-                  "custom-documents/anythingllm-hash.json"
+                  "custom-documents/anythingllm.txt-hash.json"
                 ]
               }
             }
diff --git a/server/utils/files/documentProcessor.js b/server/utils/files/documentProcessor.js
index 5239a8708..27d0f5f2b 100644
--- a/server/utils/files/documentProcessor.js
+++ b/server/utils/files/documentProcessor.js
@@ -35,7 +35,7 @@ async function processDocument(filename = "") {
     .then((res) => res)
     .catch((e) => {
       console.log(e.message);
-      return { success: false, reason: e.message };
+      return { success: false, reason: e.message, documents: [] };
     });
 }
 
@@ -55,7 +55,7 @@ async function processLink(link = "") {
     .then((res) => res)
     .catch((e) => {
       console.log(e.message);
-      return { success: false, reason: e.message };
+      return { success: false, reason: e.message, documents: [] };
     });
 }
 
-- 
GitLab