Feature/DocumentStore (#2106)

* datasource: initial commit

* datasource: datasource details and chunks

* datasource: Document Store Node

* more changes

* Document Store - Base functionality

* Document Store Loader Component

* Document Store Loader Component

* before merging the modularity PR

* after merging the modularity PR

* preview mode

* initial draft PR

* fixes

* minor updates and  fixes

* preview with loader and splitter

* preview with credential

* show stored chunks

* preview update...

* edit config

* save, preview and other changes

* save, preview and other changes

* save, process and other changes

* save, process and other changes

* alpha1 - for internal testing

* rerouting urls

* bug fix on new leader create

* pagination support for chunks

* delete document store

* Update pnpm-lock.yaml

* doc store card view

* Update store files to use updated storage functions, Document Store Table View and other changes

* ui changes

* add expanded chunk dialog, improve ui

* change throw Error to InternalError

* Bug Fixes and removal of subFolder, adding of view chunks for store

* lint fixes

* merge changes

* DocumentStoreStatus component

* ui changes for doc store

* add remove metadata key field, add custom document loader

* add chatflows used doc store chips

* add types/interfaces to DocumentStore Services

* document loader list dialog title bar color change

* update interfaces

* Whereused Chatflow Name and Added chunkNo to retain order of created chunks.

* use typeorm order chunkNo, ui changes

---------

Co-authored-by: Henry <hzj94@hotmail.com>
Co-authored-by: Henry Heng <henryheng@flowiseai.com>
This commit is contained in:
Vinod Kiran
2024-05-06 19:53:27 +05:30
committed by GitHub
parent af4e28aa91
commit 40e36d1b39
91 changed files with 38713 additions and 32791 deletions
@@ -1,8 +1,9 @@
import axios, { AxiosRequestConfig } from 'axios'
import { omit } from 'lodash'
import { Document } from '@langchain/core/documents'
import { TextSplitter } from 'langchain/text_splitter'
import { BaseDocumentLoader } from 'langchain/document_loaders/base'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
class API_DocumentLoaders implements INode {
label: string
@@ -66,6 +67,25 @@ class API_DocumentLoaders implements INode {
'JSON body for the POST request. If not specified, agent will try to figure out itself from AIPlugin if provided',
additionalParams: true,
optional: true
},
{
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
]
}
@@ -76,6 +96,12 @@ class API_DocumentLoaders implements INode {
const method = nodeData.inputs?.method as string
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
const options: ApiLoaderParams = {
url,
@@ -94,7 +120,7 @@ class API_DocumentLoaders implements INode {
const loader = new ApiLoader(options)
let docs = []
let docs: IDocument[] = []
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
@@ -104,18 +130,26 @@ class API_DocumentLoaders implements INode {
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of docs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return docs
@@ -146,7 +180,7 @@ class ApiLoader extends BaseDocumentLoader {
this.method = method
}
public async load(): Promise<Document[]> {
public async load(): Promise<IDocument[]> {
if (this.method === 'POST') {
return this.executePostRequest(this.url, this.headers, this.body)
} else {
@@ -154,7 +188,7 @@ class ApiLoader extends BaseDocumentLoader {
}
}
protected async executeGetRequest(url: string, headers?: ICommonObject): Promise<Document[]> {
protected async executeGetRequest(url: string, headers?: ICommonObject): Promise<IDocument[]> {
try {
const config: AxiosRequestConfig = {}
if (headers) {
@@ -174,7 +208,7 @@ class ApiLoader extends BaseDocumentLoader {
}
}
protected async executePostRequest(url: string, headers?: ICommonObject, body?: ICommonObject): Promise<Document[]> {
protected async executePostRequest(url: string, headers?: ICommonObject, body?: ICommonObject): Promise<IDocument[]> {
try {
const config: AxiosRequestConfig = {}
if (headers) {
@@ -1,9 +1,10 @@
import axios from 'axios'
import { omit } from 'lodash'
import { Document } from '@langchain/core/documents'
import { TextSplitter } from 'langchain/text_splitter'
import { BaseDocumentLoader } from 'langchain/document_loaders/base'
import { getCredentialData, getCredentialParam } from '../../../src/utils'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { IDocument, ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
class Airtable_DocumentLoaders implements INode {
label: string
@@ -93,9 +94,21 @@ class Airtable_DocumentLoaders implements INode {
description: 'Number of results to return. Ignored when Return All is enabled.'
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -111,6 +124,12 @@ class Airtable_DocumentLoaders implements INode {
const limit = nodeData.inputs?.limit as string
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const accessToken = getCredentialParam('accessToken', credentialData, nodeData)
@@ -131,7 +150,7 @@ class Airtable_DocumentLoaders implements INode {
throw new Error('Base ID and Table ID must be provided.')
}
let docs = []
let docs: IDocument[] = []
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
@@ -141,18 +160,26 @@ class Airtable_DocumentLoaders implements INode {
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of docs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return docs
@@ -213,7 +240,7 @@ class AirtableLoader extends BaseDocumentLoader {
this.returnAll = returnAll
}
public async load(): Promise<Document[]> {
public async load(): Promise<IDocument[]> {
if (this.returnAll) {
return this.loadAll()
}
@@ -238,7 +265,7 @@ class AirtableLoader extends BaseDocumentLoader {
}
}
private createDocumentFromPage(page: AirtableLoaderPage): Document {
private createDocumentFromPage(page: AirtableLoaderPage): IDocument {
// Generate the URL
const pageUrl = `https://api.airtable.com/v0/${this.baseId}/${this.tableId}/${page.id}`
@@ -251,7 +278,7 @@ class AirtableLoader extends BaseDocumentLoader {
})
}
private async loadLimit(): Promise<Document[]> {
private async loadLimit(): Promise<IDocument[]> {
let data: AirtableLoaderRequest = {
maxRecords: this.limit,
view: this.viewId
@@ -282,7 +309,7 @@ class AirtableLoader extends BaseDocumentLoader {
return returnPages.map((page) => this.createDocumentFromPage(page))
}
private async loadAll(): Promise<Document[]> {
private async loadAll(): Promise<IDocument[]> {
let data: AirtableLoaderRequest = {
view: this.viewId
}
@@ -1,3 +1,4 @@
import { omit } from 'lodash'
import { INode, INodeData, INodeParams, ICommonObject } from '../../../src/Interface'
import { getCredentialData, getCredentialParam } from '../../../src/utils'
import { TextSplitter } from 'langchain/text_splitter'
@@ -92,9 +93,21 @@ class ApifyWebsiteContentCrawler_DocumentLoaders implements INode {
additionalParams: true
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -110,6 +123,12 @@ class ApifyWebsiteContentCrawler_DocumentLoaders implements INode {
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
// Get input options and merge with additional input
const urls = nodeData.inputs?.urls as string
@@ -153,18 +172,26 @@ class ApifyWebsiteContentCrawler_DocumentLoaders implements INode {
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of docs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return docs
@@ -1,10 +1,11 @@
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { omit } from 'lodash'
import { CheerioWebBaseLoader, WebBaseLoaderParams } from 'langchain/document_loaders/web/cheerio'
import { test } from 'linkifyjs'
import { parse } from 'css-what'
import { webCrawl, xmlScrape } from '../../../src'
import { SelectorType } from 'cheerio'
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
class Cheerio_DocumentLoaders implements INode {
label: string
@@ -55,6 +56,7 @@ class Cheerio_DocumentLoaders implements INode {
description: 'Scrape relative links from XML sitemap URL'
}
],
default: 'webCrawl',
optional: true,
additionalParams: true
},
@@ -78,9 +80,21 @@ class Cheerio_DocumentLoaders implements INode {
additionalParams: true
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -94,6 +108,13 @@ class Cheerio_DocumentLoaders implements INode {
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
let limit = parseInt(nodeData.inputs?.limit as string)
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
let url = nodeData.inputs?.url as string
url = url.trim()
if (!test(url)) {
@@ -123,7 +144,8 @@ class Cheerio_DocumentLoaders implements INode {
}
}
let docs = []
let docs: IDocument[] = []
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`)
// if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
@@ -154,18 +176,26 @@ class Cheerio_DocumentLoaders implements INode {
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of docs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return docs
@@ -1,3 +1,4 @@
import { omit } from 'lodash'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { ConfluencePagesLoader, ConfluencePagesLoaderParams } from 'langchain/document_loaders/web/confluence'
@@ -59,9 +60,21 @@ class Confluence_DocumentLoaders implements INode {
optional: true
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -74,6 +87,12 @@ class Confluence_DocumentLoaders implements INode {
const limit = nodeData.inputs?.limit as number
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const accessToken = getCredentialParam('accessToken', credentialData, nodeData)
@@ -107,18 +126,26 @@ class Confluence_DocumentLoaders implements INode {
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of docs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return docs
@@ -1,4 +1,5 @@
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { omit } from 'lodash'
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { CSVLoader } from 'langchain/document_loaders/fs/csv'
import { getFileFromStorage } from '../../../src'
@@ -45,9 +46,21 @@ class Csv_DocumentLoaders implements INode {
optional: true
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -59,8 +72,14 @@ class Csv_DocumentLoaders implements INode {
const csvFileBase64 = nodeData.inputs?.csvFile as string
const columnName = nodeData.inputs?.columnName as string
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let alldocs = []
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
let docs: IDocument[] = []
let files: string[] = []
if (csvFileBase64.startsWith('FILE-STORAGE::')) {
@@ -78,11 +97,9 @@ class Csv_DocumentLoaders implements INode {
const loader = new CSVLoader(blob, columnName.trim().length === 0 ? undefined : columnName.trim())
if (textSplitter) {
const docs = await loader.loadAndSplit(textSplitter)
alldocs.push(...docs)
docs.push(...(await loader.loadAndSplit(textSplitter)))
} else {
const docs = await loader.load()
alldocs.push(...docs)
docs.push(...(await loader.loadAndSplit(textSplitter)))
}
}
} else {
@@ -100,32 +117,38 @@ class Csv_DocumentLoaders implements INode {
const loader = new CSVLoader(blob, columnName.trim().length === 0 ? undefined : columnName.trim())
if (textSplitter) {
const docs = await loader.loadAndSplit(textSplitter)
alldocs.push(...docs)
docs.push(...(await loader.loadAndSplit(textSplitter)))
} else {
const docs = await loader.load()
alldocs.push(...docs)
docs.push(...(await loader.load()))
}
}
}
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of alldocs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return alldocs
return docs
}
}
@@ -0,0 +1,163 @@
import { ICommonObject, IDatabaseEntity, INode, INodeData, INodeOutputsValue, INodeParams } from '../../../src/Interface'
import { NodeVM } from 'vm2'
import { DataSource } from 'typeorm'
import { availableDependencies, defaultAllowBuiltInDep, getVars, handleEscapeCharacters, prepareSandboxVars } from '../../../src/utils'
class CustomDocumentLoader_DocumentLoaders implements INode {
label: string
name: string
version: number
description: string
type: string
icon: string
category: string
badge: string
baseClasses: string[]
inputs: INodeParams[]
outputs: INodeOutputsValue[]
constructor() {
this.label = 'Custom Document Loader'
this.name = 'customDocumentLoader'
this.version = 1.0
this.type = 'Document'
this.icon = 'customDocLoader.svg'
this.category = 'Document Loaders'
this.badge = 'NEW'
this.description = `Custom function for loading documents`
this.baseClasses = [this.type]
this.inputs = [
{
label: 'Input Variables',
name: 'functionInputVariables',
description: 'Input variables can be used in the function with prefix $. For example: $var',
type: 'json',
optional: true,
acceptVariable: true,
list: true
},
{
label: 'Javascript Function',
name: 'javascriptFunction',
type: 'code',
description: `Must return an array of document objects containing metadata and pageContent if "Document" is selected in the output. If "Text" is selected in the output, it must return a string.`,
placeholder: `return [
{
pageContent: 'Document Content',
metadata: {
title: 'Document Title',
}
}
]`
}
]
this.outputs = [
{
label: 'Document',
name: 'document',
description: 'Array of document objects containing metadata and pageContent',
baseClasses: [...this.baseClasses, 'json']
},
{
label: 'Text',
name: 'text',
description: 'Concatenated string from pageContent of documents',
baseClasses: ['string', 'json']
}
]
}
async init(nodeData: INodeData, input: string, options: ICommonObject): Promise<any> {
const output = nodeData.outputs?.output as string
const javascriptFunction = nodeData.inputs?.javascriptFunction as string
const functionInputVariablesRaw = nodeData.inputs?.functionInputVariables
const appDataSource = options.appDataSource as DataSource
const databaseEntities = options.databaseEntities as IDatabaseEntity
const variables = await getVars(appDataSource, databaseEntities, nodeData)
const flow = {
chatflowId: options.chatflowid,
sessionId: options.sessionId,
chatId: options.chatId,
input
}
let inputVars: ICommonObject = {}
if (functionInputVariablesRaw) {
try {
inputVars =
typeof functionInputVariablesRaw === 'object' ? functionInputVariablesRaw : JSON.parse(functionInputVariablesRaw)
} catch (exception) {
throw new Error('Invalid JSON in the Custom Document Loader Input Variables: ' + exception)
}
}
// Some values might be a stringified JSON, parse it
for (const key in inputVars) {
let value = inputVars[key]
if (typeof value === 'string') {
value = handleEscapeCharacters(value, true)
if (value.startsWith('{') && value.endsWith('}')) {
try {
value = JSON.parse(value)
} catch (e) {
// ignore
}
}
inputVars[key] = value
}
}
let sandbox: any = { $input: input }
sandbox['$vars'] = prepareSandboxVars(variables)
sandbox['$flow'] = flow
if (Object.keys(inputVars).length) {
for (const item in inputVars) {
sandbox[`$${item}`] = inputVars[item]
}
}
const builtinDeps = process.env.TOOL_FUNCTION_BUILTIN_DEP
? defaultAllowBuiltInDep.concat(process.env.TOOL_FUNCTION_BUILTIN_DEP.split(','))
: defaultAllowBuiltInDep
const externalDeps = process.env.TOOL_FUNCTION_EXTERNAL_DEP ? process.env.TOOL_FUNCTION_EXTERNAL_DEP.split(',') : []
const deps = availableDependencies.concat(externalDeps)
const nodeVMOptions = {
console: 'inherit',
sandbox,
require: {
external: { modules: deps },
builtin: builtinDeps
}
} as any
const vm = new NodeVM(nodeVMOptions)
try {
const response = await vm.run(`module.exports = async function() {${javascriptFunction}}()`, __dirname)
if (output === 'document' && Array.isArray(response)) {
if (response.length === 0) return response
if (
response[0].pageContent &&
typeof response[0].pageContent === 'string' &&
response[0].metadata &&
typeof response[0].metadata === 'object'
)
return response
throw new Error('Document object must contain pageContent and metadata')
}
if (output === 'text' && typeof response === 'string') {
return handleEscapeCharacters(response, false)
}
return response
} catch (e) {
throw new Error(e)
}
}
}
module.exports = { nodeClass: CustomDocumentLoader_DocumentLoaders }
@@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon icon-tabler icons-tabler-outline icon-tabler-writing"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M20 17v-12c0 -1.121 -.879 -2 -2 -2s-2 .879 -2 2v12l2 2l2 -2z" /><path d="M16 7h4" /><path d="M18 19h-13a2 2 0 1 1 0 -4h4a2 2 0 1 0 0 -4h-3" /></svg>

After

Width:  |  Height:  |  Size: 465 B

@@ -0,0 +1,95 @@
import { ICommonObject, IDatabaseEntity, INode, INodeData, INodeOptionsValue, INodeOutputsValue, INodeParams } from '../../../src/Interface'
import { DataSource } from 'typeorm'
import { Document } from '@langchain/core/documents'
class DocStore_DocumentLoaders implements INode {
label: string
name: string
version: number
description: string
type: string
icon: string
category: string
baseClasses: string[]
inputs: INodeParams[]
outputs: INodeOutputsValue[]
badge: string
constructor() {
this.label = 'Document Store'
this.name = 'documentStore'
this.version = 1.0
this.type = 'Document'
this.icon = 'dstore.svg'
this.badge = 'NEW'
this.category = 'Document Loaders'
this.description = `Load data from pre-configured document stores`
this.baseClasses = [this.type]
this.inputs = [
{
label: 'Select Store',
name: 'selectedStore',
type: 'asyncOptions',
loadMethod: 'listStores'
}
]
this.outputs = [
{
label: 'Document',
name: 'document',
description: 'Array of document objects containing metadata and pageContent',
baseClasses: [...this.baseClasses, 'json']
},
{
label: 'Text',
name: 'text',
description: 'Concatenated string from pageContent of documents',
baseClasses: ['string', 'json']
}
]
}
//@ts-ignore
loadMethods = {
async listStores(_: INodeData, options: ICommonObject): Promise<INodeOptionsValue[]> {
const returnData: INodeOptionsValue[] = []
const appDataSource = options.appDataSource as DataSource
const databaseEntities = options.databaseEntities as IDatabaseEntity
if (appDataSource === undefined || !appDataSource) {
return returnData
}
const stores = await appDataSource.getRepository(databaseEntities['DocumentStore']).find()
for (const store of stores) {
if (store.status === 'SYNC') {
const obj = {
name: store.id,
label: store.name,
description: store.description
}
returnData.push(obj)
}
}
return returnData
}
}
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const selectedStore = nodeData.inputs?.selectedStore as string
const appDataSource = options.appDataSource as DataSource
const databaseEntities = options.databaseEntities as IDatabaseEntity
const chunks = await appDataSource
.getRepository(databaseEntities['DocumentStoreFileChunk'])
.find({ where: { storeId: selectedStore } })
const finalDocs = []
for (const chunk of chunks) {
finalDocs.push(new Document({ pageContent: chunk.pageContent, metadata: JSON.parse(chunk.metadata) }))
}
return finalDocs
}
}
module.exports = { nodeClass: DocStore_DocumentLoaders }
@@ -0,0 +1,15 @@
<svg
xmlns="http://www.w3.org/2000/svg"
width="24"
height="24"
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
stroke-width="2"
stroke-linecap="round"
stroke-linejoin="round"
>
<path d="M12 4l-8 4l8 4l8 -4l-8 -4" />
<path d="M4 12l8 4l8 -4" />
<path d="M4 16l8 4l8 -4" />
</svg>

After

Width:  |  Height:  |  Size: 305 B

@@ -1,4 +1,5 @@
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { omit } from 'lodash'
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { DocxLoader } from 'langchain/document_loaders/fs/docx'
import { getFileFromStorage } from '../../../src'
@@ -37,9 +38,21 @@ class Docx_DocumentLoaders implements INode {
optional: true
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -50,8 +63,14 @@ class Docx_DocumentLoaders implements INode {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const docxFileBase64 = nodeData.inputs?.docxFile as string
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let alldocs = []
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
let docs: IDocument[] = []
let files: string[] = []
if (docxFileBase64.startsWith('FILE-STORAGE::')) {
@@ -69,11 +88,9 @@ class Docx_DocumentLoaders implements INode {
const loader = new DocxLoader(blob)
if (textSplitter) {
const docs = await loader.loadAndSplit(textSplitter)
alldocs.push(...docs)
docs.push(...(await loader.loadAndSplit(textSplitter)))
} else {
const docs = await loader.load()
alldocs.push(...docs)
docs.push(...(await loader.load()))
}
}
} else {
@@ -91,32 +108,38 @@ class Docx_DocumentLoaders implements INode {
const loader = new DocxLoader(blob)
if (textSplitter) {
const docs = await loader.loadAndSplit(textSplitter)
alldocs.push(...docs)
docs.push(...(await loader.loadAndSplit(textSplitter)))
} else {
const docs = await loader.load()
alldocs.push(...docs)
docs.push(...(await loader.load()))
}
}
}
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of alldocs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return alldocs
return docs
}
}
@@ -1,3 +1,4 @@
import { omit } from 'lodash'
import { getCredentialData, getCredentialParam } from '../../../src'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { FigmaFileLoader, FigmaLoaderParams } from 'langchain/document_loaders/web/figma'
@@ -60,9 +61,21 @@ class Figma_DocumentLoaders implements INode {
optional: true
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -74,6 +87,12 @@ class Figma_DocumentLoaders implements INode {
const fileKey = nodeData.inputs?.fileKey as string
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const accessToken = getCredentialParam('accessToken', credentialData, nodeData)
@@ -86,19 +105,30 @@ class Figma_DocumentLoaders implements INode {
const loader = new FigmaFileLoader(figmaOptions)
const docs = textSplitter ? await loader.loadAndSplit() : await loader.load()
let docs = textSplitter ? await loader.loadAndSplit() : await loader.load()
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
return docs.map((doc) => {
return {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
})
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return docs
@@ -1,3 +1,4 @@
import { omit } from 'lodash'
import { INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { TextLoader } from 'langchain/document_loaders/fs/text'
@@ -65,9 +66,21 @@ class Folder_DocumentLoaders implements INode {
additionalParams: true
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -80,6 +93,12 @@ class Folder_DocumentLoaders implements INode {
const metadata = nodeData.inputs?.metadata
const recursive = nodeData.inputs?.recursive as boolean
const pdfUsage = nodeData.inputs?.pdfUsage
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
const loader = new DirectoryLoader(
folderPath,
@@ -141,18 +160,26 @@ class Folder_DocumentLoaders implements INode {
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of docs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return docs
@@ -1,3 +1,4 @@
import { omit } from 'lodash'
import { INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { GitbookLoader } from 'langchain/document_loaders/web/gitbook'
@@ -44,9 +45,21 @@ class Gitbook_DocumentLoaders implements INode {
optional: true
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -57,22 +70,39 @@ class Gitbook_DocumentLoaders implements INode {
const shouldLoadAllPaths = nodeData.inputs?.shouldLoadAllPaths as boolean
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
const loader = shouldLoadAllPaths ? new GitbookLoader(webPath, { shouldLoadAllPaths }) : new GitbookLoader(webPath)
const docs = textSplitter ? await loader.loadAndSplit() : await loader.load()
let docs = textSplitter ? await loader.loadAndSplit() : await loader.load()
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
return docs.map((doc) => {
return {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
})
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return docs
@@ -1,3 +1,4 @@
import { omit } from 'lodash'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { GithubRepoLoader, GithubRepoLoaderParams } from 'langchain/document_loaders/web/github'
@@ -86,9 +87,21 @@ class Github_DocumentLoaders implements INode {
optional: true
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -104,6 +117,12 @@ class Github_DocumentLoaders implements INode {
const maxConcurrency = nodeData.inputs?.maxConcurrency as string
const maxRetries = nodeData.inputs?.maxRetries as string
const ignorePath = nodeData.inputs?.ignorePath as string
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const accessToken = getCredentialParam('accessToken', credentialData, nodeData)
@@ -120,19 +139,30 @@ class Github_DocumentLoaders implements INode {
if (ignorePath) githubOptions.ignorePaths = JSON.parse(ignorePath)
const loader = new GithubRepoLoader(repoLink, githubOptions)
const docs = textSplitter ? await loader.loadAndSplit(textSplitter) : await loader.load()
let docs = textSplitter ? await loader.loadAndSplit(textSplitter) : await loader.load()
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
return docs.map((doc) => {
return {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
})
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return docs
@@ -1,4 +1,5 @@
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { omit } from 'lodash'
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { JSONLoader } from 'langchain/document_loaders/fs/json'
import { getFileFromStorage } from '../../../src'
@@ -45,9 +46,21 @@ class Json_DocumentLoaders implements INode {
optional: true
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -59,6 +72,12 @@ class Json_DocumentLoaders implements INode {
const jsonFileBase64 = nodeData.inputs?.jsonFile as string
const pointersName = nodeData.inputs?.pointersName as string
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
let pointers: string[] = []
if (pointersName) {
@@ -66,7 +85,7 @@ class Json_DocumentLoaders implements INode {
pointers = outputString.split(',').map((pointer) => '/' + pointer.trim())
}
let alldocs = []
let docs: IDocument[] = []
let files: string[] = []
//FILE-STORAGE::["CONTRIBUTING.md","LICENSE.md","README.md"]
@@ -85,11 +104,9 @@ class Json_DocumentLoaders implements INode {
const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined)
if (textSplitter) {
const docs = await loader.loadAndSplit(textSplitter)
alldocs.push(...docs)
docs.push(...(await loader.loadAndSplit(textSplitter)))
} else {
const docs = await loader.load()
alldocs.push(...docs)
docs.push(...(await loader.load()))
}
}
} else {
@@ -107,32 +124,38 @@ class Json_DocumentLoaders implements INode {
const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined)
if (textSplitter) {
const docs = await loader.loadAndSplit(textSplitter)
alldocs.push(...docs)
docs.push(...(await loader.loadAndSplit(textSplitter)))
} else {
const docs = await loader.load()
alldocs.push(...docs)
docs.push(...(await loader.load()))
}
}
}
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of alldocs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return alldocs
return docs
}
}
@@ -1,4 +1,5 @@
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { omit } from 'lodash'
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { JSONLinesLoader } from 'langchain/document_loaders/fs/json'
import { getFileFromStorage } from '../../../src'
@@ -44,9 +45,21 @@ class Jsonlines_DocumentLoaders implements INode {
optional: false
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -58,8 +71,14 @@ class Jsonlines_DocumentLoaders implements INode {
const jsonLinesFileBase64 = nodeData.inputs?.jsonlinesFile as string
const pointerName = nodeData.inputs?.pointerName as string
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let alldocs = []
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
let docs: IDocument[] = []
let files: string[] = []
let pointer = '/' + pointerName.trim()
@@ -79,11 +98,9 @@ class Jsonlines_DocumentLoaders implements INode {
const loader = new JSONLinesLoader(blob, pointer)
if (textSplitter) {
const docs = await loader.loadAndSplit(textSplitter)
alldocs.push(...docs)
docs.push(...(await loader.loadAndSplit(textSplitter)))
} else {
const docs = await loader.load()
alldocs.push(...docs)
docs.push(...(await loader.load()))
}
}
} else {
@@ -101,32 +118,38 @@ class Jsonlines_DocumentLoaders implements INode {
const loader = new JSONLinesLoader(blob, pointer)
if (textSplitter) {
const docs = await loader.loadAndSplit(textSplitter)
alldocs.push(...docs)
docs.push(...(await loader.loadAndSplit(textSplitter)))
} else {
const docs = await loader.load()
alldocs.push(...docs)
docs.push(...(await loader.load()))
}
}
}
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of alldocs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return alldocs
return docs
}
}
@@ -1,4 +1,5 @@
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { omit } from 'lodash'
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { NotionAPILoader, NotionAPILoaderOptions } from 'langchain/document_loaders/web/notionapi'
import { getCredentialData, getCredentialParam } from '../../../src'
@@ -44,9 +45,21 @@ class NotionDB_DocumentLoaders implements INode {
description: 'If your URL looks like - https://www.notion.so/abcdefh?v=long_hash_2, then abcdefh is the database ID'
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -57,6 +70,12 @@ class NotionDB_DocumentLoaders implements INode {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const databaseId = nodeData.inputs?.databaseId as string
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const notionIntegrationToken = getCredentialParam('notionIntegrationToken', credentialData, nodeData)
@@ -74,7 +93,7 @@ class NotionDB_DocumentLoaders implements INode {
}
const loader = new NotionAPILoader(obj)
let docs = []
let docs: IDocument[] = []
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
} else {
@@ -83,18 +102,26 @@ class NotionDB_DocumentLoaders implements INode {
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of docs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return docs
@@ -1,4 +1,5 @@
import { INode, INodeData, INodeParams } from '../../../src/Interface'
import { omit } from 'lodash'
import { IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { NotionLoader } from 'langchain/document_loaders/fs/notion'
@@ -37,9 +38,21 @@ class NotionFolder_DocumentLoaders implements INode {
optional: true
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -50,9 +63,15 @@ class NotionFolder_DocumentLoaders implements INode {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const notionFolder = nodeData.inputs?.notionFolder as string
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
const loader = new NotionLoader(notionFolder)
let docs = []
let docs: IDocument[] = []
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
@@ -62,18 +81,26 @@ class NotionFolder_DocumentLoaders implements INode {
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of docs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return docs
@@ -1,4 +1,5 @@
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { omit } from 'lodash'
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { NotionAPILoader, NotionAPILoaderOptions } from 'langchain/document_loaders/web/notionapi'
import { getCredentialData, getCredentialParam } from '../../../src'
@@ -45,9 +46,21 @@ class NotionPage_DocumentLoaders implements INode {
'The last The 32 char hex in the url path. For example: https://www.notion.so/skarard/LangChain-Notion-API-b34ca03f219c4420a6046fc4bdfdf7b4, b34ca03f219c4420a6046fc4bdfdf7b4 is the Page ID'
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -58,6 +71,12 @@ class NotionPage_DocumentLoaders implements INode {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const pageId = nodeData.inputs?.pageId as string
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const notionIntegrationToken = getCredentialParam('notionIntegrationToken', credentialData, nodeData)
@@ -71,7 +90,7 @@ class NotionPage_DocumentLoaders implements INode {
}
const loader = new NotionAPILoader(obj)
let docs = []
let docs: IDocument[] = []
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
} else {
@@ -80,18 +99,26 @@ class NotionPage_DocumentLoaders implements INode {
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of docs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return docs
@@ -1,4 +1,5 @@
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { omit } from 'lodash'
import { IDocument, ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { PDFLoader } from 'langchain/document_loaders/fs/pdf'
import { getFileFromStorage } from '../../../src'
@@ -60,9 +61,21 @@ class Pdf_DocumentLoaders implements INode {
additionalParams: true
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -75,8 +88,14 @@ class Pdf_DocumentLoaders implements INode {
const usage = nodeData.inputs?.usage as string
const metadata = nodeData.inputs?.metadata
const legacyBuild = nodeData.inputs?.legacyBuild as boolean
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let alldocs: any[] = []
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
let docs: IDocument[] = []
let files: string[] = []
//FILE-STORAGE::["CONTRIBUTING.md","LICENSE.md","README.md"]
@@ -92,7 +111,7 @@ class Pdf_DocumentLoaders implements INode {
for (const file of files) {
const fileData = await getFileFromStorage(file, chatflowid)
const bf = Buffer.from(fileData)
await this.extractDocs(usage, bf, legacyBuild, textSplitter, alldocs)
await this.extractDocs(usage, bf, legacyBuild, textSplitter, docs)
}
} else {
if (pdfFileBase64.startsWith('[') && pdfFileBase64.endsWith(']')) {
@@ -105,30 +124,38 @@ class Pdf_DocumentLoaders implements INode {
const splitDataURI = file.split(',')
splitDataURI.pop()
const bf = Buffer.from(splitDataURI.pop() || '', 'base64')
await this.extractDocs(usage, bf, legacyBuild, textSplitter, alldocs)
await this.extractDocs(usage, bf, legacyBuild, textSplitter, docs)
}
}
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of alldocs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return alldocs
return docs
}
private async extractDocs(usage: string, bf: Buffer, legacyBuild: boolean, textSplitter: TextSplitter, alldocs: any[]) {
private async extractDocs(usage: string, bf: Buffer, legacyBuild: boolean, textSplitter: TextSplitter, docs: IDocument[]) {
if (usage === 'perFile') {
const loader = new PDFLoader(new Blob([bf]), {
splitPages: false,
@@ -137,11 +164,9 @@ class Pdf_DocumentLoaders implements INode {
legacyBuild ? import('pdfjs-dist/legacy/build/pdf.js') : import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js')
})
if (textSplitter) {
const docs = await loader.loadAndSplit(textSplitter)
alldocs.push(...docs)
docs.push(...(await loader.loadAndSplit(textSplitter)))
} else {
const docs = await loader.load()
alldocs.push(...docs)
docs.push(...(await loader.load()))
}
} else {
const loader = new PDFLoader(new Blob([bf]), {
@@ -150,11 +175,9 @@ class Pdf_DocumentLoaders implements INode {
legacyBuild ? import('pdfjs-dist/legacy/build/pdf.js') : import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js')
})
if (textSplitter) {
const docs = await loader.loadAndSplit(textSplitter)
alldocs.push(...docs)
docs.push(...(await loader.loadAndSplit(textSplitter)))
} else {
const docs = await loader.load()
alldocs.push(...docs)
docs.push(...(await loader.load()))
}
}
}
@@ -1,4 +1,5 @@
import { INode, INodeData, INodeOutputsValue, INodeParams } from '../../../src/Interface'
import { omit } from 'lodash'
import { IDocument, INode, INodeData, INodeOutputsValue, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { Document } from '@langchain/core/documents'
import { handleEscapeCharacters } from '../../../src'
@@ -40,9 +41,21 @@ class PlainText_DocumentLoaders implements INode {
optional: true
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -68,42 +81,54 @@ class PlainText_DocumentLoaders implements INode {
const text = nodeData.inputs?.text as string
const metadata = nodeData.inputs?.metadata
const output = nodeData.outputs?.output as string
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let alldocs: Document<Record<string, any>>[] = []
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
let docs: IDocument[] = []
if (textSplitter) {
const docs = await textSplitter.createDocuments([text])
alldocs.push(...docs)
docs.push(...(await textSplitter.createDocuments([text])))
} else {
alldocs.push(
docs.push(
new Document({
pageContent: text
})
)
}
let finaldocs: Document<Record<string, any>>[] = []
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
for (const doc of alldocs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
},
omitMetadataKeys
)
}))
} else {
finaldocs = alldocs
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
if (output === 'document') {
return finaldocs
return docs
} else {
let finaltext = ''
for (const doc of finaldocs) {
for (const doc of docs) {
finaltext += `${doc.pageContent}\n`
}
return handleEscapeCharacters(finaltext, false)
@@ -1,4 +1,5 @@
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { omit } from 'lodash'
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { Browser, Page, PlaywrightWebBaseLoader, PlaywrightWebBaseLoaderOptions } from 'langchain/document_loaders/web/playwright'
import { test } from 'linkifyjs'
@@ -53,6 +54,7 @@ class Playwright_DocumentLoaders implements INode {
description: 'Scrape relative links from XML sitemap URL'
}
],
default: 'webCrawl',
optional: true,
additionalParams: true
},
@@ -106,9 +108,21 @@ class Playwright_DocumentLoaders implements INode {
description: 'CSS selectors like .div or #div'
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -123,6 +137,12 @@ class Playwright_DocumentLoaders implements INode {
let limit = parseInt(nodeData.inputs?.limit as string)
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as 'load' | 'domcontentloaded' | 'networkidle' | 'commit' | undefined
let waitForSelector = nodeData.inputs?.waitForSelector as string
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
let url = nodeData.inputs?.url as string
url = url.trim()
@@ -164,7 +184,7 @@ class Playwright_DocumentLoaders implements INode {
}
}
let docs = []
let docs: IDocument[] = []
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`)
// if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
@@ -195,18 +215,26 @@ class Playwright_DocumentLoaders implements INode {
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of docs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return docs
@@ -1,4 +1,5 @@
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { omit } from 'lodash'
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { Browser, Page, PuppeteerWebBaseLoader, PuppeteerWebBaseLoaderOptions } from 'langchain/document_loaders/web/puppeteer'
import { test } from 'linkifyjs'
@@ -54,6 +55,7 @@ class Puppeteer_DocumentLoaders implements INode {
description: 'Scrape relative links from XML sitemap URL'
}
],
default: 'webCrawl',
optional: true,
additionalParams: true
},
@@ -107,9 +109,21 @@ class Puppeteer_DocumentLoaders implements INode {
description: 'CSS selectors like .div or #div'
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -124,6 +138,12 @@ class Puppeteer_DocumentLoaders implements INode {
let limit = parseInt(nodeData.inputs?.limit as string)
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent
let waitForSelector = nodeData.inputs?.waitForSelector as string
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
let url = nodeData.inputs?.url as string
url = url.trim()
@@ -165,7 +185,7 @@ class Puppeteer_DocumentLoaders implements INode {
}
}
let docs = []
let docs: IDocument[] = []
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`)
// if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
@@ -196,18 +216,26 @@ class Puppeteer_DocumentLoaders implements INode {
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of docs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return docs
@@ -1,3 +1,4 @@
import { omit } from 'lodash'
import { ICommonObject, INode, INodeData, INodeOptionsValue, INodeParams } from '../../../src/Interface'
import { S3Loader } from 'langchain/document_loaders/web/s3'
import {
@@ -413,9 +414,21 @@ class S3_DocumentLoaders implements INode {
default: '500'
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -451,6 +464,12 @@ class S3_DocumentLoaders implements INode {
const combineUnderNChars = nodeData.inputs?.combineUnderNChars as number
const newAfterNChars = nodeData.inputs?.newAfterNChars as number
const maxCharacters = nodeData.inputs?.maxCharacters as number
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
let credentials: S3ClientConfig['credentials'] | undefined
@@ -542,19 +561,25 @@ class S3_DocumentLoaders implements INode {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
docs = docs.map((doc) => ({
...doc,
metadata: {
...doc.metadata,
...parsedMetadata,
[sourceIdKey]: doc.metadata[sourceIdKey] || sourceIdKey
}
metadata: omit(
{
...doc.metadata,
...parsedMetadata,
[sourceIdKey]: doc.metadata[sourceIdKey] || sourceIdKey
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: {
...doc.metadata,
[sourceIdKey]: doc.metadata[sourceIdKey] || sourceIdKey
}
metadata: omit(
{
...doc.metadata,
[sourceIdKey]: doc.metadata[sourceIdKey] || sourceIdKey
},
omitMetadataKeys
)
}))
}
@@ -1,3 +1,4 @@
import { omit } from 'lodash'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { SearchApiLoader } from 'langchain/document_loaders/web/searchapi'
@@ -54,9 +55,21 @@ class SearchAPI_DocumentLoaders implements INode {
optional: true
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -68,6 +81,12 @@ class SearchAPI_DocumentLoaders implements INode {
const query = nodeData.inputs?.query as string
const customParameters = nodeData.inputs?.customParameters
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
// Fetch the API credentials for this node
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
@@ -87,19 +106,30 @@ class SearchAPI_DocumentLoaders implements INode {
const loader = new SearchApiLoader(loaderConfig)
// Fetch documents, split if a text splitter is provided
const docs = textSplitter ? await loader.loadAndSplit() : await loader.load()
let docs = textSplitter ? await loader.loadAndSplit() : await loader.load()
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
return docs.map((doc) => {
return {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
})
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return docs
@@ -1,3 +1,4 @@
import { omit } from 'lodash'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { SerpAPILoader } from 'langchain/document_loaders/web/serpapi'
@@ -44,9 +45,21 @@ class SerpAPI_DocumentLoaders implements INode {
optional: true
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -57,23 +70,40 @@ class SerpAPI_DocumentLoaders implements INode {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const query = nodeData.inputs?.query as string
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const serpApiKey = getCredentialParam('serpApiKey', credentialData, nodeData)
const loader = new SerpAPILoader({ q: query, apiKey: serpApiKey })
const docs = textSplitter ? await loader.loadAndSplit() : await loader.load()
let docs = textSplitter ? await loader.loadAndSplit() : await loader.load()
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
return docs.map((doc) => {
return {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
})
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
return docs
@@ -1,7 +1,7 @@
import { ICommonObject, INode, INodeData, INodeOutputsValue, INodeParams } from '../../../src/Interface'
import { omit } from 'lodash'
import { ICommonObject, IDocument, INode, INodeData, INodeOutputsValue, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { TextLoader } from 'langchain/document_loaders/fs/text'
import { Document } from '@langchain/core/documents'
import { getFileFromStorage, handleEscapeCharacters } from '../../../src'
class Text_DocumentLoaders implements INode {
@@ -40,9 +40,21 @@ class Text_DocumentLoaders implements INode {
optional: true
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -68,8 +80,14 @@ class Text_DocumentLoaders implements INode {
const txtFileBase64 = nodeData.inputs?.txtFile as string
const metadata = nodeData.inputs?.metadata
const output = nodeData.outputs?.output as string
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let alldocs = []
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
let docs: IDocument[] = []
let files: string[] = []
//FILE-STORAGE::["CONTRIBUTING.md","LICENSE.md","README.md"]
@@ -88,11 +106,9 @@ class Text_DocumentLoaders implements INode {
const loader = new TextLoader(blob)
if (textSplitter) {
const docs = await loader.loadAndSplit(textSplitter)
alldocs.push(...docs)
docs.push(...(await loader.loadAndSplit(textSplitter)))
} else {
const docs = await loader.load()
alldocs.push(...docs)
docs.push(...(await loader.load()))
}
}
} else {
@@ -110,37 +126,42 @@ class Text_DocumentLoaders implements INode {
const loader = new TextLoader(blob)
if (textSplitter) {
const docs = await loader.loadAndSplit(textSplitter)
alldocs.push(...docs)
docs.push(...(await loader.loadAndSplit(textSplitter)))
} else {
const docs = await loader.load()
alldocs.push(...docs)
docs.push(...(await loader.load()))
}
}
}
let finaldocs: Document<Record<string, any>>[] = []
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
for (const doc of alldocs) {
const newdoc = {
...doc,
metadata: {
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
},
omitMetadataKeys
)
}))
} else {
finaldocs = alldocs
docs = docs.map((doc) => ({
...doc,
metadata: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
if (output === 'document') {
return finaldocs
return docs
} else {
let finaltext = ''
for (const doc of finaldocs) {
for (const doc of docs) {
finaltext += `${doc.pageContent}\n`
}
return handleEscapeCharacters(finaltext, false)
@@ -1,4 +1,5 @@
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { omit } from 'lodash'
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
import {
UnstructuredLoaderOptions,
UnstructuredLoaderStrategy,
@@ -400,9 +401,21 @@ class UnstructuredFile_DocumentLoaders implements INode {
default: '500'
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -429,6 +442,12 @@ class UnstructuredFile_DocumentLoaders implements INode {
const combineUnderNChars = nodeData.inputs?.combineUnderNChars as number
const newAfterNChars = nodeData.inputs?.newAfterNChars as number
const maxCharacters = nodeData.inputs?.maxCharacters as number
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
const fileBase64 = nodeData.inputs?.fileObject as string
const obj: UnstructuredLoaderOptions = {
@@ -452,7 +471,7 @@ class UnstructuredFile_DocumentLoaders implements INode {
const unstructuredAPIKey = getCredentialParam('unstructuredAPIKey', credentialData, nodeData)
if (unstructuredAPIKey) obj.apiKey = unstructuredAPIKey
let docs: any[] = []
let docs: IDocument[] = []
let files: string[] = []
if (fileBase64) {
@@ -499,19 +518,25 @@ class UnstructuredFile_DocumentLoaders implements INode {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
docs = docs.map((doc) => ({
...doc,
metadata: {
...doc.metadata,
...parsedMetadata,
[sourceIdKey]: doc.metadata[sourceIdKey] || sourceIdKey
}
metadata: omit(
{
...doc.metadata,
...parsedMetadata,
[sourceIdKey]: doc.metadata[sourceIdKey] || sourceIdKey
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: {
...doc.metadata,
[sourceIdKey]: doc.metadata[sourceIdKey] || sourceIdKey
}
metadata: omit(
{
...doc.metadata,
[sourceIdKey]: doc.metadata[sourceIdKey] || sourceIdKey
},
omitMetadataKeys
)
}))
}
@@ -1,3 +1,4 @@
import { omit } from 'lodash'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import {
UnstructuredDirectoryLoader,
@@ -379,9 +380,21 @@ class UnstructuredFolder_DocumentLoaders implements INode {
default: '500'
},
{
label: 'Metadata',
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
@@ -408,6 +421,12 @@ class UnstructuredFolder_DocumentLoaders implements INode {
const combineUnderNChars = nodeData.inputs?.combineUnderNChars as number
const newAfterNChars = nodeData.inputs?.newAfterNChars as number
const maxCharacters = nodeData.inputs?.maxCharacters as number
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
const obj: UnstructuredLoaderOptions = {
apiUrl: unstructuredAPIUrl,
@@ -437,19 +456,25 @@ class UnstructuredFolder_DocumentLoaders implements INode {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
docs = docs.map((doc) => ({
...doc,
metadata: {
...doc.metadata,
...parsedMetadata,
[sourceIdKey]: doc.metadata[sourceIdKey] || sourceIdKey
}
metadata: omit(
{
...doc.metadata,
...parsedMetadata,
[sourceIdKey]: doc.metadata[sourceIdKey] || sourceIdKey
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata: {
...doc.metadata,
[sourceIdKey]: doc.metadata[sourceIdKey] || sourceIdKey
}
metadata: omit(
{
...doc.metadata,
[sourceIdKey]: doc.metadata[sourceIdKey] || sourceIdKey
},
omitMetadataKeys
)
}))
}
@@ -27,6 +27,7 @@ class CharacterTextSplitter_TextSplitters implements INode {
label: 'Chunk Size',
name: 'chunkSize',
type: 'number',
description: 'Number of characters in each chunk. Default is 1000.',
default: 1000,
optional: true
},
@@ -34,6 +35,8 @@ class CharacterTextSplitter_TextSplitters implements INode {
label: 'Chunk Overlap',
name: 'chunkOverlap',
type: 'number',
description: 'Number of characters to overlap between chunks. Default is 200.',
default: 200,
optional: true
},
{
@@ -101,6 +101,7 @@ class CodeTextSplitter_TextSplitters implements INode {
label: 'Chunk Size',
name: 'chunkSize',
type: 'number',
description: 'Number of characters in each chunk. Default is 1000.',
default: 1000,
optional: true
},
@@ -108,6 +109,8 @@ class CodeTextSplitter_TextSplitters implements INode {
label: 'Chunk Overlap',
name: 'chunkOverlap',
type: 'number',
description: 'Number of characters to overlap between chunks. Default is 200.',
default: 200,
optional: true
}
]
@@ -28,6 +28,7 @@ class HtmlToMarkdownTextSplitter_TextSplitters implements INode {
label: 'Chunk Size',
name: 'chunkSize',
type: 'number',
description: 'Number of characters in each chunk. Default is 1000.',
default: 1000,
optional: true
},
@@ -35,6 +36,8 @@ class HtmlToMarkdownTextSplitter_TextSplitters implements INode {
label: 'Chunk Overlap',
name: 'chunkOverlap',
type: 'number',
description: 'Number of characters to overlap between chunks. Default is 200.',
default: 200,
optional: true
}
]
@@ -27,6 +27,7 @@ class MarkdownTextSplitter_TextSplitters implements INode {
label: 'Chunk Size',
name: 'chunkSize',
type: 'number',
description: 'Number of characters in each chunk. Default is 1000.',
default: 1000,
optional: true
},
@@ -34,6 +35,8 @@ class MarkdownTextSplitter_TextSplitters implements INode {
label: 'Chunk Overlap',
name: 'chunkOverlap',
type: 'number',
description: 'Number of characters to overlap between chunks. Default is 200.',
default: 200,
optional: true
}
]
@@ -27,6 +27,7 @@ class RecursiveCharacterTextSplitter_TextSplitters implements INode {
label: 'Chunk Size',
name: 'chunkSize',
type: 'number',
description: 'Number of characters in each chunk. Default is 1000.',
default: 1000,
optional: true
},
@@ -34,6 +35,8 @@ class RecursiveCharacterTextSplitter_TextSplitters implements INode {
label: 'Chunk Overlap',
name: 'chunkOverlap',
type: 'number',
description: 'Number of characters to overlap between chunks. Default is 200.',
default: 200,
optional: true
},
{
@@ -56,6 +56,7 @@ class TokenTextSplitter_TextSplitters implements INode {
label: 'Chunk Size',
name: 'chunkSize',
type: 'number',
description: 'Number of characters in each chunk. Default is 1000.',
default: 1000,
optional: true
},
@@ -63,6 +64,8 @@ class TokenTextSplitter_TextSplitters implements INode {
label: 'Chunk Overlap',
name: 'chunkOverlap',
type: 'number',
description: 'Number of characters to overlap between chunks. Default is 200.',
default: 200,
optional: true
}
]
+5
View File
@@ -176,6 +176,11 @@ export type MessageContentImageUrl = {
}
}
export interface IDocument<Metadata extends Record<string, any> = Record<string, any>> {
pageContent: string
metadata: Metadata
}
/**
* Classes
*/
+15
View File
@@ -135,6 +135,21 @@ export const removeFilesFromStorage = async (...paths: string[]) => {
}
}
export const removeSpecificFileFromStorage = async (...paths: string[]) => {
const storageType = getStorageType()
if (storageType === 's3') {
let Key = paths.reduce((acc, cur) => acc + '/' + cur, '')
// remove the first '/' if it exists
if (Key.startsWith('/')) {
Key = Key.substring(1)
}
await _deleteS3Folder(Key)
} else {
const file = path.join(getStoragePath(), ...paths)
fs.unlinkSync(file)
}
}
export const removeFolderFromStorage = async (...paths: string[]) => {
const storageType = getStorageType()
if (storageType === 's3') {