Feature/Add ability to create new doc store on upsert (#3965)

add ability to create new doc store on upsert, update firecrawl properties
This commit is contained in:
Henry Heng
2025-02-01 14:28:50 +00:00
committed by GitHub
parent a49177f7fb
commit 20a797d2e0
5 changed files with 122 additions and 3 deletions
@@ -679,6 +679,11 @@ paths:
type: string type: string
format: binary format: binary
description: Files to be uploaded description: Files to be uploaded
docId:
type: string
nullable: true
example: '603a7b51-ae7c-4b0a-8865-e454ed2f6766'
description: Document ID to use existing configuration
loader: loader:
type: string type: string
nullable: true nullable: true
@@ -704,6 +709,32 @@ paths:
nullable: true nullable: true
example: '{"name":"postgresRecordManager"}' example: '{"name":"postgresRecordManager"}'
description: Record Manager configurations description: Record Manager configurations
metadata:
type: object
nullable: true
description: Metadata associated with the document
example: { 'foo': 'bar' }
replaceExisting:
type: boolean
nullable: true
description: Whether to replace existing document loader with the new upserted chunks. However this does not delete the existing embeddings in the vector store
createNewDocStore:
type: boolean
nullable: true
description: Whether to create a new document store
docStore:
type: object
nullable: true
description: Only when createNewDocStore is true, pass in the new document store configuration
properties:
name:
type: string
example: plainText
description: Name of the new document store to be created
description:
type: string
example: plainText
description: Description of the new document store to be created
required: required:
- files - files
required: true required: true
@@ -2350,16 +2381,37 @@ components:
docId: docId:
type: string type: string
format: uuid format: uuid
nullable: true
description: Document ID within the store. If provided, existing configuration from the document will be used for the new document description: Document ID within the store. If provided, existing configuration from the document will be used for the new document
metadata: metadata:
type: object type: object
nullable: true
description: Metadata associated with the document description: Metadata associated with the document
example: { 'foo': 'bar' } example: { 'foo': 'bar' }
replaceExisting: replaceExisting:
type: boolean type: boolean
nullable: true
description: Whether to replace existing document loader with the new upserted chunks. However this does not delete the existing embeddings in the vector store description: Whether to replace existing document loader with the new upserted chunks. However this does not delete the existing embeddings in the vector store
createNewDocStore:
type: boolean
nullable: true
description: Whether to create a new document store
docStore:
type: object
nullable: true
description: Only when createNewDocStore is true, pass in the new document store configuration
properties:
name:
type: string
example: plainText
description: Name of the new document store to be created
description:
type: string
example: plainText
description: Description of the new document store to be created
loader: loader:
type: object type: object
nullable: true
properties: properties:
name: name:
type: string type: string
@@ -2370,6 +2422,7 @@ components:
description: Configuration for the loader description: Configuration for the loader
splitter: splitter:
type: object type: object
nullable: true
properties: properties:
name: name:
type: string type: string
@@ -2380,6 +2433,7 @@ components:
description: Configuration for the text splitter description: Configuration for the text splitter
embedding: embedding:
type: object type: object
nullable: true
properties: properties:
name: name:
type: string type: string
@@ -2390,6 +2444,7 @@ components:
description: Configuration for the embedding generator description: Configuration for the embedding generator
vectorStore: vectorStore:
type: object type: object
nullable: true
properties: properties:
name: name:
type: string type: string
@@ -2400,6 +2455,7 @@ components:
description: Configuration for the vector store description: Configuration for the vector store
recordManager: recordManager:
type: object type: object
nullable: true
properties: properties:
name: name:
type: string type: string
@@ -266,7 +266,7 @@ class FireCrawl_DocumentLoaders implements INode {
this.name = 'fireCrawl' this.name = 'fireCrawl'
this.type = 'Document' this.type = 'Document'
this.icon = 'firecrawl.png' this.icon = 'firecrawl.png'
this.version = 2.0 this.version = 2.1
this.category = 'Document Loaders' this.category = 'Document Loaders'
this.description = 'Load data from URL using FireCrawl' this.description = 'Load data from URL using FireCrawl'
this.baseClasses = [this.type] this.baseClasses = [this.type]
@@ -307,6 +307,42 @@ class FireCrawl_DocumentLoaders implements INode {
} }
], ],
default: 'crawl' default: 'crawl'
},
{
// maxCrawlPages
label: 'Max Crawl Pages',
name: 'maxCrawlPages',
type: 'string',
description: 'Maximum number of pages to crawl',
optional: true,
additionalParams: true
},
{
// generateImgAltText
label: 'Generate Image Alt Text',
name: 'generateImgAltText',
type: 'boolean',
description: 'Generate alt text for images',
optional: true,
additionalParams: true
},
{
// returnOnlyUrls
label: 'Return Only URLs',
name: 'returnOnlyUrls',
type: 'boolean',
description: 'Return only URLs of the crawled pages',
optional: true,
additionalParams: true
},
{
// onlyMainContent
label: 'Only Main Content',
name: 'onlyMainContent',
type: 'boolean',
description: 'Extract only the main content of the page',
optional: true,
additionalParams: true
} }
// ... (other input parameters) // ... (other input parameters)
] ]
@@ -76,6 +76,8 @@ export interface IDocumentStoreUpsertData {
docId: string docId: string
metadata?: string | object metadata?: string | object
replaceExisting?: boolean replaceExisting?: boolean
createNewDocStore?: boolean
docStore?: IDocumentStore
loader?: { loader?: {
name: string name: string
config: ICommonObject config: ICommonObject
@@ -32,7 +32,8 @@ import {
INodeData, INodeData,
MODE, MODE,
IOverrideConfig, IOverrideConfig,
IExecutePreviewLoader IExecutePreviewLoader,
DocumentStoreDTO
} from '../../Interface' } from '../../Interface'
import { DocumentStoreFileChunk } from '../../database/entities/DocumentStoreFileChunk' import { DocumentStoreFileChunk } from '../../database/entities/DocumentStoreFileChunk'
import { v4 as uuidv4 } from 'uuid' import { v4 as uuidv4 } from 'uuid'
@@ -1464,6 +1465,7 @@ const upsertDocStore = async (
} }
} }
const replaceExisting = data.replaceExisting ?? false const replaceExisting = data.replaceExisting ?? false
const createNewDocStore = data.createNewDocStore ?? false
const newLoader = typeof data.loader === 'string' ? JSON.parse(data.loader) : data.loader const newLoader = typeof data.loader === 'string' ? JSON.parse(data.loader) : data.loader
const newSplitter = typeof data.splitter === 'string' ? JSON.parse(data.splitter) : data.splitter const newSplitter = typeof data.splitter === 'string' ? JSON.parse(data.splitter) : data.splitter
const newVectorStore = typeof data.vectorStore === 'string' ? JSON.parse(data.vectorStore) : data.vectorStore const newVectorStore = typeof data.vectorStore === 'string' ? JSON.parse(data.vectorStore) : data.vectorStore
@@ -1533,6 +1535,15 @@ const upsertDocStore = async (
recordManagerConfig = JSON.parse(entity.recordManagerConfig || '{}')?.config recordManagerConfig = JSON.parse(entity.recordManagerConfig || '{}')?.config
} }
if (createNewDocStore) {
const docStoreBody = typeof data.docStore === 'string' ? JSON.parse(data.docStore) : data.docStore
const newDocumentStore = docStoreBody ?? { name: `Document Store ${Date.now().toString()}` }
const docStore = DocumentStoreDTO.toEntity(newDocumentStore)
const documentStore = appDataSource.getRepository(DocumentStore).create(docStore)
const dbResponse = await appDataSource.getRepository(DocumentStore).save(documentStore)
storeId = dbResponse.id
}
// Step 2: Replace with new values // Step 2: Replace with new values
loaderName = newLoader?.name ? getComponentLabelFromName(newLoader?.name) : loaderName loaderName = newLoader?.name ? getComponentLabelFromName(newLoader?.name) : loaderName
loaderId = newLoader?.name || loaderId loaderId = newLoader?.name || loaderId
@@ -1687,6 +1698,7 @@ const upsertDocStore = async (
isVectorStoreInsert: true isVectorStoreInsert: true
}) })
res.docId = newDocId res.docId = newDocId
if (createNewDocStore) res.storeId = storeId
return res return res
} catch (error) { } catch (error) {
@@ -41,11 +41,13 @@ body_data = {
"docId": "${dialogProps.loaderId}", "docId": "${dialogProps.loaderId}",
"metadata": {}, # Add additional metadata to the document chunks "metadata": {}, # Add additional metadata to the document chunks
"replaceExisting": True, # Replace existing document with the new upserted chunks "replaceExisting": True, # Replace existing document with the new upserted chunks
"createNewDocStore": False, # Create a new document store
"splitter": json.dumps({"config":{"chunkSize":20000}}) # Override existing configuration "splitter": json.dumps({"config":{"chunkSize":20000}}) # Override existing configuration
# "loader": "", # "loader": "",
# "vectorStore": "", # "vectorStore": "",
# "embedding": "", # "embedding": "",
# "recordManager": "", # "recordManager": "",
# "docStore": ""
} }
headers = { headers = {
@@ -71,11 +73,14 @@ formData.append("splitter", JSON.stringify({"config":{"chunkSize":20000}}));
formData.append("metadata", "{}"); formData.append("metadata", "{}");
// Replace existing document with the new upserted chunks // Replace existing document with the new upserted chunks
formData.append("replaceExisting", "true"); formData.append("replaceExisting", "true");
// Create a new document store
formData.append("createNewDocStore", "false");
// Override existing configuration // Override existing configuration
// formData.append("loader", ""); // formData.append("loader", "");
// formData.append("embedding", ""); // formData.append("embedding", "");
// formData.append("vectorStore", ""); // formData.append("vectorStore", "");
// formData.append("recordManager", ""); // formData.append("recordManager", "");
// formData.append("docStore", "");
async function query(formData) { async function query(formData) {
const response = await fetch( const response = await fetch(
@@ -105,11 +110,13 @@ curl -X POST http://localhost:3000/api/v1/document-store/upsert/${dialogProps.st
-F "splitter={"config":{"chunkSize":20000}}" \\ -F "splitter={"config":{"chunkSize":20000}}" \\
-F "metadata={}" \\ -F "metadata={}" \\
-F "replaceExisting=true" \\ -F "replaceExisting=true" \\
-F "createNewDocStore=false" \\
# Override existing configuration: # Override existing configuration:
# -F "loader=" \\ # -F "loader=" \\
# -F "embedding=" \\ # -F "embedding=" \\
# -F "vectorStore=" \\ # -F "vectorStore=" \\
# -F "recordManager=" # -F "recordManager=" \\
# -F "docStore="
\`\`\` \`\`\`
` `
} }
@@ -135,6 +142,7 @@ output = query({
"docId": "${dialogProps.loaderId}", "docId": "${dialogProps.loaderId}",
"metadata": "{}", # Add additional metadata to the document chunks "metadata": "{}", # Add additional metadata to the document chunks
"replaceExisting": True, # Replace existing document with the new upserted chunks "replaceExisting": True, # Replace existing document with the new upserted chunks
"createNewDocStore": False, # Create a new document store
# Override existing configuration # Override existing configuration
"loader": { "loader": {
"config": { "config": {
@@ -149,6 +157,7 @@ output = query({
# embedding: {}, # embedding: {},
# vectorStore: {}, # vectorStore: {},
# recordManager: {} # recordManager: {}
# docStore: {}
}) })
print(output) print(output)
\`\`\` \`\`\`
@@ -174,6 +183,7 @@ query({
"docId": "${dialogProps.loaderId}, "docId": "${dialogProps.loaderId},
"metadata": "{}", // Add additional metadata to the document chunks "metadata": "{}", // Add additional metadata to the document chunks
"replaceExisting": true, // Replace existing document with the new upserted chunks "replaceExisting": true, // Replace existing document with the new upserted chunks
"createNewDocStore": false, // Create a new document store
// Override existing configuration // Override existing configuration
"loader": { "loader": {
"config": { "config": {
@@ -188,6 +198,7 @@ query({
// embedding: {}, // embedding: {},
// vectorStore: {}, // vectorStore: {},
// recordManager: {} // recordManager: {}
// docStore: {}
}).then((response) => { }).then((response) => {
console.log(response); console.log(response);
}); });
@@ -201,6 +212,7 @@ curl -X POST http://localhost:3000/api/v1/document-store/upsert/${dialogProps.st
"docId": "${dialogProps.loaderId}", "docId": "${dialogProps.loaderId}",
"metadata": "{}", "metadata": "{}",
"replaceExisting": true, "replaceExisting": true,
"createNewDocStore": false,
"loader": { "loader": {
"config": { "config": {
"text": "This is a new text" "text": "This is a new text"
@@ -215,6 +227,7 @@ curl -X POST http://localhost:3000/api/v1/document-store/upsert/${dialogProps.st
// "embedding": {}, // "embedding": {},
// "vectorStore": {}, // "vectorStore": {},
// "recordManager": {} // "recordManager": {}
// "docStore": {}
}' }'
\`\`\` \`\`\`