Feature/Add new doc store upsert and refresh API (#3556)

add new doc store upsert and refresh API
This commit is contained in:
Henry Heng
2024-11-25 15:47:13 +00:00
committed by GitHub
parent 36496b1611
commit a2c36b4447
15 changed files with 1424 additions and 803 deletions
@@ -26,7 +26,7 @@ class UnstructuredFile_DocumentLoaders implements INode {
constructor() {
this.label = 'Unstructured File Loader'
this.name = 'unstructuredFileLoader'
this.version = 3.0
this.version = 4.0
this.type = 'Document'
this.icon = 'unstructured-file.svg'
this.category = 'Document Loaders'
@@ -40,6 +40,7 @@ class UnstructuredFile_DocumentLoaders implements INode {
optional: true
}
this.inputs = [
/** Deprecated
{
label: 'File Path',
name: 'filePath',
@@ -49,6 +50,7 @@ class UnstructuredFile_DocumentLoaders implements INode {
warning:
'Use the File Upload instead of File path. If file is uploaded, this path is ignored. Path will be deprecated in future releases.'
},
*/
{
label: 'Files Upload',
name: 'fileObject',
@@ -200,7 +202,7 @@ class UnstructuredFile_DocumentLoaders implements INode {
{
label: 'Hi-Res Model Name',
name: 'hiResModelName',
description: 'The name of the inference model used when strategy is hi_res. Default: detectron2_onnx.',
description: 'The name of the inference model used when strategy is hi_res',
type: 'options',
options: [
{
@@ -227,8 +229,7 @@ class UnstructuredFile_DocumentLoaders implements INode {
}
],
optional: true,
additionalParams: true,
default: 'detectron2_onnx'
additionalParams: true
},
{
label: 'Chunking Strategy',
@@ -241,9 +242,21 @@ class UnstructuredFile_DocumentLoaders implements INode {
label: 'None',
name: 'None'
},
{
label: 'Basic',
name: 'basic'
},
{
label: 'By Title',
name: 'by_title'
},
{
label: 'By Page',
name: 'by_page'
},
{
label: 'By Similarity',
name: 'by_similarity'
}
],
optional: true,
@@ -434,15 +447,15 @@ class UnstructuredFile_DocumentLoaders implements INode {
: ([] as SkipInferTableTypes[])
const hiResModelName = nodeData.inputs?.hiResModelName as HiResModelName
const includePageBreaks = nodeData.inputs?.includePageBreaks as boolean
const chunkingStrategy = nodeData.inputs?.chunkingStrategy as 'None' | 'by_title'
const chunkingStrategy = nodeData.inputs?.chunkingStrategy as string
const metadata = nodeData.inputs?.metadata
const sourceIdKey = (nodeData.inputs?.sourceIdKey as string) || 'source'
const ocrLanguages = nodeData.inputs?.ocrLanguages ? JSON.parse(nodeData.inputs?.ocrLanguages as string) : ([] as string[])
const xmlKeepTags = nodeData.inputs?.xmlKeepTags as boolean
const multiPageSections = nodeData.inputs?.multiPageSections as boolean
const combineUnderNChars = nodeData.inputs?.combineUnderNChars as number
const newAfterNChars = nodeData.inputs?.newAfterNChars as number
const maxCharacters = nodeData.inputs?.maxCharacters as number
const combineUnderNChars = nodeData.inputs?.combineUnderNChars as string
const newAfterNChars = nodeData.inputs?.newAfterNChars as string
const maxCharacters = nodeData.inputs?.maxCharacters as string
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let omitMetadataKeys: string[] = []
@@ -471,10 +484,19 @@ class UnstructuredFile_DocumentLoaders implements INode {
chunkingStrategy,
ocrLanguages,
xmlKeepTags,
multiPageSections,
combineUnderNChars,
newAfterNChars,
maxCharacters
multiPageSections
}
if (combineUnderNChars) {
obj.combineUnderNChars = parseInt(combineUnderNChars, 10)
}
if (newAfterNChars) {
obj.newAfterNChars = parseInt(newAfterNChars, 10)
}
if (maxCharacters) {
obj.maxCharacters = parseInt(maxCharacters, 10)
}
const credentialData = await getCredentialData(nodeData.credential ?? '', options)