Merge pull request #1046 from FlowiseAI/feature/UnstructuredLoader

Feature/unstructured loader
This commit is contained in:
Henry Heng
2023-10-13 13:45:32 +01:00
committed by GitHub
9 changed files with 455 additions and 2 deletions
@@ -0,0 +1,26 @@
import { INodeParams, INodeCredential } from '../src/Interface'
class UnstructuredApi implements INodeCredential {
label: string
name: string
version: number
description: string
inputs: INodeParams[]
constructor() {
this.label = 'Unstructured API'
this.name = 'unstructuredApi'
this.version = 1.0
this.description =
'Refer to <a target="_blank" href="https://unstructured.io/#get-api-key">official guide</a> on how to get api key on Unstructured'
this.inputs = [
{
label: 'API Key',
name: 'unstructuredAPIKey',
type: 'password'
}
]
}
}
module.exports = { credClass: UnstructuredApi }
@@ -0,0 +1,162 @@
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { UnstructuredLoader, UnstructuredLoaderOptions } from 'langchain/document_loaders/fs/unstructured'
import { getCredentialData, getCredentialParam } from '../../../src/utils'
class UnstructuredFile_DocumentLoaders implements INode {
label: string
name: string
version: number
description: string
type: string
icon: string
category: string
baseClasses: string[]
credential: INodeParams
inputs: INodeParams[]
constructor() {
this.label = 'Unstructured File Loader'
this.name = 'unstructuredFileLoader'
this.version = 1.0
this.type = 'Document'
this.icon = 'unstructured.png'
this.category = 'Document Loaders'
this.description = 'Use Unstructured.io to load data from a file path'
this.baseClasses = [this.type]
this.credential = {
label: 'Connect Credential',
name: 'credential',
type: 'credential',
credentialNames: ['unstructuredApi'],
optional: true
}
this.inputs = [
{
label: 'File Path',
name: 'filePath',
type: 'string',
placeholder: ''
},
{
label: 'Unstructured API URL',
name: 'unstructuredAPIUrl',
description:
'Unstructured API URL. Read <a target="_blank" href="https://unstructured-io.github.io/unstructured/introduction.html#getting-started">more</a> on how to get started',
type: 'string',
default: 'http://localhost:8000/general/v0/general'
},
{
label: 'Element Type',
name: 'elementType',
description:
'Unstructured partition document into different types, select the types to return. If not selected, all types will be returned',
type: 'multiOptions',
options: [
{
label: 'FigureCaption',
name: 'FigureCaption'
},
{
label: 'NarrativeText',
name: 'NarrativeText'
},
{
label: 'ListItem',
name: 'ListItem'
},
{
label: 'Title',
name: 'Title'
},
{
label: 'Address',
name: 'Address'
},
{
label: 'Table',
name: 'Table'
},
{
label: 'PageBreak',
name: 'PageBreak'
},
{
label: 'Header',
name: 'Header'
},
{
label: 'Footer',
name: 'Footer'
},
{
label: 'UncategorizedText',
name: 'UncategorizedText'
},
{
label: 'Image',
name: 'Image'
},
{
label: 'Formula',
name: 'Formula'
}
],
default: [],
optional: true,
additionalParams: true
},
{
label: 'Metadata',
name: 'metadata',
type: 'json',
optional: true,
additionalParams: true
}
]
}
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const filePath = nodeData.inputs?.filePath as string
const unstructuredAPIUrl = nodeData.inputs?.unstructuredAPIUrl as string
const elementType = nodeData.inputs?.elementType as string
const metadata = nodeData.inputs?.metadata
const obj: UnstructuredLoaderOptions = { apiUrl: unstructuredAPIUrl }
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const unstructuredAPIKey = getCredentialParam('unstructuredAPIKey', credentialData, nodeData)
if (unstructuredAPIKey) obj.apiKey = unstructuredAPIKey
const loader = new UnstructuredLoader(filePath, obj)
const docs = await loader.load()
let elementTypes: string[] = []
if (elementType) {
try {
elementTypes = JSON.parse(elementType)
} catch (e) {
elementTypes = []
}
}
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of docs) {
const newdoc = {
...doc,
metadata: {
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return elementTypes.length ? finaldocs.filter((doc) => elementTypes.includes(doc.metadata.category)) : finaldocs
}
return elementTypes.length ? docs.filter((doc) => elementTypes.includes(doc.metadata.category)) : docs
}
}
module.exports = { nodeClass: UnstructuredFile_DocumentLoaders }
@@ -0,0 +1,162 @@
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { UnstructuredDirectoryLoader, UnstructuredLoaderOptions } from 'langchain/document_loaders/fs/unstructured'
import { getCredentialData, getCredentialParam } from '../../../src/utils'
class UnstructuredFolder_DocumentLoaders implements INode {
label: string
name: string
version: number
description: string
type: string
icon: string
category: string
baseClasses: string[]
credential: INodeParams
inputs: INodeParams[]
constructor() {
this.label = 'Unstructured Folder Loader'
this.name = 'unstructuredFolderLoader'
this.version = 1.0
this.type = 'Document'
this.icon = 'unstructured.png'
this.category = 'Document Loaders'
this.description = 'Use Unstructured.io to load data from a folder'
this.baseClasses = [this.type]
this.credential = {
label: 'Connect Credential',
name: 'credential',
type: 'credential',
credentialNames: ['unstructuredApi'],
optional: true
}
this.inputs = [
{
label: 'Folder Path',
name: 'folderPath',
type: 'string',
placeholder: ''
},
{
label: 'Unstructured API URL',
name: 'unstructuredAPIUrl',
description:
'Unstructured API URL. Read <a target="_blank" href="https://unstructured-io.github.io/unstructured/introduction.html#getting-started">more</a> on how to get started',
type: 'string',
default: 'http://localhost:8000/general/v0/general'
},
{
label: 'Element Type',
name: 'elementType',
description:
'Unstructured partition document into different types, select the types to return. If not selected, all types will be returned',
type: 'multiOptions',
options: [
{
label: 'FigureCaption',
name: 'FigureCaption'
},
{
label: 'NarrativeText',
name: 'NarrativeText'
},
{
label: 'ListItem',
name: 'ListItem'
},
{
label: 'Title',
name: 'Title'
},
{
label: 'Address',
name: 'Address'
},
{
label: 'Table',
name: 'Table'
},
{
label: 'PageBreak',
name: 'PageBreak'
},
{
label: 'Header',
name: 'Header'
},
{
label: 'Footer',
name: 'Footer'
},
{
label: 'UncategorizedText',
name: 'UncategorizedText'
},
{
label: 'Image',
name: 'Image'
},
{
label: 'Formula',
name: 'Formula'
}
],
default: [],
optional: true,
additionalParams: true
},
{
label: 'Metadata',
name: 'metadata',
type: 'json',
optional: true,
additionalParams: true
}
]
}
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const folderPath = nodeData.inputs?.folderPath as string
const unstructuredAPIUrl = nodeData.inputs?.unstructuredAPIUrl as string
const metadata = nodeData.inputs?.metadata
const elementType = nodeData.inputs?.elementType as string
const obj: UnstructuredLoaderOptions = { apiUrl: unstructuredAPIUrl }
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const unstructuredAPIKey = getCredentialParam('unstructuredAPIKey', credentialData, nodeData)
if (unstructuredAPIKey) obj.apiKey = unstructuredAPIKey
const loader = new UnstructuredDirectoryLoader(folderPath, obj)
const docs = await loader.load()
let elementTypes: string[] = []
if (elementType) {
try {
elementTypes = JSON.parse(elementType)
} catch (e) {
elementTypes = []
}
}
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of docs) {
const newdoc = {
...doc,
metadata: {
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return elementTypes.length ? finaldocs.filter((doc) => elementTypes.includes(doc.metadata.category)) : finaldocs
}
return elementTypes.length ? docs.filter((doc) => elementTypes.includes(doc.metadata.category)) : docs
}
}
module.exports = { nodeClass: UnstructuredFolder_DocumentLoaders }
Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

@@ -18,7 +18,7 @@ class ElasicsearchExisting_VectorStores extends ElasticSearchBase implements INo
async constructVectorStore(
embeddings: Embeddings,
elasticSearchClientArgs: ElasticClientArgs,
docs: Document<Record<string, any>>[] | undefined
_: Document<Record<string, any>>[] | undefined
): Promise<VectorStore> {
return await ElasticVectorSearch.fromExistingIndex(embeddings, elasticSearchClientArgs)
}
+1
View File
@@ -5,6 +5,7 @@
export type NodeParamsType =
| 'asyncOptions'
| 'options'
| 'multiOptions'
| 'string'
| 'number'
| 'boolean'
@@ -0,0 +1,79 @@
import { useState } from 'react'
import { useSelector } from 'react-redux'
import { Popper, FormControl, TextField, Box, Typography } from '@mui/material'
import Autocomplete, { autocompleteClasses } from '@mui/material/Autocomplete'
import { styled } from '@mui/material/styles'
import PropTypes from 'prop-types'
const StyledPopper = styled(Popper)({
boxShadow: '0px 8px 10px -5px rgb(0 0 0 / 20%), 0px 16px 24px 2px rgb(0 0 0 / 14%), 0px 6px 30px 5px rgb(0 0 0 / 12%)',
borderRadius: '10px',
[`& .${autocompleteClasses.listbox}`]: {
boxSizing: 'border-box',
'& ul': {
padding: 10,
margin: 10
}
}
})
export const MultiDropdown = ({ name, value, options, onSelect, disabled = false, disableClearable = false }) => {
const customization = useSelector((state) => state.customization)
const findMatchingOptions = (options = [], internalValue) => {
let values = []
if (internalValue && typeof internalValue === 'string') values = JSON.parse(internalValue)
else values = internalValue
return options.filter((option) => values.includes(option.name))
}
const getDefaultOptionValue = () => []
let [internalValue, setInternalValue] = useState(value ?? [])
return (
<FormControl sx={{ mt: 1, width: '100%' }} size='small'>
<Autocomplete
id={name}
disabled={disabled}
disableClearable={disableClearable}
size='small'
multiple
filterSelectedOptions
options={options || []}
value={findMatchingOptions(options, internalValue) || getDefaultOptionValue()}
onChange={(e, selections) => {
let value = ''
if (selections.length) {
const selectionNames = []
for (let i = 0; i < selections.length; i += 1) {
selectionNames.push(selections[i].name)
}
value = JSON.stringify(selectionNames)
}
setInternalValue(value)
onSelect(value)
}}
PopperComponent={StyledPopper}
renderInput={(params) => <TextField {...params} value={internalValue} />}
renderOption={(props, option) => (
<Box component='li' {...props}>
<div style={{ display: 'flex', flexDirection: 'column' }}>
<Typography variant='h5'>{option.label}</Typography>
{option.description && (
<Typography sx={{ color: customization.isDarkMode ? '#9e9e9e' : '' }}>{option.description}</Typography>
)}
</div>
</Box>
)}
/>
</FormControl>
)
}
MultiDropdown.propTypes = {
name: PropTypes.string,
value: PropTypes.string,
options: PropTypes.array,
onSelect: PropTypes.func,
disabled: PropTypes.bool,
disableClearable: PropTypes.bool
}
+14 -1
View File
@@ -39,7 +39,20 @@ export const initNode = (nodeData, newNodeId) => {
const incoming = nodeData.inputs ? nodeData.inputs.length : 0
const outgoing = 1
const whitelistTypes = ['asyncOptions', 'options', 'string', 'number', 'boolean', 'password', 'json', 'code', 'date', 'file', 'folder']
const whitelistTypes = [
'asyncOptions',
'options',
'multiOptions',
'string',
'number',
'boolean',
'password',
'json',
'code',
'date',
'file',
'folder'
]
// Inputs
for (let i = 0; i < incoming; i += 1) {
@@ -11,6 +11,7 @@ import { IconArrowsMaximize, IconEdit, IconAlertTriangle } from '@tabler/icons'
// project import
import { Dropdown } from 'ui-component/dropdown/Dropdown'
import { MultiDropdown } from 'ui-component/dropdown/MultiDropdown'
import { AsyncDropdown } from 'ui-component/dropdown/AsyncDropdown'
import { Input } from 'ui-component/input/Input'
import { File } from 'ui-component/file/File'
@@ -308,6 +309,15 @@ const NodeInputHandler = ({ inputAnchor, inputParam, data, disabled = false, isA
value={data.inputs[inputParam.name] ?? inputParam.default ?? 'choose an option'}
/>
)}
{inputParam.type === 'multiOptions' && (
<MultiDropdown
disabled={disabled}
name={inputParam.name}
options={inputParam.options}
onSelect={(newValue) => (data.inputs[inputParam.name] = newValue)}
value={data.inputs[inputParam.name] ?? inputParam.default ?? 'choose an option'}
/>
)}
{inputParam.type === 'asyncOptions' && (
<>
{data.inputParams.length === 1 && <div style={{ marginTop: 10 }} />}