Merge pull request #322 from FlowiseAI/bugfix/Weaviate-PDF

Bugfix/pdf loader add legacy option
This commit is contained in:
Henry Heng
2023-06-16 15:28:07 +01:00
committed by GitHub
4 changed files with 22 additions and 5 deletions
+2
View File
@@ -6,6 +6,8 @@
FROM node:18-alpine FROM node:18-alpine
RUN apk add --update libc6-compat python3 make g++ RUN apk add --update libc6-compat python3 make g++
# needed for pdfjs-dist
RUN apk add --no-cache build-base cairo-dev pango-dev
WORKDIR /usr/src/packages WORKDIR /usr/src/packages
+2
View File
@@ -4,6 +4,8 @@ USER root
RUN apk add --no-cache git RUN apk add --no-cache git
RUN apk add --no-cache python3 py3-pip make g++ RUN apk add --no-cache python3 py3-pip make g++
# needed for pdfjs-dist
RUN apk add --no-cache build-base cairo-dev pango-dev
# You can install a specific version like: flowise@1.0.0 # You can install a specific version like: flowise@1.0.0
RUN npm install -g flowise RUN npm install -g flowise
@@ -49,6 +49,13 @@ class Pdf_DocumentLoaders implements INode {
], ],
default: 'perPage' default: 'perPage'
}, },
{
label: 'Use Legacy Build',
name: 'legacyBuild',
type: 'boolean',
optional: true,
additionalParams: true
},
{ {
label: 'Metadata', label: 'Metadata',
name: 'metadata', name: 'metadata',
@@ -64,6 +71,7 @@ class Pdf_DocumentLoaders implements INode {
const pdfFileBase64 = nodeData.inputs?.pdfFile as string const pdfFileBase64 = nodeData.inputs?.pdfFile as string
const usage = nodeData.inputs?.usage as string const usage = nodeData.inputs?.usage as string
const metadata = nodeData.inputs?.metadata const metadata = nodeData.inputs?.metadata
const legacyBuild = nodeData.inputs?.legacyBuild as boolean
let alldocs = [] let alldocs = []
let files: string[] = [] let files: string[] = []
@@ -81,8 +89,9 @@ class Pdf_DocumentLoaders implements INode {
if (usage === 'perFile') { if (usage === 'perFile') {
const loader = new PDFLoader(new Blob([bf]), { const loader = new PDFLoader(new Blob([bf]), {
splitPages: false, splitPages: false,
// @ts-ignore pdfjs: () =>
pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') // @ts-ignore
legacyBuild ? import('pdfjs-dist/legacy/build/pdf.js') : import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js')
}) })
if (textSplitter) { if (textSplitter) {
const docs = await loader.loadAndSplit(textSplitter) const docs = await loader.loadAndSplit(textSplitter)
@@ -92,8 +101,11 @@ class Pdf_DocumentLoaders implements INode {
alldocs.push(...docs) alldocs.push(...docs)
} }
} else { } else {
// @ts-ignore const loader = new PDFLoader(new Blob([bf]), {
const loader = new PDFLoader(new Blob([bf]), { pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }) pdfjs: () =>
// @ts-ignore
legacyBuild ? import('pdfjs-dist/legacy/build/pdf.js') : import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js')
})
if (textSplitter) { if (textSplitter) {
const docs = await loader.loadAndSplit(textSplitter) const docs = await loader.loadAndSplit(textSplitter)
alldocs.push(...docs) alldocs.push(...docs)
+2 -1
View File
@@ -39,9 +39,10 @@
"moment": "^2.29.3", "moment": "^2.29.3",
"node-fetch": "^2.6.11", "node-fetch": "^2.6.11",
"pdf-parse": "^1.1.1", "pdf-parse": "^1.1.1",
"pdfjs-dist": "^3.7.107",
"playwright": "^1.35.0", "playwright": "^1.35.0",
"srt-parser-2": "^1.2.3",
"puppeteer": "^20.7.1", "puppeteer": "^20.7.1",
"srt-parser-2": "^1.2.3",
"weaviate-ts-client": "^1.1.0", "weaviate-ts-client": "^1.1.0",
"ws": "^8.9.0" "ws": "^8.9.0"
}, },