diff --git a/Dockerfile b/Dockerfile index fc76cd00..e9470c31 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,6 +6,8 @@ FROM node:18-alpine RUN apk add --update libc6-compat python3 make g++ +# needed for pdfjs-dist +RUN apk add --no-cache build-base cairo-dev pango-dev WORKDIR /usr/src/packages diff --git a/docker/Dockerfile b/docker/Dockerfile index e4bf704a..15c4e0ac 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -4,6 +4,8 @@ USER root RUN apk add --no-cache git RUN apk add --no-cache python3 py3-pip make g++ +# needed for pdfjs-dist +RUN apk add --no-cache build-base cairo-dev pango-dev # You can install a specific version like: flowise@1.0.0 RUN npm install -g flowise diff --git a/packages/components/nodes/documentloaders/Pdf/Pdf.ts b/packages/components/nodes/documentloaders/Pdf/Pdf.ts index bc36f8cb..ddb7edb8 100644 --- a/packages/components/nodes/documentloaders/Pdf/Pdf.ts +++ b/packages/components/nodes/documentloaders/Pdf/Pdf.ts @@ -49,6 +49,13 @@ class Pdf_DocumentLoaders implements INode { ], default: 'perPage' }, + { + label: 'Use Legacy Build', + name: 'legacyBuild', + type: 'boolean', + optional: true, + additionalParams: true + }, { label: 'Metadata', name: 'metadata', @@ -64,6 +71,7 @@ class Pdf_DocumentLoaders implements INode { const pdfFileBase64 = nodeData.inputs?.pdfFile as string const usage = nodeData.inputs?.usage as string const metadata = nodeData.inputs?.metadata + const legacyBuild = nodeData.inputs?.legacyBuild as boolean let alldocs = [] let files: string[] = [] @@ -81,8 +89,9 @@ class Pdf_DocumentLoaders implements INode { if (usage === 'perFile') { const loader = new PDFLoader(new Blob([bf]), { splitPages: false, - // @ts-ignore - pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') + pdfjs: () => + // @ts-ignore + legacyBuild ? import('pdfjs-dist/legacy/build/pdf.js') : import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }) if (textSplitter) { const docs = await loader.loadAndSplit(textSplitter) @@ -92,8 +101,11 @@ class Pdf_DocumentLoaders implements INode { alldocs.push(...docs) } } else { - // @ts-ignore - const loader = new PDFLoader(new Blob([bf]), { pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }) + const loader = new PDFLoader(new Blob([bf]), { + pdfjs: () => + // @ts-ignore + legacyBuild ? import('pdfjs-dist/legacy/build/pdf.js') : import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') + }) if (textSplitter) { const docs = await loader.loadAndSplit(textSplitter) alldocs.push(...docs) diff --git a/packages/components/package.json b/packages/components/package.json index c9ceeea9..738c7752 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -39,9 +39,10 @@ "moment": "^2.29.3", "node-fetch": "^2.6.11", "pdf-parse": "^1.1.1", + "pdfjs-dist": "^3.7.107", "playwright": "^1.35.0", - "srt-parser-2": "^1.2.3", "puppeteer": "^20.7.1", + "srt-parser-2": "^1.2.3", "weaviate-ts-client": "^1.1.0", "ws": "^8.9.0" },