mirror of
https://github.com/farcasclaudiu/Flowise.git
synced 2026-06-28 21:00:58 +03:00
Use selected links if available when scraping in cheerio, puppeteer, and playwright nodes
This commit is contained in:
@@ -90,6 +90,7 @@ class Cheerio_DocumentLoaders implements INode {
|
|||||||
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
|
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
|
||||||
const metadata = nodeData.inputs?.metadata
|
const metadata = nodeData.inputs?.metadata
|
||||||
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
|
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
|
||||||
|
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
|
||||||
let limit = nodeData.inputs?.limit as string
|
let limit = nodeData.inputs?.limit as string
|
||||||
|
|
||||||
let url = nodeData.inputs?.url as string
|
let url = nodeData.inputs?.url as string
|
||||||
@@ -127,13 +128,22 @@ class Cheerio_DocumentLoaders implements INode {
|
|||||||
if (!limit) limit = '10'
|
if (!limit) limit = '10'
|
||||||
else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
|
else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
|
||||||
const pages: string[] =
|
const pages: string[] =
|
||||||
relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
|
selectedLinks && selectedLinks.length > 0
|
||||||
|
? selectedLinks.slice(0, parseInt(limit))
|
||||||
|
: relativeLinksMethod === 'webCrawl'
|
||||||
|
? await webCrawl(url, parseInt(limit))
|
||||||
|
: await xmlScrape(url, parseInt(limit))
|
||||||
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
|
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
|
||||||
if (!pages || pages.length === 0) throw new Error('No relative links found')
|
if (!pages || pages.length === 0) throw new Error('No relative links found')
|
||||||
for (const page of pages) {
|
for (const page of pages) {
|
||||||
docs.push(...(await cheerioLoader(page)))
|
docs.push(...(await cheerioLoader(page)))
|
||||||
}
|
}
|
||||||
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
|
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
|
||||||
|
} else if (selectedLinks && selectedLinks.length > 0) {
|
||||||
|
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`)
|
||||||
|
for (const page of selectedLinks) {
|
||||||
|
docs.push(...(await cheerioLoader(page)))
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
docs = await cheerioLoader(url)
|
docs = await cheerioLoader(url)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -118,6 +118,7 @@ class Playwright_DocumentLoaders implements INode {
|
|||||||
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
|
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
|
||||||
const metadata = nodeData.inputs?.metadata
|
const metadata = nodeData.inputs?.metadata
|
||||||
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
|
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
|
||||||
|
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
|
||||||
let limit = nodeData.inputs?.limit as string
|
let limit = nodeData.inputs?.limit as string
|
||||||
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as 'load' | 'domcontentloaded' | 'networkidle' | 'commit' | undefined
|
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as 'load' | 'domcontentloaded' | 'networkidle' | 'commit' | undefined
|
||||||
let waitForSelector = nodeData.inputs?.waitForSelector as string
|
let waitForSelector = nodeData.inputs?.waitForSelector as string
|
||||||
@@ -168,13 +169,22 @@ class Playwright_DocumentLoaders implements INode {
|
|||||||
if (!limit) limit = '10'
|
if (!limit) limit = '10'
|
||||||
else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
|
else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
|
||||||
const pages: string[] =
|
const pages: string[] =
|
||||||
relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
|
selectedLinks && selectedLinks.length > 0
|
||||||
|
? selectedLinks.slice(0, parseInt(limit))
|
||||||
|
: relativeLinksMethod === 'webCrawl'
|
||||||
|
? await webCrawl(url, parseInt(limit))
|
||||||
|
: await xmlScrape(url, parseInt(limit))
|
||||||
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
|
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
|
||||||
if (!pages || pages.length === 0) throw new Error('No relative links found')
|
if (!pages || pages.length === 0) throw new Error('No relative links found')
|
||||||
for (const page of pages) {
|
for (const page of pages) {
|
||||||
docs.push(...(await playwrightLoader(page)))
|
docs.push(...(await playwrightLoader(page)))
|
||||||
}
|
}
|
||||||
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
|
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
|
||||||
|
} else if (selectedLinks && selectedLinks.length > 0) {
|
||||||
|
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`)
|
||||||
|
for (const page of selectedLinks) {
|
||||||
|
docs.push(...(await playwrightLoader(page)))
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
docs = await playwrightLoader(url)
|
docs = await playwrightLoader(url)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -119,6 +119,7 @@ class Puppeteer_DocumentLoaders implements INode {
|
|||||||
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
|
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
|
||||||
const metadata = nodeData.inputs?.metadata
|
const metadata = nodeData.inputs?.metadata
|
||||||
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
|
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
|
||||||
|
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
|
||||||
let limit = nodeData.inputs?.limit as string
|
let limit = nodeData.inputs?.limit as string
|
||||||
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent
|
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent
|
||||||
let waitForSelector = nodeData.inputs?.waitForSelector as string
|
let waitForSelector = nodeData.inputs?.waitForSelector as string
|
||||||
@@ -169,13 +170,22 @@ class Puppeteer_DocumentLoaders implements INode {
|
|||||||
if (!limit) limit = '10'
|
if (!limit) limit = '10'
|
||||||
else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
|
else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
|
||||||
const pages: string[] =
|
const pages: string[] =
|
||||||
relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
|
selectedLinks && selectedLinks.length > 0
|
||||||
|
? selectedLinks.slice(0, parseInt(limit))
|
||||||
|
: relativeLinksMethod === 'webCrawl'
|
||||||
|
? await webCrawl(url, parseInt(limit))
|
||||||
|
: await xmlScrape(url, parseInt(limit))
|
||||||
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
|
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
|
||||||
if (!pages || pages.length === 0) throw new Error('No relative links found')
|
if (!pages || pages.length === 0) throw new Error('No relative links found')
|
||||||
for (const page of pages) {
|
for (const page of pages) {
|
||||||
docs.push(...(await puppeteerLoader(page)))
|
docs.push(...(await puppeteerLoader(page)))
|
||||||
}
|
}
|
||||||
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
|
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
|
||||||
|
} else if (selectedLinks && selectedLinks.length > 0) {
|
||||||
|
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`)
|
||||||
|
for (const page of selectedLinks) {
|
||||||
|
docs.push(...(await puppeteerLoader(page)))
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
docs = await puppeteerLoader(url)
|
docs = await puppeteerLoader(url)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user