mirror of
https://github.com/farcasclaudiu/Flowise.git
synced 2026-06-28 23:01:09 +03:00
Merge pull request #1740 from ape-nq/bugfix/webscrape-relative-links
use URL constructor to handle all types of links
This commit is contained in:
@@ -290,22 +290,12 @@ function getURLsFromHTML(htmlBody: string, baseURL: string): string[] {
|
|||||||
const linkElements = dom.window.document.querySelectorAll('a')
|
const linkElements = dom.window.document.querySelectorAll('a')
|
||||||
const urls: string[] = []
|
const urls: string[] = []
|
||||||
for (const linkElement of linkElements) {
|
for (const linkElement of linkElements) {
|
||||||
if (linkElement.href.slice(0, 1) === '/') {
|
try {
|
||||||
try {
|
const urlObj = new URL(linkElement.href, baseURL)
|
||||||
const urlObj = new URL(baseURL + linkElement.href)
|
urls.push(urlObj.href)
|
||||||
urls.push(urlObj.href) //relative
|
} catch (err) {
|
||||||
} catch (err) {
|
if (process.env.DEBUG === 'true') console.error(`error with scraped URL: ${err.message}`)
|
||||||
if (process.env.DEBUG === 'true') console.error(`error with relative url: ${err.message}`)
|
continue
|
||||||
continue
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
try {
|
|
||||||
const urlObj = new URL(linkElement.href)
|
|
||||||
urls.push(urlObj.href) //absolute
|
|
||||||
} catch (err) {
|
|
||||||
if (process.env.DEBUG === 'true') console.error(`error with absolute url: ${err.message}`)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return urls
|
return urls
|
||||||
@@ -365,7 +355,7 @@ async function crawl(baseURL: string, currentURL: string, pages: string[], limit
|
|||||||
}
|
}
|
||||||
|
|
||||||
const htmlBody = await resp.text()
|
const htmlBody = await resp.text()
|
||||||
const nextURLs = getURLsFromHTML(htmlBody, baseURL)
|
const nextURLs = getURLsFromHTML(htmlBody, currentURL)
|
||||||
for (const nextURL of nextURLs) {
|
for (const nextURL of nextURLs) {
|
||||||
pages = await crawl(baseURL, nextURL, pages, limit)
|
pages = await crawl(baseURL, nextURL, pages, limit)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user