Merge pull request #1740 from ape-nq/bugfix/webscrape-relative-links

use URL constructor to handle all types of links
This commit is contained in:
Ong Chung Yau
2024-02-27 11:25:39 +08:00
committed by GitHub
+7 -17
View File
@@ -290,22 +290,12 @@ function getURLsFromHTML(htmlBody: string, baseURL: string): string[] {
const linkElements = dom.window.document.querySelectorAll('a') const linkElements = dom.window.document.querySelectorAll('a')
const urls: string[] = [] const urls: string[] = []
for (const linkElement of linkElements) { for (const linkElement of linkElements) {
if (linkElement.href.slice(0, 1) === '/') { try {
try { const urlObj = new URL(linkElement.href, baseURL)
const urlObj = new URL(baseURL + linkElement.href) urls.push(urlObj.href)
urls.push(urlObj.href) //relative } catch (err) {
} catch (err) { if (process.env.DEBUG === 'true') console.error(`error with scraped URL: ${err.message}`)
if (process.env.DEBUG === 'true') console.error(`error with relative url: ${err.message}`) continue
continue
}
} else {
try {
const urlObj = new URL(linkElement.href)
urls.push(urlObj.href) //absolute
} catch (err) {
if (process.env.DEBUG === 'true') console.error(`error with absolute url: ${err.message}`)
continue
}
} }
} }
return urls return urls
@@ -365,7 +355,7 @@ async function crawl(baseURL: string, currentURL: string, pages: string[], limit
} }
const htmlBody = await resp.text() const htmlBody = await resp.text()
const nextURLs = getURLsFromHTML(htmlBody, baseURL) const nextURLs = getURLsFromHTML(htmlBody, currentURL)
for (const nextURL of nextURLs) { for (const nextURL of nextURLs) {
pages = await crawl(baseURL, nextURL, pages, limit) pages = await crawl(baseURL, nextURL, pages, limit)
} }