Merge pull request #1740 from ape-nq/bugfix/webscrape-relative-links

use URL constructor to handle all types of links
2026-06-28 23:01:09 +03:00 · 2024-02-27 11:25:39 +08:00
parent 89307b13c9 92bdf1cc51
commit 714f82a234
1 changed files with 7 additions and 17 deletions
@@ -290,22 +290,12 @@ function getURLsFromHTML(htmlBody: string, baseURL: string): string[] {
    const linkElements = dom.window.document.querySelectorAll('a')
    const urls: string[] = []
    for (const linkElement of linkElements) {
-        if (linkElement.href.slice(0, 1) === '/') {
+        try {
-            try {
+            const urlObj = new URL(linkElement.href, baseURL)
-                const urlObj = new URL(baseURL + linkElement.href)
+            urls.push(urlObj.href)
-                urls.push(urlObj.href) //relative
+        } catch (err) {
-            } catch (err) {
+            if (process.env.DEBUG === 'true') console.error(`error with scraped URL: ${err.message}`)
-                if (process.env.DEBUG === 'true') console.error(`error with relative url: ${err.message}`)
+            continue
                continue
            }
        } else {
            try {
                const urlObj = new URL(linkElement.href)
                urls.push(urlObj.href) //absolute
            } catch (err) {
                if (process.env.DEBUG === 'true') console.error(`error with absolute url: ${err.message}`)
                continue
            }
        }
    }
    return urls
@@ -365,7 +355,7 @@ async function crawl(baseURL: string, currentURL: string, pages: string[], limit
        }
        const htmlBody = await resp.text()
-        const nextURLs = getURLsFromHTML(htmlBody, baseURL)
+        const nextURLs = getURLsFromHTML(htmlBody, currentURL)
        for (const nextURL of nextURLs) {
            pages = await crawl(baseURL, nextURL, pages, limit)
        }