Merge pull request #1687 from 0xi4o/bug/scrap-limit

Fix: relative links method and limit not applying to manage links
This commit is contained in:
Ilango
2024-02-12 12:35:40 +05:30
committed by GitHub
7 changed files with 73 additions and 17 deletions
@@ -126,7 +126,9 @@ class Cheerio_DocumentLoaders implements INode {
let docs = [] let docs = []
if (relativeLinksMethod) { if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`) if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = 10 // if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
// so when limit is 0 we can fetch all the links
if (limit === null || limit === undefined) limit = 10
else if (limit < 0) throw new Error('Limit cannot be less than 0') else if (limit < 0) throw new Error('Limit cannot be less than 0')
const pages: string[] = const pages: string[] =
selectedLinks && selectedLinks.length > 0 selectedLinks && selectedLinks.length > 0
@@ -143,7 +145,7 @@ class Cheerio_DocumentLoaders implements INode {
} else if (selectedLinks && selectedLinks.length > 0) { } else if (selectedLinks && selectedLinks.length > 0) {
if (process.env.DEBUG === 'true') if (process.env.DEBUG === 'true')
options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`)
for (const page of selectedLinks) { for (const page of selectedLinks.slice(0, limit)) {
docs.push(...(await cheerioLoader(page))) docs.push(...(await cheerioLoader(page)))
} }
} else { } else {
@@ -167,7 +167,9 @@ class Playwright_DocumentLoaders implements INode {
let docs = [] let docs = []
if (relativeLinksMethod) { if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`) if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = 10 // if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
// so when limit is 0 we can fetch all the links
if (limit === null || limit === undefined) limit = 10
else if (limit < 0) throw new Error('Limit cannot be less than 0') else if (limit < 0) throw new Error('Limit cannot be less than 0')
const pages: string[] = const pages: string[] =
selectedLinks && selectedLinks.length > 0 selectedLinks && selectedLinks.length > 0
@@ -184,7 +186,7 @@ class Playwright_DocumentLoaders implements INode {
} else if (selectedLinks && selectedLinks.length > 0) { } else if (selectedLinks && selectedLinks.length > 0) {
if (process.env.DEBUG === 'true') if (process.env.DEBUG === 'true')
options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`)
for (const page of selectedLinks) { for (const page of selectedLinks.slice(0, limit)) {
docs.push(...(await playwrightLoader(page))) docs.push(...(await playwrightLoader(page)))
} }
} else { } else {
@@ -168,7 +168,9 @@ class Puppeteer_DocumentLoaders implements INode {
let docs = [] let docs = []
if (relativeLinksMethod) { if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`) if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = 10 // if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
// so when limit is 0 we can fetch all the links
if (limit === null || limit === undefined) limit = 10
else if (limit < 0) throw new Error('Limit cannot be less than 0') else if (limit < 0) throw new Error('Limit cannot be less than 0')
const pages: string[] = const pages: string[] =
selectedLinks && selectedLinks.length > 0 selectedLinks && selectedLinks.length > 0
@@ -185,7 +187,7 @@ class Puppeteer_DocumentLoaders implements INode {
} else if (selectedLinks && selectedLinks.length > 0) { } else if (selectedLinks && selectedLinks.length > 0) {
if (process.env.DEBUG === 'true') if (process.env.DEBUG === 'true')
options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`)
for (const page of selectedLinks) { for (const page of selectedLinks.slice(0, limit)) {
docs.push(...(await puppeteerLoader(page))) docs.push(...(await puppeteerLoader(page)))
} }
} else { } else {
+7 -1
View File
@@ -1149,8 +1149,14 @@ export class App {
this.app.get('/api/v1/fetch-links', async (req: Request, res: Response) => { this.app.get('/api/v1/fetch-links', async (req: Request, res: Response) => {
const url = decodeURIComponent(req.query.url as string) const url = decodeURIComponent(req.query.url as string)
const relativeLinksMethod = req.query.relativeLinksMethod as string const relativeLinksMethod = req.query.relativeLinksMethod as string
if (!relativeLinksMethod) {
return res.status(500).send('Please choose a Relative Links Method in Additional Parameters.')
}
const limit = parseInt(req.query.limit as string)
if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`) if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
const links: string[] = relativeLinksMethod === 'webCrawl' ? await webCrawl(url, 0) : await xmlScrape(url, 0) const links: string[] = relativeLinksMethod === 'webCrawl' ? await webCrawl(url, limit) : await xmlScrape(url, limit)
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
res.json({ status: 'OK', links }) res.json({ status: 'OK', links })
}) })
+3 -3
View File
@@ -1,8 +1,8 @@
import client from './client' import client from './client'
const fetchAllLinks = (url, relativeLinksMethod) => const fetchLinks = (url, relativeLinksMethod, relativeLinksLimit) =>
client.get(`/fetch-links?url=${encodeURIComponent(url)}&relativeLinksMethod=${relativeLinksMethod}`) client.get(`/fetch-links?url=${encodeURIComponent(url)}&relativeLinksMethod=${relativeLinksMethod}&limit=${relativeLinksLimit}`)
export default { export default {
fetchAllLinks fetchLinks
} }
@@ -16,7 +16,7 @@ import {
Stack, Stack,
Typography Typography
} from '@mui/material' } from '@mui/material'
import { IconTrash } from '@tabler/icons' import { IconTrash, IconX } from '@tabler/icons'
import PerfectScrollbar from 'react-perfect-scrollbar' import PerfectScrollbar from 'react-perfect-scrollbar'
import { BackdropLoader } from 'ui-component/loading/BackdropLoader' import { BackdropLoader } from 'ui-component/loading/BackdropLoader'
@@ -24,12 +24,23 @@ import { StyledButton } from 'ui-component/button/StyledButton'
import scraperApi from 'api/scraper' import scraperApi from 'api/scraper'
import { HIDE_CANVAS_DIALOG, SHOW_CANVAS_DIALOG } from 'store/actions' import useNotifier from 'utils/useNotifier'
import {
HIDE_CANVAS_DIALOG,
SHOW_CANVAS_DIALOG,
enqueueSnackbar as enqueueSnackbarAction,
closeSnackbar as closeSnackbarAction
} from 'store/actions'
const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => { const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => {
const portalElement = document.getElementById('portal') const portalElement = document.getElementById('portal')
const dispatch = useDispatch() const dispatch = useDispatch()
useNotifier()
const enqueueSnackbar = (...args) => dispatch(enqueueSnackbarAction(...args))
const closeSnackbar = (...args) => dispatch(closeSnackbarAction(...args))
const [loading, setLoading] = useState(false) const [loading, setLoading] = useState(false)
const [selectedLinks, setSelectedLinks] = useState([]) const [selectedLinks, setSelectedLinks] = useState([])
const [url, setUrl] = useState('') const [url, setUrl] = useState('')
@@ -53,9 +64,38 @@ const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => {
const handleFetchLinks = async () => { const handleFetchLinks = async () => {
setLoading(true) setLoading(true)
const fetchLinksResp = await scraperApi.fetchAllLinks(url, 'webCrawl') try {
if (fetchLinksResp.data) { const fetchLinksResp = await scraperApi.fetchLinks(url, dialogProps.relativeLinksMethod, dialogProps.limit)
setSelectedLinks(fetchLinksResp.data.links) if (fetchLinksResp.data) {
setSelectedLinks(fetchLinksResp.data.links)
enqueueSnackbar({
message: 'Successfully fetched links',
options: {
key: new Date().getTime() + Math.random(),
variant: 'success',
action: (key) => (
<Button style={{ color: 'white' }} onClick={() => closeSnackbar(key)}>
<IconX />
</Button>
)
}
})
}
} catch (error) {
const errorData = error.response.data || `${error.response.status}: ${error.response.statusText}`
enqueueSnackbar({
message: errorData,
options: {
key: new Date().getTime() + Math.random(),
variant: 'error',
persist: true,
action: (key) => (
<Button style={{ color: 'white' }} onClick={() => closeSnackbar(key)}>
<IconX />
</Button>
)
}
})
} }
setLoading(false) setLoading(false)
} }
@@ -91,9 +91,11 @@ const NodeInputHandler = ({ inputAnchor, inputParam, data, disabled = false, isA
} }
} }
const onManageLinksDialogClicked = (url, selectedLinks) => { const onManageLinksDialogClicked = (url, selectedLinks, relativeLinksMethod, limit) => {
const dialogProps = { const dialogProps = {
url, url,
relativeLinksMethod,
limit,
selectedLinks, selectedLinks,
confirmButtonName: 'Save', confirmButtonName: 'Save',
cancelButtonName: 'Cancel' cancelButtonName: 'Cancel'
@@ -475,7 +477,9 @@ const NodeInputHandler = ({ inputAnchor, inputParam, data, disabled = false, isA
onClick={() => onClick={() =>
onManageLinksDialogClicked( onManageLinksDialogClicked(
data.inputs[inputParam.name] ?? inputParam.default ?? '', data.inputs[inputParam.name] ?? inputParam.default ?? '',
data.inputs.selectedLinks data.inputs.selectedLinks,
data.inputs['relativeLinksMethod'] ?? 'webCrawl',
parseInt(data.inputs['limit']) ?? 0
) )
} }
> >