feat: add search functionality to FireCrawl with customizable parameters (#4535)

* feat: add search functionality to FireCrawl with customizable parameters

* refactor: unify request parameters in FireCrawl to include integration identifier

* Update FireCrawl Document Loader to version 4.0, enhancing parameter labels and adding conditional visibility for URL and crawler options based on selected crawler type.

---------

Co-authored-by: Henry <hzj94@hotmail.com>
This commit is contained in:
Ademílson Tonato
2025-06-07 00:06:39 +01:00
committed by GitHub
parent 30c4180d97
commit 0c5f7ea003
@@ -67,6 +67,29 @@ interface ExtractResponse {
data?: Record<string, any> data?: Record<string, any>
} }
interface SearchResult {
url: string
title: string
description: string
}
interface SearchResponse {
success: boolean
data?: SearchResult[]
warning?: string
}
interface SearchRequest {
query: string
limit?: number
tbs?: string
lang?: string
country?: string
location?: string
timeout?: number
ignoreInvalidURLs?: boolean
}
interface Params { interface Params {
[key: string]: any [key: string]: any
extractorOptions?: { extractorOptions?: {
@@ -161,7 +184,11 @@ class FirecrawlApp {
} }
try { try {
const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/scrape', validParams, headers) const parameters = {
...validParams,
integration: 'flowise'
}
const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/scrape', parameters, headers)
if (response.status === 200) { if (response.status === 200) {
const responseData = response.data const responseData = response.data
if (responseData.success) { if (responseData.success) {
@@ -259,7 +286,11 @@ class FirecrawlApp {
} }
try { try {
const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/crawl', validParams, headers) const parameters = {
...validParams,
integration: 'flowise'
}
const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/crawl', parameters, headers)
if (response.status === 200) { if (response.status === 200) {
const crawlResponse = response.data as CrawlResponse const crawlResponse = response.data as CrawlResponse
if (!crawlResponse.success) { if (!crawlResponse.success) {
@@ -367,7 +398,11 @@ class FirecrawlApp {
} }
try { try {
const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/extract', validParams, headers) const parameters = {
...validParams,
integration: 'flowise'
}
const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/extract', parameters, headers)
if (response.status === 200) { if (response.status === 200) {
const extractResponse = response.data as ExtractResponse const extractResponse = response.data as ExtractResponse
if (waitUntilDone) { if (waitUntilDone) {
@@ -384,18 +419,55 @@ class FirecrawlApp {
return { success: false, id: '', url: '' } return { success: false, id: '', url: '' }
} }
async search(request: SearchRequest): Promise<SearchResponse> {
const headers = this.prepareHeaders()
// Create a clean payload with only valid parameters
const validParams: any = {
query: request.query
}
// Add optional parameters if they exist and are not empty
const validSearchParams = ['limit', 'tbs', 'lang', 'country', 'location', 'timeout', 'ignoreInvalidURLs'] as const
validSearchParams.forEach((param) => {
if (request[param] !== undefined && request[param] !== null) {
validParams[param] = request[param]
}
})
try {
const parameters = {
...validParams,
integration: 'flowise'
}
const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/search', parameters, headers)
if (response.status === 200) {
const searchResponse = response.data as SearchResponse
if (!searchResponse.success) {
throw new Error(`Search request failed: ${searchResponse.warning || 'Unknown error'}`)
}
return searchResponse
} else {
this.handleError(response, 'perform search')
}
} catch (error: any) {
throw new Error(error.message)
}
return { success: false }
}
private prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders { private prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
return { return {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
Authorization: `Bearer ${this.apiKey}`, Authorization: `Bearer ${this.apiKey}`,
'X-Origin': 'flowise',
'X-Origin-Type': 'integration',
...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {}) ...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {})
} as AxiosRequestHeaders & { 'X-Origin': string; 'X-Origin-Type': string; 'x-idempotency-key'?: string } } as AxiosRequestHeaders & { 'x-idempotency-key'?: string }
} }
private postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise<AxiosResponse> { private async postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
return axios.post(url, data, { headers }) const result = await axios.post(url, data, { headers })
return result
} }
private getRequest(url: string, headers: AxiosRequestHeaders): Promise<AxiosResponse> { private getRequest(url: string, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
@@ -468,29 +540,32 @@ class FirecrawlApp {
// FireCrawl Loader // FireCrawl Loader
interface FirecrawlLoaderParameters { interface FirecrawlLoaderParameters {
url: string url?: string
query?: string
apiKey?: string apiKey?: string
apiUrl?: string apiUrl?: string
mode?: 'crawl' | 'scrape' | 'extract' mode?: 'crawl' | 'scrape' | 'extract' | 'search'
params?: Record<string, unknown> params?: Record<string, unknown>
} }
export class FireCrawlLoader extends BaseDocumentLoader { export class FireCrawlLoader extends BaseDocumentLoader {
private apiKey: string private apiKey: string
private apiUrl: string private apiUrl: string
private url: string private url?: string
private mode: 'crawl' | 'scrape' | 'extract' private query?: string
private mode: 'crawl' | 'scrape' | 'extract' | 'search'
private params?: Record<string, unknown> private params?: Record<string, unknown>
constructor(loaderParams: FirecrawlLoaderParameters) { constructor(loaderParams: FirecrawlLoaderParameters) {
super() super()
const { apiKey, apiUrl, url, mode = 'crawl', params } = loaderParams const { apiKey, apiUrl, url, query, mode = 'crawl', params } = loaderParams
if (!apiKey) { if (!apiKey) {
throw new Error('Firecrawl API key not set. You can set it as FIRECRAWL_API_KEY in your .env file, or pass it to Firecrawl.') throw new Error('Firecrawl API key not set. You can set it as FIRECRAWL_API_KEY in your .env file, or pass it to Firecrawl.')
} }
this.apiKey = apiKey this.apiKey = apiKey
this.url = url this.url = url
this.query = query
this.mode = mode this.mode = mode
this.params = params this.params = params
this.apiUrl = apiUrl || 'https://api.firecrawl.dev' this.apiUrl = apiUrl || 'https://api.firecrawl.dev'
@@ -500,13 +575,37 @@ export class FireCrawlLoader extends BaseDocumentLoader {
const app = new FirecrawlApp({ apiKey: this.apiKey, apiUrl: this.apiUrl }) const app = new FirecrawlApp({ apiKey: this.apiKey, apiUrl: this.apiUrl })
let firecrawlDocs: FirecrawlDocument[] let firecrawlDocs: FirecrawlDocument[]
if (this.mode === 'scrape') { if (this.mode === 'search') {
if (!this.query) {
throw new Error('Firecrawl: Query is required for search mode')
}
const response = await app.search({ query: this.query, ...this.params })
if (!response.success) {
throw new Error(`Firecrawl: Failed to search. Warning: ${response.warning}`)
}
// Convert search results to FirecrawlDocument format
firecrawlDocs = (response.data || []).map((result) => ({
markdown: result.description,
metadata: {
title: result.title,
sourceURL: result.url,
description: result.description
}
}))
} else if (this.mode === 'scrape') {
if (!this.url) {
throw new Error('Firecrawl: URL is required for scrape mode')
}
const response = await app.scrapeUrl(this.url, this.params) const response = await app.scrapeUrl(this.url, this.params)
if (!response.success) { if (!response.success) {
throw new Error(`Firecrawl: Failed to scrape URL. Error: ${response.error}`) throw new Error(`Firecrawl: Failed to scrape URL. Error: ${response.error}`)
} }
firecrawlDocs = [response.data as FirecrawlDocument] firecrawlDocs = [response.data as FirecrawlDocument]
} else if (this.mode === 'crawl') { } else if (this.mode === 'crawl') {
if (!this.url) {
throw new Error('Firecrawl: URL is required for crawl mode')
}
const response = await app.crawlUrl(this.url, this.params) const response = await app.crawlUrl(this.url, this.params)
if ('status' in response) { if ('status' in response) {
if (response.status === 'failed') { if (response.status === 'failed') {
@@ -520,6 +619,9 @@ export class FireCrawlLoader extends BaseDocumentLoader {
firecrawlDocs = [response.data as FirecrawlDocument] firecrawlDocs = [response.data as FirecrawlDocument]
} }
} else if (this.mode === 'extract') { } else if (this.mode === 'extract') {
if (!this.url) {
throw new Error('Firecrawl: URL is required for extract mode')
}
this.params!.urls = [this.url] this.params!.urls = [this.url]
const response = await app.extract(this.params as any as ExtractRequest) const response = await app.extract(this.params as any as ExtractRequest)
if (!response.success) { if (!response.success) {
@@ -557,7 +659,7 @@ export class FireCrawlLoader extends BaseDocumentLoader {
} }
return [] return []
} else { } else {
throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape', 'extract'.`) throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape', 'extract', 'search'.`)
} }
// Convert Firecrawl documents to LangChain documents // Convert Firecrawl documents to LangChain documents
@@ -602,7 +704,7 @@ class FireCrawl_DocumentLoaders implements INode {
this.name = 'fireCrawl' this.name = 'fireCrawl'
this.type = 'Document' this.type = 'Document'
this.icon = 'firecrawl.png' this.icon = 'firecrawl.png'
this.version = 3.0 this.version = 4.0
this.category = 'Document Loaders' this.category = 'Document Loaders'
this.description = 'Load data from URL using FireCrawl' this.description = 'Load data from URL using FireCrawl'
this.baseClasses = [this.type] this.baseClasses = [this.type]
@@ -620,14 +722,7 @@ class FireCrawl_DocumentLoaders implements INode {
optional: true optional: true
}, },
{ {
label: 'URLs', label: 'Type',
name: 'url',
type: 'string',
description: 'URL to be crawled/scraped/extracted',
placeholder: 'https://docs.flowiseai.com'
},
{
label: 'Crawler type',
type: 'options', type: 'options',
name: 'crawlerType', name: 'crawlerType',
options: [ options: [
@@ -645,89 +740,179 @@ class FireCrawl_DocumentLoaders implements INode {
label: 'Extract', label: 'Extract',
name: 'extract', name: 'extract',
description: 'Extract data from a URL' description: 'Extract data from a URL'
},
{
label: 'Search',
name: 'search',
description: 'Search the web using FireCrawl'
} }
], ],
default: 'crawl' default: 'crawl'
}, },
{
label: 'URLs',
name: 'url',
type: 'string',
description: 'URL to be crawled/scraped/extracted',
placeholder: 'https://docs.flowiseai.com',
optional: true,
show: {
crawlerType: ['crawl', 'scrape', 'extract']
}
},
{ {
// includeTags // includeTags
label: '[Scrape] Include Tags', label: 'Include Tags',
name: 'includeTags', name: 'includeTags',
type: 'string', type: 'string',
description: 'Tags to include in the output. Use comma to separate multiple tags.', description: 'Tags to include in the output. Use comma to separate multiple tags.',
optional: true, optional: true,
additionalParams: true additionalParams: true,
show: {
crawlerType: ['scrape']
}
}, },
{ {
// excludeTags // excludeTags
label: '[Scrape] Exclude Tags', label: 'Exclude Tags',
name: 'excludeTags', name: 'excludeTags',
type: 'string', type: 'string',
description: 'Tags to exclude from the output. Use comma to separate multiple tags.', description: 'Tags to exclude from the output. Use comma to separate multiple tags.',
optional: true, optional: true,
additionalParams: true additionalParams: true,
show: {
crawlerType: ['scrape']
}
}, },
{ {
// onlyMainContent // onlyMainContent
label: '[Scrape] Only Main Content', label: 'Only Main Content',
name: 'onlyMainContent', name: 'onlyMainContent',
type: 'boolean', type: 'boolean',
description: 'Extract only the main content of the page', description: 'Extract only the main content of the page',
optional: true, optional: true,
additionalParams: true additionalParams: true,
show: {
crawlerType: ['scrape']
}
}, },
{ {
// limit // limit
label: '[Crawl] Limit', label: 'Limit',
name: 'limit', name: 'limit',
type: 'string', type: 'string',
description: 'Maximum number of pages to crawl', description: 'Maximum number of pages to crawl',
optional: true, optional: true,
additionalParams: true, additionalParams: true,
default: '10000' default: '10000',
show: {
crawlerType: ['crawl']
}
}, },
{ {
label: '[Crawl] Include Paths', label: 'Include Paths',
name: 'includePaths', name: 'includePaths',
type: 'string', type: 'string',
description: description:
'URL pathname regex patterns that include matching URLs in the crawl. Only the paths that match the specified patterns will be included in the response.', 'URL pathname regex patterns that include matching URLs in the crawl. Only the paths that match the specified patterns will be included in the response.',
placeholder: `blog/.*, news/.*`, placeholder: `blog/.*, news/.*`,
optional: true, optional: true,
additionalParams: true additionalParams: true,
show: {
crawlerType: ['crawl']
}
}, },
{ {
label: '[Crawl] Exclude Paths', label: 'Exclude Paths',
name: 'excludePaths', name: 'excludePaths',
type: 'string', type: 'string',
description: 'URL pathname regex patterns that exclude matching URLs from the crawl.', description: 'URL pathname regex patterns that exclude matching URLs from the crawl.',
placeholder: `blog/.*, news/.*`, placeholder: `blog/.*, news/.*`,
optional: true, optional: true,
additionalParams: true additionalParams: true,
show: {
crawlerType: ['crawl']
}
}, },
{ {
label: '[Extract] Schema', label: 'Schema',
name: 'extractSchema', name: 'extractSchema',
type: 'json', type: 'json',
description: 'JSON schema for data extraction', description: 'JSON schema for data extraction',
optional: true, optional: true,
additionalParams: true additionalParams: true,
show: {
crawlerType: ['extract']
}
}, },
{ {
label: '[Extract] Prompt', label: 'Prompt',
name: 'extractPrompt', name: 'extractPrompt',
type: 'string', type: 'string',
description: 'Prompt for data extraction', description: 'Prompt for data extraction',
optional: true, optional: true,
additionalParams: true additionalParams: true,
show: {
crawlerType: ['extract']
}
}, },
{ {
label: '[Extract] Job ID', label: 'Query',
name: 'extractJobId', name: 'searchQuery',
type: 'string', type: 'string',
description: 'ID of the extract job', description: 'Search query to find relevant content',
optional: true, optional: true,
additionalParams: true show: {
crawlerType: ['search']
}
},
{
label: 'Limit',
name: 'searchLimit',
type: 'string',
description: 'Maximum number of results to return',
optional: true,
additionalParams: true,
default: '5',
show: {
crawlerType: ['search']
}
},
{
label: 'Language',
name: 'searchLang',
type: 'string',
description: 'Language code for search results (e.g., en, es, fr)',
optional: true,
additionalParams: true,
default: 'en',
show: {
crawlerType: ['search']
}
},
{
label: 'Country',
name: 'searchCountry',
type: 'string',
description: 'Country code for search results (e.g., us, uk, ca)',
optional: true,
additionalParams: true,
default: 'us',
show: {
crawlerType: ['search']
}
},
{
label: 'Timeout',
name: 'searchTimeout',
type: 'number',
description: 'Timeout in milliseconds for search operation',
optional: true,
additionalParams: true,
default: 60000,
show: {
crawlerType: ['search']
}
} }
] ]
this.outputs = [ this.outputs = [
@@ -758,6 +943,11 @@ class FireCrawl_DocumentLoaders implements INode {
const firecrawlApiUrl = getCredentialParam('firecrawlApiUrl', credentialData, nodeData, 'https://api.firecrawl.dev') const firecrawlApiUrl = getCredentialParam('firecrawlApiUrl', credentialData, nodeData, 'https://api.firecrawl.dev')
const output = nodeData.outputs?.output as string const output = nodeData.outputs?.output as string
// Validate URL only for non-search methods
if (crawlerType !== 'search' && !url) {
throw new Error('Firecrawl: URL is required for ' + crawlerType + ' mode')
}
const includePaths = nodeData.inputs?.includePaths ? (nodeData.inputs.includePaths.split(',') as string[]) : undefined const includePaths = nodeData.inputs?.includePaths ? (nodeData.inputs.includePaths.split(',') as string[]) : undefined
const excludePaths = nodeData.inputs?.excludePaths ? (nodeData.inputs.excludePaths.split(',') as string[]) : undefined const excludePaths = nodeData.inputs?.excludePaths ? (nodeData.inputs.excludePaths.split(',') as string[]) : undefined
@@ -767,9 +957,16 @@ class FireCrawl_DocumentLoaders implements INode {
const extractSchema = nodeData.inputs?.extractSchema const extractSchema = nodeData.inputs?.extractSchema
const extractPrompt = nodeData.inputs?.extractPrompt as string const extractPrompt = nodeData.inputs?.extractPrompt as string
const searchQuery = nodeData.inputs?.searchQuery as string
const searchLimit = nodeData.inputs?.searchLimit as string
const searchLang = nodeData.inputs?.searchLang as string
const searchCountry = nodeData.inputs?.searchCountry as string
const searchTimeout = nodeData.inputs?.searchTimeout as number
const input: FirecrawlLoaderParameters = { const input: FirecrawlLoaderParameters = {
url, url,
mode: crawlerType as 'crawl' | 'scrape' | 'extract', query: searchQuery,
mode: crawlerType as 'crawl' | 'scrape' | 'extract' | 'search',
apiKey: firecrawlApiToken, apiKey: firecrawlApiToken,
apiUrl: firecrawlApiUrl, apiUrl: firecrawlApiUrl,
params: { params: {
@@ -785,6 +982,19 @@ class FireCrawl_DocumentLoaders implements INode {
} }
} }
// Add search-specific parameters only when in search mode
if (crawlerType === 'search') {
if (!searchQuery) {
throw new Error('Firecrawl: Search query is required for search mode')
}
input.params = {
limit: searchLimit ? parseInt(searchLimit, 10) : 5,
lang: searchLang,
country: searchCountry,
timeout: searchTimeout
}
}
if (onlyMainContent === true) { if (onlyMainContent === true) {
const scrapeOptions = input.params?.scrapeOptions as any const scrapeOptions = input.params?.scrapeOptions as any
input.params!.scrapeOptions = { input.params!.scrapeOptions = {