mirror of
https://github.com/farcasclaudiu/openclaw.git
synced 2026-06-28 19:01:47 +03:00
feat(web-fetch): support Cloudflare Markdown for Agents (#15376)
Merged via /review-pr -> /prepare-pr -> /merge-pr. Prepared head SHA: d0528dc429840b16bfcef4d921f3229653d38143 Co-authored-by: Yaxuan42 <184813557+Yaxuan42@users.noreply.github.com> Co-authored-by: steipete <58493+steipete@users.noreply.github.com> Reviewed-by: @steipete
This commit is contained in:
@@ -22,6 +22,7 @@ Docs: https://docs.openclaw.ai
|
|||||||
- Sessions/Agents: pass `agentId` when resolving existing transcript paths in reply runs so non-default agents and heartbeat/chat handlers no longer fail with `Session file path must be within sessions directory`. (#15141) Thanks @Goldenmonstew.
|
- Sessions/Agents: pass `agentId` when resolving existing transcript paths in reply runs so non-default agents and heartbeat/chat handlers no longer fail with `Session file path must be within sessions directory`. (#15141) Thanks @Goldenmonstew.
|
||||||
- Sessions/Agents: pass `agentId` through status and usage transcript-resolution paths (auto-reply, gateway usage APIs, and session cost/log loaders) so non-default agents can resolve absolute session files without path-validation failures. (#15103) Thanks @jalehman.
|
- Sessions/Agents: pass `agentId` through status and usage transcript-resolution paths (auto-reply, gateway usage APIs, and session cost/log loaders) so non-default agents can resolve absolute session files without path-validation failures. (#15103) Thanks @jalehman.
|
||||||
- Signal/Install: auto-install `signal-cli` via Homebrew on non-x64 Linux architectures, avoiding x86_64 native binary `Exec format error` failures on arm64/arm hosts. (#15443) Thanks @jogvan-k.
|
- Signal/Install: auto-install `signal-cli` via Homebrew on non-x64 Linux architectures, avoiding x86_64 native binary `Exec format error` failures on arm64/arm hosts. (#15443) Thanks @jogvan-k.
|
||||||
|
- Web tools/web_fetch: prefer `text/markdown` responses for Cloudflare Markdown for Agents, add `cf-markdown` extraction for markdown bodies, and redact fetched URLs in `x-markdown-tokens` debug logs to avoid leaking raw paths/query params. (#15376) Thanks @Yaxuan42.
|
||||||
|
|
||||||
## 2026.2.12
|
## 2026.2.12
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,186 @@
|
|||||||
|
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||||
|
import * as ssrf from "../../infra/net/ssrf.js";
|
||||||
|
import * as logger from "../../logger.js";
|
||||||
|
|
||||||
|
const lookupMock = vi.fn();
|
||||||
|
const resolvePinnedHostname = ssrf.resolvePinnedHostname;
|
||||||
|
|
||||||
|
function makeHeaders(map: Record<string, string>): { get: (key: string) => string | null } {
|
||||||
|
return {
|
||||||
|
get: (key) => map[key.toLowerCase()] ?? null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function markdownResponse(body: string, extraHeaders: Record<string, string> = {}): Response {
|
||||||
|
return {
|
||||||
|
ok: true,
|
||||||
|
status: 200,
|
||||||
|
headers: makeHeaders({ "content-type": "text/markdown; charset=utf-8", ...extraHeaders }),
|
||||||
|
text: async () => body,
|
||||||
|
} as Response;
|
||||||
|
}
|
||||||
|
|
||||||
|
function htmlResponse(body: string): Response {
|
||||||
|
return {
|
||||||
|
ok: true,
|
||||||
|
status: 200,
|
||||||
|
headers: makeHeaders({ "content-type": "text/html; charset=utf-8" }),
|
||||||
|
text: async () => body,
|
||||||
|
} as Response;
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("web_fetch Cloudflare Markdown for Agents", () => {
|
||||||
|
const priorFetch = global.fetch;
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
lookupMock.mockResolvedValue([{ address: "93.184.216.34", family: 4 }]);
|
||||||
|
vi.spyOn(ssrf, "resolvePinnedHostname").mockImplementation((hostname) =>
|
||||||
|
resolvePinnedHostname(hostname, lookupMock),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
// @ts-expect-error restore
|
||||||
|
global.fetch = priorFetch;
|
||||||
|
lookupMock.mockReset();
|
||||||
|
vi.restoreAllMocks();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("sends Accept header preferring text/markdown", async () => {
|
||||||
|
const fetchSpy = vi.fn().mockResolvedValue(markdownResponse("# Test Page\n\nHello world."));
|
||||||
|
// @ts-expect-error mock fetch
|
||||||
|
global.fetch = fetchSpy;
|
||||||
|
|
||||||
|
const { createWebFetchTool } = await import("./web-tools.js");
|
||||||
|
const tool = createWebFetchTool({
|
||||||
|
config: {
|
||||||
|
tools: { web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } } } },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
await tool?.execute?.("call", { url: "https://example.com/page" });
|
||||||
|
|
||||||
|
expect(fetchSpy).toHaveBeenCalled();
|
||||||
|
const [, init] = fetchSpy.mock.calls[0];
|
||||||
|
expect(init.headers.Accept).toBe("text/markdown, text/html;q=0.9, */*;q=0.1");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("uses cf-markdown extractor for text/markdown responses", async () => {
|
||||||
|
const md = "# CF Markdown\n\nThis is server-rendered markdown.";
|
||||||
|
const fetchSpy = vi.fn().mockResolvedValue(markdownResponse(md));
|
||||||
|
// @ts-expect-error mock fetch
|
||||||
|
global.fetch = fetchSpy;
|
||||||
|
|
||||||
|
const { createWebFetchTool } = await import("./web-tools.js");
|
||||||
|
const tool = createWebFetchTool({
|
||||||
|
config: {
|
||||||
|
tools: { web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } } } },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await tool?.execute?.("call", { url: "https://example.com/cf" });
|
||||||
|
expect(result?.details).toMatchObject({
|
||||||
|
status: 200,
|
||||||
|
extractor: "cf-markdown",
|
||||||
|
contentType: "text/markdown",
|
||||||
|
});
|
||||||
|
// The body should contain the original markdown (wrapped with security markers)
|
||||||
|
expect(result?.details?.text).toContain("CF Markdown");
|
||||||
|
expect(result?.details?.text).toContain("server-rendered markdown");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("falls back to readability for text/html responses", async () => {
|
||||||
|
const html =
|
||||||
|
"<html><body><article><h1>HTML Page</h1><p>Content here.</p></article></body></html>";
|
||||||
|
const fetchSpy = vi.fn().mockResolvedValue(htmlResponse(html));
|
||||||
|
// @ts-expect-error mock fetch
|
||||||
|
global.fetch = fetchSpy;
|
||||||
|
|
||||||
|
const { createWebFetchTool } = await import("./web-tools.js");
|
||||||
|
const tool = createWebFetchTool({
|
||||||
|
config: {
|
||||||
|
tools: { web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } } } },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await tool?.execute?.("call", { url: "https://example.com/html" });
|
||||||
|
expect(result?.details?.extractor).not.toBe("cf-markdown");
|
||||||
|
expect(result?.details?.contentType).toBe("text/html");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("logs x-markdown-tokens when header is present", async () => {
|
||||||
|
const logSpy = vi.spyOn(logger, "logDebug").mockImplementation(() => {});
|
||||||
|
const fetchSpy = vi
|
||||||
|
.fn()
|
||||||
|
.mockResolvedValue(markdownResponse("# Tokens Test", { "x-markdown-tokens": "1500" }));
|
||||||
|
// @ts-expect-error mock fetch
|
||||||
|
global.fetch = fetchSpy;
|
||||||
|
|
||||||
|
const { createWebFetchTool } = await import("./web-tools.js");
|
||||||
|
const tool = createWebFetchTool({
|
||||||
|
config: {
|
||||||
|
tools: { web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } } } },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
await tool?.execute?.("call", { url: "https://example.com/tokens/private?token=secret" });
|
||||||
|
|
||||||
|
expect(logSpy).toHaveBeenCalledWith(
|
||||||
|
expect.stringContaining("x-markdown-tokens: 1500 (https://example.com/...)"),
|
||||||
|
);
|
||||||
|
const tokenLogs = logSpy.mock.calls
|
||||||
|
.map(([message]) => String(message))
|
||||||
|
.filter((message) => message.includes("x-markdown-tokens"));
|
||||||
|
expect(tokenLogs).toHaveLength(1);
|
||||||
|
expect(tokenLogs[0]).not.toContain("token=secret");
|
||||||
|
expect(tokenLogs[0]).not.toContain("/tokens/private");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("converts markdown to text when extractMode is text", async () => {
|
||||||
|
const md = "# Heading\n\n**Bold text** and [a link](https://example.com).";
|
||||||
|
const fetchSpy = vi.fn().mockResolvedValue(markdownResponse(md));
|
||||||
|
// @ts-expect-error mock fetch
|
||||||
|
global.fetch = fetchSpy;
|
||||||
|
|
||||||
|
const { createWebFetchTool } = await import("./web-tools.js");
|
||||||
|
const tool = createWebFetchTool({
|
||||||
|
config: {
|
||||||
|
tools: { web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } } } },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await tool?.execute?.("call", {
|
||||||
|
url: "https://example.com/text-mode",
|
||||||
|
extractMode: "text",
|
||||||
|
});
|
||||||
|
expect(result?.details).toMatchObject({
|
||||||
|
extractor: "cf-markdown",
|
||||||
|
extractMode: "text",
|
||||||
|
});
|
||||||
|
// Text mode strips header markers (#) and link syntax
|
||||||
|
expect(result?.details?.text).not.toContain("# Heading");
|
||||||
|
expect(result?.details?.text).toContain("Heading");
|
||||||
|
expect(result?.details?.text).not.toContain("[a link](https://example.com)");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("does not log x-markdown-tokens when header is absent", async () => {
|
||||||
|
const logSpy = vi.spyOn(logger, "logDebug").mockImplementation(() => {});
|
||||||
|
const fetchSpy = vi.fn().mockResolvedValue(markdownResponse("# No tokens"));
|
||||||
|
// @ts-expect-error mock fetch
|
||||||
|
global.fetch = fetchSpy;
|
||||||
|
|
||||||
|
const { createWebFetchTool } = await import("./web-tools.js");
|
||||||
|
const tool = createWebFetchTool({
|
||||||
|
config: {
|
||||||
|
tools: { web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } } } },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
await tool?.execute?.("call", { url: "https://example.com/no-tokens" });
|
||||||
|
|
||||||
|
const tokenLogs = logSpy.mock.calls.filter(
|
||||||
|
(args) => typeof args[0] === "string" && args[0].includes("x-markdown-tokens"),
|
||||||
|
);
|
||||||
|
expect(tokenLogs).toHaveLength(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -3,6 +3,7 @@ import type { OpenClawConfig } from "../../config/config.js";
|
|||||||
import type { AnyAgentTool } from "./common.js";
|
import type { AnyAgentTool } from "./common.js";
|
||||||
import { fetchWithSsrFGuard } from "../../infra/net/fetch-guard.js";
|
import { fetchWithSsrFGuard } from "../../infra/net/fetch-guard.js";
|
||||||
import { SsrFBlockedError } from "../../infra/net/ssrf.js";
|
import { SsrFBlockedError } from "../../infra/net/ssrf.js";
|
||||||
|
import { logDebug } from "../../logger.js";
|
||||||
import { wrapExternalContent, wrapWebContent } from "../../security/external-content.js";
|
import { wrapExternalContent, wrapWebContent } from "../../security/external-content.js";
|
||||||
import { normalizeSecretInput } from "../../utils/normalize-secret-input.js";
|
import { normalizeSecretInput } from "../../utils/normalize-secret-input.js";
|
||||||
import { stringEnum } from "../schema/typebox.js";
|
import { stringEnum } from "../schema/typebox.js";
|
||||||
@@ -212,6 +213,15 @@ function formatWebFetchErrorDetail(params: {
|
|||||||
return truncated.text;
|
return truncated.text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function redactUrlForDebugLog(rawUrl: string): string {
|
||||||
|
try {
|
||||||
|
const parsed = new URL(rawUrl);
|
||||||
|
return parsed.pathname && parsed.pathname !== "/" ? `${parsed.origin}/...` : parsed.origin;
|
||||||
|
} catch {
|
||||||
|
return "[invalid-url]";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const WEB_FETCH_WRAPPER_WITH_WARNING_OVERHEAD = wrapWebContent("", "web_fetch").length;
|
const WEB_FETCH_WRAPPER_WITH_WARNING_OVERHEAD = wrapWebContent("", "web_fetch").length;
|
||||||
const WEB_FETCH_WRAPPER_NO_WARNING_OVERHEAD = wrapExternalContent("", {
|
const WEB_FETCH_WRAPPER_NO_WARNING_OVERHEAD = wrapExternalContent("", {
|
||||||
source: "web_fetch",
|
source: "web_fetch",
|
||||||
@@ -409,7 +419,7 @@ async function runWebFetch(params: {
|
|||||||
timeoutMs: params.timeoutSeconds * 1000,
|
timeoutMs: params.timeoutSeconds * 1000,
|
||||||
init: {
|
init: {
|
||||||
headers: {
|
headers: {
|
||||||
Accept: "*/*",
|
Accept: "text/markdown, text/html;q=0.9, */*;q=0.1",
|
||||||
"User-Agent": params.userAgent,
|
"User-Agent": params.userAgent,
|
||||||
"Accept-Language": "en-US,en;q=0.9",
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
},
|
},
|
||||||
@@ -418,6 +428,14 @@ async function runWebFetch(params: {
|
|||||||
res = result.response;
|
res = result.response;
|
||||||
finalUrl = result.finalUrl;
|
finalUrl = result.finalUrl;
|
||||||
release = result.release;
|
release = result.release;
|
||||||
|
|
||||||
|
// Cloudflare Markdown for Agents — log token budget hint when present
|
||||||
|
const markdownTokens = res.headers.get("x-markdown-tokens");
|
||||||
|
if (markdownTokens) {
|
||||||
|
logDebug(
|
||||||
|
`[web-fetch] x-markdown-tokens: ${markdownTokens} (${redactUrlForDebugLog(finalUrl)})`,
|
||||||
|
);
|
||||||
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof SsrFBlockedError) {
|
if (error instanceof SsrFBlockedError) {
|
||||||
throw error;
|
throw error;
|
||||||
@@ -522,7 +540,13 @@ async function runWebFetch(params: {
|
|||||||
let title: string | undefined;
|
let title: string | undefined;
|
||||||
let extractor = "raw";
|
let extractor = "raw";
|
||||||
let text = body;
|
let text = body;
|
||||||
if (contentType.includes("text/html")) {
|
if (contentType.includes("text/markdown")) {
|
||||||
|
// Cloudflare Markdown for Agents: server returned pre-rendered markdown
|
||||||
|
extractor = "cf-markdown";
|
||||||
|
if (params.extractMode === "text") {
|
||||||
|
text = markdownToText(body);
|
||||||
|
}
|
||||||
|
} else if (contentType.includes("text/html")) {
|
||||||
if (params.readabilityEnabled) {
|
if (params.readabilityEnabled) {
|
||||||
const readable = await extractReadableContent({
|
const readable = await extractReadableContent({
|
||||||
html: body,
|
html: body,
|
||||||
|
|||||||
Reference in New Issue
Block a user