mirror of
https://github.com/farcasclaudiu/openclaw.git
synced 2026-06-28 15:01:41 +03:00
TTS: gate auto audio on inbound voice notes (#1667)
Co-authored-by: Sebastian <sebslight@gmail.com>
This commit is contained in:
+128
-1
@@ -4,7 +4,7 @@ import { completeSimple } from "@mariozechner/pi-ai";
|
||||
|
||||
import { getApiKeyForModel } from "../agents/model-auth.js";
|
||||
import { resolveModel } from "../agents/pi-embedded-runner/model.js";
|
||||
import { _test, getTtsProvider, resolveTtsConfig } from "./tts.js";
|
||||
import * as tts from "./tts.js";
|
||||
|
||||
vi.mock("@mariozechner/pi-ai", () => ({
|
||||
completeSimple: vi.fn(),
|
||||
@@ -37,6 +37,8 @@ vi.mock("../agents/model-auth.js", () => ({
|
||||
requireApiKey: vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? ""),
|
||||
}));
|
||||
|
||||
const { _test, resolveTtsConfig, maybeApplyTtsToPayload, getTtsProvider } = tts;
|
||||
|
||||
const {
|
||||
isValidVoiceId,
|
||||
isValidOpenAIVoice,
|
||||
@@ -431,4 +433,129 @@ describe("tts", () => {
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe("maybeApplyTtsToPayload", () => {
|
||||
const baseCfg = {
|
||||
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
|
||||
messages: {
|
||||
tts: {
|
||||
auto: "inbound",
|
||||
provider: "openai",
|
||||
openai: { apiKey: "test-key", model: "gpt-4o-mini-tts", voice: "alloy" },
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
it("skips auto-TTS when inbound audio gating is on and the message is not audio", async () => {
|
||||
const prevPrefs = process.env.CLAWDBOT_TTS_PREFS;
|
||||
process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
|
||||
const originalFetch = globalThis.fetch;
|
||||
const fetchMock = vi.fn(async () => ({
|
||||
ok: true,
|
||||
arrayBuffer: async () => new ArrayBuffer(1),
|
||||
}));
|
||||
globalThis.fetch = fetchMock as unknown as typeof fetch;
|
||||
|
||||
const payload = { text: "Hello world" };
|
||||
const result = await maybeApplyTtsToPayload({
|
||||
payload,
|
||||
cfg: baseCfg,
|
||||
kind: "final",
|
||||
inboundAudio: false,
|
||||
});
|
||||
|
||||
expect(result).toBe(payload);
|
||||
expect(fetchMock).not.toHaveBeenCalled();
|
||||
|
||||
globalThis.fetch = originalFetch;
|
||||
process.env.CLAWDBOT_TTS_PREFS = prevPrefs;
|
||||
});
|
||||
|
||||
it("attempts auto-TTS when inbound audio gating is on and the message is audio", async () => {
|
||||
const prevPrefs = process.env.CLAWDBOT_TTS_PREFS;
|
||||
process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
|
||||
const originalFetch = globalThis.fetch;
|
||||
const fetchMock = vi.fn(async () => ({
|
||||
ok: true,
|
||||
arrayBuffer: async () => new ArrayBuffer(1),
|
||||
}));
|
||||
globalThis.fetch = fetchMock as unknown as typeof fetch;
|
||||
|
||||
const result = await maybeApplyTtsToPayload({
|
||||
payload: { text: "Hello world" },
|
||||
cfg: baseCfg,
|
||||
kind: "final",
|
||||
inboundAudio: true,
|
||||
});
|
||||
|
||||
expect(result.mediaUrl).toBeDefined();
|
||||
expect(fetchMock).toHaveBeenCalledTimes(1);
|
||||
|
||||
globalThis.fetch = originalFetch;
|
||||
process.env.CLAWDBOT_TTS_PREFS = prevPrefs;
|
||||
});
|
||||
|
||||
it("skips auto-TTS in tagged mode unless a tts tag is present", async () => {
|
||||
const prevPrefs = process.env.CLAWDBOT_TTS_PREFS;
|
||||
process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
|
||||
const originalFetch = globalThis.fetch;
|
||||
const fetchMock = vi.fn(async () => ({
|
||||
ok: true,
|
||||
arrayBuffer: async () => new ArrayBuffer(1),
|
||||
}));
|
||||
globalThis.fetch = fetchMock as unknown as typeof fetch;
|
||||
|
||||
const cfg = {
|
||||
...baseCfg,
|
||||
messages: {
|
||||
...baseCfg.messages,
|
||||
tts: { ...baseCfg.messages.tts, auto: "tagged" },
|
||||
},
|
||||
};
|
||||
|
||||
const payload = { text: "Hello world" };
|
||||
const result = await maybeApplyTtsToPayload({
|
||||
payload,
|
||||
cfg,
|
||||
kind: "final",
|
||||
});
|
||||
|
||||
expect(result).toBe(payload);
|
||||
expect(fetchMock).not.toHaveBeenCalled();
|
||||
|
||||
globalThis.fetch = originalFetch;
|
||||
process.env.CLAWDBOT_TTS_PREFS = prevPrefs;
|
||||
});
|
||||
|
||||
it("runs auto-TTS in tagged mode when tags are present", async () => {
|
||||
const prevPrefs = process.env.CLAWDBOT_TTS_PREFS;
|
||||
process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
|
||||
const originalFetch = globalThis.fetch;
|
||||
const fetchMock = vi.fn(async () => ({
|
||||
ok: true,
|
||||
arrayBuffer: async () => new ArrayBuffer(1),
|
||||
}));
|
||||
globalThis.fetch = fetchMock as unknown as typeof fetch;
|
||||
|
||||
const cfg = {
|
||||
...baseCfg,
|
||||
messages: {
|
||||
...baseCfg.messages,
|
||||
tts: { ...baseCfg.messages.tts, auto: "tagged" },
|
||||
},
|
||||
};
|
||||
|
||||
const result = await maybeApplyTtsToPayload({
|
||||
payload: { text: "[[tts:text]]Hello world[[/tts:text]]" },
|
||||
cfg,
|
||||
kind: "final",
|
||||
});
|
||||
|
||||
expect(result.mediaUrl).toBeDefined();
|
||||
expect(fetchMock).toHaveBeenCalledTimes(1);
|
||||
|
||||
globalThis.fetch = originalFetch;
|
||||
process.env.CLAWDBOT_TTS_PREFS = prevPrefs;
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
+87
-18
@@ -20,6 +20,7 @@ import type { ChannelId } from "../channels/plugins/types.js";
|
||||
import type { ClawdbotConfig } from "../config/config.js";
|
||||
import type {
|
||||
TtsConfig,
|
||||
TtsAutoMode,
|
||||
TtsMode,
|
||||
TtsProvider,
|
||||
TtsModelOverrideConfig,
|
||||
@@ -75,8 +76,10 @@ const DEFAULT_OUTPUT = {
|
||||
voiceCompatible: false,
|
||||
};
|
||||
|
||||
const TTS_AUTO_MODES = new Set<TtsAutoMode>(["off", "always", "inbound", "tagged"]);
|
||||
|
||||
export type ResolvedTtsConfig = {
|
||||
enabled: boolean;
|
||||
auto: TtsAutoMode;
|
||||
mode: TtsMode;
|
||||
provider: TtsProvider;
|
||||
providerSource: "config" | "default";
|
||||
@@ -123,6 +126,7 @@ export type ResolvedTtsConfig = {
|
||||
|
||||
type TtsUserPrefs = {
|
||||
tts?: {
|
||||
auto?: TtsAutoMode;
|
||||
enabled?: boolean;
|
||||
provider?: TtsProvider;
|
||||
maxLength?: number;
|
||||
@@ -161,6 +165,7 @@ type TtsDirectiveOverrides = {
|
||||
type TtsDirectiveParseResult = {
|
||||
cleanedText: string;
|
||||
ttsText?: string;
|
||||
hasDirective: boolean;
|
||||
overrides: TtsDirectiveOverrides;
|
||||
warnings: string[];
|
||||
};
|
||||
@@ -187,6 +192,15 @@ type TtsStatusEntry = {
|
||||
|
||||
let lastTtsAttempt: TtsStatusEntry | undefined;
|
||||
|
||||
export function normalizeTtsAutoMode(value: unknown): TtsAutoMode | undefined {
|
||||
if (typeof value !== "string") return undefined;
|
||||
const normalized = value.trim().toLowerCase();
|
||||
if (TTS_AUTO_MODES.has(normalized as TtsAutoMode)) {
|
||||
return normalized as TtsAutoMode;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function resolveModelOverridePolicy(
|
||||
overrides: TtsModelOverrideConfig | undefined,
|
||||
): ResolvedTtsModelOverrides {
|
||||
@@ -220,8 +234,9 @@ export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig {
|
||||
const raw: TtsConfig = cfg.messages?.tts ?? {};
|
||||
const providerSource = raw.provider ? "config" : "default";
|
||||
const edgeOutputFormat = raw.edge?.outputFormat?.trim();
|
||||
const auto = normalizeTtsAutoMode(raw.auto) ?? (raw.enabled ? "always" : "off");
|
||||
return {
|
||||
enabled: raw.enabled ?? false,
|
||||
auto,
|
||||
mode: raw.mode ?? "final",
|
||||
provider: raw.provider ?? "edge",
|
||||
providerSource,
|
||||
@@ -279,17 +294,48 @@ export function resolveTtsPrefsPath(config: ResolvedTtsConfig): string {
|
||||
return path.join(CONFIG_DIR, "settings", "tts.json");
|
||||
}
|
||||
|
||||
function resolveTtsAutoModeFromPrefs(prefs: TtsUserPrefs): TtsAutoMode | undefined {
|
||||
const auto = normalizeTtsAutoMode(prefs.tts?.auto);
|
||||
if (auto) return auto;
|
||||
if (typeof prefs.tts?.enabled === "boolean") {
|
||||
return prefs.tts.enabled ? "always" : "off";
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
export function resolveTtsAutoMode(params: {
|
||||
config: ResolvedTtsConfig;
|
||||
prefsPath: string;
|
||||
sessionAuto?: string;
|
||||
}): TtsAutoMode {
|
||||
const sessionAuto = normalizeTtsAutoMode(params.sessionAuto);
|
||||
if (sessionAuto) return sessionAuto;
|
||||
const prefsAuto = resolveTtsAutoModeFromPrefs(readPrefs(params.prefsPath));
|
||||
if (prefsAuto) return prefsAuto;
|
||||
return params.config.auto;
|
||||
}
|
||||
|
||||
export function buildTtsSystemPromptHint(cfg: ClawdbotConfig): string | undefined {
|
||||
const config = resolveTtsConfig(cfg);
|
||||
const prefsPath = resolveTtsPrefsPath(config);
|
||||
if (!isTtsEnabled(config, prefsPath)) return undefined;
|
||||
const autoMode = resolveTtsAutoMode({ config, prefsPath });
|
||||
if (autoMode === "off") return undefined;
|
||||
const maxLength = getTtsMaxLength(prefsPath);
|
||||
const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
|
||||
const autoHint =
|
||||
autoMode === "inbound"
|
||||
? "Only use TTS when the user's last message includes audio/voice."
|
||||
: autoMode === "tagged"
|
||||
? "Only use TTS when you include [[tts]] or [[tts:text]] tags."
|
||||
: undefined;
|
||||
return [
|
||||
"Voice (TTS) is enabled.",
|
||||
autoHint,
|
||||
`Keep spoken text ≤${maxLength} chars to avoid auto-summary (summary ${summarize}).`,
|
||||
"Use [[tts:...]] and optional [[tts:text]]...[[/tts:text]] to control voice/expressiveness.",
|
||||
].join("\n");
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join("\n");
|
||||
}
|
||||
|
||||
function readPrefs(prefsPath: string): TtsUserPrefs {
|
||||
@@ -323,16 +369,25 @@ function updatePrefs(prefsPath: string, update: (prefs: TtsUserPrefs) => void):
|
||||
atomicWriteFileSync(prefsPath, JSON.stringify(prefs, null, 2));
|
||||
}
|
||||
|
||||
export function isTtsEnabled(config: ResolvedTtsConfig, prefsPath: string): boolean {
|
||||
const prefs = readPrefs(prefsPath);
|
||||
if (prefs.tts?.enabled !== undefined) return prefs.tts.enabled === true;
|
||||
return config.enabled;
|
||||
export function isTtsEnabled(
|
||||
config: ResolvedTtsConfig,
|
||||
prefsPath: string,
|
||||
sessionAuto?: string,
|
||||
): boolean {
|
||||
return resolveTtsAutoMode({ config, prefsPath, sessionAuto }) !== "off";
|
||||
}
|
||||
|
||||
export function setTtsAutoMode(prefsPath: string, mode: TtsAutoMode): void {
|
||||
updatePrefs(prefsPath, (prefs) => {
|
||||
const next = { ...prefs.tts };
|
||||
delete next.enabled;
|
||||
next.auto = mode;
|
||||
prefs.tts = next;
|
||||
});
|
||||
}
|
||||
|
||||
export function setTtsEnabled(prefsPath: string, enabled: boolean): void {
|
||||
updatePrefs(prefsPath, (prefs) => {
|
||||
prefs.tts = { ...prefs.tts, enabled };
|
||||
});
|
||||
setTtsAutoMode(prefsPath, enabled ? "always" : "off");
|
||||
}
|
||||
|
||||
export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): TtsProvider {
|
||||
@@ -485,15 +540,17 @@ function parseTtsDirectives(
|
||||
policy: ResolvedTtsModelOverrides,
|
||||
): TtsDirectiveParseResult {
|
||||
if (!policy.enabled) {
|
||||
return { cleanedText: text, overrides: {}, warnings: [] };
|
||||
return { cleanedText: text, overrides: {}, warnings: [], hasDirective: false };
|
||||
}
|
||||
|
||||
const overrides: TtsDirectiveOverrides = {};
|
||||
const warnings: string[] = [];
|
||||
let cleanedText = text;
|
||||
let hasDirective = false;
|
||||
|
||||
const blockRegex = /\[\[tts:text\]\]([\s\S]*?)\[\[\/tts:text\]\]/gi;
|
||||
cleanedText = cleanedText.replace(blockRegex, (_match, inner: string) => {
|
||||
hasDirective = true;
|
||||
if (policy.allowText && overrides.ttsText == null) {
|
||||
overrides.ttsText = inner.trim();
|
||||
}
|
||||
@@ -502,6 +559,7 @@ function parseTtsDirectives(
|
||||
|
||||
const directiveRegex = /\[\[tts:([^\]]+)\]\]/gi;
|
||||
cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => {
|
||||
hasDirective = true;
|
||||
const tokens = body.split(/\s+/).filter(Boolean);
|
||||
for (const token of tokens) {
|
||||
const eqIndex = token.indexOf("=");
|
||||
@@ -672,6 +730,7 @@ function parseTtsDirectives(
|
||||
return {
|
||||
cleanedText,
|
||||
ttsText: overrides.ttsText,
|
||||
hasDirective,
|
||||
overrides,
|
||||
warnings,
|
||||
};
|
||||
@@ -1156,13 +1215,17 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
cfg: ClawdbotConfig;
|
||||
channel?: string;
|
||||
kind?: "tool" | "block" | "final";
|
||||
inboundAudio?: boolean;
|
||||
ttsAuto?: string;
|
||||
}): Promise<ReplyPayload> {
|
||||
const config = resolveTtsConfig(params.cfg);
|
||||
const prefsPath = resolveTtsPrefsPath(config);
|
||||
if (!isTtsEnabled(config, prefsPath)) return params.payload;
|
||||
|
||||
const mode = config.mode ?? "final";
|
||||
if (mode === "final" && params.kind && params.kind !== "final") return params.payload;
|
||||
const autoMode = resolveTtsAutoMode({
|
||||
config,
|
||||
prefsPath,
|
||||
sessionAuto: params.ttsAuto,
|
||||
});
|
||||
if (autoMode === "off") return params.payload;
|
||||
|
||||
const text = params.payload.text ?? "";
|
||||
const directives = parseTtsDirectives(text, config.modelOverrides);
|
||||
@@ -1183,6 +1246,12 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
text: visibleText.length > 0 ? visibleText : undefined,
|
||||
};
|
||||
|
||||
if (autoMode === "tagged" && !directives.hasDirective) return nextPayload;
|
||||
if (autoMode === "inbound" && params.inboundAudio !== true) return nextPayload;
|
||||
|
||||
const mode = config.mode ?? "final";
|
||||
if (mode === "final" && params.kind && params.kind !== "final") return nextPayload;
|
||||
|
||||
if (!ttsText.trim()) return nextPayload;
|
||||
if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) return nextPayload;
|
||||
if (text.includes("MEDIA:")) return nextPayload;
|
||||
@@ -1197,7 +1266,7 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
logVerbose(
|
||||
`TTS: skipping long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
|
||||
);
|
||||
return params.payload;
|
||||
return nextPayload;
|
||||
}
|
||||
|
||||
try {
|
||||
@@ -1219,7 +1288,7 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
logVerbose(`TTS: summarization failed: ${error.message}`);
|
||||
return params.payload;
|
||||
return nextPayload;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user