TTS: gate auto audio on inbound voice notes (#1667)

Co-authored-by: Sebastian <sebslight@gmail.com>
This commit is contained in:
Seb Slight
2026-01-24 23:35:20 -05:00
committed by GitHub
parent ede5145191
commit d4f60bf16a
20 changed files with 433 additions and 63 deletions
+128 -1
View File
@@ -4,7 +4,7 @@ import { completeSimple } from "@mariozechner/pi-ai";
import { getApiKeyForModel } from "../agents/model-auth.js";
import { resolveModel } from "../agents/pi-embedded-runner/model.js";
import { _test, getTtsProvider, resolveTtsConfig } from "./tts.js";
import * as tts from "./tts.js";
vi.mock("@mariozechner/pi-ai", () => ({
completeSimple: vi.fn(),
@@ -37,6 +37,8 @@ vi.mock("../agents/model-auth.js", () => ({
requireApiKey: vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? ""),
}));
const { _test, resolveTtsConfig, maybeApplyTtsToPayload, getTtsProvider } = tts;
const {
isValidVoiceId,
isValidOpenAIVoice,
@@ -431,4 +433,129 @@ describe("tts", () => {
);
});
});
describe("maybeApplyTtsToPayload", () => {
const baseCfg = {
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
messages: {
tts: {
auto: "inbound",
provider: "openai",
openai: { apiKey: "test-key", model: "gpt-4o-mini-tts", voice: "alloy" },
},
},
};
it("skips auto-TTS when inbound audio gating is on and the message is not audio", async () => {
const prevPrefs = process.env.CLAWDBOT_TTS_PREFS;
process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
const originalFetch = globalThis.fetch;
const fetchMock = vi.fn(async () => ({
ok: true,
arrayBuffer: async () => new ArrayBuffer(1),
}));
globalThis.fetch = fetchMock as unknown as typeof fetch;
const payload = { text: "Hello world" };
const result = await maybeApplyTtsToPayload({
payload,
cfg: baseCfg,
kind: "final",
inboundAudio: false,
});
expect(result).toBe(payload);
expect(fetchMock).not.toHaveBeenCalled();
globalThis.fetch = originalFetch;
process.env.CLAWDBOT_TTS_PREFS = prevPrefs;
});
it("attempts auto-TTS when inbound audio gating is on and the message is audio", async () => {
const prevPrefs = process.env.CLAWDBOT_TTS_PREFS;
process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
const originalFetch = globalThis.fetch;
const fetchMock = vi.fn(async () => ({
ok: true,
arrayBuffer: async () => new ArrayBuffer(1),
}));
globalThis.fetch = fetchMock as unknown as typeof fetch;
const result = await maybeApplyTtsToPayload({
payload: { text: "Hello world" },
cfg: baseCfg,
kind: "final",
inboundAudio: true,
});
expect(result.mediaUrl).toBeDefined();
expect(fetchMock).toHaveBeenCalledTimes(1);
globalThis.fetch = originalFetch;
process.env.CLAWDBOT_TTS_PREFS = prevPrefs;
});
it("skips auto-TTS in tagged mode unless a tts tag is present", async () => {
const prevPrefs = process.env.CLAWDBOT_TTS_PREFS;
process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
const originalFetch = globalThis.fetch;
const fetchMock = vi.fn(async () => ({
ok: true,
arrayBuffer: async () => new ArrayBuffer(1),
}));
globalThis.fetch = fetchMock as unknown as typeof fetch;
const cfg = {
...baseCfg,
messages: {
...baseCfg.messages,
tts: { ...baseCfg.messages.tts, auto: "tagged" },
},
};
const payload = { text: "Hello world" };
const result = await maybeApplyTtsToPayload({
payload,
cfg,
kind: "final",
});
expect(result).toBe(payload);
expect(fetchMock).not.toHaveBeenCalled();
globalThis.fetch = originalFetch;
process.env.CLAWDBOT_TTS_PREFS = prevPrefs;
});
it("runs auto-TTS in tagged mode when tags are present", async () => {
const prevPrefs = process.env.CLAWDBOT_TTS_PREFS;
process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
const originalFetch = globalThis.fetch;
const fetchMock = vi.fn(async () => ({
ok: true,
arrayBuffer: async () => new ArrayBuffer(1),
}));
globalThis.fetch = fetchMock as unknown as typeof fetch;
const cfg = {
...baseCfg,
messages: {
...baseCfg.messages,
tts: { ...baseCfg.messages.tts, auto: "tagged" },
},
};
const result = await maybeApplyTtsToPayload({
payload: { text: "[[tts:text]]Hello world[[/tts:text]]" },
cfg,
kind: "final",
});
expect(result.mediaUrl).toBeDefined();
expect(fetchMock).toHaveBeenCalledTimes(1);
globalThis.fetch = originalFetch;
process.env.CLAWDBOT_TTS_PREFS = prevPrefs;
});
});
});
+87 -18
View File
@@ -20,6 +20,7 @@ import type { ChannelId } from "../channels/plugins/types.js";
import type { ClawdbotConfig } from "../config/config.js";
import type {
TtsConfig,
TtsAutoMode,
TtsMode,
TtsProvider,
TtsModelOverrideConfig,
@@ -75,8 +76,10 @@ const DEFAULT_OUTPUT = {
voiceCompatible: false,
};
const TTS_AUTO_MODES = new Set<TtsAutoMode>(["off", "always", "inbound", "tagged"]);
export type ResolvedTtsConfig = {
enabled: boolean;
auto: TtsAutoMode;
mode: TtsMode;
provider: TtsProvider;
providerSource: "config" | "default";
@@ -123,6 +126,7 @@ export type ResolvedTtsConfig = {
type TtsUserPrefs = {
tts?: {
auto?: TtsAutoMode;
enabled?: boolean;
provider?: TtsProvider;
maxLength?: number;
@@ -161,6 +165,7 @@ type TtsDirectiveOverrides = {
type TtsDirectiveParseResult = {
cleanedText: string;
ttsText?: string;
hasDirective: boolean;
overrides: TtsDirectiveOverrides;
warnings: string[];
};
@@ -187,6 +192,15 @@ type TtsStatusEntry = {
let lastTtsAttempt: TtsStatusEntry | undefined;
export function normalizeTtsAutoMode(value: unknown): TtsAutoMode | undefined {
if (typeof value !== "string") return undefined;
const normalized = value.trim().toLowerCase();
if (TTS_AUTO_MODES.has(normalized as TtsAutoMode)) {
return normalized as TtsAutoMode;
}
return undefined;
}
function resolveModelOverridePolicy(
overrides: TtsModelOverrideConfig | undefined,
): ResolvedTtsModelOverrides {
@@ -220,8 +234,9 @@ export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig {
const raw: TtsConfig = cfg.messages?.tts ?? {};
const providerSource = raw.provider ? "config" : "default";
const edgeOutputFormat = raw.edge?.outputFormat?.trim();
const auto = normalizeTtsAutoMode(raw.auto) ?? (raw.enabled ? "always" : "off");
return {
enabled: raw.enabled ?? false,
auto,
mode: raw.mode ?? "final",
provider: raw.provider ?? "edge",
providerSource,
@@ -279,17 +294,48 @@ export function resolveTtsPrefsPath(config: ResolvedTtsConfig): string {
return path.join(CONFIG_DIR, "settings", "tts.json");
}
function resolveTtsAutoModeFromPrefs(prefs: TtsUserPrefs): TtsAutoMode | undefined {
const auto = normalizeTtsAutoMode(prefs.tts?.auto);
if (auto) return auto;
if (typeof prefs.tts?.enabled === "boolean") {
return prefs.tts.enabled ? "always" : "off";
}
return undefined;
}
export function resolveTtsAutoMode(params: {
config: ResolvedTtsConfig;
prefsPath: string;
sessionAuto?: string;
}): TtsAutoMode {
const sessionAuto = normalizeTtsAutoMode(params.sessionAuto);
if (sessionAuto) return sessionAuto;
const prefsAuto = resolveTtsAutoModeFromPrefs(readPrefs(params.prefsPath));
if (prefsAuto) return prefsAuto;
return params.config.auto;
}
export function buildTtsSystemPromptHint(cfg: ClawdbotConfig): string | undefined {
const config = resolveTtsConfig(cfg);
const prefsPath = resolveTtsPrefsPath(config);
if (!isTtsEnabled(config, prefsPath)) return undefined;
const autoMode = resolveTtsAutoMode({ config, prefsPath });
if (autoMode === "off") return undefined;
const maxLength = getTtsMaxLength(prefsPath);
const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
const autoHint =
autoMode === "inbound"
? "Only use TTS when the user's last message includes audio/voice."
: autoMode === "tagged"
? "Only use TTS when you include [[tts]] or [[tts:text]] tags."
: undefined;
return [
"Voice (TTS) is enabled.",
autoHint,
`Keep spoken text ≤${maxLength} chars to avoid auto-summary (summary ${summarize}).`,
"Use [[tts:...]] and optional [[tts:text]]...[[/tts:text]] to control voice/expressiveness.",
].join("\n");
]
.filter(Boolean)
.join("\n");
}
function readPrefs(prefsPath: string): TtsUserPrefs {
@@ -323,16 +369,25 @@ function updatePrefs(prefsPath: string, update: (prefs: TtsUserPrefs) => void):
atomicWriteFileSync(prefsPath, JSON.stringify(prefs, null, 2));
}
export function isTtsEnabled(config: ResolvedTtsConfig, prefsPath: string): boolean {
const prefs = readPrefs(prefsPath);
if (prefs.tts?.enabled !== undefined) return prefs.tts.enabled === true;
return config.enabled;
export function isTtsEnabled(
config: ResolvedTtsConfig,
prefsPath: string,
sessionAuto?: string,
): boolean {
return resolveTtsAutoMode({ config, prefsPath, sessionAuto }) !== "off";
}
export function setTtsAutoMode(prefsPath: string, mode: TtsAutoMode): void {
updatePrefs(prefsPath, (prefs) => {
const next = { ...prefs.tts };
delete next.enabled;
next.auto = mode;
prefs.tts = next;
});
}
export function setTtsEnabled(prefsPath: string, enabled: boolean): void {
updatePrefs(prefsPath, (prefs) => {
prefs.tts = { ...prefs.tts, enabled };
});
setTtsAutoMode(prefsPath, enabled ? "always" : "off");
}
export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): TtsProvider {
@@ -485,15 +540,17 @@ function parseTtsDirectives(
policy: ResolvedTtsModelOverrides,
): TtsDirectiveParseResult {
if (!policy.enabled) {
return { cleanedText: text, overrides: {}, warnings: [] };
return { cleanedText: text, overrides: {}, warnings: [], hasDirective: false };
}
const overrides: TtsDirectiveOverrides = {};
const warnings: string[] = [];
let cleanedText = text;
let hasDirective = false;
const blockRegex = /\[\[tts:text\]\]([\s\S]*?)\[\[\/tts:text\]\]/gi;
cleanedText = cleanedText.replace(blockRegex, (_match, inner: string) => {
hasDirective = true;
if (policy.allowText && overrides.ttsText == null) {
overrides.ttsText = inner.trim();
}
@@ -502,6 +559,7 @@ function parseTtsDirectives(
const directiveRegex = /\[\[tts:([^\]]+)\]\]/gi;
cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => {
hasDirective = true;
const tokens = body.split(/\s+/).filter(Boolean);
for (const token of tokens) {
const eqIndex = token.indexOf("=");
@@ -672,6 +730,7 @@ function parseTtsDirectives(
return {
cleanedText,
ttsText: overrides.ttsText,
hasDirective,
overrides,
warnings,
};
@@ -1156,13 +1215,17 @@ export async function maybeApplyTtsToPayload(params: {
cfg: ClawdbotConfig;
channel?: string;
kind?: "tool" | "block" | "final";
inboundAudio?: boolean;
ttsAuto?: string;
}): Promise<ReplyPayload> {
const config = resolveTtsConfig(params.cfg);
const prefsPath = resolveTtsPrefsPath(config);
if (!isTtsEnabled(config, prefsPath)) return params.payload;
const mode = config.mode ?? "final";
if (mode === "final" && params.kind && params.kind !== "final") return params.payload;
const autoMode = resolveTtsAutoMode({
config,
prefsPath,
sessionAuto: params.ttsAuto,
});
if (autoMode === "off") return params.payload;
const text = params.payload.text ?? "";
const directives = parseTtsDirectives(text, config.modelOverrides);
@@ -1183,6 +1246,12 @@ export async function maybeApplyTtsToPayload(params: {
text: visibleText.length > 0 ? visibleText : undefined,
};
if (autoMode === "tagged" && !directives.hasDirective) return nextPayload;
if (autoMode === "inbound" && params.inboundAudio !== true) return nextPayload;
const mode = config.mode ?? "final";
if (mode === "final" && params.kind && params.kind !== "final") return nextPayload;
if (!ttsText.trim()) return nextPayload;
if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) return nextPayload;
if (text.includes("MEDIA:")) return nextPayload;
@@ -1197,7 +1266,7 @@ export async function maybeApplyTtsToPayload(params: {
logVerbose(
`TTS: skipping long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
);
return params.payload;
return nextPayload;
}
try {
@@ -1219,7 +1288,7 @@ export async function maybeApplyTtsToPayload(params: {
} catch (err) {
const error = err as Error;
logVerbose(`TTS: summarization failed: ${error.message}`);
return params.payload;
return nextPayload;
}
}