fix(media): strip audio attachments after successful transcription (openclaw#9076) thanks @nobrainer-tech

Verified: - pnpm install --frozen-lockfile - pnpm build - pnpm check - pnpm test (fails in known unrelated telegram suite) - pnpm vitest run src/auto-reply/media-note.test.ts src/auto-reply/reply.media-note.test.ts Co-authored-by: nobrainer-tech <445466+nobrainer-tech@users.noreply.github.com>
2026-06-29 01:02:03 +03:00 · 2026-02-13 02:01:53 +01:00
parent a6003d6711
commit 7081dee1af
2 changed files with 151 additions and 1 deletions
@@ -106,4 +106,93 @@ describe("buildInboundMediaNote", () => {
    });
    expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
  });
  it("strips audio attachments when transcription succeeded via MediaUnderstanding (issue #4197)", () => {
    const note = buildInboundMediaNote({
      MediaPaths: ["/tmp/voice.ogg", "/tmp/image.png"],
      MediaUrls: ["https://example.com/voice.ogg", "https://example.com/image.png"],
      MediaTypes: ["audio/ogg", "image/png"],
      MediaUnderstanding: [
        {
          kind: "audio.transcription",
          attachmentIndex: 0,
          text: "Hello world",
          provider: "whisper",
        },
      ],
    });
    // Audio attachment should be stripped (already transcribed), image should remain
    expect(note).toBe(
      "[media attached: /tmp/image.png (image/png) | https://example.com/image.png]",
    );
  });
  it("only strips audio attachments that were transcribed", () => {
    const note = buildInboundMediaNote({
      MediaPaths: ["/tmp/voice-1.ogg", "/tmp/voice-2.ogg"],
      MediaUrls: ["https://example.com/voice-1.ogg", "https://example.com/voice-2.ogg"],
      MediaTypes: ["audio/ogg", "audio/ogg"],
      MediaUnderstanding: [
        {
          kind: "audio.transcription",
          attachmentIndex: 0,
          text: "First transcript",
          provider: "whisper",
        },
      ],
    });
    expect(note).toBe(
      "[media attached: /tmp/voice-2.ogg (audio/ogg) | https://example.com/voice-2.ogg]",
    );
  });
  it("strips audio attachments when Transcript is present (issue #4197)", () => {
    const note = buildInboundMediaNote({
      MediaPaths: ["/tmp/voice.opus"],
      MediaTypes: ["audio/opus"],
      Transcript: "Hello world from Whisper",
    });
    // Audio should be stripped when transcript is available
    expect(note).toBeUndefined();
  });
  it("does not strip multiple audio attachments using transcript-only fallback", () => {
    const note = buildInboundMediaNote({
      MediaPaths: ["/tmp/voice-1.ogg", "/tmp/voice-2.ogg"],
      MediaTypes: ["audio/ogg", "audio/ogg"],
      Transcript: "Transcript text without per-attachment mapping",
    });
    expect(note).toBe(
      [
        "[media attached: 2 files]",
        "[media attached 1/2: /tmp/voice-1.ogg (audio/ogg)]",
        "[media attached 2/2: /tmp/voice-2.ogg (audio/ogg)]",
      ].join("\n"),
    );
  });
  it("strips audio by extension even without mime type (issue #4197)", () => {
    const note = buildInboundMediaNote({
      MediaPaths: ["/tmp/voice_message.ogg", "/tmp/document.pdf"],
      MediaUnderstanding: [
        {
          kind: "audio.transcription",
          attachmentIndex: 0,
          text: "Transcribed audio content",
          provider: "whisper",
        },
      ],
    });
    // Only PDF should remain, audio stripped by extension
    expect(note).toBe("[media attached: /tmp/document.pdf]");
  });
  it("keeps audio attachments when no transcription available", () => {
    const note = buildInboundMediaNote({
      MediaPaths: ["/tmp/voice.ogg"],
      MediaTypes: ["audio/ogg"],
    });
    // No transcription = keep audio attachment as fallback
    expect(note).toBe("[media attached: /tmp/voice.ogg (audio/ogg)]");
  });
 });
@@ -17,12 +17,45 @@ function formatMediaAttachedLine(params: {
  return `${prefix}${params.path}${typePart}${urlPart}]`;
 }
 // Common audio file extensions for transcription detection
 const AUDIO_EXTENSIONS = new Set([
  ".ogg",
  ".opus",
  ".mp3",
  ".m4a",
  ".wav",
  ".webm",
  ".flac",
  ".aac",
  ".wma",
  ".aiff",
  ".alac",
  ".oga",
 ]);
 function isAudioPath(path: string | undefined): boolean {
  if (!path) {
    return false;
  }
  const lower = path.toLowerCase();
  for (const ext of AUDIO_EXTENSIONS) {
    if (lower.endsWith(ext)) {
      return true;
    }
  }
  return false;
 }
 export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
  // Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
  const suppressed = new Set<number>();
  const transcribedAudioIndices = new Set<number>();
  if (Array.isArray(ctx.MediaUnderstanding)) {
    for (const output of ctx.MediaUnderstanding) {
      suppressed.add(output.attachmentIndex);
      if (output.kind === "audio.transcription") {
        transcribedAudioIndices.add(output.attachmentIndex);
      }
    }
  }
  if (Array.isArray(ctx.MediaUnderstandingDecisions)) {
@@ -33,6 +66,9 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
      for (const attachment of decision.attachments) {
        if (attachment.chosen?.outcome === "success") {
          suppressed.add(attachment.attachmentIndex);
          if (decision.capability === "audio") {
            transcribedAudioIndices.add(attachment.attachmentIndex);
          }
        }
      }
    }
@@ -56,6 +92,10 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
    Array.isArray(ctx.MediaTypes) && ctx.MediaTypes.length === paths.length
      ? ctx.MediaTypes
      : undefined;
  const hasTranscript = Boolean(ctx.Transcript?.trim());
  // Transcript alone does not identify an attachment index; only use it as a fallback
  // when there is a single attachment to avoid stripping unrelated audio files.
  const canStripSingleAttachmentByTranscript = hasTranscript && paths.length === 1;
  const entries = paths
    .map((entry, index) => ({
@@ -64,7 +104,28 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
      url: urls?.[index] ?? ctx.MediaUrl,
      index,
    }))
-    .filter((entry) => !suppressed.has(entry.index));
+    .filter((entry) => {
      if (suppressed.has(entry.index)) {
        return false;
      }
      // Strip audio attachments when transcription succeeded - the transcript is already
      // available in the context, raw audio binary would only waste tokens (issue #4197)
      // Note: Only trust MIME type from per-entry types array, not fallback ctx.MediaType
      // which could misclassify non-audio attachments (greptile review feedback)
      const hasPerEntryType = types !== undefined;
      const isAudioByMime = hasPerEntryType && entry.type?.toLowerCase().startsWith("audio/");
      const isAudioEntry = isAudioPath(entry.path) || isAudioByMime;
      if (!isAudioEntry) {
        return true;
      }
      if (
        transcribedAudioIndices.has(entry.index) ||
        (canStripSingleAttachmentByTranscript && entry.index === 0)
      ) {
        return false;
      }
      return true;
    });
  if (entries.length === 0) {
    return undefined;
  }