ResponsibleAI - Input Moderation - Simplifying the Options for OpenAI Moderation.

2026-06-28 17:01:00 +03:00 · 2023-11-22 19:19:59 +05:30
parent ad8281e553
commit 619fb4f5c1
2 changed files with 5 additions and 144 deletions
@@ -3,19 +3,7 @@ import { BaseLanguageModel } from 'langchain/base_language'
 import { OpenAIModerationChain } from 'langchain/chains'

 export class OpenAIModerationRunner implements Moderation {
-    private moderationConfig: string = 'useDefault'
    private moderationErrorMessage: string = "Text was found that violates OpenAI's content policy."
-    private sexual: number = 0.01
-    private sexualMinors: number = 0.01
-    private hate: number = 0.01
-    private hateThreatening: number = 0.01
-    private harassment: number = 0.01
-    private harassmentThreatening: number = 0.01
-    private selfHarm: number = 0.01
-    private selfHarmIntent: number = 0.01
-    private selfHarmInstructions: number = 0.01
-    private violence: number = 0.01
-    private violenceGraphic: number = 0.01

    async checkForViolations(llm: BaseLanguageModel, input: string): Promise<string> {
        const openAIApiKey = (llm as any).openAIApiKey
@@ -31,32 +19,13 @@ export class OpenAIModerationRunner implements Moderation {
        const { output: moderationOutput, results } = await moderation.call({
            input: input
        })
-        if (this.moderationConfig != 'useCustom' && results[0].flagged) {
+        if (results[0].flagged) {
            throw Error(this.moderationErrorMessage)
        }
-        if (this.moderationConfig != 'useDefault') {
-            const categoryScores = results[0].category_scores
-            if (
-                categoryScores['harassment'] > this.harassment ||
-                categoryScores['harassment/threatening'] > this.harassmentThreatening ||
-                categoryScores['self-harm'] > this.selfHarm ||
-                categoryScores['self-harm/intent'] > this.selfHarmIntent ||
-                categoryScores['self-harm/instructions'] > this.selfHarmInstructions ||
-                categoryScores['sexual'] > this.sexual ||
-                categoryScores['sexual/minors'] > this.sexualMinors ||
-                categoryScores['hate'] > this.hate ||
-                categoryScores['hate/threatening'] > this.hateThreatening ||
-                categoryScores['violence'] > this.violence ||
-                categoryScores['violence/graphic'] > this.violenceGraphic
-            ) {
-                throw Error(this.moderationErrorMessage)
-            }
-        }
        return moderationOutput
    }

-    setParameter(category: string, value: number) {
-        // @ts-ignore
-        this[category] = value
+    setErrorMessage(message: string) {
+        this.moderationErrorMessage = message
    }
 }