Bugfix/Remove postgres vector store data when deletion (#5536)

Remove postgres vector store data when deletion

- Introduced a new `doc_id` column in MySQL, Postgres, and SQLite record managers to support document identification.
- Updated the `update` method to handle both string and object formats for keys, allowing for better flexibility in document updates.
- Enhanced `listKeys` method to filter by `doc_id` when provided in options.
- Updated vector store integrations to utilize the new `doc_id` filtering capability
This commit is contained in:
Henry Heng
2025-11-30 12:01:36 +00:00
committed by GitHub
parent e6e0c2d07b
commit 465005a503
20 changed files with 620 additions and 217 deletions
@@ -62,7 +62,6 @@ class MySQLRecordManager_RecordManager implements INode {
label: 'Namespace',
name: 'namespace',
type: 'string',
description: 'If not specified, chatflowid will be used',
additionalParams: true,
optional: true
},
@@ -219,7 +218,16 @@ class MySQLRecordManager implements RecordManagerInterface {
unique key \`unique_key_namespace\` (\`key\`,
\`namespace\`));`)
const columns = [`updated_at`, `key`, `namespace`, `group_id`]
// Add doc_id column if it doesn't exist (migration for existing tables)
const checkColumn = await queryRunner.manager.query(
`SELECT COUNT(1) ColumnExists FROM INFORMATION_SCHEMA.COLUMNS
WHERE table_schema=DATABASE() AND table_name='${tableName}' AND column_name='doc_id';`
)
if (checkColumn[0].ColumnExists === 0) {
await queryRunner.manager.query(`ALTER TABLE \`${tableName}\` ADD COLUMN \`doc_id\` longtext;`)
}
const columns = [`updated_at`, `key`, `namespace`, `group_id`, `doc_id`]
for (const column of columns) {
// MySQL does not support 'IF NOT EXISTS' function for Index
const Check = await queryRunner.manager.query(
@@ -261,7 +269,7 @@ class MySQLRecordManager implements RecordManagerInterface {
}
}
async update(keys: string[], updateOptions?: UpdateOptions): Promise<void> {
async update(keys: Array<{ uid: string; docId: string }> | string[], updateOptions?: UpdateOptions): Promise<void> {
if (keys.length === 0) {
return
}
@@ -277,23 +285,23 @@ class MySQLRecordManager implements RecordManagerInterface {
throw new Error(`Time sync issue with database ${updatedAt} < ${timeAtLeast}`)
}
const groupIds = _groupIds ?? keys.map(() => null)
// Handle both new format (objects with uid and docId) and old format (strings)
const isNewFormat = keys.length > 0 && typeof keys[0] === 'object' && 'uid' in keys[0]
const keyStrings = isNewFormat ? (keys as Array<{ uid: string; docId: string }>).map((k) => k.uid) : (keys as string[])
const docIds = isNewFormat ? (keys as Array<{ uid: string; docId: string }>).map((k) => k.docId) : keys.map(() => null)
if (groupIds.length !== keys.length) {
throw new Error(`Number of keys (${keys.length}) does not match number of group_ids (${groupIds.length})`)
const groupIds = _groupIds ?? keyStrings.map(() => null)
if (groupIds.length !== keyStrings.length) {
throw new Error(`Number of keys (${keyStrings.length}) does not match number of group_ids (${groupIds.length})`)
}
const recordsToUpsert = keys.map((key, i) => [
key,
this.namespace,
updatedAt,
groupIds[i] ?? null // Ensure groupIds[i] is null if undefined
])
const recordsToUpsert = keyStrings.map((key, i) => [key, this.namespace, updatedAt, groupIds[i] ?? null, docIds[i] ?? null])
const query = `
INSERT INTO \`${tableName}\` (\`key\`, \`namespace\`, \`updated_at\`, \`group_id\`)
VALUES (?, ?, ?, ?)
ON DUPLICATE KEY UPDATE \`updated_at\` = VALUES(\`updated_at\`)`
INSERT INTO \`${tableName}\` (\`key\`, \`namespace\`, \`updated_at\`, \`group_id\`, \`doc_id\`)
VALUES (?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE \`updated_at\` = VALUES(\`updated_at\`), \`doc_id\` = VALUES(\`doc_id\`)`
// To handle multiple files upsert
try {
@@ -349,13 +357,13 @@ class MySQLRecordManager implements RecordManagerInterface {
}
}
async listKeys(options?: ListKeyOptions): Promise<string[]> {
async listKeys(options?: ListKeyOptions & { docId?: string }): Promise<string[]> {
const dataSource = await this.getDataSource()
const queryRunner = dataSource.createQueryRunner()
const tableName = this.sanitizeTableName(this.tableName)
try {
const { before, after, limit, groupIds } = options ?? {}
const { before, after, limit, groupIds, docId } = options ?? {}
let query = `SELECT \`key\` FROM \`${tableName}\` WHERE \`namespace\` = ?`
const values: (string | number | string[])[] = [this.namespace]
@@ -382,6 +390,11 @@ class MySQLRecordManager implements RecordManagerInterface {
values.push(...groupIds.filter((gid): gid is string => gid !== null))
}
if (docId) {
query += ` AND \`doc_id\` = ?`
values.push(docId)
}
query += ';'
// Directly using try/catch with async/await for cleaner flow
@@ -78,7 +78,6 @@ class PostgresRecordManager_RecordManager implements INode {
label: 'Namespace',
name: 'namespace',
type: 'string',
description: 'If not specified, chatflowid will be used',
additionalParams: true,
optional: true
},
@@ -241,6 +240,19 @@ class PostgresRecordManager implements RecordManagerInterface {
CREATE INDEX IF NOT EXISTS namespace_index ON "${tableName}" (namespace);
CREATE INDEX IF NOT EXISTS group_id_index ON "${tableName}" (group_id);`)
// Add doc_id column if it doesn't exist (migration for existing tables)
await queryRunner.manager.query(`
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = '${tableName}' AND column_name = 'doc_id'
) THEN
ALTER TABLE "${tableName}" ADD COLUMN doc_id TEXT;
CREATE INDEX IF NOT EXISTS doc_id_index ON "${tableName}" (doc_id);
END IF;
END $$;`)
await queryRunner.release()
} catch (e: any) {
// This error indicates that the table already exists
@@ -286,7 +298,7 @@ class PostgresRecordManager implements RecordManagerInterface {
return `(${placeholders.join(', ')})`
}
async update(keys: string[], updateOptions?: UpdateOptions): Promise<void> {
async update(keys: Array<{ uid: string; docId: string }> | string[], updateOptions?: UpdateOptions): Promise<void> {
if (keys.length === 0) {
return
}
@@ -302,17 +314,22 @@ class PostgresRecordManager implements RecordManagerInterface {
throw new Error(`Time sync issue with database ${updatedAt} < ${timeAtLeast}`)
}
const groupIds = _groupIds ?? keys.map(() => null)
// Handle both new format (objects with uid and docId) and old format (strings)
const isNewFormat = keys.length > 0 && typeof keys[0] === 'object' && 'uid' in keys[0]
const keyStrings = isNewFormat ? (keys as Array<{ uid: string; docId: string }>).map((k) => k.uid) : (keys as string[])
const docIds = isNewFormat ? (keys as Array<{ uid: string; docId: string }>).map((k) => k.docId) : keys.map(() => null)
if (groupIds.length !== keys.length) {
throw new Error(`Number of keys (${keys.length}) does not match number of group_ids ${groupIds.length})`)
const groupIds = _groupIds ?? keyStrings.map(() => null)
if (groupIds.length !== keyStrings.length) {
throw new Error(`Number of keys (${keyStrings.length}) does not match number of group_ids ${groupIds.length})`)
}
const recordsToUpsert = keys.map((key, i) => [key, this.namespace, updatedAt, groupIds[i]])
const recordsToUpsert = keyStrings.map((key, i) => [key, this.namespace, updatedAt, groupIds[i], docIds[i]])
const valuesPlaceholders = recordsToUpsert.map((_, j) => this.generatePlaceholderForRowAt(j, recordsToUpsert[0].length)).join(', ')
const query = `INSERT INTO "${tableName}" (key, namespace, updated_at, group_id) VALUES ${valuesPlaceholders} ON CONFLICT (key, namespace) DO UPDATE SET updated_at = EXCLUDED.updated_at;`
const query = `INSERT INTO "${tableName}" (key, namespace, updated_at, group_id, doc_id) VALUES ${valuesPlaceholders} ON CONFLICT (key, namespace) DO UPDATE SET updated_at = EXCLUDED.updated_at, doc_id = EXCLUDED.doc_id;`
try {
await queryRunner.manager.query(query, recordsToUpsert.flat())
await queryRunner.release()
@@ -351,8 +368,8 @@ class PostgresRecordManager implements RecordManagerInterface {
}
}
async listKeys(options?: ListKeyOptions): Promise<string[]> {
const { before, after, limit, groupIds } = options ?? {}
async listKeys(options?: ListKeyOptions & { docId?: string }): Promise<string[]> {
const { before, after, limit, groupIds, docId } = options ?? {}
const tableName = this.sanitizeTableName(this.tableName)
let query = `SELECT key FROM "${tableName}" WHERE namespace = $1`
@@ -383,6 +400,12 @@ class PostgresRecordManager implements RecordManagerInterface {
index += 1
}
if (docId) {
values.push(docId)
query += ` AND doc_id = $${index}`
index += 1
}
query += ';'
const dataSource = await this.getDataSource()
@@ -51,7 +51,6 @@ class SQLiteRecordManager_RecordManager implements INode {
label: 'Namespace',
name: 'namespace',
type: 'string',
description: 'If not specified, chatflowid will be used',
additionalParams: true,
optional: true
},
@@ -198,6 +197,15 @@ CREATE INDEX IF NOT EXISTS key_index ON "${tableName}" (key);
CREATE INDEX IF NOT EXISTS namespace_index ON "${tableName}" (namespace);
CREATE INDEX IF NOT EXISTS group_id_index ON "${tableName}" (group_id);`)
// Add doc_id column if it doesn't exist (migration for existing tables)
const checkColumn = await queryRunner.manager.query(
`SELECT COUNT(*) as count FROM pragma_table_info('${tableName}') WHERE name='doc_id';`
)
if (checkColumn[0].count === 0) {
await queryRunner.manager.query(`ALTER TABLE "${tableName}" ADD COLUMN doc_id TEXT;`)
await queryRunner.manager.query(`CREATE INDEX IF NOT EXISTS doc_id_index ON "${tableName}" (doc_id);`)
}
await queryRunner.release()
} catch (e: any) {
// This error indicates that the table already exists
@@ -228,7 +236,7 @@ CREATE INDEX IF NOT EXISTS group_id_index ON "${tableName}" (group_id);`)
}
}
async update(keys: string[], updateOptions?: UpdateOptions): Promise<void> {
async update(keys: Array<{ uid: string; docId: string }> | string[], updateOptions?: UpdateOptions): Promise<void> {
if (keys.length === 0) {
return
}
@@ -243,23 +251,23 @@ CREATE INDEX IF NOT EXISTS group_id_index ON "${tableName}" (group_id);`)
throw new Error(`Time sync issue with database ${updatedAt} < ${timeAtLeast}`)
}
const groupIds = _groupIds ?? keys.map(() => null)
// Handle both new format (objects with uid and docId) and old format (strings)
const isNewFormat = keys.length > 0 && typeof keys[0] === 'object' && 'uid' in keys[0]
const keyStrings = isNewFormat ? (keys as Array<{ uid: string; docId: string }>).map((k) => k.uid) : (keys as string[])
const docIds = isNewFormat ? (keys as Array<{ uid: string; docId: string }>).map((k) => k.docId) : keys.map(() => null)
if (groupIds.length !== keys.length) {
throw new Error(`Number of keys (${keys.length}) does not match number of group_ids (${groupIds.length})`)
const groupIds = _groupIds ?? keyStrings.map(() => null)
if (groupIds.length !== keyStrings.length) {
throw new Error(`Number of keys (${keyStrings.length}) does not match number of group_ids (${groupIds.length})`)
}
const recordsToUpsert = keys.map((key, i) => [
key,
this.namespace,
updatedAt,
groupIds[i] ?? null // Ensure groupIds[i] is null if undefined
])
const recordsToUpsert = keyStrings.map((key, i) => [key, this.namespace, updatedAt, groupIds[i] ?? null, docIds[i] ?? null])
const query = `
INSERT INTO "${tableName}" (key, namespace, updated_at, group_id)
VALUES (?, ?, ?, ?)
ON CONFLICT (key, namespace) DO UPDATE SET updated_at = excluded.updated_at`
INSERT INTO "${tableName}" (key, namespace, updated_at, group_id, doc_id)
VALUES (?, ?, ?, ?, ?)
ON CONFLICT (key, namespace) DO UPDATE SET updated_at = excluded.updated_at, doc_id = excluded.doc_id`
try {
// To handle multiple files upsert
@@ -314,8 +322,8 @@ CREATE INDEX IF NOT EXISTS group_id_index ON "${tableName}" (group_id);`)
}
}
async listKeys(options?: ListKeyOptions): Promise<string[]> {
const { before, after, limit, groupIds } = options ?? {}
async listKeys(options?: ListKeyOptions & { docId?: string }): Promise<string[]> {
const { before, after, limit, groupIds, docId } = options ?? {}
const tableName = this.sanitizeTableName(this.tableName)
let query = `SELECT key FROM "${tableName}" WHERE namespace = ?`
@@ -344,6 +352,11 @@ CREATE INDEX IF NOT EXISTS group_id_index ON "${tableName}" (group_id);`)
values.push(...groupIds.filter((gid): gid is string => gid !== null))
}
if (docId) {
query += ` AND doc_id = ?`
values.push(docId)
}
query += ';'
const dataSource = await this.getDataSource()
@@ -186,7 +186,11 @@ class Chroma_VectorStores implements INode {
const vectorStoreName = collectionName
await recordManager.createSchema()
;(recordManager as any).namespace = (recordManager as any).namespace + '_' + vectorStoreName
const keys: string[] = await recordManager.listKeys({})
const filterKeys: ICommonObject = {}
if (options.docId) {
filterKeys.docId = options.docId
}
const keys: string[] = await recordManager.listKeys(filterKeys)
const chromaStore = new ChromaExtended(embeddings, obj)
@@ -198,7 +198,11 @@ class Elasticsearch_VectorStores implements INode {
const vectorStoreName = indexName
await recordManager.createSchema()
;(recordManager as any).namespace = (recordManager as any).namespace + '_' + vectorStoreName
const keys: string[] = await recordManager.listKeys({})
const filterKeys: ICommonObject = {}
if (options.docId) {
filterKeys.docId = options.docId
}
const keys: string[] = await recordManager.listKeys(filterKeys)
await vectorStore.delete({ ids: keys })
await recordManager.deleteKeys(keys)
@@ -212,7 +212,11 @@ class Pinecone_VectorStores implements INode {
const vectorStoreName = pineconeNamespace
await recordManager.createSchema()
;(recordManager as any).namespace = (recordManager as any).namespace + '_' + vectorStoreName
const keys: string[] = await recordManager.listKeys({})
const filterKeys: ICommonObject = {}
if (options.docId) {
filterKeys.docId = options.docId
}
const keys: string[] = await recordManager.listKeys(filterKeys)
await pineconeStore.delete({ ids: keys })
await recordManager.deleteKeys(keys)
@@ -49,7 +49,7 @@ class Postgres_VectorStores implements INode {
constructor() {
this.label = 'Postgres'
this.name = 'postgres'
this.version = 7.0
this.version = 7.1
this.type = 'Postgres'
this.icon = 'postgres.svg'
this.category = 'Vector Stores'
@@ -173,6 +173,15 @@ class Postgres_VectorStores implements INode {
additionalParams: true,
optional: true
},
{
label: 'Upsert Batch Size',
name: 'batchSize',
type: 'number',
step: 1,
description: 'Upsert in batches of size N',
additionalParams: true,
optional: true
},
{
label: 'Additional Configuration',
name: 'additionalConfig',
@@ -232,6 +241,7 @@ class Postgres_VectorStores implements INode {
const docs = nodeData.inputs?.document as Document[]
const recordManager = nodeData.inputs?.recordManager
const isFileUploadEnabled = nodeData.inputs?.fileUpload as boolean
const _batchSize = nodeData.inputs?.batchSize
const vectorStoreDriver: VectorStoreDriver = Postgres_VectorStores.getDriverFromConfig(nodeData, options)
const flattenDocs = docs && docs.length ? flatten(docs) : []
@@ -265,7 +275,15 @@ class Postgres_VectorStores implements INode {
return res
} else {
await vectorStoreDriver.fromDocuments(finalDocs)
if (_batchSize) {
const batchSize = parseInt(_batchSize, 10)
for (let i = 0; i < finalDocs.length; i += batchSize) {
const batch = finalDocs.slice(i, i + batchSize)
await vectorStoreDriver.fromDocuments(batch)
}
} else {
await vectorStoreDriver.fromDocuments(finalDocs)
}
return { numAdded: finalDocs.length, addedDocs: finalDocs }
}
@@ -285,7 +303,11 @@ class Postgres_VectorStores implements INode {
const vectorStoreName = tableName
await recordManager.createSchema()
;(recordManager as any).namespace = (recordManager as any).namespace + '_' + vectorStoreName
const keys: string[] = await recordManager.listKeys({})
const filterKeys: ICommonObject = {}
if (options.docId) {
filterKeys.docId = options.docId
}
const keys: string[] = await recordManager.listKeys(filterKeys)
await vectorStore.delete({ ids: keys })
await recordManager.deleteKeys(keys)
@@ -5,6 +5,11 @@ import { TypeORMVectorStore, TypeORMVectorStoreArgs, TypeORMVectorStoreDocument
import { VectorStore } from '@langchain/core/vectorstores'
import { Document } from '@langchain/core/documents'
import { Pool } from 'pg'
import { v4 as uuid } from 'uuid'
type TypeORMAddDocumentOptions = {
ids?: string[]
}
export class TypeORMDriver extends VectorStoreDriver {
protected _postgresConnectionOptions: DataSourceOptions
@@ -95,15 +100,45 @@ export class TypeORMDriver extends VectorStoreDriver {
try {
instance.appDataSource.getRepository(instance.documentEntity).delete(ids)
} catch (e) {
console.error('Failed to delete')
console.error('Failed to delete', e)
}
}
}
const baseAddVectorsFn = instance.addVectors.bind(instance)
instance.addVectors = async (
vectors: number[][],
documents: Document[],
documentOptions?: TypeORMAddDocumentOptions
): Promise<void> => {
const rows = vectors.map((embedding, idx) => {
const embeddingString = `[${embedding.join(',')}]`
const documentRow = {
id: documentOptions?.ids?.length ? documentOptions.ids[idx] : uuid(),
pageContent: documents[idx].pageContent,
embedding: embeddingString,
metadata: documents[idx].metadata
}
return documentRow
})
instance.addVectors = async (vectors, documents) => {
return baseAddVectorsFn(vectors, this.sanitizeDocuments(documents))
const documentRepository = instance.appDataSource.getRepository(instance.documentEntity)
const _batchSize = this.nodeData.inputs?.batchSize
const chunkSize = _batchSize ? parseInt(_batchSize, 10) : 500
for (let i = 0; i < rows.length; i += chunkSize) {
const chunk = rows.slice(i, i + chunkSize)
try {
await documentRepository.save(chunk)
} catch (e) {
console.error(e)
throw new Error(`Error inserting: ${chunk[0].pageContent}`)
}
}
}
instance.addDocuments = async (documents: Document[], options?: { ids?: string[] }): Promise<void> => {
const texts = documents.map(({ pageContent }) => pageContent)
return (instance.addVectors as any)(await this.getEmbeddings().embedDocuments(texts), documents, options)
}
return instance
@@ -385,7 +385,11 @@ class Qdrant_VectorStores implements INode {
const vectorStoreName = collectionName
await recordManager.createSchema()
;(recordManager as any).namespace = (recordManager as any).namespace + '_' + vectorStoreName
const keys: string[] = await recordManager.listKeys({})
const filterKeys: ICommonObject = {}
if (options.docId) {
filterKeys.docId = options.docId
}
const keys: string[] = await recordManager.listKeys(filterKeys)
await vectorStore.delete({ ids: keys })
await recordManager.deleteKeys(keys)
@@ -197,7 +197,11 @@ class Supabase_VectorStores implements INode {
const vectorStoreName = tableName + '_' + queryName
await recordManager.createSchema()
;(recordManager as any).namespace = (recordManager as any).namespace + '_' + vectorStoreName
const keys: string[] = await recordManager.listKeys({})
const filterKeys: ICommonObject = {}
if (options.docId) {
filterKeys.docId = options.docId
}
const keys: string[] = await recordManager.listKeys(filterKeys)
await supabaseStore.delete({ ids: keys })
await recordManager.deleteKeys(keys)
@@ -187,7 +187,11 @@ class Upstash_VectorStores implements INode {
const vectorStoreName = UPSTASH_VECTOR_REST_URL
await recordManager.createSchema()
;(recordManager as any).namespace = (recordManager as any).namespace + '_' + vectorStoreName
const keys: string[] = await recordManager.listKeys({})
const filterKeys: ICommonObject = {}
if (options.docId) {
filterKeys.docId = options.docId
}
const keys: string[] = await recordManager.listKeys(filterKeys)
await upstashStore.delete({ ids: keys })
await recordManager.deleteKeys(keys)
@@ -252,7 +252,11 @@ class Weaviate_VectorStores implements INode {
const vectorStoreName = weaviateTextKey ? weaviateIndex + '_' + weaviateTextKey : weaviateIndex
await recordManager.createSchema()
;(recordManager as any).namespace = (recordManager as any).namespace + '_' + vectorStoreName
const keys: string[] = await recordManager.listKeys({})
const filterKeys: ICommonObject = {}
if (options.docId) {
filterKeys.docId = options.docId
}
const keys: string[] = await recordManager.listKeys(filterKeys)
await weaviateStore.delete({ ids: keys })
await recordManager.deleteKeys(keys)
+8 -4
View File
@@ -8,6 +8,10 @@ import { IndexingResult } from './Interface'
type Metadata = Record<string, unknown>
export interface ExtendedRecordManagerInterface extends RecordManagerInterface {
update(keys: Array<{ uid: string; docId: string }> | string[], updateOptions?: Record<string, any>): Promise<void>
}
type StringOrDocFunc = string | ((doc: DocumentInterface) => string)
export interface HashedDocumentInterface extends DocumentInterface {
@@ -207,7 +211,7 @@ export const _isBaseDocumentLoader = (arg: any): arg is BaseDocumentLoader => {
interface IndexArgs {
docsSource: BaseDocumentLoader | DocumentInterface[]
recordManager: RecordManagerInterface
recordManager: ExtendedRecordManagerInterface
vectorStore: VectorStore
options?: IndexOptions
}
@@ -275,7 +279,7 @@ export async function index(args: IndexArgs): Promise<IndexingResult> {
const uids: string[] = []
const docsToIndex: DocumentInterface[] = []
const docsToUpdate: string[] = []
const docsToUpdate: Array<{ uid: string; docId: string }> = []
const seenDocs = new Set<string>()
hashedDocs.forEach((hashedDoc, i) => {
const docExists = batchExists[i]
@@ -283,7 +287,7 @@ export async function index(args: IndexArgs): Promise<IndexingResult> {
if (forceUpdate) {
seenDocs.add(hashedDoc.uid)
} else {
docsToUpdate.push(hashedDoc.uid)
docsToUpdate.push({ uid: hashedDoc.uid, docId: hashedDoc.metadata.docId as string })
return
}
}
@@ -308,7 +312,7 @@ export async function index(args: IndexArgs): Promise<IndexingResult> {
}
await recordManager.update(
hashedDocs.map((doc) => doc.uid),
hashedDocs.map((doc) => ({ uid: doc.uid, docId: doc.metadata.docId as string })),
{ timeAtLeast: indexStartDt, groupIds: sourceIds }
)