修改关键词匹配规则，使用近义词过滤

5331e365 · 于飞 · bb47c76f · 5331e365 · 5331e365 · 5331e365
Commit 5331e365 authored Oct 12, 2024 by 于飞
Hide whitespace changes
Inline Side-by-side

Showing with 97 additions and 43 deletions

filter.py dbgpt/app/apps/utils/filter.py +0 -1

keywordsviews.py dbgpt/app/apps/vadmin/keywordsviews.py +89 -42

crud.py dbgpt/app/apps/vadmin/word/crud.py +8 -0

No files found.
--- a/dbgpt/app/apps/utils/filter.py
+++ b/dbgpt/app/apps/utils/filter.py
@@ -123,7 +123,6 @@ class DFAFilter():
        for keyword in datas:
            self.add(keyword.word_name)

-
    #最长匹配模式，确保敏感词过滤器优先匹配和替换较长的敏感词
    def filter(self, message, repl="*"):
        is_sensitive = False

--- a/dbgpt/app/apps/vadmin/keywordsviews.py
+++ b/dbgpt/app/apps/vadmin/keywordsviews.py
@@ -22,6 +22,7 @@ from sqlalchemy.ext.asyncio import AsyncSession

 from typing import Any, Dict, Generic, Optional, TypeVar
 from dbgpt._private.pydantic import BaseModel, ConfigDict, Field, model_to_dict
+from dbgpt.app.apps.vadmin.word import crud

 from dbgpt.app.apps.utils.spach_keywords import my_spacy_nlp
 from dbgpt.app.apps.utils.filter import mydfafiter, mydfafiter_picture, mydfafiter_question, mydfafiter_video
@@ -84,7 +85,7 @@ def get_key_words_nlp(user_input: str) -> list:
    #print(words)
    return words

-async def get_media_datas_by(conv_uid: str, words: str, db: AsyncSession, knownledge: str) -> list:
+async def get_media_datas_by(conv_uid: str, words: [], db: AsyncSession, knownledge: str) -> list:
    # 去拿出group_id
    datas = []
    if knownledge != None:
@@ -97,52 +98,56 @@ async def get_media_datas_by(conv_uid: str, words: str, db: AsyncSession, knownl
        for image_groups in image_datas:
            image_groupid = image_groups.get('group_id')
            print(f"===========>image_groupid:{image_groupid}")
-            # 取出匹配到的关键词，获取数据库中的图片
-            images_dic = {'page': 1, 'limit': 0, 'v_order': None, 'v_order_field': None, 'type': 1,
-                          'group_id': image_groupid,
-                          'key_word': ('like', words)}
-
-            images_datas, count = await MediaDal(db).get_datas(**images_dic, v_return_count=True)
-            print(f"-----查询到的图片为:---->:{images_datas}")
-
-            for data in images_datas:
-                json_image = {'type': MEDIA_TYPE1, 'file_name': data.get('file_name'), 'key_word': data.get('key_word'),
-                              'local_path': data.get('local_path'), 'remote_path': data.get('remote_path')}
-                result.append(json_image)
+            #遍历关键词数组->找出每一个关键词对应的图片
+            for word in words:
+                # 取出匹配到的关键词，获取数据库中的图片
+                images_dic = {'page': 1, 'limit': 0, 'v_order': None, 'v_order_field': None, 'type': 1,
+                              'group_id': image_groupid,
+                              'key_word': word}
+                images_datas, count = await MediaDal(db).get_datas(**images_dic, v_return_count=True)
+                print(f"-----查询到的图片为:---->:{images_datas}")
+                for data in images_datas:
+                    json_image = {'type': MEDIA_TYPE1, 'file_name': data.get('file_name'),
+                                  'key_word': data.get('key_word'),
+                                  'local_path': data.get('local_path'), 'remote_path': data.get('remote_path')}
+                    result.append(json_image)

        video_datas = corrdata.get('video_group') or []
        for video_groups in video_datas:
            video_groupid = video_groups.get('group_id')
            print(f"===========>video_groupid:{video_groupid}")
-            # 取出匹配到的关键词，获取数据库中的视频
-            video_dic = {'page': 1, 'limit': 0, 'v_order': None, 'v_order_field': None, 'type': 2,
-                         'group_id': video_groupid,
-                         'key_word': ('like', words)}
-
-            video_datas, count = await MediaDal(db).get_datas(**video_dic, v_return_count=True)
-            print(f"-----查询到的视频为:---->:{video_datas}")
-            for videodata in video_datas:
-                json_video = {'type': MEDIA_TYPE2, 'file_name': videodata.get('file_name'),
-                              'key_word': videodata.get('key_word'),
-                              'local_path': videodata.get('local_path'),
-                              'remote_path': videodata.get('remote_path')}
-                result.append(json_video)
+            # 遍历关键词数组->找出每一个关键词对应的视频
+            for word in words:
+                # 取出匹配到的关键词，获取数据库中的视频
+                video_dic = {'page': 1, 'limit': 0, 'v_order': None, 'v_order_field': None, 'type': 2,
+                             'group_id': video_groupid,
+                             'key_word': word}
+
+                video_datas, count = await MediaDal(db).get_datas(**video_dic, v_return_count=True)
+                print(f"-----查询到的视频为:---->:{video_datas}")
+                for videodata in video_datas:
+                    json_video = {'type': MEDIA_TYPE2, 'file_name': videodata.get('file_name'),
+                                  'key_word': videodata.get('key_word'),
+                                  'local_path': videodata.get('local_path'), 'remote_path': videodata.get('remote_path')}
+                    result.append(json_video)

        question_datas = corrdata.get('question_group') or []
        for question_groups in question_datas:
            question_groupid = question_groups.get('group_id')
            print(f"===========>question_groupid:{question_groupid}")
-            # 匹配到的问答对有
-            question_dic = {'page': 1, 'limit': 0, 'v_order': None, 'v_order_field': None,
-                            'group_id': question_groupid,
-                            'key_word': ('like', words)}
-            question_datas, count = await QuestionDal(db).get_datas(**question_dic, v_return_count=True)
-            print(f"-----查询到的问答对为:---->:{question_datas}")
-            for questiondata in question_datas:
-                json_question = {'type': MEDIA_TYPE4, 'title': questiondata.get('title'),
-                                 'key_word': questiondata.get('key_word'),
-                                 'answer': questiondata.get('answer')}
-                result.append(json_question)
+            # 遍历关键词数组->找出每一个关键词对应的问答对
+            for word in words:
+                # 匹配到的问答对有
+                question_dic = {'page': 1, 'limit': 0, 'v_order': None, 'v_order_field': None,
+                                'group_id': question_groupid,
+                                'key_word': word}
+                question_datas, count = await QuestionDal(db).get_datas(**question_dic, v_return_count=True)
+                print(f"-----查询到的问答对为:---->:{question_datas}")
+                for questiondata in question_datas:
+                    json_question = {'type': MEDIA_TYPE4, 'title': questiondata.get('title'),
+                                     'key_word': questiondata.get('key_word'),
+                                     'answer': questiondata.get('answer')}
+                    result.append(json_question)


        # 保存到聊天历史资源数据库中
@@ -238,6 +243,40 @@ async def get_media_datas_all(conv_uid: str, default_model: str, db: AsyncSessio

    return ret_media_datas

+
+def filter_similar(key_words: list, similar_words: list) -> list:
+    """
+    从 key_words 中过滤掉同义词，只保留每组同义词中的一个词。
+
+    参数:
+    - key_words: 需要筛选的关键词列表。
+    - similar_words: 同义词组列表，每组是包含同义词的列表。
+
+    返回值:
+    - f_words: 过滤后的关键词列表，只保留非同义词或每组中的一个代表词。
+
+    示例:
+    key_words = ['高兴', '快乐', '满足', '生气', '愤怒']
+    similar_words = [['高兴', '快乐', '满足'], ['生气', '愤怒']]
+    result = filter_similar(key_words, similar_words)
+    print(result)  # 输出：['高兴', '生气']
+    这个例子中，函数会保留“高兴”作为同义词组的代表词，并保留“生气”作为另一组的代表。
+    """
+    f_words = []
+    for word in key_words:
+        # 检查这个词或它的同义词是否已经在 f_words 中
+        found_similar = False
+        for synonym_group in similar_words:
+            if word in synonym_group:
+                # 如果这个同义词组中的任何一个词已在结果列表中，跳过当前词
+                if any(syn in f_words for syn in synonym_group):
+                    found_similar = True
+                    break
+        if not found_similar:
+            f_words.append(word)
+    return f_words
+
+
 @router.post("/get_spacy_keywords", summary="资源列表(图片、视频)")
 async def get_spacy_keywords(dialogue: ConversationVo = Body(), auth: Auth = Depends(OpenAuth())):
    print(f"用户输入的问题：{dialogue.user_input} -- 选择的知识库为:{dialogue.select_param}")
@@ -267,18 +306,26 @@ async def get_spacy_keywords(dialogue: ConversationVo = Body(), auth: Auth = Dep
        return SuccessResponse(result)  #返回type=3

    #没有敏感词的时候,查找是否有相关图片 或者 视频
-    words = get_key_words_nlp(dialogue.user_input) #100%匹配算法 | 只取匹配到的第一个
+    words = get_key_words_nlp(dialogue.user_input) #100%匹配算法
    if len(words) > 0:
-        print(f"---算法1-匹配到的关键词--->:{words[0]}")
-        result = await get_media_datas_by(dialogue.conv_uid, words[0], auth.db, dialogue.select_param)
+        print(f"---算法1-匹配到的关键词--->:{words}")
+        #从数据库中加载同义词列表 | similar_words = [['高兴', '快乐', '满足'], ['生气', '愤怒']]
+        similar_words = await crud.SimilarDal(auth.db).get_similar_by_keyword()
+        key_words = filter_similar(words, similar_words) #先过滤掉同义词
+        print(f"---算法1-过滤掉近义词后的关键词--->:{key_words}")
+        result = await get_media_datas_by(dialogue.conv_uid, key_words, auth.db, dialogue.select_param)
        return SuccessResponse(result)
    else:
        print(f"---算法2-begin--->")
        #上面的算法没找到，换一种算法继续找
        words2 = get_key_words(dialogue.user_input)
        if len(words2) > 0:
-            print(f"---算法2-匹配到的关键词--->:{words[0]}")
-            result = await get_media_datas_by(dialogue.conv_uid, words2[0], auth.db, dialogue.select_param)
+            print(f"---算法2-匹配到的关键词--->:{words2}")
+            # 从数据库中加载同义词列表 | similar_words = [['高兴', '快乐', '满足'], ['生气', '愤怒']]
+            similar_words2 = await crud.SimilarDal(auth.db).get_similar_by_keyword()
+            key_words2 = filter_similar(words2, similar_words2)  # 先过滤掉同义词
+            print(f"---算法2-过滤掉近义词后的关键词--->:{key_words2}")
+            result = await get_media_datas_by(dialogue.conv_uid, key_words2, auth.db, dialogue.select_param)
            return SuccessResponse(result)
        else:
            print(f"-----没有找到需要查询的内容:---->")

--- a/dbgpt/app/apps/vadmin/word/crud.py
+++ b/dbgpt/app/apps/vadmin/word/crud.py
@@ -60,6 +60,14 @@ class SimilarDal(DalBase):
        else:
            print(f"编号：{similar1.id} 词条：{similar1.word_name} 近义词：{similar1.similar_name} ")

+    async def get_similar_by_keyword(self) -> list:
+        """
+        根据关键词查询, 所有同义词 | 按照中文逗号分割 | 如果数据库里有英文逗号，先转换为中文逗号
+        """
+        similar_datas = await self.get_datas(limit=0)
+        result = [item['similar_name'].replace('，', ',').split(',') for item in similar_datas]
+        return result
+
    async def update_similars(self, ids: [], data: SimilarUpdate):
        await self.db.execute(
            update(self.model).where(self.model.id.in_(ids)).values(