Commit 5331e365 authored by 于飞's avatar 于飞

修改关键词匹配规则,使用近义词过滤

parent bb47c76f
......@@ -123,7 +123,6 @@ class DFAFilter():
for keyword in datas:
self.add(keyword.word_name)
#最长匹配模式,确保敏感词过滤器优先匹配和替换较长的敏感词
def filter(self, message, repl="*"):
is_sensitive = False
......
......@@ -22,6 +22,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from typing import Any, Dict, Generic, Optional, TypeVar
from dbgpt._private.pydantic import BaseModel, ConfigDict, Field, model_to_dict
from dbgpt.app.apps.vadmin.word import crud
from dbgpt.app.apps.utils.spach_keywords import my_spacy_nlp
from dbgpt.app.apps.utils.filter import mydfafiter, mydfafiter_picture, mydfafiter_question, mydfafiter_video
......@@ -84,7 +85,7 @@ def get_key_words_nlp(user_input: str) -> list:
#print(words)
return words
async def get_media_datas_by(conv_uid: str, words: str, db: AsyncSession, knownledge: str) -> list:
async def get_media_datas_by(conv_uid: str, words: [], db: AsyncSession, knownledge: str) -> list:
# 去拿出group_id
datas = []
if knownledge != None:
......@@ -97,52 +98,56 @@ async def get_media_datas_by(conv_uid: str, words: str, db: AsyncSession, knownl
for image_groups in image_datas:
image_groupid = image_groups.get('group_id')
print(f"===========>image_groupid:{image_groupid}")
# 取出匹配到的关键词,获取数据库中的图片
images_dic = {'page': 1, 'limit': 0, 'v_order': None, 'v_order_field': None, 'type': 1,
'group_id': image_groupid,
'key_word': ('like', words)}
images_datas, count = await MediaDal(db).get_datas(**images_dic, v_return_count=True)
print(f"-----查询到的图片为:---->:{images_datas}")
for data in images_datas:
json_image = {'type': MEDIA_TYPE1, 'file_name': data.get('file_name'), 'key_word': data.get('key_word'),
'local_path': data.get('local_path'), 'remote_path': data.get('remote_path')}
result.append(json_image)
#遍历关键词数组->找出每一个关键词对应的图片
for word in words:
# 取出匹配到的关键词,获取数据库中的图片
images_dic = {'page': 1, 'limit': 0, 'v_order': None, 'v_order_field': None, 'type': 1,
'group_id': image_groupid,
'key_word': word}
images_datas, count = await MediaDal(db).get_datas(**images_dic, v_return_count=True)
print(f"-----查询到的图片为:---->:{images_datas}")
for data in images_datas:
json_image = {'type': MEDIA_TYPE1, 'file_name': data.get('file_name'),
'key_word': data.get('key_word'),
'local_path': data.get('local_path'), 'remote_path': data.get('remote_path')}
result.append(json_image)
video_datas = corrdata.get('video_group') or []
for video_groups in video_datas:
video_groupid = video_groups.get('group_id')
print(f"===========>video_groupid:{video_groupid}")
# 取出匹配到的关键词,获取数据库中的视频
video_dic = {'page': 1, 'limit': 0, 'v_order': None, 'v_order_field': None, 'type': 2,
'group_id': video_groupid,
'key_word': ('like', words)}
video_datas, count = await MediaDal(db).get_datas(**video_dic, v_return_count=True)
print(f"-----查询到的视频为:---->:{video_datas}")
for videodata in video_datas:
json_video = {'type': MEDIA_TYPE2, 'file_name': videodata.get('file_name'),
'key_word': videodata.get('key_word'),
'local_path': videodata.get('local_path'),
'remote_path': videodata.get('remote_path')}
result.append(json_video)
# 遍历关键词数组->找出每一个关键词对应的视频
for word in words:
# 取出匹配到的关键词,获取数据库中的视频
video_dic = {'page': 1, 'limit': 0, 'v_order': None, 'v_order_field': None, 'type': 2,
'group_id': video_groupid,
'key_word': word}
video_datas, count = await MediaDal(db).get_datas(**video_dic, v_return_count=True)
print(f"-----查询到的视频为:---->:{video_datas}")
for videodata in video_datas:
json_video = {'type': MEDIA_TYPE2, 'file_name': videodata.get('file_name'),
'key_word': videodata.get('key_word'),
'local_path': videodata.get('local_path'), 'remote_path': videodata.get('remote_path')}
result.append(json_video)
question_datas = corrdata.get('question_group') or []
for question_groups in question_datas:
question_groupid = question_groups.get('group_id')
print(f"===========>question_groupid:{question_groupid}")
# 匹配到的问答对有
question_dic = {'page': 1, 'limit': 0, 'v_order': None, 'v_order_field': None,
'group_id': question_groupid,
'key_word': ('like', words)}
question_datas, count = await QuestionDal(db).get_datas(**question_dic, v_return_count=True)
print(f"-----查询到的问答对为:---->:{question_datas}")
for questiondata in question_datas:
json_question = {'type': MEDIA_TYPE4, 'title': questiondata.get('title'),
'key_word': questiondata.get('key_word'),
'answer': questiondata.get('answer')}
result.append(json_question)
# 遍历关键词数组->找出每一个关键词对应的问答对
for word in words:
# 匹配到的问答对有
question_dic = {'page': 1, 'limit': 0, 'v_order': None, 'v_order_field': None,
'group_id': question_groupid,
'key_word': word}
question_datas, count = await QuestionDal(db).get_datas(**question_dic, v_return_count=True)
print(f"-----查询到的问答对为:---->:{question_datas}")
for questiondata in question_datas:
json_question = {'type': MEDIA_TYPE4, 'title': questiondata.get('title'),
'key_word': questiondata.get('key_word'),
'answer': questiondata.get('answer')}
result.append(json_question)
# 保存到聊天历史资源数据库中
......@@ -238,6 +243,40 @@ async def get_media_datas_all(conv_uid: str, default_model: str, db: AsyncSessio
return ret_media_datas
def filter_similar(key_words: list, similar_words: list) -> list:
"""
从 key_words 中过滤掉同义词,只保留每组同义词中的一个词。
参数:
- key_words: 需要筛选的关键词列表。
- similar_words: 同义词组列表,每组是包含同义词的列表。
返回值:
- f_words: 过滤后的关键词列表,只保留非同义词或每组中的一个代表词。
示例:
key_words = ['高兴', '快乐', '满足', '生气', '愤怒']
similar_words = [['高兴', '快乐', '满足'], ['生气', '愤怒']]
result = filter_similar(key_words, similar_words)
print(result) # 输出:['高兴', '生气']
这个例子中,函数会保留“高兴”作为同义词组的代表词,并保留“生气”作为另一组的代表。
"""
f_words = []
for word in key_words:
# 检查这个词或它的同义词是否已经在 f_words 中
found_similar = False
for synonym_group in similar_words:
if word in synonym_group:
# 如果这个同义词组中的任何一个词已在结果列表中,跳过当前词
if any(syn in f_words for syn in synonym_group):
found_similar = True
break
if not found_similar:
f_words.append(word)
return f_words
@router.post("/get_spacy_keywords", summary="资源列表(图片、视频)")
async def get_spacy_keywords(dialogue: ConversationVo = Body(), auth: Auth = Depends(OpenAuth())):
print(f"用户输入的问题:{dialogue.user_input} -- 选择的知识库为:{dialogue.select_param}")
......@@ -267,18 +306,26 @@ async def get_spacy_keywords(dialogue: ConversationVo = Body(), auth: Auth = Dep
return SuccessResponse(result) #返回type=3
#没有敏感词的时候,查找是否有相关图片 或者 视频
words = get_key_words_nlp(dialogue.user_input) #100%匹配算法 | 只取匹配到的第一个
words = get_key_words_nlp(dialogue.user_input) #100%匹配算法
if len(words) > 0:
print(f"---算法1-匹配到的关键词--->:{words[0]}")
result = await get_media_datas_by(dialogue.conv_uid, words[0], auth.db, dialogue.select_param)
print(f"---算法1-匹配到的关键词--->:{words}")
#从数据库中加载同义词列表 | similar_words = [['高兴', '快乐', '满足'], ['生气', '愤怒']]
similar_words = await crud.SimilarDal(auth.db).get_similar_by_keyword()
key_words = filter_similar(words, similar_words) #先过滤掉同义词
print(f"---算法1-过滤掉近义词后的关键词--->:{key_words}")
result = await get_media_datas_by(dialogue.conv_uid, key_words, auth.db, dialogue.select_param)
return SuccessResponse(result)
else:
print(f"---算法2-begin--->")
#上面的算法没找到,换一种算法继续找
words2 = get_key_words(dialogue.user_input)
if len(words2) > 0:
print(f"---算法2-匹配到的关键词--->:{words[0]}")
result = await get_media_datas_by(dialogue.conv_uid, words2[0], auth.db, dialogue.select_param)
print(f"---算法2-匹配到的关键词--->:{words2}")
# 从数据库中加载同义词列表 | similar_words = [['高兴', '快乐', '满足'], ['生气', '愤怒']]
similar_words2 = await crud.SimilarDal(auth.db).get_similar_by_keyword()
key_words2 = filter_similar(words2, similar_words2) # 先过滤掉同义词
print(f"---算法2-过滤掉近义词后的关键词--->:{key_words2}")
result = await get_media_datas_by(dialogue.conv_uid, key_words2, auth.db, dialogue.select_param)
return SuccessResponse(result)
else:
print(f"-----没有找到需要查询的内容:---->")
......
......@@ -60,6 +60,14 @@ class SimilarDal(DalBase):
else:
print(f"编号:{similar1.id} 词条:{similar1.word_name} 近义词:{similar1.similar_name} ")
async def get_similar_by_keyword(self) -> list:
"""
根据关键词查询, 所有同义词 | 按照中文逗号分割 | 如果数据库里有英文逗号,先转换为中文逗号
"""
similar_datas = await self.get_datas(limit=0)
result = [item['similar_name'].replace(',', ',').split(',') for item in similar_datas]
return result
async def update_similars(self, ids: [], data: SimilarUpdate):
await self.db.execute(
update(self.model).where(self.model.id.in_(ids)).values(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment