Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
D
db_gpt
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
linyangyang
db_gpt
Commits
5331e365
Commit
5331e365
authored
Oct 12, 2024
by
于飞
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改关键词匹配规则,使用近义词过滤
parent
bb47c76f
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
97 additions
and
43 deletions
+97
-43
filter.py
dbgpt/app/apps/utils/filter.py
+0
-1
keywordsviews.py
dbgpt/app/apps/vadmin/keywordsviews.py
+89
-42
crud.py
dbgpt/app/apps/vadmin/word/crud.py
+8
-0
No files found.
dbgpt/app/apps/utils/filter.py
View file @
5331e365
...
...
@@ -123,7 +123,6 @@ class DFAFilter():
for
keyword
in
datas
:
self
.
add
(
keyword
.
word_name
)
#最长匹配模式,确保敏感词过滤器优先匹配和替换较长的敏感词
def
filter
(
self
,
message
,
repl
=
"*"
):
is_sensitive
=
False
...
...
dbgpt/app/apps/vadmin/keywordsviews.py
View file @
5331e365
...
...
@@ -22,6 +22,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from
typing
import
Any
,
Dict
,
Generic
,
Optional
,
TypeVar
from
dbgpt._private.pydantic
import
BaseModel
,
ConfigDict
,
Field
,
model_to_dict
from
dbgpt.app.apps.vadmin.word
import
crud
from
dbgpt.app.apps.utils.spach_keywords
import
my_spacy_nlp
from
dbgpt.app.apps.utils.filter
import
mydfafiter
,
mydfafiter_picture
,
mydfafiter_question
,
mydfafiter_video
...
...
@@ -84,7 +85,7 @@ def get_key_words_nlp(user_input: str) -> list:
#print(words)
return
words
async
def
get_media_datas_by
(
conv_uid
:
str
,
words
:
str
,
db
:
AsyncSession
,
knownledge
:
str
)
->
list
:
async
def
get_media_datas_by
(
conv_uid
:
str
,
words
:
[]
,
db
:
AsyncSession
,
knownledge
:
str
)
->
list
:
# 去拿出group_id
datas
=
[]
if
knownledge
!=
None
:
...
...
@@ -97,52 +98,56 @@ async def get_media_datas_by(conv_uid: str, words: str, db: AsyncSession, knownl
for
image_groups
in
image_datas
:
image_groupid
=
image_groups
.
get
(
'group_id'
)
print
(
f
"===========>image_groupid:{image_groupid}"
)
# 取出匹配到的关键词,获取数据库中的图片
images_dic
=
{
'page'
:
1
,
'limit'
:
0
,
'v_order'
:
None
,
'v_order_field'
:
None
,
'type'
:
1
,
'group_id'
:
image_groupid
,
'key_word'
:
(
'like'
,
words
)}
images_datas
,
count
=
await
MediaDal
(
db
)
.
get_datas
(
**
images_dic
,
v_return_count
=
True
)
print
(
f
"-----查询到的图片为:---->:{images_datas}"
)
for
data
in
images_datas
:
json_image
=
{
'type'
:
MEDIA_TYPE1
,
'file_name'
:
data
.
get
(
'file_name'
),
'key_word'
:
data
.
get
(
'key_word'
),
'local_path'
:
data
.
get
(
'local_path'
),
'remote_path'
:
data
.
get
(
'remote_path'
)}
result
.
append
(
json_image
)
#遍历关键词数组->找出每一个关键词对应的图片
for
word
in
words
:
# 取出匹配到的关键词,获取数据库中的图片
images_dic
=
{
'page'
:
1
,
'limit'
:
0
,
'v_order'
:
None
,
'v_order_field'
:
None
,
'type'
:
1
,
'group_id'
:
image_groupid
,
'key_word'
:
word
}
images_datas
,
count
=
await
MediaDal
(
db
)
.
get_datas
(
**
images_dic
,
v_return_count
=
True
)
print
(
f
"-----查询到的图片为:---->:{images_datas}"
)
for
data
in
images_datas
:
json_image
=
{
'type'
:
MEDIA_TYPE1
,
'file_name'
:
data
.
get
(
'file_name'
),
'key_word'
:
data
.
get
(
'key_word'
),
'local_path'
:
data
.
get
(
'local_path'
),
'remote_path'
:
data
.
get
(
'remote_path'
)}
result
.
append
(
json_image
)
video_datas
=
corrdata
.
get
(
'video_group'
)
or
[]
for
video_groups
in
video_datas
:
video_groupid
=
video_groups
.
get
(
'group_id'
)
print
(
f
"===========>video_groupid:{video_groupid}"
)
# 取出匹配到的关键词,获取数据库中的视频
video_dic
=
{
'page'
:
1
,
'limit'
:
0
,
'v_order'
:
None
,
'v_order_field'
:
None
,
'type'
:
2
,
'group_id'
:
video_groupid
,
'key_word'
:
(
'like'
,
words
)}
video_datas
,
count
=
await
MediaDal
(
db
)
.
get_datas
(
**
video_dic
,
v_return_count
=
True
)
print
(
f
"-----查询到的视频为:---->:{video_datas}"
)
for
videodata
in
video_datas
:
json_video
=
{
'type'
:
MEDIA_TYPE2
,
'file_name'
:
videodata
.
get
(
'file_name'
),
'key_word'
:
videodata
.
get
(
'key_word'
),
'local_path'
:
videodata
.
get
(
'local_path'
),
'remote_path'
:
videodata
.
get
(
'remote_path'
)}
result
.
append
(
json_video
)
# 遍历关键词数组->找出每一个关键词对应的视频
for
word
in
words
:
# 取出匹配到的关键词,获取数据库中的视频
video_dic
=
{
'page'
:
1
,
'limit'
:
0
,
'v_order'
:
None
,
'v_order_field'
:
None
,
'type'
:
2
,
'group_id'
:
video_groupid
,
'key_word'
:
word
}
video_datas
,
count
=
await
MediaDal
(
db
)
.
get_datas
(
**
video_dic
,
v_return_count
=
True
)
print
(
f
"-----查询到的视频为:---->:{video_datas}"
)
for
videodata
in
video_datas
:
json_video
=
{
'type'
:
MEDIA_TYPE2
,
'file_name'
:
videodata
.
get
(
'file_name'
),
'key_word'
:
videodata
.
get
(
'key_word'
),
'local_path'
:
videodata
.
get
(
'local_path'
),
'remote_path'
:
videodata
.
get
(
'remote_path'
)}
result
.
append
(
json_video
)
question_datas
=
corrdata
.
get
(
'question_group'
)
or
[]
for
question_groups
in
question_datas
:
question_groupid
=
question_groups
.
get
(
'group_id'
)
print
(
f
"===========>question_groupid:{question_groupid}"
)
# 匹配到的问答对有
question_dic
=
{
'page'
:
1
,
'limit'
:
0
,
'v_order'
:
None
,
'v_order_field'
:
None
,
'group_id'
:
question_groupid
,
'key_word'
:
(
'like'
,
words
)}
question_datas
,
count
=
await
QuestionDal
(
db
)
.
get_datas
(
**
question_dic
,
v_return_count
=
True
)
print
(
f
"-----查询到的问答对为:---->:{question_datas}"
)
for
questiondata
in
question_datas
:
json_question
=
{
'type'
:
MEDIA_TYPE4
,
'title'
:
questiondata
.
get
(
'title'
),
'key_word'
:
questiondata
.
get
(
'key_word'
),
'answer'
:
questiondata
.
get
(
'answer'
)}
result
.
append
(
json_question
)
# 遍历关键词数组->找出每一个关键词对应的问答对
for
word
in
words
:
# 匹配到的问答对有
question_dic
=
{
'page'
:
1
,
'limit'
:
0
,
'v_order'
:
None
,
'v_order_field'
:
None
,
'group_id'
:
question_groupid
,
'key_word'
:
word
}
question_datas
,
count
=
await
QuestionDal
(
db
)
.
get_datas
(
**
question_dic
,
v_return_count
=
True
)
print
(
f
"-----查询到的问答对为:---->:{question_datas}"
)
for
questiondata
in
question_datas
:
json_question
=
{
'type'
:
MEDIA_TYPE4
,
'title'
:
questiondata
.
get
(
'title'
),
'key_word'
:
questiondata
.
get
(
'key_word'
),
'answer'
:
questiondata
.
get
(
'answer'
)}
result
.
append
(
json_question
)
# 保存到聊天历史资源数据库中
...
...
@@ -238,6 +243,40 @@ async def get_media_datas_all(conv_uid: str, default_model: str, db: AsyncSessio
return
ret_media_datas
def
filter_similar
(
key_words
:
list
,
similar_words
:
list
)
->
list
:
"""
从 key_words 中过滤掉同义词,只保留每组同义词中的一个词。
参数:
- key_words: 需要筛选的关键词列表。
- similar_words: 同义词组列表,每组是包含同义词的列表。
返回值:
- f_words: 过滤后的关键词列表,只保留非同义词或每组中的一个代表词。
示例:
key_words = ['高兴', '快乐', '满足', '生气', '愤怒']
similar_words = [['高兴', '快乐', '满足'], ['生气', '愤怒']]
result = filter_similar(key_words, similar_words)
print(result) # 输出:['高兴', '生气']
这个例子中,函数会保留“高兴”作为同义词组的代表词,并保留“生气”作为另一组的代表。
"""
f_words
=
[]
for
word
in
key_words
:
# 检查这个词或它的同义词是否已经在 f_words 中
found_similar
=
False
for
synonym_group
in
similar_words
:
if
word
in
synonym_group
:
# 如果这个同义词组中的任何一个词已在结果列表中,跳过当前词
if
any
(
syn
in
f_words
for
syn
in
synonym_group
):
found_similar
=
True
break
if
not
found_similar
:
f_words
.
append
(
word
)
return
f_words
@
router
.
post
(
"/get_spacy_keywords"
,
summary
=
"资源列表(图片、视频)"
)
async
def
get_spacy_keywords
(
dialogue
:
ConversationVo
=
Body
(),
auth
:
Auth
=
Depends
(
OpenAuth
())):
print
(
f
"用户输入的问题:{dialogue.user_input} -- 选择的知识库为:{dialogue.select_param}"
)
...
...
@@ -267,18 +306,26 @@ async def get_spacy_keywords(dialogue: ConversationVo = Body(), auth: Auth = Dep
return
SuccessResponse
(
result
)
#返回type=3
#没有敏感词的时候,查找是否有相关图片 或者 视频
words
=
get_key_words_nlp
(
dialogue
.
user_input
)
#100%匹配算法
| 只取匹配到的第一个
words
=
get_key_words_nlp
(
dialogue
.
user_input
)
#100%匹配算法
if
len
(
words
)
>
0
:
print
(
f
"---算法1-匹配到的关键词--->:{words[0]}"
)
result
=
await
get_media_datas_by
(
dialogue
.
conv_uid
,
words
[
0
],
auth
.
db
,
dialogue
.
select_param
)
print
(
f
"---算法1-匹配到的关键词--->:{words}"
)
#从数据库中加载同义词列表 | similar_words = [['高兴', '快乐', '满足'], ['生气', '愤怒']]
similar_words
=
await
crud
.
SimilarDal
(
auth
.
db
)
.
get_similar_by_keyword
()
key_words
=
filter_similar
(
words
,
similar_words
)
#先过滤掉同义词
print
(
f
"---算法1-过滤掉近义词后的关键词--->:{key_words}"
)
result
=
await
get_media_datas_by
(
dialogue
.
conv_uid
,
key_words
,
auth
.
db
,
dialogue
.
select_param
)
return
SuccessResponse
(
result
)
else
:
print
(
f
"---算法2-begin--->"
)
#上面的算法没找到,换一种算法继续找
words2
=
get_key_words
(
dialogue
.
user_input
)
if
len
(
words2
)
>
0
:
print
(
f
"---算法2-匹配到的关键词--->:{words[0]}"
)
result
=
await
get_media_datas_by
(
dialogue
.
conv_uid
,
words2
[
0
],
auth
.
db
,
dialogue
.
select_param
)
print
(
f
"---算法2-匹配到的关键词--->:{words2}"
)
# 从数据库中加载同义词列表 | similar_words = [['高兴', '快乐', '满足'], ['生气', '愤怒']]
similar_words2
=
await
crud
.
SimilarDal
(
auth
.
db
)
.
get_similar_by_keyword
()
key_words2
=
filter_similar
(
words2
,
similar_words2
)
# 先过滤掉同义词
print
(
f
"---算法2-过滤掉近义词后的关键词--->:{key_words2}"
)
result
=
await
get_media_datas_by
(
dialogue
.
conv_uid
,
key_words2
,
auth
.
db
,
dialogue
.
select_param
)
return
SuccessResponse
(
result
)
else
:
print
(
f
"-----没有找到需要查询的内容:---->"
)
...
...
dbgpt/app/apps/vadmin/word/crud.py
View file @
5331e365
...
...
@@ -60,6 +60,14 @@ class SimilarDal(DalBase):
else
:
print
(
f
"编号:{similar1.id} 词条:{similar1.word_name} 近义词:{similar1.similar_name} "
)
async
def
get_similar_by_keyword
(
self
)
->
list
:
"""
根据关键词查询, 所有同义词 | 按照中文逗号分割 | 如果数据库里有英文逗号,先转换为中文逗号
"""
similar_datas
=
await
self
.
get_datas
(
limit
=
0
)
result
=
[
item
[
'similar_name'
]
.
replace
(
','
,
','
)
.
split
(
','
)
for
item
in
similar_datas
]
return
result
async
def
update_similars
(
self
,
ids
:
[],
data
:
SimilarUpdate
):
await
self
.
db
.
execute
(
update
(
self
.
model
)
.
where
(
self
.
model
.
id
.
in_
(
ids
))
.
values
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment