Commit 5bb40bed authored by 林洋洋's avatar 林洋洋

添加图片库备用

parent e6672292
package com.ask.api.entity;
import com.baomidou.mybatisplus.annotation.TableName;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
import java.time.LocalDateTime;
@Data
@TableName(value = "ask_images_record")
@Schema(description = "图片表")
public class AskImagesRecord {
// 路径
private Long id;
// 值
private String imageName;
// 时间戳
private byte[] imageData;
}
......@@ -31,6 +31,16 @@
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.17.0</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.apache.tika</groupId>-->
<!-- <artifactId>tika-parsers-ocr</artifactId>-->
<!-- <version>3.0.0</version>-->
<!-- </dependency>-->
<!-- API -->
<dependency>
<groupId>com.ask</groupId>
......
......@@ -3,9 +3,17 @@ package com.ask.mapper;
import com.ask.api.entity.AskHistoryCollectData;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import org.apache.ibatis.annotations.Select;
import java.time.LocalDateTime;
import java.util.List;
@Mapper
public interface AskHistoryCollectDataMapper extends BaseMapper<AskHistoryCollectData> {
List<AskHistoryCollectData> selectLastRecordForEachPath(@Param("paths") List<String> paths,
@Param("startTime") LocalDateTime startTime,
@Param("endTime") LocalDateTime endTime);
}
\ No newline at end of file
package com.ask.mapper;
import com.ask.api.entity.AskHistoryCollectData;
import com.ask.api.entity.AskImagesRecord;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import org.apache.ibatis.annotations.Mapper;
@Mapper
public interface AskImagesRecordMapper extends BaseMapper<AskImagesRecord> {
}
package com.ask.service.impl;
import com.ask.api.entity.AskImagesRecord;
import com.ask.mapper.AskImagesRecordMapper;
import lombok.AllArgsConstructor;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.springframework.stereotype.Component;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedHashMap;
import java.util.Map;
@Component
@AllArgsConstructor
public class DocumentParseService {
private final AskImagesRecordMapper askImagesRecordMapper;
public String extractText(InputStream inputStream) {
// 初始化解析器、元数据和上下文
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
//忽略页眉页脚
officeParserConfig.setIncludeHeadersAndFooters(false);
parseContext.set(OfficeParserConfig.class, officeParserConfig);
Map<String, Long> imageMap = new LinkedHashMap<>();
// 自定义ContentHandler用于插入占位符
class MarkdownImageHandler extends ContentHandlerDecorator {
private final StringBuilder markdown = new StringBuilder();
private String localName = null;
@Override
public void characters(char[] ch, int start, int length) {
String text = new String(ch, start, length);
if (this.localName.equals("h1")) {
markdown.append("# ").append(text);
} else if (this.localName.equals("p")) {
markdown.append("\n").append(text);
} else {
markdown.append(text);
}
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attrs) {
this.localName = localName;
// System.out.println("localName="+localName+" qName="+qName+" text="+text);
if ("img".equals(localName)) { // 捕获图片节点
String src = attrs.getValue("src");
if (src != null && src.startsWith("embedded:")) {
String imageName = src.split(":")[1];
//TODO 存储图片
AskImagesRecord askImagesRecord = new AskImagesRecord();
askImagesRecord.setImageName(imageName);
askImagesRecordMapper.insert(askImagesRecord);
imageMap.put(imageName, askImagesRecord.getId());
markdown.append("![").append(imageName).append("](ImageId:").append(askImagesRecord.getId()).append(")\n");
}
}
}
public String getMarkdown() {
return markdown.toString();
}
}
MarkdownImageHandler contentHandler = new MarkdownImageHandler();
EmbeddedDocumentExtractor extractor = new EmbeddedDocumentExtractor() {
@Override
public boolean shouldParseEmbedded(Metadata metadata) {
// 只处理图片类型
return metadata.get(Metadata.CONTENT_TYPE) != null &&
metadata.get(Metadata.CONTENT_TYPE).startsWith("image/");
}
@Override
public void parseEmbedded(InputStream inputStream, ContentHandler embeddedHandler, Metadata metadata, boolean b) throws IOException, SAXException {
String fileName = metadata.get("resourceName");
Long imageId = imageMap.get(fileName);
AskImagesRecord askImagesRecord = askImagesRecordMapper.selectById(imageId);
askImagesRecord.setImageData(inputStream.readAllBytes());
askImagesRecordMapper.updateById(askImagesRecord);
}
};
parseContext.set(EmbeddedDocumentExtractor.class, extractor);
// 开始解析文档
try {
parser.parse(inputStream, contentHandler, metadata, parseContext);
} catch (IOException | SAXException | TikaException e) {
throw new RuntimeException(e);
}
//System.out.println("文件内容:" + contentHandler.getMarkdown());
return contentHandler.getMarkdown();
}
}
\ No newline at end of file
......@@ -45,7 +45,7 @@ public class KnowledgeDocumentServiceImpl extends ServiceImpl<KnowledgeDocumentM
private final SysFileService sysFileService;
private final AskVectorStoreService askVectorStoreService;
private final AsyncVectorizationService asyncVectorizationService;
private final DocumentParseService documentParseService;
/**
......@@ -78,8 +78,8 @@ public class KnowledgeDocumentServiceImpl extends ServiceImpl<KnowledgeDocumentM
* @param maxTokensPerSlice 每片最大token数(仅对CUSTOM策略有效)
* @return 文档片段列表,每个Document就是一片
*/
public List<Document> slicePdfDocument(String bucketName,String fileName , SliceStrategy sliceStrategy) {
InputStreamResource resource = new InputStreamResource(sysFileService.getFileStream(bucketName,fileName));
public List<Document> slicePdfDocument(String bucketName, String fileName, SliceStrategy sliceStrategy) {
InputStreamResource resource = new InputStreamResource(sysFileService.getFileStream(bucketName, fileName));
List<Document> documents = new ArrayList<>();
try {
......@@ -99,9 +99,9 @@ public class KnowledgeDocumentServiceImpl extends ServiceImpl<KnowledgeDocumentM
}
log.info("PDF切片完成,策略: {}, 切片数量: {}", sliceStrategy, documents.size());
}catch (Exception e) {
} catch (Exception e) {
log.error("PDF切片失败,策略: {}, 错误: {}", sliceStrategy, e.getMessage(), e);
documents = sliceByTokens(new InputStreamResource(sysFileService.getFileStream(bucketName,fileName)));
documents = sliceByTokens(new InputStreamResource(sysFileService.getFileStream(bucketName, fileName)));
}
return documents;
......@@ -348,6 +348,28 @@ public class KnowledgeDocumentServiceImpl extends ServiceImpl<KnowledgeDocumentM
return tokenDocuments;
}
private List<Document> sliceByTokens(String text) {
// 使用TokenTextSplitter进行切片
TokenTextSplitter textSplitter = new TokenTextSplitter(
4096, // 分片大小
50, // 最小分片字符数
50, // 设置最小需要嵌入的长度
1000, // 最大片段大小
true // 保持分隔符
);
Document fullDocument = new Document(text);
List<Document> tokenDocuments = textSplitter.apply(List.of(fullDocument));
for (Document subDoc : tokenDocuments) {
subDoc.getMetadata().put("size", Objects.requireNonNull(subDoc.getText()).length());
subDoc.getMetadata().put("title", "");
}
return tokenDocuments;
}
/**
* PDF切片策略枚举
*/
......@@ -361,9 +383,12 @@ public class KnowledgeDocumentServiceImpl extends ServiceImpl<KnowledgeDocumentM
List<DocumentSegmentResult> results = new ArrayList<>();
for (SysFile file : request.getFiles()) {
// String docText = documentParseService.extractText(sysFileService.getFileStream(file.getBucketName(), file.getFileName()));
// List<Document> segments = sliceByTokens(docText);
SliceStrategy sliceStrategy = SliceStrategy.CUSTOM;
if("pdf".equals(file.getType())){
sliceStrategy =SliceStrategy.PARAGRAPH;
if ("pdf".equals(file.getType())) {
sliceStrategy = SliceStrategy.PARAGRAPH;
}
// 读取文档内容 - 使用新的PDF切片函数
List<Document> segments = slicePdfDocument(
......
......@@ -21,11 +21,9 @@ import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
import java.time.format.SignStyle;
import java.time.temporal.ChronoField;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.*;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;
@Component
@Slf4j
......@@ -105,6 +103,28 @@ public class ExcelTools {
List<AskReportDict> askReportDicts = askReportDictMapper.selectList(
Wrappers.lambdaQuery(AskReportDict.class).eq(AskReportDict::getType, 1)
);
List<String> paramSer = new ArrayList<>();
for (AskReportDict askReportDict : askReportDicts) {
askReportDict.convertParamsToParamList();
askReportDict.getParamList().forEach(params -> {
if (params.getType() == 1) {
if (!paramSer.contains(params.getParam())) {
paramSer.add(params.getParam());
}
}
});
}
if (CollectionUtils.isEmpty(paramSer)) {
return paramMap;
}
List<AskHistoryCollectData> askHistoryCollectDataList = askHistoryCollectDataMapper.selectLastRecordForEachPath(paramSer, startTime, endTime);
Map<String, Double> result = askHistoryCollectDataList.stream()
.collect(Collectors.toMap(
AskHistoryCollectData::getPath, // 获取 path 作为键
AskHistoryCollectData::getValue, // 获取 value 作为值
(existingValue, newValue) -> existingValue // 如果有重复的键,这里决定如何处理。这里选择保留现有的值。
));
for (AskReportDict askReportDict : askReportDicts) {
String key = askReportDict.getKey();
......@@ -117,13 +137,13 @@ public class ExcelTools {
switch (askReportDict.getType()) {
case 1:
handleType1(askReportDict.getKey(), params, paramMap, startTime, endTime);
handleType1(askReportDict.getKey(), params, paramMap, result);
break;
case 2:
handleType2(askReportDict.getKey(), params, paramMap, startTime, endTime);
handleType2(askReportDict.getKey(), params, paramMap, result);
break;
case 3:
handleType3(askReportDict.getKey(), params, paramMap, startTime, endTime);
handleType3(askReportDict.getKey(), params, paramMap, result);
break;
default:
// Handle other types if necessary
......@@ -134,50 +154,50 @@ public class ExcelTools {
return paramMap;
}
private void handleType1(String key, List<AskReportDict.Params> params, Map<String, Object> paramMap, LocalDateTime startTime, LocalDateTime endTime) {
private void handleType1(String key, List<AskReportDict.Params> params, Map<String, Object> paramMap, Map<String, Double> result) {
AskReportDict.Params param = params.get(0);
if (param == null) {
return;
}
Double value = getLatestValue(param.getParam(), startTime, endTime);
Double value = result.get(param.getParam());
if (value != null) {
paramMap.put(key, value);
}
}
private void handleType2(String key, List<AskReportDict.Params> params, Map<String, Object> paramMap, LocalDateTime startTime, LocalDateTime endTime) {
private void handleType2(String key, List<AskReportDict.Params> params, Map<String, Object> paramMap, Map<String, Double> result) {
if (params.size() < 2) {
return;
}
Double numerator = getParamValue(params.get(0), startTime, endTime);
Double denominator = getParamValue(params.get(1), startTime, endTime);
Double numerator = result.get(params.get(0).getParam());
Double denominator = result.get(params.get(1).getParam());
if (numerator == null || denominator == null || denominator == 0) {
return;
}
double result = Math.round(numerator / denominator * 100.0) / 100.0; // 保留两位小数
paramMap.put(key, result);
Double value = Math.round(numerator / denominator * 100.0) / 100.0; // 保留两位小数
paramMap.put(key, value);
}
private void handleType3(String key, List<AskReportDict.Params> params, Map<String, Object> paramMap, LocalDateTime startTime, LocalDateTime endTime) {
private void handleType3(String key, List<AskReportDict.Params> params, Map<String, Object> paramMap, Map<String, Double> result) {
if (params.size() < 2) {
return;
}
Double numerator = getParamValue(params.get(0), startTime, endTime);
Double denominator = getParamValue(params.get(1), startTime, endTime);
Double numerator = result.get(params.get(0).getParam());
Double denominator = result.get(params.get(1).getParam());
if (numerator == null || denominator == null || denominator == 0) {
return;
}
double result = (numerator / denominator) * 100; // 计算百分比
result = Math.round(result * 100.0) / 100.0; // 保留两位小数
Double value = (numerator / denominator) * 100; // 计算百分比
value = Math.round(value * 100.0) / 100.0; // 保留两位小数
String percentageResult = String.format("%.2f%%", result); // 格式化为百分比字符串
String percentageResult = String.format("%.2f%%", value); // 格式化为百分比字符串
paramMap.put(key, percentageResult);
}
......
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.ask.mapper.AskHistoryCollectDataMapper">
<select id="selectLastRecordForEachPath" resultType="com.ask.api.entity.AskHistoryCollectData">
SELECT *
FROM (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY path ORDER BY datetime DESC) AS rn
FROM ask_history_collect_data
WHERE path IN
<foreach item="item" collection="paths" open="(" separator="," close=")">
#{item}
</foreach>
AND datetime BETWEEN #{startTime} AND #{endTime}
) t
WHERE t.rn = 1
</select>
</mapper>
......@@ -15,7 +15,7 @@ spring:
datasource:
url: jdbc:postgresql://8.152.98.45:5432/ask_data_ai_db?stringtype=unspecified
username: postgres
password: postgres123
password: e5d039e4ba5246068
driver-class-name: org.postgresql.Driver
ai:
vectorstore:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment