AI-accounting-soft/src/main/java/com/accounting/util/OcrAmountParser.java
2025-12-12 16:37:27 +08:00

346 lines
12 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package com.accounting.util;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import java.math.BigDecimal;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class OcrAmountParser {
// 金额正则表达式:匹配 ¥100.00、100.00元、100元、100.00 等格式
private static final Pattern AMOUNT_PATTERN = Pattern.compile(
"[¥¥]?\\s*(\\d{1,10}(\\.\\d{1,2})?)\\s*[元]?"
);
// 日期正则表达式:匹配字符串中是否含有月或日
private static final Pattern UNION_DATE_PATTERN = Pattern.compile(".*月.*日.*");
// 日期正则表达式:匹配 12月2日1314 这样的格式
private static final Pattern DATE_PATTERN = Pattern.compile(
"(\\d{1,2})月(\\d{1,2})日\\s*(\\d{1,2})[:](\\d{1,2})"
);
// 商户名称关键词(常见支付平台)
private static final String[] MERCHANT_KEYWORDS = {
"微信支付", "支付宝", "收款", "付款", "商户", "商家", "店铺", "超市", "餐厅", "饭店"
};
public static class ParseResult {
private BigDecimal amount;
private String merchant;
private LocalDateTime date; // 改为LocalDateTime以支持时间
private BigDecimal confidence;
public ParseResult(BigDecimal amount, String merchant, LocalDateTime date, BigDecimal confidence) {
this.amount = amount;
this.merchant = merchant;
this.date = date;
this.confidence = confidence;
}
public ParseResult() {
}
public void setAmount(BigDecimal amount) {
this.amount = amount;
}
public void setMerchant(String merchant) {
this.merchant = merchant;
}
public void setDate(LocalDateTime date) {
this.date = date;
}
public void setConfidence(BigDecimal confidence) {
this.confidence = confidence;
}
public BigDecimal getAmount() { return amount; }
public String getMerchant() { return merchant; }
public LocalDateTime getDate() { return date; }
public BigDecimal getConfidence() { return confidence; }
}
/**
* 验证字符串是否为有效日期格式
*/
private static boolean isValidDate(String dateStr) {
if (dateStr == null || dateStr.trim().isEmpty()) {
return false;
}
return UNION_DATE_PATTERN.matcher(dateStr.trim()).matches();
}
/**
* 将字符串转换为BigDecimal
* @param moneyStr 金额字符串
* @return BigDecimal对象如果转换失败则返回null
*/
private static BigDecimal parseMoneyString(String moneyStr) {
if (moneyStr == null || moneyStr.trim().isEmpty()) {
return null;
}
try {
// 判断正负号
String cleanStr = moneyStr.trim();
boolean isNegative = cleanStr.startsWith("-");
// 移除可能的前缀符号 (+/-)
if (cleanStr.startsWith("+") || cleanStr.startsWith("-")) {
cleanStr = cleanStr.substring(1);
}
// 使用现有的金额正则表达式匹配
Matcher matcher = AMOUNT_PATTERN.matcher(cleanStr);
if (matcher.find()) {
String amountStr = matcher.group(1);
BigDecimal amount = new BigDecimal(amountStr);
// 应用正负号
return isNegative ? amount.negate() : amount;
}
} catch (Exception e) {
// 转换失败
e.printStackTrace();
}
return null;
}
/**
* 解析日期字符串为LocalDateTime对象
* @param dateStr 日期字符串,例如:"12月2日1314"
* @return LocalDateTime对象
*/
private static LocalDateTime parseDateTimeString(String dateStr) {
if (dateStr == null || dateStr.trim().isEmpty()) {
return null;
}
try {
Matcher matcher = DATE_PATTERN.matcher(dateStr.trim());
if (matcher.find()) {
int month = Integer.parseInt(matcher.group(1));
int day = Integer.parseInt(matcher.group(2));
int hour = Integer.parseInt(matcher.group(3));
int minute = Integer.parseInt(matcher.group(4));
// 使用当前年份
int year = java.time.Year.now().getValue();
return LocalDateTime.of(year, month, day, hour, minute);
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* 解析OCR识别结果提取金额、商户名称、日期等信息
*/
public static List<ParseResult> parse(String ocrResultJson) {
try {
JSONObject jsonObject = JSON.parseObject(ocrResultJson);
System.out.println();
String data = jsonObject.getString("data");
if (data == null) {
return List.of(new ParseResult(null, null, null, BigDecimal.ZERO));
}
JSONObject dataObject = JSON.parseObject(data);
String content = dataObject.getString("content");
String[] split = content.split(" ");
ArrayList<Map<String,String>> signList = new ArrayList<>();
for (int i = 1; i < split.length; i++) {
String currentLine = split[i].trim();
String previousLine = split[i-1].trim();
// 安全地获取 i+1, i+2, i+3 位置的值
String iPlusOne = null;
String iPlusTwo = null;
String iPlusThree = null;
if (i + 1 < split.length) {
iPlusOne = split[i + 1].trim();
}
if (i + 2 < split.length) {
iPlusTwo = split[i + 2].trim();
}
if (i + 3 < split.length) {
iPlusThree = split[i + 3].trim();
}
// 检查当前行是否以+或-开头
if (currentLine.startsWith("+") || currentLine.startsWith("-")) {
Map<String, String> signMap = new HashMap<>();
signMap.put("money", currentLine);
signMap.put("content", previousLine);
// 检查 i+1, i+2, i+3 哪个是日期格式
String dateValue = null;
if (i + 1 < split.length && isValidDate(iPlusOne)) {
dateValue = iPlusOne;
} else if (i + 2 < split.length && isValidDate(iPlusTwo)) {
dateValue = iPlusTwo;
} else if (i + 3 < split.length && isValidDate(iPlusThree)) {
dateValue = iPlusThree;
}
if (dateValue != null) {
signMap.put("date", dateValue);
}
signList.add(signMap);
}
}
//识别完成,对识别结果进行处理
ArrayList<ParseResult> parseList = new ArrayList<>();
signList.forEach(signMap -> {
ParseResult result = new ParseResult();
if (signMap.containsKey("money")){
result.setAmount(parseMoneyString(signMap.get("money")));
}
if (signMap.containsKey("content")){
result.setMerchant(parseMerchant(signMap.get("content")));
}
if (signMap.containsKey("date")){
result.setDate(parseDateTimeString(signMap.get("date")));
}
parseList.add(result);
});
return parseList;
} catch (Exception e) {
throw new RuntimeException("解析过程中出错,请重试或联系管理员,报错信息:"+e);
}
}
/**
* 从OCR结果中提取文本内容
*/
private static String extractContent(JSONObject data) {
StringBuilder content = new StringBuilder();
// 尝试获取prism_wordsInfo字段通用文字识别
JSONArray wordsInfo = data.getJSONArray("prism_wordsInfo");
if (wordsInfo != null) {
for (int i = 0; i < wordsInfo.size(); i++) {
JSONObject word = wordsInfo.getJSONObject(i);
String wordStr = word.getString("word");
if (wordStr != null) {
content.append(wordStr).append(" ");
}
}
}
// 如果没有prism_wordsInfo尝试获取content字段
if (content.length() == 0) {
String contentStr = data.getString("content");
if (contentStr != null) {
content.append(contentStr);
}
}
return content.toString().trim();
}
/**
* 解析金额
*/
private static BigDecimal parseAmount(String content) {
Matcher matcher = AMOUNT_PATTERN.matcher(content);
// 查找所有匹配的金额,取最大的(通常是实际支付金额)
BigDecimal maxAmount = null;
while (matcher.find()) {
String amountStr = matcher.group(1);
try {
BigDecimal amount = new BigDecimal(amountStr);
if (maxAmount == null || amount.compareTo(maxAmount) > 0) {
maxAmount = amount;
}
} catch (NumberFormatException e) {
// 忽略解析失败的金额
}
}
return maxAmount;
}
/**
* 解析日期
*/
private static LocalDate parseDate(String content) {
Matcher matcher = DATE_PATTERN.matcher(content);
if (matcher.find()) {
try {
int year = Integer.parseInt(matcher.group(1));
int month = Integer.parseInt(matcher.group(2));
int day = Integer.parseInt(matcher.group(3));
return LocalDate.of(year, month, day);
} catch (Exception e) {
// 解析失败返回null
}
}
return null;
}
/**
* 解析商户名称
*/
private static String parseMerchant(String content) {
// 查找包含商户关键词的行
String[] lines = content.split("\n");
for (String line : lines) {
for (String keyword : MERCHANT_KEYWORDS) {
if (line.contains(keyword)) {
// 提取商户名称(去除关键词本身)
String merchant = line.replace(keyword, "").trim();
if (!merchant.isEmpty() && merchant.length() < 50) {
return merchant;
}
}
}
}
// 如果没有找到,返回第一行非金额非日期的文本
for (String line : lines) {
line = line.trim();
if (!line.isEmpty() && !AMOUNT_PATTERN.matcher(line).find() &&
!DATE_PATTERN.matcher(line).find() && line.length() < 50) {
return line;
}
}
return null;
}
}