346 lines
12 KiB
Java
346 lines
12 KiB
Java
package com.accounting.util;
|
||
|
||
import com.alibaba.fastjson2.JSON;
|
||
import com.alibaba.fastjson2.JSONArray;
|
||
import com.alibaba.fastjson2.JSONObject;
|
||
|
||
import java.math.BigDecimal;
|
||
import java.time.LocalDate;
|
||
import java.time.LocalDateTime;
|
||
import java.time.format.DateTimeFormatter;
|
||
import java.time.format.DateTimeParseException;
|
||
import java.util.ArrayList;
|
||
import java.util.HashMap;
|
||
import java.util.List;
|
||
import java.util.Map;
|
||
import java.util.regex.Matcher;
|
||
import java.util.regex.Pattern;
|
||
|
||
public class OcrAmountParser {
|
||
|
||
// 金额正则表达式:匹配 ¥100.00、100.00元、100元、100.00 等格式
|
||
private static final Pattern AMOUNT_PATTERN = Pattern.compile(
|
||
"[¥¥]?\\s*(\\d{1,10}(\\.\\d{1,2})?)\\s*[元]?"
|
||
);
|
||
|
||
// 日期正则表达式:匹配字符串中是否含有月或日
|
||
private static final Pattern UNION_DATE_PATTERN = Pattern.compile(".*月.*日.*");
|
||
|
||
|
||
// 日期正则表达式:匹配 12月2日13:14 这样的格式
|
||
private static final Pattern DATE_PATTERN = Pattern.compile(
|
||
"(\\d{1,2})月(\\d{1,2})日\\s*(\\d{1,2})[::](\\d{1,2})"
|
||
);
|
||
|
||
// 商户名称关键词(常见支付平台)
|
||
private static final String[] MERCHANT_KEYWORDS = {
|
||
"微信支付", "支付宝", "收款", "付款", "商户", "商家", "店铺", "超市", "餐厅", "饭店"
|
||
};
|
||
|
||
public static class ParseResult {
|
||
private BigDecimal amount;
|
||
private String merchant;
|
||
private LocalDateTime date; // 改为LocalDateTime以支持时间
|
||
private BigDecimal confidence;
|
||
|
||
public ParseResult(BigDecimal amount, String merchant, LocalDateTime date, BigDecimal confidence) {
|
||
this.amount = amount;
|
||
this.merchant = merchant;
|
||
this.date = date;
|
||
this.confidence = confidence;
|
||
}
|
||
|
||
public ParseResult() {
|
||
|
||
}
|
||
|
||
public void setAmount(BigDecimal amount) {
|
||
this.amount = amount;
|
||
}
|
||
|
||
public void setMerchant(String merchant) {
|
||
this.merchant = merchant;
|
||
}
|
||
|
||
public void setDate(LocalDateTime date) {
|
||
this.date = date;
|
||
}
|
||
|
||
public void setConfidence(BigDecimal confidence) {
|
||
this.confidence = confidence;
|
||
}
|
||
|
||
public BigDecimal getAmount() { return amount; }
|
||
public String getMerchant() { return merchant; }
|
||
public LocalDateTime getDate() { return date; }
|
||
public BigDecimal getConfidence() { return confidence; }
|
||
}
|
||
|
||
/**
|
||
* 验证字符串是否为有效日期格式
|
||
*/
|
||
private static boolean isValidDate(String dateStr) {
|
||
if (dateStr == null || dateStr.trim().isEmpty()) {
|
||
return false;
|
||
}
|
||
return UNION_DATE_PATTERN.matcher(dateStr.trim()).matches();
|
||
}
|
||
|
||
/**
|
||
* 将字符串转换为BigDecimal
|
||
* @param moneyStr 金额字符串
|
||
* @return BigDecimal对象,如果转换失败则返回null
|
||
*/
|
||
private static BigDecimal parseMoneyString(String moneyStr) {
|
||
if (moneyStr == null || moneyStr.trim().isEmpty()) {
|
||
return null;
|
||
}
|
||
|
||
try {
|
||
// 判断正负号
|
||
String cleanStr = moneyStr.trim();
|
||
boolean isNegative = cleanStr.startsWith("-");
|
||
|
||
// 移除可能的前缀符号 (+/-)
|
||
if (cleanStr.startsWith("+") || cleanStr.startsWith("-")) {
|
||
cleanStr = cleanStr.substring(1);
|
||
}
|
||
|
||
// 使用现有的金额正则表达式匹配
|
||
Matcher matcher = AMOUNT_PATTERN.matcher(cleanStr);
|
||
if (matcher.find()) {
|
||
String amountStr = matcher.group(1);
|
||
BigDecimal amount = new BigDecimal(amountStr);
|
||
// 应用正负号
|
||
return isNegative ? amount.negate() : amount;
|
||
}
|
||
} catch (Exception e) {
|
||
// 转换失败
|
||
e.printStackTrace();
|
||
}
|
||
return null;
|
||
}
|
||
|
||
/**
|
||
* 解析日期字符串为LocalDateTime对象
|
||
* @param dateStr 日期字符串,例如:"12月2日13:14"
|
||
* @return LocalDateTime对象
|
||
*/
|
||
private static LocalDateTime parseDateTimeString(String dateStr) {
|
||
if (dateStr == null || dateStr.trim().isEmpty()) {
|
||
return null;
|
||
}
|
||
|
||
try {
|
||
Matcher matcher = DATE_PATTERN.matcher(dateStr.trim());
|
||
if (matcher.find()) {
|
||
int month = Integer.parseInt(matcher.group(1));
|
||
int day = Integer.parseInt(matcher.group(2));
|
||
int hour = Integer.parseInt(matcher.group(3));
|
||
int minute = Integer.parseInt(matcher.group(4));
|
||
|
||
// 使用当前年份
|
||
int year = java.time.Year.now().getValue();
|
||
return LocalDateTime.of(year, month, day, hour, minute);
|
||
}
|
||
} catch (Exception e) {
|
||
e.printStackTrace();
|
||
}
|
||
return null;
|
||
}
|
||
|
||
/**
|
||
* 解析OCR识别结果,提取金额、商户名称、日期等信息
|
||
*/
|
||
public static List<ParseResult> parse(String ocrResultJson) {
|
||
try {
|
||
JSONObject jsonObject = JSON.parseObject(ocrResultJson);
|
||
System.out.println();
|
||
String data = jsonObject.getString("data");
|
||
if (data == null) {
|
||
|
||
return List.of(new ParseResult(null, null, null, BigDecimal.ZERO));
|
||
}
|
||
|
||
JSONObject dataObject = JSON.parseObject(data);
|
||
String content = dataObject.getString("content");
|
||
String[] split = content.split(" ");
|
||
|
||
ArrayList<Map<String,String>> signList = new ArrayList<>();
|
||
|
||
for (int i = 1; i < split.length; i++) {
|
||
String currentLine = split[i].trim();
|
||
String previousLine = split[i-1].trim();
|
||
|
||
// 安全地获取 i+1, i+2, i+3 位置的值
|
||
String iPlusOne = null;
|
||
String iPlusTwo = null;
|
||
String iPlusThree = null;
|
||
|
||
if (i + 1 < split.length) {
|
||
iPlusOne = split[i + 1].trim();
|
||
}
|
||
if (i + 2 < split.length) {
|
||
iPlusTwo = split[i + 2].trim();
|
||
}
|
||
if (i + 3 < split.length) {
|
||
iPlusThree = split[i + 3].trim();
|
||
}
|
||
|
||
// 检查当前行是否以+或-开头
|
||
if (currentLine.startsWith("+") || currentLine.startsWith("-")) {
|
||
Map<String, String> signMap = new HashMap<>();
|
||
signMap.put("money", currentLine);
|
||
signMap.put("content", previousLine);
|
||
|
||
// 检查 i+1, i+2, i+3 哪个是日期格式
|
||
String dateValue = null;
|
||
if (i + 1 < split.length && isValidDate(iPlusOne)) {
|
||
dateValue = iPlusOne;
|
||
} else if (i + 2 < split.length && isValidDate(iPlusTwo)) {
|
||
dateValue = iPlusTwo;
|
||
} else if (i + 3 < split.length && isValidDate(iPlusThree)) {
|
||
dateValue = iPlusThree;
|
||
}
|
||
|
||
if (dateValue != null) {
|
||
signMap.put("date", dateValue);
|
||
}
|
||
signList.add(signMap);
|
||
}
|
||
}
|
||
|
||
//识别完成,对识别结果进行处理
|
||
|
||
ArrayList<ParseResult> parseList = new ArrayList<>();
|
||
signList.forEach(signMap -> {
|
||
ParseResult result = new ParseResult();
|
||
if (signMap.containsKey("money")){
|
||
result.setAmount(parseMoneyString(signMap.get("money")));
|
||
}
|
||
if (signMap.containsKey("content")){
|
||
result.setMerchant(parseMerchant(signMap.get("content")));
|
||
}
|
||
if (signMap.containsKey("date")){
|
||
result.setDate(parseDateTimeString(signMap.get("date")));
|
||
}
|
||
|
||
parseList.add(result);
|
||
});
|
||
|
||
return parseList;
|
||
} catch (Exception e) {
|
||
throw new RuntimeException("解析过程中出错,请重试或联系管理员,报错信息:"+e);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 从OCR结果中提取文本内容
|
||
*/
|
||
private static String extractContent(JSONObject data) {
|
||
StringBuilder content = new StringBuilder();
|
||
|
||
// 尝试获取prism_wordsInfo字段(通用文字识别)
|
||
JSONArray wordsInfo = data.getJSONArray("prism_wordsInfo");
|
||
if (wordsInfo != null) {
|
||
for (int i = 0; i < wordsInfo.size(); i++) {
|
||
JSONObject word = wordsInfo.getJSONObject(i);
|
||
String wordStr = word.getString("word");
|
||
if (wordStr != null) {
|
||
content.append(wordStr).append(" ");
|
||
}
|
||
}
|
||
}
|
||
|
||
// 如果没有prism_wordsInfo,尝试获取content字段
|
||
if (content.length() == 0) {
|
||
String contentStr = data.getString("content");
|
||
if (contentStr != null) {
|
||
content.append(contentStr);
|
||
}
|
||
}
|
||
|
||
return content.toString().trim();
|
||
}
|
||
|
||
/**
|
||
* 解析金额
|
||
*/
|
||
private static BigDecimal parseAmount(String content) {
|
||
Matcher matcher = AMOUNT_PATTERN.matcher(content);
|
||
|
||
// 查找所有匹配的金额,取最大的(通常是实际支付金额)
|
||
BigDecimal maxAmount = null;
|
||
while (matcher.find()) {
|
||
String amountStr = matcher.group(1);
|
||
try {
|
||
BigDecimal amount = new BigDecimal(amountStr);
|
||
if (maxAmount == null || amount.compareTo(maxAmount) > 0) {
|
||
maxAmount = amount;
|
||
}
|
||
} catch (NumberFormatException e) {
|
||
// 忽略解析失败的金额
|
||
}
|
||
}
|
||
|
||
return maxAmount;
|
||
}
|
||
|
||
/**
|
||
* 解析日期
|
||
*/
|
||
private static LocalDate parseDate(String content) {
|
||
Matcher matcher = DATE_PATTERN.matcher(content);
|
||
if (matcher.find()) {
|
||
try {
|
||
int year = Integer.parseInt(matcher.group(1));
|
||
int month = Integer.parseInt(matcher.group(2));
|
||
int day = Integer.parseInt(matcher.group(3));
|
||
return LocalDate.of(year, month, day);
|
||
} catch (Exception e) {
|
||
// 解析失败,返回null
|
||
}
|
||
}
|
||
return null;
|
||
}
|
||
|
||
/**
|
||
* 解析商户名称
|
||
*/
|
||
private static String parseMerchant(String content) {
|
||
// 查找包含商户关键词的行
|
||
String[] lines = content.split("\n");
|
||
for (String line : lines) {
|
||
for (String keyword : MERCHANT_KEYWORDS) {
|
||
if (line.contains(keyword)) {
|
||
// 提取商户名称(去除关键词本身)
|
||
String merchant = line.replace(keyword, "").trim();
|
||
if (!merchant.isEmpty() && merchant.length() < 50) {
|
||
return merchant;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// 如果没有找到,返回第一行非金额非日期的文本
|
||
for (String line : lines) {
|
||
line = line.trim();
|
||
if (!line.isEmpty() && !AMOUNT_PATTERN.matcher(line).find() &&
|
||
!DATE_PATTERN.matcher(line).find() && line.length() < 50) {
|
||
return line;
|
||
}
|
||
}
|
||
|
||
return null;
|
||
}
|
||
}
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|