fix(ocr识别功能修复)
1、修复了ocr识别的问题,采用以时间为主的识别方式
This commit is contained in:
parent
2ca71b6982
commit
910fec8e65
@ -9,10 +9,7 @@ import java.time.LocalDate;
|
|||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.time.format.DateTimeParseException;
|
import java.time.format.DateTimeParseException;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
@ -23,8 +20,10 @@ public class OcrAmountParser {
|
|||||||
"[¥¥]?\\s*(\\d{1,10}(\\.\\d{1,2})?)\\s*[元]?"
|
"[¥¥]?\\s*(\\d{1,10}(\\.\\d{1,2})?)\\s*[元]?"
|
||||||
);
|
);
|
||||||
|
|
||||||
// 日期正则表达式:匹配字符串中是否含有月或日
|
|
||||||
private static final Pattern UNION_DATE_PATTERN = Pattern.compile(".*月.*日.*");
|
|
||||||
|
// 日期正则表达式:匹配字符串中是否含有月、日或天
|
||||||
|
private static final Pattern UNION_DATE_PATTERN = Pattern.compile(".*(今天|昨天).*");
|
||||||
|
|
||||||
|
|
||||||
// 日期正则表达式:匹配 12月2日13:14 这样的格式
|
// 日期正则表达式:匹配 12月2日13:14 这样的格式
|
||||||
@ -32,6 +31,11 @@ public class OcrAmountParser {
|
|||||||
"(\\d{1,2})月(\\d{1,2})日\\s*(\\d{1,2})[::](\\d{1,2})"
|
"(\\d{1,2})月(\\d{1,2})日\\s*(\\d{1,2})[::](\\d{1,2})"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// 日期正则表达式:匹配 12-11 13:14 这样的格式
|
||||||
|
private static final Pattern DATE_PATTERN_DASH = Pattern.compile(
|
||||||
|
"(\\d{1,2})-(\\d{1,2})\\s*(\\d{1,2})[::](\\d{1,2})"
|
||||||
|
);
|
||||||
|
|
||||||
// 商户名称关键词(常见支付平台)
|
// 商户名称关键词(常见支付平台)
|
||||||
private static final String[] MERCHANT_KEYWORDS = {
|
private static final String[] MERCHANT_KEYWORDS = {
|
||||||
"微信支付", "支付宝", "收款", "付款", "商户", "商家", "店铺", "超市", "餐厅", "饭店"
|
"微信支付", "支付宝", "收款", "付款", "商户", "商家", "店铺", "超市", "餐厅", "饭店"
|
||||||
@ -83,7 +87,19 @@ public class OcrAmountParser {
|
|||||||
if (dateStr == null || dateStr.trim().isEmpty()) {
|
if (dateStr == null || dateStr.trim().isEmpty()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return UNION_DATE_PATTERN.matcher(dateStr.trim()).matches();
|
// 匹配今天、昨天
|
||||||
|
if (UNION_DATE_PATTERN.matcher(dateStr.trim()).matches()){
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// 匹配12-11 13:14
|
||||||
|
if (DATE_PATTERN_DASH.matcher(dateStr.trim()).matches()){
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// 匹配12月11日 13:14
|
||||||
|
if (DATE_PATTERN.matcher(dateStr.trim()).matches()){
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -123,34 +139,90 @@ public class OcrAmountParser {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 解析日期字符串为LocalDateTime对象
|
* 解析日期字符串为LocalDateTime对象
|
||||||
* @param dateStr 日期字符串,例如:"12月2日13:14"
|
* @param dateStr 解析3中日期字符串,例如:"12月2日13:14" "12-11 13:14" "今天 13:14" "昨天 13:14"
|
||||||
* @return LocalDateTime对象
|
* @return LocalDateTime对象
|
||||||
*/
|
*/
|
||||||
private static LocalDateTime parseDateTimeString(String dateStr) {
|
private static LocalDateTime parseDateTimeString(String dateStr) {
|
||||||
if (dateStr == null || dateStr.trim().isEmpty()) {
|
if (dateStr == null || dateStr.trim().isEmpty()) {
|
||||||
return null;
|
return LocalDateTime.now();
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
//先处理今天、昨天
|
||||||
Matcher matcher = DATE_PATTERN.matcher(dateStr.trim());
|
if (UNION_DATE_PATTERN.matcher(dateStr.trim()).matches()){
|
||||||
if (matcher.find()) {
|
String trimmedDateStr = dateStr.trim();
|
||||||
int month = Integer.parseInt(matcher.group(1));
|
LocalDateTime baseDate;
|
||||||
int day = Integer.parseInt(matcher.group(2));
|
|
||||||
int hour = Integer.parseInt(matcher.group(3));
|
if (trimmedDateStr.startsWith("今天")) {
|
||||||
int minute = Integer.parseInt(matcher.group(4));
|
baseDate = LocalDateTime.now();
|
||||||
|
} else if (trimmedDateStr.startsWith("昨天")) {
|
||||||
// 使用当前年份
|
baseDate = LocalDateTime.now().minusDays(1);
|
||||||
int year = java.time.Year.now().getValue();
|
} else {
|
||||||
return LocalDateTime.of(year, month, day, hour, minute);
|
return null;
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
// 提取时间部分
|
||||||
|
String timePart = trimmedDateStr.substring(2).trim(); // 去掉"今天"/"昨天"
|
||||||
|
if (!timePart.isEmpty()) {
|
||||||
|
String[] timeParts = timePart.split("[::]");
|
||||||
|
if (timeParts.length >= 2) {
|
||||||
|
try {
|
||||||
|
int hour = Integer.parseInt(timeParts[0]);
|
||||||
|
int minute = Integer.parseInt(timeParts[1]);
|
||||||
|
return baseDate.withHour(hour).withMinute(minute).withSecond(0).withNano(0);
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
// 时间解析失败,返回基础日期
|
||||||
|
return baseDate;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return baseDate;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//处理 12月2日13:14
|
||||||
|
if (DATE_PATTERN.matcher(dateStr.trim()).matches()){
|
||||||
|
try {
|
||||||
|
Matcher matcher = DATE_PATTERN.matcher(dateStr.trim());
|
||||||
|
if (matcher.find()) {
|
||||||
|
int month = Integer.parseInt(matcher.group(1));
|
||||||
|
int day = Integer.parseInt(matcher.group(2));
|
||||||
|
int hour = Integer.parseInt(matcher.group(3));
|
||||||
|
int minute = Integer.parseInt(matcher.group(4));
|
||||||
|
|
||||||
|
// 使用当前年份
|
||||||
|
int year = java.time.Year.now().getValue();
|
||||||
|
return LocalDateTime.of(year, month, day, hour, minute);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (DATE_PATTERN_DASH.matcher(dateStr.trim()).matches()){
|
||||||
|
try {
|
||||||
|
Matcher matcher = DATE_PATTERN_DASH.matcher(dateStr.trim());
|
||||||
|
if (matcher.find()) {
|
||||||
|
int month = Integer.parseInt(matcher.group(1));
|
||||||
|
int day = Integer.parseInt(matcher.group(2));
|
||||||
|
int hour = Integer.parseInt(matcher.group(3));
|
||||||
|
int minute = Integer.parseInt(matcher.group(4));
|
||||||
|
|
||||||
|
// 使用当前年份
|
||||||
|
int year = java.time.Year.now().getValue();
|
||||||
|
return LocalDateTime.of(year, month, day, hour, minute);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 解析OCR识别结果,提取金额、商户名称、日期等信息
|
* 解析OCR识别结果,提取金额、商户名称、日期等信息
|
||||||
|
* 重构OCR识别结果的处理逻辑,大概思想是先识别时间 时间可能有多种格式 微信的格式为 12-12 13:14 支付宝的格式为 今天 13:14 昨天 13:14
|
||||||
|
* 其他平台的格式为 12月2日13:14 等等 识别出日期以后 日期之前的3-4个值就可能是此笔支付的其他数据 一般的格式为 [商户/描述] [金额] 可能存在的[分类] [时间]
|
||||||
|
* 所以在识别出日期后 查看前3-4个值中是否有类似金额的值
|
||||||
*/
|
*/
|
||||||
public static List<ParseResult> parse(String ocrResultJson) {
|
public static List<ParseResult> parse(String ocrResultJson) {
|
||||||
try {
|
try {
|
||||||
@ -164,6 +236,7 @@ public class OcrAmountParser {
|
|||||||
|
|
||||||
JSONObject dataObject = JSON.parseObject(data);
|
JSONObject dataObject = JSON.parseObject(data);
|
||||||
String content = dataObject.getString("content");
|
String content = dataObject.getString("content");
|
||||||
|
// String content = "下午2:01 0.3K/s必 5G ra HD ID 4G 10 C 49 D < Q搜索交易记录 搜索 全部 支出 转账 退款 订单筛选 ¥198 ¥3,092.83 ¥0.00 收支分析 设置支出预算> C 五华区皓月千里便利店 -4.20 日用百货 今天 13:30 扫收钱码付款-给快乐 -11.00 餐饮美食 今天 12:22 余额宝-收益发放 0.19 投资理财 今天 04:47 2000406014951497 -20.00 餐饮美食 昨天 22:14 扫收钱码付款-给扫码点单店主 -5.00 公共服务 昨天 21:44 蜜雪冰城920749店 -2.18 餐饮美食 TA ";
|
||||||
String[] split = content.split(" ");
|
String[] split = content.split(" ");
|
||||||
|
|
||||||
ArrayList<Map<String,String>> signList = new ArrayList<>();
|
ArrayList<Map<String,String>> signList = new ArrayList<>();
|
||||||
@ -177,36 +250,59 @@ public class OcrAmountParser {
|
|||||||
String iPlusTwo = null;
|
String iPlusTwo = null;
|
||||||
String iPlusThree = null;
|
String iPlusThree = null;
|
||||||
|
|
||||||
if (i + 1 < split.length) {
|
iPlusOne = split[i - 1].trim();
|
||||||
iPlusOne = split[i + 1].trim();
|
if (i > 1) {
|
||||||
|
iPlusTwo = split[i - 2].trim();
|
||||||
}
|
}
|
||||||
if (i + 2 < split.length) {
|
if (i > 2) {
|
||||||
iPlusTwo = split[i + 2].trim();
|
iPlusThree = split[i - 3].trim();
|
||||||
}
|
|
||||||
if (i + 3 < split.length) {
|
|
||||||
iPlusThree = split[i + 3].trim();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 检查当前行是否以+或-开头
|
// 检查当前行是否符合日期格式
|
||||||
if (currentLine.startsWith("+") || currentLine.startsWith("-")) {
|
if (isValidDate(currentLine)) {
|
||||||
Map<String, String> signMap = new HashMap<>();
|
Map<String, String> signMap = new HashMap<>();
|
||||||
signMap.put("money", currentLine);
|
|
||||||
signMap.put("content", previousLine);
|
|
||||||
|
|
||||||
// 检查 i+1, i+2, i+3 哪个是日期格式
|
//判断前三个值是否包含了金额
|
||||||
String dateValue = null;
|
|
||||||
if (i + 1 < split.length && isValidDate(iPlusOne)) {
|
//plusOne是金额
|
||||||
dateValue = iPlusOne;
|
if (iPlusOne != null && ((iPlusOne.startsWith("+") || iPlusOne.startsWith("-"))
|
||||||
} else if (i + 2 < split.length && isValidDate(iPlusTwo)) {
|
|| AMOUNT_PATTERN.matcher(Objects.requireNonNull(iPlusOne).trim()).matches())) {
|
||||||
dateValue = iPlusTwo;
|
if (UNION_DATE_PATTERN.matcher(currentLine).matches()){
|
||||||
} else if (i + 3 < split.length && isValidDate(iPlusThree)) {
|
signMap.put("data", currentLine + split[i+1].trim());
|
||||||
dateValue = iPlusThree;
|
}else {
|
||||||
|
signMap.put("data", currentLine);
|
||||||
|
}
|
||||||
|
signMap.put("money", iPlusOne);
|
||||||
|
signMap.put("content", iPlusTwo);
|
||||||
|
signList.add(signMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dateValue != null) {
|
//plusTwo是金额
|
||||||
signMap.put("date", dateValue);
|
if (iPlusTwo != null && ((iPlusTwo.startsWith("+") || iPlusTwo.startsWith("-"))
|
||||||
|
|| AMOUNT_PATTERN.matcher(Objects.requireNonNull(iPlusTwo).trim()).matches())) {
|
||||||
|
if (UNION_DATE_PATTERN.matcher(currentLine).matches()){
|
||||||
|
signMap.put("data", currentLine + split[i+1].trim());
|
||||||
|
}else {
|
||||||
|
signMap.put("data", currentLine);
|
||||||
|
}
|
||||||
|
signMap.put("money", iPlusTwo);
|
||||||
|
signMap.put("content", iPlusThree);
|
||||||
|
signList.add(signMap);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//plusThree是金额
|
||||||
|
if (iPlusThree != null && ((iPlusThree.startsWith("+") || iPlusThree.startsWith("-"))
|
||||||
|
|| AMOUNT_PATTERN.matcher(Objects.requireNonNull(iPlusThree).trim()).matches())) {
|
||||||
|
if (UNION_DATE_PATTERN.matcher(currentLine).matches()){
|
||||||
|
signMap.put("data", currentLine + split[i+1].trim());
|
||||||
|
}else {
|
||||||
|
signMap.put("data", currentLine);
|
||||||
|
}
|
||||||
|
signMap.put("money", iPlusThree);
|
||||||
|
signMap.put("content", split[i - 4].trim());
|
||||||
|
signList.add(signMap);
|
||||||
}
|
}
|
||||||
signList.add(signMap);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -219,10 +315,10 @@ public class OcrAmountParser {
|
|||||||
result.setAmount(parseMoneyString(signMap.get("money")));
|
result.setAmount(parseMoneyString(signMap.get("money")));
|
||||||
}
|
}
|
||||||
if (signMap.containsKey("content")){
|
if (signMap.containsKey("content")){
|
||||||
result.setMerchant(parseMerchant(signMap.get("content")));
|
result.setMerchant(signMap.get("content"));
|
||||||
}
|
}
|
||||||
if (signMap.containsKey("date")){
|
if (signMap.containsKey("data")){
|
||||||
result.setDate(parseDateTimeString(signMap.get("date")));
|
result.setDate(parseDateTimeString(signMap.get("data")));
|
||||||
}
|
}
|
||||||
|
|
||||||
parseList.add(result);
|
parseList.add(result);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user