diff --git a/src/main/java/com/accounting/util/OcrAmountParser.java b/src/main/java/com/accounting/util/OcrAmountParser.java index ef39807..3e561cb 100644 --- a/src/main/java/com/accounting/util/OcrAmountParser.java +++ b/src/main/java/com/accounting/util/OcrAmountParser.java @@ -9,10 +9,7 @@ import java.time.LocalDate; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeParseException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -23,8 +20,10 @@ public class OcrAmountParser { "[¥¥]?\\s*(\\d{1,10}(\\.\\d{1,2})?)\\s*[元]?" ); - // 日期正则表达式:匹配字符串中是否含有月或日 - private static final Pattern UNION_DATE_PATTERN = Pattern.compile(".*月.*日.*"); + + + // 日期正则表达式:匹配字符串中是否含有月、日或天 + private static final Pattern UNION_DATE_PATTERN = Pattern.compile(".*(今天|昨天).*"); // 日期正则表达式:匹配 12月2日13:14 这样的格式 @@ -32,6 +31,11 @@ public class OcrAmountParser { "(\\d{1,2})月(\\d{1,2})日\\s*(\\d{1,2})[::](\\d{1,2})" ); + // 日期正则表达式:匹配 12-11 13:14 这样的格式 + private static final Pattern DATE_PATTERN_DASH = Pattern.compile( + "(\\d{1,2})-(\\d{1,2})\\s*(\\d{1,2})[::](\\d{1,2})" + ); + // 商户名称关键词(常见支付平台) private static final String[] MERCHANT_KEYWORDS = { "微信支付", "支付宝", "收款", "付款", "商户", "商家", "店铺", "超市", "餐厅", "饭店" @@ -83,7 +87,19 @@ public class OcrAmountParser { if (dateStr == null || dateStr.trim().isEmpty()) { return false; } - return UNION_DATE_PATTERN.matcher(dateStr.trim()).matches(); + // 匹配今天、昨天 + if (UNION_DATE_PATTERN.matcher(dateStr.trim()).matches()){ + return true; + } + // 匹配12-11 13:14 + if (DATE_PATTERN_DASH.matcher(dateStr.trim()).matches()){ + return true; + } + // 匹配12月11日 13:14 + if (DATE_PATTERN.matcher(dateStr.trim()).matches()){ + return true; + } + return false; } /** @@ -123,34 +139,90 @@ public class OcrAmountParser { /** * 解析日期字符串为LocalDateTime对象 - * @param dateStr 日期字符串,例如:"12月2日13:14" + * @param dateStr 解析3中日期字符串,例如:"12月2日13:14" "12-11 13:14" "今天 13:14" "昨天 13:14" * @return LocalDateTime对象 */ private static LocalDateTime parseDateTimeString(String dateStr) { if (dateStr == null || dateStr.trim().isEmpty()) { - return null; + return LocalDateTime.now(); } - - try { - Matcher matcher = DATE_PATTERN.matcher(dateStr.trim()); - if (matcher.find()) { - int month = Integer.parseInt(matcher.group(1)); - int day = Integer.parseInt(matcher.group(2)); - int hour = Integer.parseInt(matcher.group(3)); - int minute = Integer.parseInt(matcher.group(4)); - - // 使用当前年份 - int year = java.time.Year.now().getValue(); - return LocalDateTime.of(year, month, day, hour, minute); + + //先处理今天、昨天 + if (UNION_DATE_PATTERN.matcher(dateStr.trim()).matches()){ + String trimmedDateStr = dateStr.trim(); + LocalDateTime baseDate; + + if (trimmedDateStr.startsWith("今天")) { + baseDate = LocalDateTime.now(); + } else if (trimmedDateStr.startsWith("昨天")) { + baseDate = LocalDateTime.now().minusDays(1); + } else { + return null; } - } catch (Exception e) { - e.printStackTrace(); + + // 提取时间部分 + String timePart = trimmedDateStr.substring(2).trim(); // 去掉"今天"/"昨天" + if (!timePart.isEmpty()) { + String[] timeParts = timePart.split("[::]"); + if (timeParts.length >= 2) { + try { + int hour = Integer.parseInt(timeParts[0]); + int minute = Integer.parseInt(timeParts[1]); + return baseDate.withHour(hour).withMinute(minute).withSecond(0).withNano(0); + } catch (NumberFormatException e) { + // 时间解析失败,返回基础日期 + return baseDate; + } + } + } + return baseDate; } + + //处理 12月2日13:14 + if (DATE_PATTERN.matcher(dateStr.trim()).matches()){ + try { + Matcher matcher = DATE_PATTERN.matcher(dateStr.trim()); + if (matcher.find()) { + int month = Integer.parseInt(matcher.group(1)); + int day = Integer.parseInt(matcher.group(2)); + int hour = Integer.parseInt(matcher.group(3)); + int minute = Integer.parseInt(matcher.group(4)); + + // 使用当前年份 + int year = java.time.Year.now().getValue(); + return LocalDateTime.of(year, month, day, hour, minute); + } + } catch (Exception e) { + e.printStackTrace(); + } + } + + if (DATE_PATTERN_DASH.matcher(dateStr.trim()).matches()){ + try { + Matcher matcher = DATE_PATTERN_DASH.matcher(dateStr.trim()); + if (matcher.find()) { + int month = Integer.parseInt(matcher.group(1)); + int day = Integer.parseInt(matcher.group(2)); + int hour = Integer.parseInt(matcher.group(3)); + int minute = Integer.parseInt(matcher.group(4)); + + // 使用当前年份 + int year = java.time.Year.now().getValue(); + return LocalDateTime.of(year, month, day, hour, minute); + } + } catch (Exception e) { + e.printStackTrace(); + } + } + return null; } /** * 解析OCR识别结果,提取金额、商户名称、日期等信息 + * 重构OCR识别结果的处理逻辑,大概思想是先识别时间 时间可能有多种格式 微信的格式为 12-12 13:14 支付宝的格式为 今天 13:14 昨天 13:14 + * 其他平台的格式为 12月2日13:14 等等 识别出日期以后 日期之前的3-4个值就可能是此笔支付的其他数据 一般的格式为 [商户/描述] [金额] 可能存在的[分类] [时间] + * 所以在识别出日期后 查看前3-4个值中是否有类似金额的值 */ public static List parse(String ocrResultJson) { try { @@ -164,6 +236,7 @@ public class OcrAmountParser { JSONObject dataObject = JSON.parseObject(data); String content = dataObject.getString("content"); +// String content = "下午2:01 0.3K/s必 5G ra HD ID 4G 10 C 49 D < Q搜索交易记录 搜索 全部 支出 转账 退款 订单筛选 ¥198 ¥3,092.83 ¥0.00 收支分析 设置支出预算> C 五华区皓月千里便利店 -4.20 日用百货 今天 13:30 扫收钱码付款-给快乐 -11.00 餐饮美食 今天 12:22 余额宝-收益发放 0.19 投资理财 今天 04:47 2000406014951497 -20.00 餐饮美食 昨天 22:14 扫收钱码付款-给扫码点单店主 -5.00 公共服务 昨天 21:44 蜜雪冰城920749店 -2.18 餐饮美食 TA "; String[] split = content.split(" "); ArrayList> signList = new ArrayList<>(); @@ -177,36 +250,59 @@ public class OcrAmountParser { String iPlusTwo = null; String iPlusThree = null; - if (i + 1 < split.length) { - iPlusOne = split[i + 1].trim(); + iPlusOne = split[i - 1].trim(); + if (i > 1) { + iPlusTwo = split[i - 2].trim(); } - if (i + 2 < split.length) { - iPlusTwo = split[i + 2].trim(); - } - if (i + 3 < split.length) { - iPlusThree = split[i + 3].trim(); + if (i > 2) { + iPlusThree = split[i - 3].trim(); } - // 检查当前行是否以+或-开头 - if (currentLine.startsWith("+") || currentLine.startsWith("-")) { + // 检查当前行是否符合日期格式 + if (isValidDate(currentLine)) { Map signMap = new HashMap<>(); - signMap.put("money", currentLine); - signMap.put("content", previousLine); - // 检查 i+1, i+2, i+3 哪个是日期格式 - String dateValue = null; - if (i + 1 < split.length && isValidDate(iPlusOne)) { - dateValue = iPlusOne; - } else if (i + 2 < split.length && isValidDate(iPlusTwo)) { - dateValue = iPlusTwo; - } else if (i + 3 < split.length && isValidDate(iPlusThree)) { - dateValue = iPlusThree; + //判断前三个值是否包含了金额 + + //plusOne是金额 + if (iPlusOne != null && ((iPlusOne.startsWith("+") || iPlusOne.startsWith("-")) + || AMOUNT_PATTERN.matcher(Objects.requireNonNull(iPlusOne).trim()).matches())) { + if (UNION_DATE_PATTERN.matcher(currentLine).matches()){ + signMap.put("data", currentLine + split[i+1].trim()); + }else { + signMap.put("data", currentLine); + } + signMap.put("money", iPlusOne); + signMap.put("content", iPlusTwo); + signList.add(signMap); } - if (dateValue != null) { - signMap.put("date", dateValue); + //plusTwo是金额 + if (iPlusTwo != null && ((iPlusTwo.startsWith("+") || iPlusTwo.startsWith("-")) + || AMOUNT_PATTERN.matcher(Objects.requireNonNull(iPlusTwo).trim()).matches())) { + if (UNION_DATE_PATTERN.matcher(currentLine).matches()){ + signMap.put("data", currentLine + split[i+1].trim()); + }else { + signMap.put("data", currentLine); + } + signMap.put("money", iPlusTwo); + signMap.put("content", iPlusThree); + signList.add(signMap); + + } + + //plusThree是金额 + if (iPlusThree != null && ((iPlusThree.startsWith("+") || iPlusThree.startsWith("-")) + || AMOUNT_PATTERN.matcher(Objects.requireNonNull(iPlusThree).trim()).matches())) { + if (UNION_DATE_PATTERN.matcher(currentLine).matches()){ + signMap.put("data", currentLine + split[i+1].trim()); + }else { + signMap.put("data", currentLine); + } + signMap.put("money", iPlusThree); + signMap.put("content", split[i - 4].trim()); + signList.add(signMap); } - signList.add(signMap); } } @@ -219,10 +315,10 @@ public class OcrAmountParser { result.setAmount(parseMoneyString(signMap.get("money"))); } if (signMap.containsKey("content")){ - result.setMerchant(parseMerchant(signMap.get("content"))); + result.setMerchant(signMap.get("content")); } - if (signMap.containsKey("date")){ - result.setDate(parseDateTimeString(signMap.get("date"))); + if (signMap.containsKey("data")){ + result.setDate(parseDateTimeString(signMap.get("data"))); } parseList.add(result);