养乐多滋滋滋 2025-10-23 09:23 采纳率: 76%
浏览 2
已结题

java识别发票保存到表格

今天写个代码,用java扫描发票,抓取数据,保存到表格里,
问题:
1:为什么销售方的名称抓取的也是,购买方的名称效果图如下:

img


以下是核心代码:
ExcelExporter

package com.jl.liandian.FaPiao;

import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
import java.util.Map;

/**
 * @Author Jin
 * @Description
 * @Date 2025/10/23 8:05
 **/
public class ExcelExporter {

    public static void exportToExcel(List<Map<String, String>> invoiceList, String excelPath) throws IOException {
        File excelFile = new File(excelPath);
        if (!excelFile.getParentFile().exists()) {
            excelFile.getParentFile().mkdirs();
        }

        try (Workbook workbook = new XSSFWorkbook()) {
            Sheet sheet = workbook.createSheet("发票信息汇总");
            sheet.setDefaultColumnWidth(25);

            // 表头
            String[] headers = {
                    "文件名", "发票号码", "开票日期", "价税合计",
                    "购买方名称", "购买方税号", "销售方名称", "销售方税号"
            };
            Row headerRow = sheet.createRow(0);
            CellStyle headerStyle = createHeaderStyle(workbook);
            for (int i = 0; i < headers.length; i++) {
                Cell cell = headerRow.createCell(i);
                cell.setCellValue(headers[i]);
                cell.setCellStyle(headerStyle);
            }

            // 数据行
            CellStyle dataStyle = workbook.createCellStyle();
            dataStyle.setWrapText(true);
            for (int i = 0; i < invoiceList.size(); i++) {
                Map<String, String> invoice = invoiceList.get(i);
                Row row = sheet.createRow(i + 1);
                row.createCell(0).setCellValue(invoice.get("文件名"));
                row.createCell(1).setCellValue(invoice.get("发票号码"));
                row.createCell(2).setCellValue(invoice.get("开票日期"));
                row.createCell(3).setCellValue(invoice.get("价税合计"));
                row.createCell(4).setCellValue(invoice.get("购买方名称"));
                row.createCell(5).setCellValue(invoice.get("购买方税号"));
                row.createCell(6).setCellValue(invoice.get("销售方名称"));
                row.createCell(7).setCellValue(invoice.get("销售方税号"));
            }

            // 写入文件
            try (FileOutputStream fos = new FileOutputStream(excelPath)) {
                workbook.write(fos);
            }
        }
    }

    private static CellStyle createHeaderStyle(Workbook workbook) {
        CellStyle style = workbook.createCellStyle();
        Font font = workbook.createFont();
        font.setBold(true);
        style.setFont(font);
        style.setFillForegroundColor(IndexedColors.GREY_25_PERCENT.getIndex());
        style.setFillPattern(FillPatternType.SOLID_FOREGROUND);
        return style;
    }
}

InvoiceParser

package com.jl.liandian.FaPiao;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 发票常用字段
 * @Author Jin
 * @Description
 * @Date 2025/10/23 8:04
 **/
public class InvoiceParser {

    public static Map<String, String> parseInvoice(String text, String fileName) {
        Map<String, String> invoiceInfo = new HashMap<>();
        invoiceInfo.put("文件名", fileName);
        String[] fields = {"发票号码", "开票日期", "价税合计", "购买方名称", "购买方税号", "销售方名称", "销售方税号"};
        for (String field : fields) {
            invoiceInfo.put(field, "未识别");
        }

        // 1. 发票号码(强制匹配8-20位数字)
        Matcher invNoMatcher = Pattern.compile("(\\d{8,20})", Pattern.CASE_INSENSITIVE).matcher(text);
        if (invNoMatcher.find()) {
            invoiceInfo.put("发票号码", invNoMatcher.group(1));
        }

        // 2. 开票日期(强制匹配日期格式)
        Matcher dateMatcher = Pattern.compile("(\\d{4}[-/年]\\d{1,2}[-/月]\\d{1,2})", Pattern.CASE_INSENSITIVE).matcher(text);
        if (dateMatcher.find()) {
            String date = dateMatcher.group(1)
                    .replaceAll("[年月]", "-")
                    .replace("/", "-")
                    .trim();
            invoiceInfo.put("开票日期", date);
        }

        // 3. 价税合计(强制匹配金额格式)
        Matcher amountMatcher = Pattern.compile("(\\d+\\.\\d{2})", Pattern.CASE_INSENSITIVE).matcher(text);
        if (amountMatcher.find()) {
            invoiceInfo.put("价税合计", amountMatcher.group(1) + "元");
        }

        // 4. 购买方名称(强制匹配中文企业名)
        Matcher buyerNameMatcher = Pattern.compile("([\\u4e00-\\u9fa5]+[有限公司|公司|企业|个人]+)", Pattern.CASE_INSENSITIVE).matcher(text);
        if (buyerNameMatcher.find()) {
            invoiceInfo.put("购买方名称", buyerNameMatcher.group(1));
        }

        // 5. 销售方名称(强化专属关键词 + 严格排除购买方)
        List<String> sellerExclusiveKeywords = Arrays.asList(
                "销售方", "卖方", "商家", "销方", "收款方", "提供方",
                "出票方", "航空公司", "旅行社" // 新增行业专属关键词
        );
        for (String keyword : sellerExclusiveKeywords) {
            Matcher sellerNameMatcher = Pattern.compile(
                    keyword + "[::]?\\s*([^\\d\\n]+?)(?=\\s*税号|\\s*识别号|\\s*$)",
                    Pattern.CASE_INSENSITIVE | Pattern.DOTALL
            ).matcher(text);
            if (sellerNameMatcher.find()) {
                String sellerName = cleanText(sellerNameMatcher.group(1));
                // 严格排除购买方名称(包含、完全相同均排除)
                if (!sellerName.contains(invoiceInfo.get("购买方名称")) &&
                        !invoiceInfo.get("购买方名称").contains(sellerName)) {
                    invoiceInfo.put("销售方名称", sellerName);
                    break;
                }
            }
        }
        // 补充:若仍未识别,强制匹配与购买方不同的企业名
        if ("未识别".equals(invoiceInfo.get("销售方名称")) && !"未识别".equals(invoiceInfo.get("购买方名称"))) {
            Matcher fallbackSeller = Pattern.compile(
                    "([\\u4e00-\\u9fa5]+[有限公司|公司|企业|商家|航空|旅行社]+)(?!" + invoiceInfo.get("购买方名称") + ")",
                    Pattern.CASE_INSENSITIVE
            ).matcher(text);
            if (fallbackSeller.find()) {
                invoiceInfo.put("销售方名称", fallbackSeller.group(1));
            }
        }

        // 6. 购买方税号、销售方税号(强制匹配18位税号)
        List<String> taxNos = new ArrayList<>();
        Matcher taxMatcher = Pattern.compile("([\\dA-Z]{18})", Pattern.CASE_INSENSITIVE).matcher(text);
        while (taxMatcher.find()) {
            taxNos.add(taxMatcher.group(1));
        }
        if (taxNos.size() >= 1) invoiceInfo.put("购买方税号", taxNos.get(0));
        if (taxNos.size() >= 2) invoiceInfo.put("销售方税号", taxNos.get(1));

        return invoiceInfo;
    }

    private static String cleanText(String text) {
        return text.replaceAll("[^\\u4e00-\\u9fa5a-zA-Z0-9]", "").trim();
    }
}

PdfOcrUtil

package com.jl.liandian.FaPiao;

import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;

import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;

/**
 * @Author Jin
 * @Description
 * @Date 2025/10/23 8:08
 **/
public class PdfOcrUtil {

    /**
     * 提取可复制PDF文本
     */
    public static String extractTextFromEditablePdf(File pdfFile) throws IOException {
        try (PDDocument document = PDDocument.load(pdfFile)) {
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setSortByPosition(true);
            return stripper.getText(document).replaceAll("\\s+", " ").trim();
        }
    }

    /**
     * 扫描件PDF识别(极端增强预处理)
     */
    public static String extractTextFromScannedPdf(File pdfFile, String tessDataPath) throws Exception {
        try (PDDocument document = PDDocument.load(pdfFile)) {
            PDFRenderer renderer = new PDFRenderer(document);
            StringBuilder text = new StringBuilder();
            ITesseract tesseract = new Tesseract();
            tesseract.setDatapath(tessDataPath);
            tesseract.setLanguage("chi_sim+eng");
            tesseract.setOcrEngineMode(1);

            for (int i = 0; i < document.getNumberOfPages(); i++) {
                // 400DPI高清渲染
                BufferedImage image = renderer.renderImageWithDPI(i, 400);
                // 极端增强预处理(对比度+二值化)
                BufferedImage processedImage = preprocessImage(image);
                text.append(tesseract.doOCR(processedImage)).append(" ");
            }
            return text.toString().replaceAll("\\s+", " ").trim();
        }
    }

    /**
     * 极端增强图片预处理(解决极低清晰度问题)
     */
    private static BufferedImage preprocessImage(BufferedImage image) {
        // 转为灰度图
        BufferedImage grayImage = new BufferedImage(
                image.getWidth(), image.getHeight(), BufferedImage.TYPE_BYTE_GRAY
        );
        Graphics g = grayImage.getGraphics();
        g.drawImage(image, 0, 0, null);
        g.dispose();

        // 超增强对比度(暗部更暗,亮部更亮)
        int contrast = 60;
        BufferedImage contrastImage = new BufferedImage(
                grayImage.getWidth(), grayImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY
        );
        for (int y = 0; y < grayImage.getHeight(); y++) {
            for (int x = 0; x < grayImage.getWidth(); x++) {
                int pixel = grayImage.getRGB(x, y) & 0xFF;
                int newPixel = pixel + contrast;
                newPixel = Math.min(255, Math.max(0, newPixel));
                contrastImage.setRGB(x, y, (newPixel << 16) | (newPixel << 8) | newPixel);
            }
        }

        // 低阈值二值化(确保文字不被过滤)
        BufferedImage binaryImage = new BufferedImage(
                contrastImage.getWidth(), contrastImage.getHeight(), BufferedImage.TYPE_BYTE_BINARY
        );
        for (int y = 0; y < contrastImage.getHeight(); y++) {
            for (int x = 0; x < contrastImage.getWidth(); x++) {
                int pixel = contrastImage.getRGB(x, y) & 0xFF;
                binaryImage.setRGB(x, y, pixel > 80 ? 0xFFFFFF : 0x000000); // 阈值降至80
            }
        }
        return binaryImage;
    }

    /**
     * 判断是否为扫描件
     */
    public static boolean isScannedPdf(File pdfFile) {
        try {
            String text = extractTextFromEditablePdf(pdfFile);
            return text.length() < 100;
        } catch (Exception e) {
            return true;
        }
    }
}


LiandianApplication

package com.jl.liandian;

import com.jl.liandian.FaPiao.ExcelExporter;
import com.jl.liandian.FaPiao.InvoiceParser;
import com.jl.liandian.FaPiao.PdfOcrUtil;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

@SpringBootApplication
public class LiandianApplication {
    // 配置参数(请根据实际情况修改)
    private static final String PDF_DIR = "E:\\在线赛思\\报销\\发票\\scanned"; // 发票PDF所在目录
    private static final String EXCEL_OUTPUT_PATH = "E:\\在线赛思\\报销\\发票\\scanned\\发票信息汇总.xlsx"; // 导出Excel路径
    private static final String TESS_DATA_PATH = "D:/tessdata"; // Tesseract语言包路径

    public static void main(String[] args) {
        System.out.println("===== 终极发票识别工具 =====");
        System.out.println("扫描目录:" + PDF_DIR);
        System.out.println("输出路径:" + EXCEL_OUTPUT_PATH);

        try {
            // 收集所有PDF发票
            List<File> pdfFiles = new ArrayList<>();
            traversePdfFiles(new File(PDF_DIR), pdfFiles);
            System.out.println("发现PDF文件:" + pdfFiles.size() + " 个");
            if (pdfFiles.isEmpty()) {
                System.out.println("未找到PDF文件,程序退出");
                return;
            }

            // 批量识别
            List<Map<String, String>> invoiceList = new ArrayList<>();
            for (int i = 0; i < pdfFiles.size(); i++) {
                File pdfFile = pdfFiles.get(i);
                System.out.printf("处理中 (%d/%d):%s%n", i + 1, pdfFiles.size(), pdfFile.getName());

                try {
                    // 提取文本
                    String pdfText = PdfOcrUtil.isScannedPdf(pdfFile)
                            ? PdfOcrUtil.extractTextFromScannedPdf(pdfFile, TESS_DATA_PATH)
                            : PdfOcrUtil.extractTextFromEditablePdf(pdfFile);

                    // 解析发票
                    Map<String, String> info = InvoiceParser.parseInvoice(pdfText, pdfFile.getName());
                    invoiceList.add(info);
                } catch (Exception e) {
                    System.err.println("处理失败 " + pdfFile.getName() + ":" + e.getMessage());
                }
            }

            // 导出到Excel
            ExcelExporter.exportToExcel(invoiceList, EXCEL_OUTPUT_PATH);
            System.out.println("===== 处理完成!结果已保存至:" + EXCEL_OUTPUT_PATH + " =====");

        } catch (Exception e) {
            System.err.println("程序异常:" + e.getMessage());
            e.printStackTrace();
        }
    }

    // 递归遍历所有PDF
    private static void traversePdfFiles(File dir, List<File> pdfFiles) {
        if (dir == null || !dir.isDirectory()) return;
        File[] files = dir.listFiles();
        if (files == null) return;
        for (File file : files) {
            if (file.isDirectory()) {
                traversePdfFiles(file, pdfFiles);
            } else if (file.getName().toLowerCase().endsWith(".pdf")) {
                pdfFiles.add(file);
            }
        }
    }
}


这是控制台运行后的截图:

img

  • 写回答

0条回答 默认 最新

    报告相同问题?

    问题事件

    • 已结题 (查看结题原因) 10月23日
    • 创建了问题 10月23日