java使用POI读取内存溢出 5C

问题:
当我们使用POI导出xlsx格式的excel之后,使用microsoft excel打开保存之后,发现变大了很多,之后发现是SharedStringsTable.xml这个文件变大了很多,microsoft excel 将所有的共享字符串都存放在这个xml文件中。此后我们通过poi去读取这个由microsoft excel保存之后的文件,java内存溢出了。是否具有好的方法去读取这个大文件的excel.

5个回答

试试导出xls格式的呢,还是必须导出xlsx格式的

qq_36939564
qq_36939564 xls能存放的数量太少,数量在百万级别,用xlsx比较好
一年多之前 回复

/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */

import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

/**

  • 使用CVS模式解决XLSX文件,可以有效解决用户模式内存溢出的问题
  • 该模式是POI官方推荐的读取大数据的模式,在用户模式下,数据量较大、Sheet较多、或者是有很多无用的空行的情况
  • ,容易出现内存溢出,用户模式读取Excel的典型代码如下: FileInputStream file=new
  • FileInputStream("c:\test.xlsx"); Workbook wb=new XSSFWorkbook(file);
  • @author 山人
    */
    public class XLSXCovertCSVReader {

    /**

    • The type of the data value is indicated by an attribute on the cell. The
    • value is usually in a "v" element within the cell. */ enum xssfDataType { BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER, }

    /**

    • 使用xssf_sax_API处理Excel,请参考: http://poi.apache.org/spreadsheet/how-to.html#xssf_sax_api
    • Also see Standard ECMA-376, 1st edition, part 4, pages 1928ff, at
    • http://www.ecma-international.org/publications/standards/Ecma-376.htm
    • A web-friendly version is http://openiso.org/Ecma/376/Part4
      */
      class MyXSSFSheetHandler extends DefaultHandler {

      /**

      • Table with styles */ private StylesTable stylesTable;

      /**

      • Table with unique strings */ private ReadOnlySharedStringsTable sharedStringsTable;

      /**

      • Destination for data */ private final PrintStream output;

      /**

      • Number of columns to read starting with leftmost */ private final int minColumnCount;

      // Set when V start element is seen
      private boolean vIsOpen;

      // Set when cell start element is seen;
      // used when cell close element is seen.
      private xssfDataType nextDataType;

      // Used to format numeric cell values.
      private short formatIndex;
      private String formatString;
      private final DataFormatter formatter;

      private int thisColumn = -1;
      // The last column printed to the output stream
      private int lastColumnNumber = -1;

      // Gathers characters as they are seen.
      private StringBuffer value;
      private String[] record;
      private List rows = new ArrayList();
      private boolean isCellNull = false;

      /**

      • Accepts objects needed while parsing.
      • @param styles
      • Table of styles
      • @param strings
      • Table of shared strings
      • @param cols
      • Minimum number of columns to show
      • @param target
      • Sink for output */ public MyXSSFSheetHandler(StylesTable styles, ReadOnlySharedStringsTable strings, int cols, PrintStream target) { this.stylesTable = styles; this.sharedStringsTable = strings; this.minColumnCount = cols; this.output = target; this.value = new StringBuffer(); this.nextDataType = xssfDataType.NUMBER; this.formatter = new DataFormatter(); record = new String[this.minColumnCount]; rows.clear();// 每次读取都清空行集合 }

      /*

      • (non-Javadoc)
      • @see
      • org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String,
      • java.lang.String, java.lang.String, org.xml.sax.Attributes)
        */
        public void startElement(String uri, String localName, String name,
        Attributes attributes) throws SAXException {

        if ("inlineStr".equals(name) || "v".equals(name)) {
        vIsOpen = true;
        // Clear contents cache
        value.setLength(0);
        }
        // c => cell
        else if ("c".equals(name)) {
        // Get the cell reference
        String r = attributes.getValue("r");
        int firstDigit = -1;
        for (int c = 0; c < r.length(); ++c) {
        if (Character.isDigit(r.charAt(c))) {
        firstDigit = c;
        break;
        }
        }
        thisColumn = nameToColumn(r.substring(0, firstDigit));

        // Set up defaults.
        this.nextDataType = xssfDataType.NUMBER;
        this.formatIndex = -1;
        this.formatString = null;
        String cellType = attributes.getValue("t");
        String cellStyleStr = attributes.getValue("s");
        if ("b".equals(cellType))
            nextDataType = xssfDataType.BOOL;
        else if ("e".equals(cellType))
            nextDataType = xssfDataType.ERROR;
        else if ("inlineStr".equals(cellType))
            nextDataType = xssfDataType.INLINESTR;
        else if ("s".equals(cellType))
            nextDataType = xssfDataType.SSTINDEX;
        else if ("str".equals(cellType))
            nextDataType = xssfDataType.FORMULA;
        else if (cellStyleStr != null) {
            // It's a number, but almost certainly one
            // with a special style or format
            int styleIndex = Integer.parseInt(cellStyleStr);
            XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
            this.formatIndex = style.getDataFormat();
            this.formatString = style.getDataFormatString();
            if (this.formatString == null)
                this.formatString = BuiltinFormats
                        .getBuiltinFormat(this.formatIndex);
        }
        

        }

      }

      /*

      • (non-Javadoc)
      • @see org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String,
      • java.lang.String, java.lang.String)
        */
        public void endElement(String uri, String localName, String name)
        throws SAXException {

        String thisStr = null;

        // v => contents of a cell
        if ("v".equals(name)) {
        // Process the value contents as required.
        // Do now, as characters() may be called more than once
        switch (nextDataType) {

        case BOOL:
            char first = value.charAt(0);
            thisStr = first == '0' ? "FALSE" : "TRUE";
            break;
        
        case ERROR:
            thisStr = "\"ERROR:" + value.toString() + '"';
            break;
        
        case FORMULA:
            // A formula could result in a string value,
            // so always add double-quote characters.
            thisStr = '"' + value.toString() + '"';
            break;
        
        case INLINESTR:
            // TODO: have seen an example of this, so it's untested.
            XSSFRichTextString rtsi = new XSSFRichTextString(
                    value.toString());
            thisStr = '"' + rtsi.toString() + '"';
            break;
        
        case SSTINDEX:
            String sstIndex = value.toString();
            try {
                int idx = Integer.parseInt(sstIndex);
                XSSFRichTextString rtss = new XSSFRichTextString(
                        sharedStringsTable.getEntryAt(idx));
                thisStr = '"' + rtss.toString() + '"';
            } catch (NumberFormatException ex) {
                output.println("Failed to parse SST index '" + sstIndex
                        + "': " + ex.toString());
            }
            break;
        
        case NUMBER:
            String n = value.toString();
            // 判断是否是日期格式
            if (HSSFDateUtil.isADateFormat(this.formatIndex, n)) {
                Double d = Double.parseDouble(n);
                Date date=HSSFDateUtil.getJavaDate(d);
                thisStr=formateDateToString(date);
            } else if (this.formatString != null)
                thisStr = formatter.formatRawCellContents(
                        Double.parseDouble(n), this.formatIndex,
                        this.formatString);
            else
                thisStr = n;
            break;
        
        default:
            thisStr = "(TODO: Unexpected type: " + nextDataType + ")";
            break;
        }
        
        // Output after we've seen the string contents
        // Emit commas for any fields that were missing on this row
        if (lastColumnNumber == -1) {
            lastColumnNumber = 0;
        }
        //判断单元格的值是否为空
        if (thisStr == null || "".equals(isCellNull)) {
            isCellNull = true;// 设置单元格是否为空值
        }
        record[thisColumn] = thisStr;
        // Update column
        if (thisColumn > -1)
            lastColumnNumber = thisColumn;
        

        } else if ("row".equals(name)) {

        // Print out any missing commas if needed
        if (minColumns > 0) {
            // Columns are 0 based
            if (lastColumnNumber == -1) {
                lastColumnNumber = 0;
            }
            if (isCellNull == false && record[0] != null
                    && record[1] != null)// 判断是否空行
            {
                rows.add(record.clone());
                isCellNull = false;
                for (int i = 0; i < record.length; i++) {
                    record[i] = null;
                }
            }
        }
        lastColumnNumber = -1;
        

        }

      }

      public List getRows() {
      return rows;
      }

      public void setRows(List rows) {
      this.rows = rows;
      }

      /**

      • Captures characters only if a suitable element is open. Originally
      • was just "v"; extended for inlineStr also. */ public void characters(char[] ch, int start, int length) throws SAXException { if (vIsOpen) value.append(ch, start, length); }

      /**

      • Converts an Excel column name like "C" to a zero-based index.
      • @param name
      • @return Index corresponding to the specified name */ private int nameToColumn(String name) { int column = -1; for (int i = 0; i < name.length(); ++i) { int c = name.charAt(i); column = (column + 1) * 26 + c - 'A'; } return column; }

      private String formateDateToString(Date date) {
      SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");//格式化日期
      return sdf.format(date);

      }

    }

    // /////////////////////////////////////

    private OPCPackage xlsxPackage;
    private int minColumns;
    private PrintStream output;
    private String sheetName;

    /**

    • Creates a new XLSX -> CSV converter
    • @param pkg
    • The XLSX package to process
    • @param output
    • The PrintStream to output the CSV to
    • @param minColumns
    • The minimum number of columns to output, or -1 for no minimum */ public XLSXCovertCSVReader(OPCPackage pkg, PrintStream output, String sheetName, int minColumns) { this.xlsxPackage = pkg; this.output = output; this.minColumns = minColumns; this.sheetName = sheetName; }

    /**

    • Parses and shows the content of one sheet using the specified styles and
    • shared-strings tables.
    • @param styles
    • @param strings
    • @param sheetInputStream
      */
      public List processSheet(StylesTable styles,
      ReadOnlySharedStringsTable strings, InputStream sheetInputStream)
      throws IOException, ParserConfigurationException, SAXException {

      InputSource sheetSource = new InputSource(sheetInputStream);
      SAXParserFactory saxFactory = SAXParserFactory.newInstance();
      SAXParser saxParser = saxFactory.newSAXParser();
      XMLReader sheetParser = saxParser.getXMLReader();
      MyXSSFSheetHandler handler = new MyXSSFSheetHandler(styles, strings,
      this.minColumns, this.output);
      sheetParser.setContentHandler(handler);
      sheetParser.parse(sheetSource);
      return handler.getRows();
      }

    /**

    • 初始化这个处理程序 将
    • @throws IOException
    • @throws OpenXML4JException
    • @throws ParserConfigurationException
    • @throws SAXException
      */
      public List process() throws IOException, OpenXML4JException,
      ParserConfigurationException, SAXException {

      ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(
      this.xlsxPackage);
      XSSFReader xssfReader = new XSSFReader(this.xlsxPackage);
      List list = null;
      StylesTable styles = xssfReader.getStylesTable();
      XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader
      .getSheetsData();
      int index = 0;
      while (iter.hasNext()) {
      InputStream stream = iter.next();
      String sheetNameTemp = iter.getSheetName();
      if (this.sheetName.equals(sheetNameTemp)) {
      list = processSheet(styles, strings, stream);
      stream.close();
      ++index;
      }
      }
      return list;
      }

    /**

    • 读取Excel
    • @param path
    • 文件路径
    • @param sheetName
    • sheet名称
    • @param minColumns
    • 列总数
    • @return
    • @throws SAXException
    • @throws ParserConfigurationException
    • @throws OpenXML4JException
    • @throws IOException */ private static List readerExcel(String path, String sheetName, int minColumns) throws IOException, OpenXML4JException, ParserConfigurationException, SAXException { OPCPackage p = OPCPackage.open(path, PackageAccess.READ); XLSXCovertCSVReader xlsx2csv = new XLSXCovertCSVReader(p, System.out, sheetName, minColumns); List list = xlsx2csv.process(); p.close(); return list; }

    public static void main(String[] args) throws Exception {
    List list = XLSXCovertCSVReader
    .readerExcel(
    "F:\test.xlsx",
    "Sheet1", 17);
    for (String[] record : list) {
    for (String cell : record) {
    System.out.print(cell + " ");
    }
    System.out.println();
    }
    }

}

qq_36939564
qq_36939564 就在这个ReadOnlySharedStringsTable初内存溢出了,在解析xlsx的文件时候,首先会将shardStringtable.xml文件加载到内存,此时内存溢出。上面的那个方法不行,已经测试
一年多之前 回复

多次调用IO,都会报这个错

用EEplus 试试

qq_36939564
qq_36939564 java语言不能使用这个EEplus 吧.
一年多之前 回复

文件太大了,虚拟机承受不了,自然内存会溢出呀。。。。要么加大虚拟机的内存空间,要么把文件分割。

huang931027
IAmObject 回复qq_36939564: 那你可以不用生成呀,自己做一个模板,把数据导出到模板
一年多之前 回复
qq_36939564
qq_36939564 回复huang931027: 少量的数据没有问题,就是当数据量大的时候就会出现问题。 现在有个疑问就是,使用poi导出的xlsx文件130M,没有使用到sharedStringTable.xml,而使用microsoft打开之后保存(不做任何的修改,就是打开保存),变大为180M.。我在想是不是poi生成xlsx文件的方式和microsoft的保存方式不一样
一年多之前 回复
huang931027
IAmObject 回复qq_36939564: 你可以先试一下,就excel里面就存上几条数据,看看能不能代码执行的时候有没有报错。
一年多之前 回复
qq_36939564
qq_36939564 我将虚拟机的内存调到了1G,依旧这样。
一年多之前 回复
Csdn user default icon
上传中...
上传图片
插入图片
抄袭、复制答案,以达到刷声望分或其他目的的行为,在CSDN问答是严格禁止的,一经发现立刻封号。是时候展现真正的技术了!
立即提问