龙空技术网

每天一个入坑小技巧:poi读取excel导致的内存溢出问题

云溪影视基地 108

前言:

眼前我们对“apache poi读取excel”大致比较注重,兄弟们都需要剖析一些“apache poi读取excel”的相关资讯。那么小编同时在网络上搜集了一些对于“apache poi读取excel””的相关内容,希望大家能喜欢,姐妹们一起来学习一下吧!

文件拆分后读取poi sax读取excel,解决大数据量导致的内存溢出问题

sax事件驱动解析与原来的解析方式最大的区别就是,sax将excel转换成xml格式然后一行去读取,而不是一下将excel解析到内存中,这样就可以有效的解决由数据量大导致的内存溢出问题了,sax解析时会忽略掉空格,这一点要单独去处理

1.sax将excel转换成xml

  为了更好的使用sax首先看看被转化成xml是什么样子的。

public static void main(String[] args) {        String filename ="D:\\8月\\otc\\sax解析测试\\0221otcposdata20190909.xlsm";        OPCPackage pkg;        try {            pkg = OPCPackage.open(filename);            XSSFReader r = new XSSFReader(pkg);            //查看转换的xml原始文件,方便理解后面解析时的处理,            InputStream in = r.getSheet("rId1");            byte[] buf = new byte[1024];            int len;            while ((len = in.read(buf)) != -1) {                System.out.write(buf, 0, len);            }        } catch (Exception e) {            e.printStackTrace();        }    }

解析后的样子,发现有好多标签,cols、col、row、 c 、v等等这些就是excel中的元素,后面的解析会用到。

2.解析excel2007

  通过继承DefaultHandler类,重写process(),startElement(),characters(),endElement()这四个方法。

import java.io.IOException;import java.io.InputStream;import java.util.ArrayList;import java.util.List;import org.apache.poi.openxml4j.exceptions.OpenXML4JException;import org.apache.poi.openxml4j.opc.OPCPackage;import org.apache.poi.ss.usermodel.BuiltinFormats;import org.apache.poi.ss.usermodel.DataFormatter;import org.apache.poi.xssf.eventusermodel.XSSFReader;import org.apache.poi.xssf.eventusermodel.XSSFReader.SheetIterator;import org.apache.poi.xssf.model.SharedStringsTable;import org.apache.poi.xssf.model.StylesTable;import org.apache.poi.xssf.usermodel.XSSFCellStyle;import org.apache.poi.xssf.usermodel.XSSFRichTextString;import org.xml.sax.Attributes;import org.xml.sax.InputSource;import org.xml.sax.SAXException;import org.xml.sax.XMLReader;import org.xml.sax.helpers.DefaultHandler;import org.xml.sax.helpers.XMLReaderFactory;public class ExcelXlsxReader extends DefaultHandler {    private IExcelRowReader rowReader;    public void setRowReader(IExcelRowReader rowReader) {        this.rowReader = rowReader;    }    private SharedStringsTable sst;    private StylesTable stylesTable;    private String lastContents;    private int sheetIndex = -1;    private CellDataType nextDataType = CellDataType.SSTINDEX;    private final DataFormatter formatter = new DataFormatter();    private short formatIndex;    private String formatString;    private List<List<String>> templist = new ArrayList<List<String>>();    private List<String> rowList = null;    private boolean allowNullRow = true;    private String dimension;    private int longest;    private List<String> currentRow;    private boolean isSSTIndex = false;    private String sheetName = null;    /**     * 遍历工作簿中所有的电子表格     *      * @param filename     * @throws IOException     * @throws OpenXML4JException     * @throws SAXException     * @throws Exception     */    public void process(String filename,String newFileName,String fileType) throws IOException,            OpenXML4JException, SAXException {        OPCPackage pkg = OPCPackage.open(filename);        XSSFReader r = new XSSFReader(pkg);        stylesTable = r.getStylesTable();        SharedStringsTable sst = r.getSharedStringsTable();        XMLReader parser = fetchSheetParser(sst);        SheetIterator sheets = (SheetIterator) r.getSheetsData();        while (sheets.hasNext()) {            InputStream sheet = sheets.next();            sheetName = sheets.getSheetName();            sheetIndex++;            InputSource sheetSource = new InputSource(sheet);            parser.parse(sheetSource);            sheet.close();        }        //写文件        rowReader.writeExcel(templist,sheetName,newFileName,fileType);        templist = null;    }    public XMLReader fetchSheetParser(SharedStringsTable sst)            throws SAXException {        XMLReader parser = XMLReaderFactory                .createXMLReader("org.apache.xerces.parsers.SAXParser");        this.sst = sst;        parser.setContentHandler(this);        return parser;    }    @Override    public void characters(char[] ch, int start, int length)            throws SAXException {        // 得到单元格内容的值        lastContents += new String(ch, start, length);    }    @Override    public void startElement(String uri, String localName, String qName,            Attributes attributes) throws SAXException {                if (qName.equals("dimension")) {            dimension = attributes.getValue("ref");            longest = covertRowIdtoInt(dimension.substring(dimension                    .indexOf(":") + 1));        }        //行        if (qName.equals("row")) {            currentRow = new ArrayList<String>();        }        //单元格         if (qName.equals("c")) {            // 判断单元格的值是SST的索引            if (attributes.getValue("t") != null                    && attributes.getValue("t").equals("s")) {                isSSTIndex = true;                nextDataType = CellDataType.SSTINDEX;            } else {                isSSTIndex = false;                // 当单元格的值不是SST的索引放一个空值占位。                currentRow.add("");                // 判断单元格格式类型                setNextDataType(attributes);            }        }        lastContents = "";    }    @Override    public void endElement(String uri, String localName, String qName)            throws SAXException {        // 行结束,存储一行数据        if (qName.equals("row")) {            if (isWriteRow(currentRow)) {                rowList = new ArrayList<String>();                rowList.addAll(currentRow);                templist.add(rowList);                rowList = null;            }        }        // 取值        if (qName.equals("v")) {            // 单元格的值是SST的索引            if (isSSTIndex) {                lastContents = getDataValue(lastContents);            } else {                // 单元格的值不是SST的索引,删除占位空格,再取值                currentRow.remove(currentRow.size() - 1);                lastContents = getDataValue(lastContents);            }            currentRow.add(lastContents);        }    }    /**     * 单元格中的数据类型枚举     */    enum CellDataType {        SSTINDEX, NUMBER, DATE, NULL    }        /**     * 在取值前处理单元格数据类型,目前只对日期格式进行处理,可扩展。     */    public void setNextDataType(Attributes attributes) {        nextDataType = CellDataType.NUMBER;        formatIndex = -1;        formatString = null;        String dateFormat = "m/d/yy,yyyy\\-mm\\-dd;@,yyyy/m/d;@,yyyy/m/d\\ h:mm;@,mm/dd/yy;@,m/d;@,"                + "yy/m/d;@,m/d/yy;@,[$-409]yyyy/m/d\\ h:mm\\ AM/PM;@,[$-F800]dddd\\,\\ mmmm\\ dd\\,\\ yyyy";        String cellStyleStr = attributes.getValue("s");        if (cellStyleStr != null) {            int styleIndex = Integer.parseInt(cellStyleStr);            XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);            formatIndex = style.getDataFormat();            formatString = style.getDataFormatString();            // 对日期类型进行处理            if (dateFormat.contains(formatString)) {                nextDataType = CellDataType.DATE;                formatString = "yyyy-MM-dd";            }            if (formatString == null) {                nextDataType = CellDataType.NULL;                formatString = BuiltinFormats.getBuiltinFormat(formatIndex);            }        }    }    private boolean isWriteRow(List<String> list) {        boolean flag = false;        for (int i = 0; i < list.size(); i++) {            if (list.get(i) != null && !list.get(i).equals("")) {                flag = true;                break;            }        }        if (allowNullRow) {            if (!flag) {                flag = true;// 允许一次空行                allowNullRow = false;            }        }        return flag;    }    public static int covertRowIdtoInt(String rowId) {        int firstDigit = -1;        for (int c = 0; c < rowId.length(); ++c) {            if (Character.isDigit(rowId.charAt(c))) {                firstDigit = c;                break;            }        }        String newRowId = rowId.substring(0, firstDigit);        int num = 0;        int result = 0;        int length = newRowId.length();        for (int i = 0; i < length; i++) {            char ch = newRowId.charAt(length - i - 1);            num = (int) (ch - 'A' + 1);            num *= Math.pow(26, i);            result += num;        }        return result;    }    /**     * 对解析出来的数据进行类型处理     */    public String getDataValue(String value) {        String thisStr = null;        switch (nextDataType) {        case SSTINDEX:            String sstIndex = value.toString();            try {                int idx = Integer.parseInt(sstIndex);                XSSFRichTextString rtss = new XSSFRichTextString(sst                        .getEntryAt(idx));                thisStr = rtss.toString();                rtss = null;            } catch (NumberFormatException ex) {                thisStr = value.toString();            }            break;        case DATE:            thisStr = formatter.formatRawCellContents(                    Double.parseDouble(value), formatIndex, formatString);            break;        default:            thisStr = value;            break;        }        return thisStr;    }    }

3.解析excel2003

  与读取excel2007有所区别,Excel2003需要通过继承HSSFListener类来解析,重写process()、processRecord()两个方法,在processRecord中进行读取操作。

import java.io.FileInputStream;import java.io.IOException;import java.util.ArrayList;import java.util.List;import org.apache.poi.hssf.eventusermodel.EventWorkbookBuilder.SheetRecordCollectingListener;import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;import org.apache.poi.hssf.eventusermodel.HSSFListener;import org.apache.poi.hssf.eventusermodel.HSSFRequest;import org.apache.poi.hssf.eventusermodel.MissingRecordAwareHSSFListener;import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord;import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord;import org.apache.poi.hssf.model.HSSFFormulaParser;import org.apache.poi.hssf.record.BOFRecord;import org.apache.poi.hssf.record.BlankRecord;import org.apache.poi.hssf.record.BoolErrRecord;import org.apache.poi.hssf.record.BoundSheetRecord;import org.apache.poi.hssf.record.FormulaRecord;import org.apache.poi.hssf.record.LabelRecord;import org.apache.poi.hssf.record.LabelSSTRecord;import org.apache.poi.hssf.record.NumberRecord;import org.apache.poi.hssf.record.Record;import org.apache.poi.hssf.record.SSTRecord;import org.apache.poi.hssf.record.StringRecord;import org.apache.poi.hssf.usermodel.HSSFWorkbook;import org.apache.poi.poifs.filesystem.POIFSFileSystem;public class ExcelXlsReader implements HSSFListener {    private int minColumns = -1;    private POIFSFileSystem fs;    private int lastRowNumber;    private int lastColumnNumber;    /** Should we output the formula, or the value it has? */    private boolean outputFormulaValues = true;    /** For parsing Formulas */    private SheetRecordCollectingListener workbookBuildingListener;    // excel2003工作薄    private HSSFWorkbook stubWorkbook;    // Records we pick up as we process    private SSTRecord sstRecord;    private FormatTrackingHSSFListener formatListener;    // 表索引    private int sheetIndex = -1;    private BoundSheetRecord[] orderedBSRs;    @SuppressWarnings("unchecked")    private ArrayList boundSheetRecords = new ArrayList();    // For handling formulas with string results    private int nextRow;    private int nextColumn;    private boolean outputNextStringRecord;    // 当前行    private int curRow = 0;    // 存储行记录的容器    private List<String> rowlist = new ArrayList<String>();;    private boolean allowNullRow = true;    @SuppressWarnings("unused")    private String sheetName;    private List<List<String>> templist = new ArrayList<List<String>>();    private List<String> r = null;    private IExcelRowReader rowReader;    public void setRowReader(IExcelRowReader rowReader) {        this.rowReader = rowReader;    }    /**     * 遍历excel下所有的sheet     *      * @throws IOException     */    public void process(String fileName,String newFileName,String fileType) throws IOException {        this.fs = new POIFSFileSystem(new FileInputStream(fileName));        MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);        formatListener = new FormatTrackingHSSFListener(listener);        HSSFEventFactory factory = new HSSFEventFactory();             HSSFRequest request = new HSSFRequest();        if (outputFormulaValues) {            request.addListenerForAllRecords(formatListener);                  } else {            workbookBuildingListener = new SheetRecordCollectingListener(formatListener);            request.addListenerForAllRecords(workbookBuildingListener);        }        factory.processWorkbookEvents(request, fs);        rowReader.writeExcel(templist,sheetName,newFileName,fileType);               templist = null;    }    /**     * HSSFListener 监听方法,处理 Record     */    @SuppressWarnings("unchecked")    public void processRecord(Record record) {        int thisRow = -1;        int thisColumn = -1;        String thisStr = null;        String value = null;        switch (record.getSid()) {        case BoundSheetRecord.sid:            boundSheetRecords.add(record);            break;        case BOFRecord.sid:            BOFRecord br = (BOFRecord) record;            if (br.getType() == BOFRecord.TYPE_WORKSHEET) {                // 如果有需要,则建立子工作薄                if (workbookBuildingListener != null && stubWorkbook == null) {                    stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook();                }                sheetIndex++;                if (orderedBSRs == null) {                    orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords);                }                sheetName = orderedBSRs[sheetIndex].getSheetname();                //ExcelRowReader.SHEETNAME = sheetName;            }            break;        case SSTRecord.sid:            sstRecord = (SSTRecord) record;            break;        case BlankRecord.sid:            BlankRecord brec = (BlankRecord) record;            thisRow = brec.getRow();            thisColumn = brec.getColumn();            thisStr = "";            rowlist.add(thisColumn, thisStr);            break;        case BoolErrRecord.sid: // 单元格为布尔类型            BoolErrRecord berec = (BoolErrRecord) record;            thisRow = berec.getRow();            thisColumn = berec.getColumn();            thisStr = berec.getBooleanValue() + "";            rowlist.add(thisColumn, thisStr);            break;        case FormulaRecord.sid: // 单元格为公式类型            FormulaRecord frec = (FormulaRecord) record;            thisRow = frec.getRow();            thisColumn = frec.getColumn();            if (outputFormulaValues) {                if (Double.isNaN(frec.getValue())) {                    outputNextStringRecord = true;                    nextRow = frec.getRow();                    nextColumn = frec.getColumn();                } else {                    thisStr = formatListener.formatNumberDateCell(frec);                }            } else {                thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';            }            rowlist.add(thisColumn, thisStr);            break;        case StringRecord.sid:// 单元格中公式的字符串            if (outputNextStringRecord) {                StringRecord srec = (StringRecord) record;                thisStr = srec.getString();                thisRow = nextRow;                thisColumn = nextColumn;                outputNextStringRecord = false;            }            break;        case LabelRecord.sid:            LabelRecord lrec = (LabelRecord) record;            curRow = thisRow = lrec.getRow();            thisColumn = lrec.getColumn();            value = lrec.getValue().trim();            value = value.equals("") ? " " : value;            this.rowlist.add(thisColumn, value);            break;        case LabelSSTRecord.sid: // 单元格为字符串类型            LabelSSTRecord lsrec = (LabelSSTRecord) record;            curRow = thisRow = lsrec.getRow();            thisColumn = lsrec.getColumn();            if (sstRecord == null) {                rowlist.add(thisColumn, " ");            } else {                value = sstRecord.getString(lsrec.getSSTIndex()).toString().trim();                value = value.equals("") ? " " : value;                rowlist.add(thisColumn, value);            }            break;        case NumberRecord.sid: // 单元格为数字类型            NumberRecord numrec = (NumberRecord) record;            curRow = thisRow = numrec.getRow();            thisColumn = numrec.getColumn();            value = formatListener.formatNumberDateCell(numrec).trim();            value = value.equals("") ? " " : value;            // 向容器加入列值            rowlist.add(thisColumn, value);            break;        default:            break;        }        // 遇到新行的操作        if (thisRow != -1 && thisRow != lastRowNumber) {            lastColumnNumber = -1;        }        // 空值的操作        if (record instanceof MissingCellDummyRecord) {            MissingCellDummyRecord mc = (MissingCellDummyRecord) record;            curRow = thisRow = mc.getRow();            thisColumn = mc.getColumn();            rowlist.add(thisColumn, " ");        }        // 更新行和列的值        if (thisRow > -1)            lastRowNumber = thisRow;        if (thisColumn > -1)            lastColumnNumber = thisColumn;        // 行结束时的操作        if (record instanceof LastCellOfRowDummyRecord) {            if (minColumns > 0) {                // 列值重新置空                if (lastColumnNumber == -1) {                    lastColumnNumber = 0;                }            }            lastColumnNumber = -1;            // 每行结束时, 调用getRows() 方法            if(isWriteRow(rowlist)){                //rowReader.getRows(sheetIndex, curRow, rowlist);                r = new ArrayList<String>();                r.addAll(rowlist);                templist.add(r);                r = null;            }            // 清空容器            rowlist.clear();        }    }    private boolean isWriteRow(List list){        boolean flag = false;         for (int i = 0; i < list.size(); i++) {            if (list.get(i)!= null && !list.get(i).equals("")) {                flag = true;                break;            }        }        if (allowNullRow) {            if (!flag) {                flag = true;//允许一次空行                allowNullRow = false;            }                    }        return flag;           }}

标签: #apache poi读取excel