前言:
眼前我们对“apache poi读取excel”大致比较注重,兄弟们都需要剖析一些“apache poi读取excel”的相关资讯。那么小编同时在网络上搜集了一些对于“apache poi读取excel””的相关内容,希望大家能喜欢,姐妹们一起来学习一下吧!文件拆分后读取poi sax读取excel,解决大数据量导致的内存溢出问题
sax事件驱动解析与原来的解析方式最大的区别就是,sax将excel转换成xml格式然后一行去读取,而不是一下将excel解析到内存中,这样就可以有效的解决由数据量大导致的内存溢出问题了,sax解析时会忽略掉空格,这一点要单独去处理
1.sax将excel转换成xml
为了更好的使用sax首先看看被转化成xml是什么样子的。
public static void main(String[] args) { String filename ="D:\\8月\\otc\\sax解析测试\\0221otcposdata20190909.xlsm"; OPCPackage pkg; try { pkg = OPCPackage.open(filename); XSSFReader r = new XSSFReader(pkg); //查看转换的xml原始文件,方便理解后面解析时的处理, InputStream in = r.getSheet("rId1"); byte[] buf = new byte[1024]; int len; while ((len = in.read(buf)) != -1) { System.out.write(buf, 0, len); } } catch (Exception e) { e.printStackTrace(); } }
解析后的样子,发现有好多标签,cols、col、row、 c 、v等等这些就是excel中的元素,后面的解析会用到。
2.解析excel2007
通过继承DefaultHandler类,重写process(),startElement(),characters(),endElement()这四个方法。
import java.io.IOException;import java.io.InputStream;import java.util.ArrayList;import java.util.List;import org.apache.poi.openxml4j.exceptions.OpenXML4JException;import org.apache.poi.openxml4j.opc.OPCPackage;import org.apache.poi.ss.usermodel.BuiltinFormats;import org.apache.poi.ss.usermodel.DataFormatter;import org.apache.poi.xssf.eventusermodel.XSSFReader;import org.apache.poi.xssf.eventusermodel.XSSFReader.SheetIterator;import org.apache.poi.xssf.model.SharedStringsTable;import org.apache.poi.xssf.model.StylesTable;import org.apache.poi.xssf.usermodel.XSSFCellStyle;import org.apache.poi.xssf.usermodel.XSSFRichTextString;import org.xml.sax.Attributes;import org.xml.sax.InputSource;import org.xml.sax.SAXException;import org.xml.sax.XMLReader;import org.xml.sax.helpers.DefaultHandler;import org.xml.sax.helpers.XMLReaderFactory;public class ExcelXlsxReader extends DefaultHandler { private IExcelRowReader rowReader; public void setRowReader(IExcelRowReader rowReader) { this.rowReader = rowReader; } private SharedStringsTable sst; private StylesTable stylesTable; private String lastContents; private int sheetIndex = -1; private CellDataType nextDataType = CellDataType.SSTINDEX; private final DataFormatter formatter = new DataFormatter(); private short formatIndex; private String formatString; private List<List<String>> templist = new ArrayList<List<String>>(); private List<String> rowList = null; private boolean allowNullRow = true; private String dimension; private int longest; private List<String> currentRow; private boolean isSSTIndex = false; private String sheetName = null; /** * 遍历工作簿中所有的电子表格 * * @param filename * @throws IOException * @throws OpenXML4JException * @throws SAXException * @throws Exception */ public void process(String filename,String newFileName,String fileType) throws IOException, OpenXML4JException, SAXException { OPCPackage pkg = OPCPackage.open(filename); XSSFReader r = new XSSFReader(pkg); stylesTable = r.getStylesTable(); SharedStringsTable sst = r.getSharedStringsTable(); XMLReader parser = fetchSheetParser(sst); SheetIterator sheets = (SheetIterator) r.getSheetsData(); while (sheets.hasNext()) { InputStream sheet = sheets.next(); sheetName = sheets.getSheetName(); sheetIndex++; InputSource sheetSource = new InputSource(sheet); parser.parse(sheetSource); sheet.close(); } //写文件 rowReader.writeExcel(templist,sheetName,newFileName,fileType); templist = null; } public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException { XMLReader parser = XMLReaderFactory .createXMLReader("org.apache.xerces.parsers.SAXParser"); this.sst = sst; parser.setContentHandler(this); return parser; } @Override public void characters(char[] ch, int start, int length) throws SAXException { // 得到单元格内容的值 lastContents += new String(ch, start, length); } @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { if (qName.equals("dimension")) { dimension = attributes.getValue("ref"); longest = covertRowIdtoInt(dimension.substring(dimension .indexOf(":") + 1)); } //行 if (qName.equals("row")) { currentRow = new ArrayList<String>(); } //单元格 if (qName.equals("c")) { // 判断单元格的值是SST的索引 if (attributes.getValue("t") != null && attributes.getValue("t").equals("s")) { isSSTIndex = true; nextDataType = CellDataType.SSTINDEX; } else { isSSTIndex = false; // 当单元格的值不是SST的索引放一个空值占位。 currentRow.add(""); // 判断单元格格式类型 setNextDataType(attributes); } } lastContents = ""; } @Override public void endElement(String uri, String localName, String qName) throws SAXException { // 行结束,存储一行数据 if (qName.equals("row")) { if (isWriteRow(currentRow)) { rowList = new ArrayList<String>(); rowList.addAll(currentRow); templist.add(rowList); rowList = null; } } // 取值 if (qName.equals("v")) { // 单元格的值是SST的索引 if (isSSTIndex) { lastContents = getDataValue(lastContents); } else { // 单元格的值不是SST的索引,删除占位空格,再取值 currentRow.remove(currentRow.size() - 1); lastContents = getDataValue(lastContents); } currentRow.add(lastContents); } } /** * 单元格中的数据类型枚举 */ enum CellDataType { SSTINDEX, NUMBER, DATE, NULL } /** * 在取值前处理单元格数据类型,目前只对日期格式进行处理,可扩展。 */ public void setNextDataType(Attributes attributes) { nextDataType = CellDataType.NUMBER; formatIndex = -1; formatString = null; String dateFormat = "m/d/yy,yyyy\\-mm\\-dd;@,yyyy/m/d;@,yyyy/m/d\\ h:mm;@,mm/dd/yy;@,m/d;@," + "yy/m/d;@,m/d/yy;@,[$-409]yyyy/m/d\\ h:mm\\ AM/PM;@,[$-F800]dddd\\,\\ mmmm\\ dd\\,\\ yyyy"; String cellStyleStr = attributes.getValue("s"); if (cellStyleStr != null) { int styleIndex = Integer.parseInt(cellStyleStr); XSSFCellStyle style = stylesTable.getStyleAt(styleIndex); formatIndex = style.getDataFormat(); formatString = style.getDataFormatString(); // 对日期类型进行处理 if (dateFormat.contains(formatString)) { nextDataType = CellDataType.DATE; formatString = "yyyy-MM-dd"; } if (formatString == null) { nextDataType = CellDataType.NULL; formatString = BuiltinFormats.getBuiltinFormat(formatIndex); } } } private boolean isWriteRow(List<String> list) { boolean flag = false; for (int i = 0; i < list.size(); i++) { if (list.get(i) != null && !list.get(i).equals("")) { flag = true; break; } } if (allowNullRow) { if (!flag) { flag = true;// 允许一次空行 allowNullRow = false; } } return flag; } public static int covertRowIdtoInt(String rowId) { int firstDigit = -1; for (int c = 0; c < rowId.length(); ++c) { if (Character.isDigit(rowId.charAt(c))) { firstDigit = c; break; } } String newRowId = rowId.substring(0, firstDigit); int num = 0; int result = 0; int length = newRowId.length(); for (int i = 0; i < length; i++) { char ch = newRowId.charAt(length - i - 1); num = (int) (ch - 'A' + 1); num *= Math.pow(26, i); result += num; } return result; } /** * 对解析出来的数据进行类型处理 */ public String getDataValue(String value) { String thisStr = null; switch (nextDataType) { case SSTINDEX: String sstIndex = value.toString(); try { int idx = Integer.parseInt(sstIndex); XSSFRichTextString rtss = new XSSFRichTextString(sst .getEntryAt(idx)); thisStr = rtss.toString(); rtss = null; } catch (NumberFormatException ex) { thisStr = value.toString(); } break; case DATE: thisStr = formatter.formatRawCellContents( Double.parseDouble(value), formatIndex, formatString); break; default: thisStr = value; break; } return thisStr; } }
3.解析excel2003
与读取excel2007有所区别,Excel2003需要通过继承HSSFListener类来解析,重写process()、processRecord()两个方法,在processRecord中进行读取操作。
import java.io.FileInputStream;import java.io.IOException;import java.util.ArrayList;import java.util.List;import org.apache.poi.hssf.eventusermodel.EventWorkbookBuilder.SheetRecordCollectingListener;import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;import org.apache.poi.hssf.eventusermodel.HSSFListener;import org.apache.poi.hssf.eventusermodel.HSSFRequest;import org.apache.poi.hssf.eventusermodel.MissingRecordAwareHSSFListener;import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord;import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord;import org.apache.poi.hssf.model.HSSFFormulaParser;import org.apache.poi.hssf.record.BOFRecord;import org.apache.poi.hssf.record.BlankRecord;import org.apache.poi.hssf.record.BoolErrRecord;import org.apache.poi.hssf.record.BoundSheetRecord;import org.apache.poi.hssf.record.FormulaRecord;import org.apache.poi.hssf.record.LabelRecord;import org.apache.poi.hssf.record.LabelSSTRecord;import org.apache.poi.hssf.record.NumberRecord;import org.apache.poi.hssf.record.Record;import org.apache.poi.hssf.record.SSTRecord;import org.apache.poi.hssf.record.StringRecord;import org.apache.poi.hssf.usermodel.HSSFWorkbook;import org.apache.poi.poifs.filesystem.POIFSFileSystem;public class ExcelXlsReader implements HSSFListener { private int minColumns = -1; private POIFSFileSystem fs; private int lastRowNumber; private int lastColumnNumber; /** Should we output the formula, or the value it has? */ private boolean outputFormulaValues = true; /** For parsing Formulas */ private SheetRecordCollectingListener workbookBuildingListener; // excel2003工作薄 private HSSFWorkbook stubWorkbook; // Records we pick up as we process private SSTRecord sstRecord; private FormatTrackingHSSFListener formatListener; // 表索引 private int sheetIndex = -1; private BoundSheetRecord[] orderedBSRs; @SuppressWarnings("unchecked") private ArrayList boundSheetRecords = new ArrayList(); // For handling formulas with string results private int nextRow; private int nextColumn; private boolean outputNextStringRecord; // 当前行 private int curRow = 0; // 存储行记录的容器 private List<String> rowlist = new ArrayList<String>();; private boolean allowNullRow = true; @SuppressWarnings("unused") private String sheetName; private List<List<String>> templist = new ArrayList<List<String>>(); private List<String> r = null; private IExcelRowReader rowReader; public void setRowReader(IExcelRowReader rowReader) { this.rowReader = rowReader; } /** * 遍历excel下所有的sheet * * @throws IOException */ public void process(String fileName,String newFileName,String fileType) throws IOException { this.fs = new POIFSFileSystem(new FileInputStream(fileName)); MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this); formatListener = new FormatTrackingHSSFListener(listener); HSSFEventFactory factory = new HSSFEventFactory(); HSSFRequest request = new HSSFRequest(); if (outputFormulaValues) { request.addListenerForAllRecords(formatListener); } else { workbookBuildingListener = new SheetRecordCollectingListener(formatListener); request.addListenerForAllRecords(workbookBuildingListener); } factory.processWorkbookEvents(request, fs); rowReader.writeExcel(templist,sheetName,newFileName,fileType); templist = null; } /** * HSSFListener 监听方法,处理 Record */ @SuppressWarnings("unchecked") public void processRecord(Record record) { int thisRow = -1; int thisColumn = -1; String thisStr = null; String value = null; switch (record.getSid()) { case BoundSheetRecord.sid: boundSheetRecords.add(record); break; case BOFRecord.sid: BOFRecord br = (BOFRecord) record; if (br.getType() == BOFRecord.TYPE_WORKSHEET) { // 如果有需要,则建立子工作薄 if (workbookBuildingListener != null && stubWorkbook == null) { stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook(); } sheetIndex++; if (orderedBSRs == null) { orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords); } sheetName = orderedBSRs[sheetIndex].getSheetname(); //ExcelRowReader.SHEETNAME = sheetName; } break; case SSTRecord.sid: sstRecord = (SSTRecord) record; break; case BlankRecord.sid: BlankRecord brec = (BlankRecord) record; thisRow = brec.getRow(); thisColumn = brec.getColumn(); thisStr = ""; rowlist.add(thisColumn, thisStr); break; case BoolErrRecord.sid: // 单元格为布尔类型 BoolErrRecord berec = (BoolErrRecord) record; thisRow = berec.getRow(); thisColumn = berec.getColumn(); thisStr = berec.getBooleanValue() + ""; rowlist.add(thisColumn, thisStr); break; case FormulaRecord.sid: // 单元格为公式类型 FormulaRecord frec = (FormulaRecord) record; thisRow = frec.getRow(); thisColumn = frec.getColumn(); if (outputFormulaValues) { if (Double.isNaN(frec.getValue())) { outputNextStringRecord = true; nextRow = frec.getRow(); nextColumn = frec.getColumn(); } else { thisStr = formatListener.formatNumberDateCell(frec); } } else { thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"'; } rowlist.add(thisColumn, thisStr); break; case StringRecord.sid:// 单元格中公式的字符串 if (outputNextStringRecord) { StringRecord srec = (StringRecord) record; thisStr = srec.getString(); thisRow = nextRow; thisColumn = nextColumn; outputNextStringRecord = false; } break; case LabelRecord.sid: LabelRecord lrec = (LabelRecord) record; curRow = thisRow = lrec.getRow(); thisColumn = lrec.getColumn(); value = lrec.getValue().trim(); value = value.equals("") ? " " : value; this.rowlist.add(thisColumn, value); break; case LabelSSTRecord.sid: // 单元格为字符串类型 LabelSSTRecord lsrec = (LabelSSTRecord) record; curRow = thisRow = lsrec.getRow(); thisColumn = lsrec.getColumn(); if (sstRecord == null) { rowlist.add(thisColumn, " "); } else { value = sstRecord.getString(lsrec.getSSTIndex()).toString().trim(); value = value.equals("") ? " " : value; rowlist.add(thisColumn, value); } break; case NumberRecord.sid: // 单元格为数字类型 NumberRecord numrec = (NumberRecord) record; curRow = thisRow = numrec.getRow(); thisColumn = numrec.getColumn(); value = formatListener.formatNumberDateCell(numrec).trim(); value = value.equals("") ? " " : value; // 向容器加入列值 rowlist.add(thisColumn, value); break; default: break; } // 遇到新行的操作 if (thisRow != -1 && thisRow != lastRowNumber) { lastColumnNumber = -1; } // 空值的操作 if (record instanceof MissingCellDummyRecord) { MissingCellDummyRecord mc = (MissingCellDummyRecord) record; curRow = thisRow = mc.getRow(); thisColumn = mc.getColumn(); rowlist.add(thisColumn, " "); } // 更新行和列的值 if (thisRow > -1) lastRowNumber = thisRow; if (thisColumn > -1) lastColumnNumber = thisColumn; // 行结束时的操作 if (record instanceof LastCellOfRowDummyRecord) { if (minColumns > 0) { // 列值重新置空 if (lastColumnNumber == -1) { lastColumnNumber = 0; } } lastColumnNumber = -1; // 每行结束时, 调用getRows() 方法 if(isWriteRow(rowlist)){ //rowReader.getRows(sheetIndex, curRow, rowlist); r = new ArrayList<String>(); r.addAll(rowlist); templist.add(r); r = null; } // 清空容器 rowlist.clear(); } } private boolean isWriteRow(List list){ boolean flag = false; for (int i = 0; i < list.size(); i++) { if (list.get(i)!= null && !list.get(i).equals("")) { flag = true; break; } } if (allowNullRow) { if (!flag) { flag = true;//允许一次空行 allowNullRow = false; } } return flag; }}
版权声明:
本站文章均来自互联网搜集,如有侵犯您的权益,请联系我们删除,谢谢。
标签: #apache poi读取excel