关于在POI以SAX方式解析,会导出拼音(音标)的问题解决
生活随笔
收集整理的這篇文章主要介紹了
关于在POI以SAX方式解析,会导出拼音(音标)的问题解决
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
參考文章:
POI以SAX方式解析Excel2007大文件(包含空單元格的處理)?? ? ? ?
這個片文章中,POI以SAX方式來讀取excel,解決讀大文件的問題,但是文章中創建ReadOnlySharedStringsTable
對象時,使用的是ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(opcPackage)
會使得讀取音標串也被讀取出來。這個解決方案是使用另外的構造方法,
public?ReadOnlySharedStringsTable(OPCPackage?pkg,boolean?includePhoneticRuns)throws java.io.IOException,org.xml.sax.SAXExceptionpkg?- The?OPCPackage?to use as basis for the shared-strings table.
includePhoneticRuns?- whether or not to concatenate phoneticRuns onto the shared string
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(opcPackage,false);
import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List;import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory;import org.apache.poi.hssf.eventusermodel.EventWorkbookBuilder.SheetRecordCollectingListener; import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener; import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; import org.apache.poi.hssf.eventusermodel.HSSFListener; import org.apache.poi.hssf.eventusermodel.HSSFRequest; import org.apache.poi.hssf.eventusermodel.MissingRecordAwareHSSFListener; import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord; import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord; import org.apache.poi.hssf.model.HSSFFormulaParser; import org.apache.poi.hssf.record.BOFRecord; import org.apache.poi.hssf.record.BlankRecord; import org.apache.poi.hssf.record.BoolErrRecord; import org.apache.poi.hssf.record.BoundSheetRecord; import org.apache.poi.hssf.record.FormulaRecord; import org.apache.poi.hssf.record.LabelRecord; import org.apache.poi.hssf.record.LabelSSTRecord; import org.apache.poi.hssf.record.NumberRecord; import org.apache.poi.hssf.record.Record; import org.apache.poi.hssf.record.SSTRecord; import org.apache.poi.hssf.record.StringRecord; import org.apache.poi.hssf.usermodel.HSSFDateUtil; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackageAccess; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.ss.usermodel.BuiltinFormats; import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable; import org.apache.poi.xssf.eventusermodel.XSSFReader; import org.apache.poi.xssf.model.StylesTable; import org.apache.poi.xssf.usermodel.XSSFCellStyle; import org.apache.poi.xssf.usermodel.XSSFRichTextString; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler;public class ExcelEventParser {enum xssfDataType {BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER,}class XLSReader implements HSSFListener {//POIFS文件流private POIFSFileSystem fs;//當前行數private int lastRowNumber;//當前列數private int lastColumnNumber;//是否計算公式的值private boolean outputFormulaValues = true;private SheetRecordCollectingListener workbookBuildingListener;private HSSFWorkbook stubWorkbook;//共享字符集private SSTRecord sstRecord;private FormatTrackingHSSFListener formatListener;private int sheetIndex = -1;private List<Record> boundSheetRecords = new ArrayList<Record>();//公式單元格結果所在行private int nextRow;//公式單元格結果所在列private int nextColumn;//公式單元格是否存在計算值private boolean outputNextStringRecord;//當前行數private int curRow = 0;//當前最大列數private int maxRef = 0;//空單元格使用的填充字符private String NULL = "-";private List<String> row = new ArrayList<String>();private List<List<String>> rows = new ArrayList<List<String>>();/*** 讀取Excel2003文件* @param filePath 文件路徑* @return* @throws IOException*/public List<List<String>> process(String filePath) throws IOException {//使用POIFS讀取文件流this.fs = new POIFSFileSystem(new FileInputStream(filePath));MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);formatListener = new FormatTrackingHSSFListener(listener);HSSFEventFactory factory = new HSSFEventFactory();HSSFRequest request = new HSSFRequest();if (outputFormulaValues) {//添加全部監聽器request.addListenerForAllRecords(formatListener);} else {workbookBuildingListener = new SheetRecordCollectingListener(formatListener);request.addListenerForAllRecords(workbookBuildingListener);}//處理文件流factory.processWorkbookEvents(request, fs);return rows;}/*** 依次處理每個監聽的record* @param record* @return* @throws IOException*/public void processRecord(Record record) {//只處理第一個sheet,其他全部忽略if (sheetIndex > 0) {return;}if (maxRef > MAX_COLUMN) {return;}int thisRow = -1;int thisColumn = -1;String thisStr = null;String value = null;//根據record的sid匹配類型分別進行處理//可以在這里添加業務邏輯switch (record.getSid()) {// 記錄了sheetNamecase BoundSheetRecord.sid:boundSheetRecords.add(record);break;// Workbook、Sheet的開始case BOFRecord.sid:BOFRecord br = (BOFRecord) record;if (br.getType() == BOFRecord.TYPE_WORKSHEET) {if (workbookBuildingListener != null && stubWorkbook == null) {stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook();}sheetIndex++;}break;//共享字符集case SSTRecord.sid:sstRecord = (SSTRecord) record;break;// 存在單元格樣式的空單元格case BlankRecord.sid:BlankRecord brec = (BlankRecord) record;thisRow = brec.getRow();thisColumn = brec.getColumn();thisStr = NULL;row.add(thisColumn, thisStr);break;// 布爾或錯誤單元格case BoolErrRecord.sid:BoolErrRecord berec = (BoolErrRecord) record;thisRow = berec.getRow();thisColumn = berec.getColumn();thisStr = berec.getBooleanValue() + "";row.add(thisColumn, thisStr);break;// 公式單元格case FormulaRecord.sid://調用方法計算單元格的值FormulaRecord frec = (FormulaRecord) record;thisRow = frec.getRow();thisColumn = frec.getColumn();if (outputFormulaValues) {if (Double.isNaN(frec.getValue())) {outputNextStringRecord = true;nextRow = frec.getRow();nextColumn = frec.getColumn();} else {thisStr = formatListener.formatNumberDateCell(frec);}} else {thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';}row.add(thisColumn, thisStr);break;// 公式的計算結果單元格case StringRecord.sid:if (outputNextStringRecord) {StringRecord srec = (StringRecord) record;thisStr = srec.getString();thisRow = nextRow;thisColumn = nextColumn;outputNextStringRecord = false;}break;// 文本單元格case LabelRecord.sid:LabelRecord lrec = (LabelRecord) record;curRow = thisRow = lrec.getRow();thisColumn = lrec.getColumn();value = lrec.getValue().trim();value = value.equals("") ? NULL : value;this.row.add(thisColumn, value);break;// 共用的文本單元格case LabelSSTRecord.sid:LabelSSTRecord lsrec = (LabelSSTRecord) record;curRow = thisRow = lsrec.getRow();thisColumn = lsrec.getColumn();if (sstRecord == null) {row.add(thisColumn, NULL);} else {value = sstRecord.getString(lsrec.getSSTIndex()).toString().trim();value = value.equals("") ? NULL : value;row.add(thisColumn, value);}break;// 數值單元格:數字單元格和日期單元格case NumberRecord.sid:NumberRecord numrec = (NumberRecord) record;curRow = thisRow = numrec.getRow();thisColumn = numrec.getColumn();//處理時間類型的數字value = formatListener.formatNumberDateCell(numrec).trim();value = value.equals("") ? NULL : value;row.add(thisColumn, value);break;default:break;}if (thisRow != -1 && thisRow != lastRowNumber) {lastColumnNumber = -1;}if (record instanceof MissingCellDummyRecord) {MissingCellDummyRecord mc = (MissingCellDummyRecord) record;curRow = thisRow = mc.getRow();thisColumn = mc.getColumn();row.add(thisColumn, NULL);}if (thisRow > -1)lastRowNumber = thisRow;if (thisColumn > -1)lastColumnNumber = thisColumn;//進入行結束處理if (record instanceof LastCellOfRowDummyRecord) {if (maxRef > row.size()) {int len = maxRef - row.size();for (int i = 0; i < len; i++) {lastColumnNumber++;row.add(lastColumnNumber, NULL);}}if (curRow == 0) {maxRef = row.size();}rows.add(row);lastColumnNumber = -1;row = new ArrayList<String>();}}}class XLSXReader extends DefaultHandler {//樣式數據private StylesTable stylesTable;//共享字符集private ReadOnlySharedStringsTable sharedStringsTable;//是否存在內聯字符private boolean vIsOpen;//記錄數據類型private xssfDataType nextDataType;private short formatIndex;private String formatString;//格式化類private final DataFormatter formatter;//當前所在列private int thisColumn = -1;//當前單元格值private StringBuffer value;//儲存一列的數據private List<String> record = new ArrayList<String>();//儲存全部數據private List<List<String>> rows = new ArrayList<List<String>>();//判斷當前行有沒有空單元格private boolean isCellNull = false;//前一個單元格和當前單元格private String preRef = null, ref = null;//避免同一個單元格重復計數private String Newcell = null;//計數當前行中的空格private int rowNull = 0;//當前單元格對應的序號private int curCol = 0;//當前行對應的序號private int curRow = 0;//空單元格使用的填充字符private String NULL = "-";/*** XLSXReader構造函數* @param styles 單元格樣式數據* @param strings 共享字符集*/public XLSXReader(StylesTable styles, ReadOnlySharedStringsTable strings) {this.stylesTable = styles;this.sharedStringsTable = strings;this.value = new StringBuffer();this.nextDataType = xssfDataType.NUMBER;this.formatter = new DataFormatter();record = new ArrayList<String>();rows.clear();}/*** 每當遇到開始標簽時進入* @param uri 命名空間路徑或空* @param localName 本地名稱或空* @param name 標簽名稱* @param attributes 標簽內容* @return* @throws SAXException*/public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {if ("inlineStr".equals(name) || "v".equals(name)) {vIsOpen = true;value.setLength(0);//每當遇到<c>表示新的單元格 } else if ("c".equals(name)) {//記錄上一個單元格和當前單元格用于填充空格if (preRef == null) {preRef = attributes.getValue("r");} else {preRef = ref;}ref = attributes.getValue("r");String r = ref;int firstDigit = -1;for (int c = 0; c < r.length(); ++c) {if (Character.isDigit(r.charAt(c))) {firstDigit = c;break;}}//獲取當前列數thisColumn = nameToColumn(r.substring(0, firstDigit));//判斷是否超過最大列數if (thisColumn > MAX_COLUMN) {return;}//將單元格類型翻譯成設定名稱this.nextDataType = xssfDataType.NUMBER;this.formatIndex = -1;this.formatString = null;String cellTpye = attributes.getValue("t");String cellStyleStr = attributes.getValue("s");if ("b".equals(cellTpye))nextDataType = xssfDataType.BOOL;else if ("e".equals(cellTpye))nextDataType = xssfDataType.ERROR;else if ("inlineStr".equals(cellTpye))nextDataType = xssfDataType.INLINESTR;else if ("s".equals(cellTpye))nextDataType = xssfDataType.SSTINDEX;else if ("str".equals(cellTpye))nextDataType = xssfDataType.FORMULA;else if (cellStyleStr != null) {int styleIndex = Integer.parseInt(cellStyleStr);XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);this.formatIndex = style.getDataFormat();this.formatString = style.getDataFormatString();if (this.formatString == null) {this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex);}}}}/*** 每當遇到結束標簽時進入* @param uri 命名空間路徑或空* @param localName 本地名稱或空* @param name 標簽名稱* @return* @throws SAXException*/public void endElement(String uri, String localName, String name) throws SAXException {//當遇到</v>標簽時,表面存在一個需要處理的值if ("v".equals(name)) {if (thisColumn > MAX_COLUMN) {return;}String thisStr = NULL;//根據當前單元格和上一個單元格判斷需要補充的空單元格if (null != ref && !ref.equals(Newcell)) {if (!ref.equals(preRef)) {int num = Integer.parseInt(ref.replaceAll("[A-Z]", ""));int num_1 = Integer.parseInt(preRef.replaceAll("[A-Z]", ""));int len = 0;//取出兩個單元格所在行數,如果行數不同則進行換行,從A1開始填充if (num > num_1) {len = countNullCell(ref, "A1")+1;}else{len = countNullCell(ref, preRef);}//填充空單元格for (int i = 0; i < len; i++) {record.add(curCol, thisStr);curCol++;Newcell = ref;rowNull++;}}}//選擇合適的方式處理單元格switch (nextDataType) {case BOOL:char first = value.charAt(0);thisStr = first == '0' ? "否" : "是";break;case ERROR:thisStr = value.toString();break;case FORMULA:thisStr = value.toString();break;case INLINESTR:XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());thisStr = rtsi.toString();break;case SSTINDEX:String sstIndex = value.toString();try {//查詢字符索引對應的字符值int idx = Integer.parseInt(sstIndex);XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx));thisStr = rtss.toString();} catch (NumberFormatException ex) {System.out.println("未找到對應的SST索引 '" + sstIndex + "': " + ex.toString());}break;case NUMBER:String n = value.toString();//判斷該數字是不是時間if (HSSFDateUtil.isADateFormat(this.formatIndex, n)) {Double d = Double.parseDouble(n);Date date = HSSFDateUtil.getJavaDate(d);thisStr = formatDateToString(date);} else if (this.formatString != null)thisStr = formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex,this.formatString);elsethisStr = n;break;//其他特殊類型可以自己擴展default:thisStr = "(尚未支持的類型: " + nextDataType + ")";break;}if (thisStr == null || thisStr.equals(NULL)) {isCellNull = true;}//這里可以根據實際業務進行改造,以單元格為單位處理record.add(curCol, thisStr);curCol++;} else if ("row".equals(name)) {if (isCellNull == false && record.size() != rowNull) {if (MAX_COLUMN >= record.size()) {int len = MAX_COLUMN - record.size();for (int i = 0; i < len; i++) {record.add(curCol, NULL);curCol++;}}//這里可以根據實際業務進行改造,以行為單位處理rows.add(curRow, record);isCellNull = false;//這里必須新建對象,不能為了節約清空之前的對象record = new ArrayList<String>();curRow++;curCol = 0;rowNull = 0;}}}/*** 計算兩個單元格中間的空單元格數量* <p>Excel2007文件最大行數為1048576,最大列數為16284,列名為XFD</P>* @param ref 當前單元格名稱* @param preRef 之前一個單元格名稱* @return*/private int countNullCell(String ref, String preRef) {String xfd = ref.replaceAll("\\d+", "");String xfd_1 = preRef.replaceAll("\\d+", "");xfd = fillChar(xfd, 3, '@', true);xfd_1 = fillChar(xfd_1, 3, '@', true);char[] letter = xfd.toCharArray();char[] letter_1 = xfd_1.toCharArray();int res = (letter[0] - letter_1[0]) * 26 * 26 + (letter[1] - letter_1[1]) * 26 + (letter[2] - letter_1[2]);return res - 1;}/*** 用特定字符填充字符串到指定長度* @param str 原字符串* @param len 目標串的長度* @param let 填充使用的字符* @param isPre 是否在前面填充字符* @return*/private String fillChar(String str, int len, char let, boolean isPre) {int len_1 = str.length();if (len_1 < len) {if (isPre) {for (int i = 0; i < (len - len_1); i++) {str = let + str;}} else {for (int i = 0; i < (len - len_1); i++) {str = str + let;}}}return str;}/*** 每遇到一個標簽調用,用于記錄標簽類型* @param ch 文件流字符數組* @param start 開始位置* @param length 讀取長度* @return* @throws SAXException*/@Overridepublic void characters(char[] ch, int start, int length) throws SAXException {if (vIsOpen)value.append(ch, start, length);}/*** 根據單元格名稱計算列數* @param name 單元格名稱(如:A1)* @return*/private int nameToColumn(String name) {int column = -1;for (int i = 0; i < name.length(); ++i) {int c = name.charAt(i);column = (column + 1) * 26 + c - 'A';}return column;}/*** 將日期類型轉變為String格式* @param date 日期* @return*/private String formatDateToString(Date date) {SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");return sdf.format(date);}}//事先定義的最大列數private int MAX_COLUMN = 7;private static String XLS = ".xls";private static String XLSX = ".xlsx";public ExcelEventParser() {}/*** 構造XLSXReader讀取Excel內容* @param styles 樣式信息* @param strings 共享字符集* @param sheetInputStream sheet輸入流* @return* @throws IOException* @throws ParserConfigurationException* @throws SAXException*/private List<List<String>> XLSXProcessSheet(StylesTable styles, ReadOnlySharedStringsTable strings,InputStream sheetInputStream) throws IOException, ParserConfigurationException, SAXException {//將輸入流包裝成XML源InputSource sheetSource = new InputSource(sheetInputStream);//構造基于標簽回調的XMLReaderSAXParserFactory saxFactory = SAXParserFactory.newInstance();SAXParser saxParser = saxFactory.newSAXParser();XMLReader sheetParser = saxParser.getXMLReader();//構造XLSXReaderXLSXReader handler = new XLSXReader(styles, strings);//添加管理者sheetParser.setContentHandler(handler);//讀入XML源sheetParser.parse(sheetSource);return handler.rows;}/*** 讀取Excel2007文件* @param filePath 文件路徑* @return* @throws IOException* @throws OpenXML4JException* @throws ParserConfigurationException* @throws SAXException*/private List<List<String>> XLSXprocess(String filePath)throws IOException, OpenXML4JException, ParserConfigurationException, SAXException {//將文件以壓縮包的形式讀入OPCPackage opcPackage = OPCPackage.open(filePath, PackageAccess.READ);ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(opcPackage,false);XSSFReader xssfReader = new XSSFReader(opcPackage);List<List<String>> sheet = null;//讀入樣式信息StylesTable styles = xssfReader.getStylesTable();//構造sheet迭代器XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();//這里只讀入了第一個sheet,如果需要讀取多個可以遍歷這個迭代器if (iter.hasNext()) {InputStream stream = iter.next();//構造XLSXReader讀取內容sheet = XLSXProcessSheet(styles, strings, stream);//關閉流stream.close();}opcPackage.close();return sheet;}/*** 讀取Excel2003文件* @param filePath 文件路徑* @return* @throws IOException* @throws OpenXML4JException* @throws ParserConfigurationException* @throws SAXException*/private List<List<String>> XLSprocess(String filePath)throws IOException, OpenXML4JException, ParserConfigurationException, SAXException {XLSReader xls = new XLSReader();List<List<String>> sheet = xls.process(filePath);return sheet;}/*** 讀取Excel文件內容并返回包含文件內容的List* @param filePath 文件路徑* @return* @throws IOException* @throws OpenXML4JException* @throws ParserConfigurationException* @throws SAXException*/public static List<List<String>> readerExcel(String filePath)throws IOException, OpenXML4JException, ParserConfigurationException, SAXException {// 將文件數據儲存起來List<List<String>> sheet = null;// 獲取文件后綴名,判斷文件類型int index = filePath.lastIndexOf(".");String excelVer = filePath.substring(index, filePath.length());ExcelEventParser csvReader = new ExcelEventParser();if (XLS.equals(excelVer)) {sheet = csvReader.XLSprocess(filePath);} else if (XLSX.equals(excelVer)) {sheet = csvReader.XLSXprocess(filePath);}return sheet;}public static void main(String[] args) {try {// 要導入的文件地址(例:D:/新建 Microsoft Excel 工作表 .xlsx)String filePath = "D:/新建 Microsoft Excel 工作表 .xls";List<List<String>> sheet = readerExcel(filePath);for (List<String> row : sheet) {for (String cell : row) {System.out.print(cell + ",");}System.out.println();}} catch (Exception e) {e.printStackTrace();}} }?
總結
以上是生活随笔為你收集整理的关于在POI以SAX方式解析,会导出拼音(音标)的问题解决的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Swagger如何屏蔽某些接口显示
- 下一篇: DDTW 导数动态时间规整算法