
package comtest;
/
需要的jar包:
poi-302-FINAL-20080204jar
poi-contrib-302-FINAL-20080204jar
poi-scratchpad-302-FINAL-20080204jar
poi-35-beta6-20090622jar
geronimo-stax-api_10_spec-10jar
ooxml-schemas-10jar
openxml4j-bin-betajar
poi-ooxml-35-beta6-20090622jar
xmlbeans-230jar
dom4j-161jar
/
import javaioByteArrayInputStream;
import javaioFileInputStream;
import javaioFileOutputStream;
import javaioIOException;
import javaioInputStream;
import javaioStringWriter;
import orgapachepdfboxpdmodelPDDocument;
import orgapachepdfboxutilPDFTextStripper;
import orgapachepoiPOIOLE2TextExtractor;
import orgapachepoiPOITextExtractor;
import orgapachepoiPOIXMLDocument;
import orgapachepoiPOIXMLTextExtractor;
import orgapachepoiextractorExtractorFactory;
import orgapachepoihssfusermodelHSSFCell;
import orgapachepoihssfusermodelHSSFRow;
import orgapachepoihssfusermodelHSSFSheet;
import orgapachepoihssfusermodelHSSFWorkbook;
import orgapachepoihwpfextractorWordExtractor;
import orgapachepoiopenxml4jexceptionsOpenXML4JException;
import orgapachepoiopenxml4jopcOPCPackage;
import orgapachepoipoifsfilesystemDirectoryEntry;
import orgapachepoipoifsfilesystemDocumentEntry;
import orgapachepoipoifsfilesystemPOIFSFileSystem;
import orgapachepoixslfextractorXSLFPowerPointExtractor;
import orgapachepoixssfusermodelXSSFCell;
import orgapachepoixssfusermodelXSSFRow;
import orgapachepoixssfusermodelXSSFSheet;
import orgapachepoixssfusermodelXSSFWorkbook;
import orgapachepoixwpfextractorXWPFWordExtractor;
import orgapachexmlbeansXmlException;
public class WordAndExcelExtractor {
public static void main(String[] args) {
try {
// 读取word
String wordFile = "D:/1doc";
//String wordText2007 = WordAndExcelExtractorextractTextFromDOC2007(wordFile);
//Systemoutprintln("wordText2007=======" + wordText2007);
InputStream isword = new FileInputStream(wordFile);
WordExtractor wordExtractor = new WordExtractor(isword);
Systemoutprintln("word========" + wordExtractorgetText());
// 读取 Excel
InputStream is = new FileInputStream("D:/测试xls");
String excelText = WordAndExcelExtractorextractTextFromXLS(is);
Systemoutprintln("text2003==========" + excelText);
String excelFile = "D:/test2xlsx";
String excelText2007 = WordAndExcelExtractor
extractTextFromXLS2007(excelFile);
Systemoutprintln("excelText2007==========" + excelText2007);
// 读取 PPT
PowerPointExtractor ppe = new PowerPointExtractor("D:/testppt");
Systemoutprintln("ppt2003===============" + ppegetText());
// Systemoutprintln("###############################");
// Systemoutprintln(ppegetText(true, true, true, true));
//
// InputStream is = new FileInputStream("D:/testppt");
// PowerPointExtractor ppt2003 = new PowerPointExtractor(is);
// Systemoutprintln(ppt2003getText());
Systemoutprintln("");
XSLFPowerPointExtractor ppt = new XSLFPowerPointExtractor(
POIXMLDocumentopenPackage("D:/test2pptx"));
Systemoutprintln("ppt2007============================="
+ pptgetText());
/ 读取PDF /
InputStream in = new FileInputStream("D:/testpdf");
PDDocument pdfDocument = PDDocumentload(in);
if (pdfDocumentisEncrypted()) {
// 仅仅尝试使用默认密码打开加密的PDF
pdfDocumentdecrypt("");
}
PDFTextStripper stripper = null;
// 创建一个writer用来作来存储文件正文
StringWriter writer = new StringWriter();
if (stripper == null) {
stripper = new PDFTextStripper();
} else {
stripperresetEngine();
}
stripperwriteText(pdfDocument, writer);
String contents = writergetBuffer()toString();
Systemoutprintln("pdfd===" + contents);
/ 向Word中写入数据 /
byte[] a = contentsgetBytes();
ByteArrayInputStream bs = new ByteArrayInputStream(a);
POIFSFileSystem fs = new POIFSFileSystem();
// /////////////////////////////////
DirectoryEntry directory = fsgetRoot();
DocumentEntry de = directorycreateDocument("WordDocument", bs);
// 以上两句代码不能省略,否则输出的是乱码
FileOutputStream fos = new FileOutputStream("D:\\dddoc");
fswriteFilesystem(fos);
bsclose();
fosflush();
fosclose();
Systemoutprintln("写入成功");
} catch (Exception e) {
eprintStackTrace();
}
}
/
@Method: extractTextFromXLS
@Description: 从excel 2003档中提取纯文本
@param
@return String
@throws
/
@SuppressWarnings("deprecation")
private static String extractTextFromXLS(InputStream is) throws IOException {
StringBuffer content = new StringBuffer();
HSSFWorkbook workbook = new HSSFWorkbook(is); // 创建对Excel工作簿文件的引用
for (int numSheets = 0; numSheets < workbookgetNumberOfSheets(); numSheets++) {
if (null != workbookgetSheetAt(numSheets)) {
HSSFSheet aSheet = workbookgetSheetAt(numSheets); // 获得一个sheet
contentappend(aSheetgetSheetName());
contentappend("\r\n-----------------------\r\n");
for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet
getLastRowNum(); rowNumOfSheet++) {
if (null != aSheetgetRow(rowNumOfSheet)) {
HSSFRow aRow = aSheetgetRow(rowNumOfSheet); // 获得一行
for (short cellNumOfRow = 0; cellNumOfRow <= aRow
getLastCellNum(); cellNumOfRow++) {
if (null != aRowgetCell(cellNumOfRow)) {
HSSFCell aCell = aRowgetCell(cellNumOfRow); // 获得列值
if (aCellgetCellType() == HSSFCellCELL_TYPE_NUMERIC) {
contentappend(aCellgetNumericCellValue());
} else if (aCellgetCellType() == HSSFCellCELL_TYPE_BOOLEAN) {
contentappend(aCellgetBooleanCellValue());
} else {
contentappend(aCellgetStringCellValue());
}
contentappend("\t");
}
}
contentappend("\r\n");
}
}
}
}
return contenttoString();
}
/
@Method: extractTextFromXLS2007
@Description: 从excel 2007文档中提取纯文本
@param
@return String
@throws
/
private static String extractTextFromXLS2007(String fileName)
throws Exception {
StringBuffer content = new StringBuffer();
// 构造 XSSFWorkbook 对象,strPath 传入文件路径
XSSFWorkbook xwb = new XSSFWorkbook(fileName);
// 循环工作表Sheet
for (int numSheet = 0; numSheet < xwbgetNumberOfSheets(); numSheet++) {
XSSFSheet xSheet = xwbgetSheetAt(numSheet);
if (xSheet == null) {
continue;
}
// 循环行Row
for (int rowNum = 0; rowNum <= xSheetgetLastRowNum(); rowNum++) {
XSSFRow xRow = xSheetgetRow(rowNum);
if (xRow == null) {
continue;
}
// 循环列Cell
for (int cellNum = 0; cellNum <= xRowgetLastCellNum(); cellNum++) {
XSSFCell xCell = xRowgetCell(cellNum);
if (xCell == null) {
continue;
}
if (xCellgetCellType() == XSSFCellCELL_TYPE_BOOLEAN) {
contentappend(xCellgetBooleanCellValue());
} else if (xCellgetCellType() == XSSFCellCELL_TYPE_NUMERIC) {
contentappend(xCellgetNumericCellValue());
} else {
contentappend(xCellgetStringCellValue());
}
}
}
}
return contenttoString();
}
}
这是POI jar包的下载地址,我下载的是39版本的
http://poiapacheorg/downloadhtml
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)