java读取doc,pdf问题。_框架

PDFBox是一个开源的对pdf文件进行 *** 作的库。 PDFBox-073jar加入classpath。同时FontBox10jar加入classpath，否则报错

import javaioFileInputStream;

import javaioFileNotFoundException;

import javaioIOException;

import orgpdfboxpdfparserPDFParser;

import orgpdfboxpdmodelPDDocument;

import orgpdfboxutilPDFTextStripper;

public class PdfReader {

simply reader all the text from a pdf file

You have to deal with the format of the output text by yourself

2008-2-25

@param pdfFilePath file path

@return all text in the pdf file

public static String getTextFromPDF(String pdfFilePath)

{

String result = null;

FileInputStream is = null;

PDDocument document = null;

try {

is = new FileInputStream(pdfFilePath);

PDFParser parser = new PDFParser(is);

parserparse();

document = parsergetPDDocument();

PDFTextStripper stripper = new PDFTextStripper();

result = strippergetText(document);

} catch (FileNotFoundException e) {

// TODO Auto-generated catch block

eprintStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

eprintStackTrace();

} finally {

if (is != null) {

try {

isclose();

} catch (IOException e) {

// TODO Auto-generated catch block

eprintStackTrace();

}

if (document != null) {

try {

documentclose();

} catch (IOException e) {

// TODO Auto-generated catch block

eprintStackTrace();

}

return result;

}

public static void main(String[] args)

{

String str=PdfReadergetTextFromPDF("C:\\Readpdf");

Systemoutprintln(str);

}

代码2：

import javaioFile;

import javaioFileOutputStream;

import javaioOutputStreamWriter;

import javaioWriter;

import javanetMalformedURLException;

import javanetURL;

import orgpdfboxpdmodelPDDocument;

import orgpdfboxutilPDFTextStripper;

public class PDFReader {

public void readFdf(String file) throws Exception {

boolean sort = false;

String pdfFile = file;

String textFile = null;

String encoding = "UTF-8";

int startPage = 1;

int endPage = IntegerMAX_VALUE;

Writer output = null;

PDDocument document = null;

try {

// 首先当作一个URL来装载文件，如果得到异常再从本地文件系统//去装载文件

URL url = new URL(pdfFile);

//注意参数已不是以前版本中的URL而是File。

document = PDDocumentload(pdfFile);

// 获取PDF的文件名

String fileName = urlgetFile();

// 以原来PDF的名称来命名新产生的txt文件

if (fileNamelength() > 4) {

File outputFile = new File(fileNamesubstring(0, fileName

length() - 4)

+ "txt");

textFile = outputFilegetName();

}

} catch (MalformedURLException e) {

// 如果作为URL装载得到异常则从文件系统装载

//注意参数已不是以前版本中的URL而是File。

document = PDDocumentload(pdfFile);

if (pdfFilelength() > 4) {

textFile = pdfFilesubstring(0, pdfFilelength() - 4)

+ "txt";

}

output = new OutputStreamWriter(new FileOutputStream(textFile),

encoding);

PDFTextStripper stripper = null;

stripper = new PDFTextStripper();

// 设置是否排序

strippersetSortByPosition(sort);

// 设置起始页

strippersetStartPage(startPage);

// 设置结束页

strippersetEndPage(endPage);

// 调用PDFTextStripper的writeText提取并输出文本

stripperwriteText(document, output);

} finally {

if (output != null) {

// 关闭输出流

outputclose();

}

if (document != null) {

// 关闭PDF Document

documentclose();

}

@param args

public static void main(String[] args) {

// TODO Auto-generated method stub

PDFReader pdfReader = new PDFReader();

try {

// 取得E盘下的SpringGuidepdf的内容

pdfReaderreadFdf("C:\\Readpdf");

} catch (Exception e) {

eprintStackTrace();

}

2、抽取支持中文的pdf文件－xpdf

xpdf是一个开源项目，我们可以调用他的本地方法来实现抽取中文pdf文件。

>import javaio;

Title: pdf extraction

Description: email:chris@matrixorgcn

Company: Matrixorgcn

@author chris

@version 10,who use this example pls remain the declare

public class PdfWin {

public PdfWin() {

}

public static void main(String args[]) throws Exception

{

String PATH_TO_XPDF="C:Program Filesxpdfpdftotextexe";

String filename="c:apdf";

String[] cmd = new String[] { PATH_TO_XPDF, "-enc", "UTF-8", "-q", filename, "-"};

Process p = RuntimegetRuntime()exec(cmd);

BufferedInputStream bis = new BufferedInputStream(pgetInputStream());

InputStreamReader reader = new InputStreamReader(bis, "UTF-8");

StringWriter out = new StringWriter();

char [] buf = new char[10000];

int len;

while((len = readerread(buf))>= 0) {

//outwrite(buf, 0, len);

Systemoutprintln("the length is"+len);

}

readerclose();

String ts=new String(buf);

Systemoutprintln("the str is"+ts);

}

为程序的注析用的：

静态OPCPackage打开（Java，朗，字符串路径）

打开具有读／写权限的包。

访问在读写API文档中找到的文件。

＊／

Publicvoid进程（字符串文件名）抛出异常｛

PKG＝OPCPackage。Open（filename）；

XSSFReaderr＝新XSSFReader（PKG）；

R．gutierrezetSharedStringsTableSST＝（）；

XMLReader解析器＝fetchSheetParser（SST）；

迭代器< InputStream > r gutierrez etSheetsData sheets = ();

而（表。HasNext（））｛

克鲁＝0；

SheetIndex＋＋；

InputStream工作表＝工作表。下一个（）；

InputSourcesheetSource＝新的InputSource（sheet）；

解析器。解析（sheetSource）；

表。关闭（）；

扩展资料：

注意事项：

函数中的这些语句用于做一些有意义的事情——通常是处理文本、控制输入或计算值。通过在程序代码中引入函数名和所需的参数，可以在程序中执行（或调用）函数。

类似的过程，但函数通常有一个返回值。它们都可以在自己的结构中调用自己，称为递归。

大多数编程语言在它们的函数构建方法中都包含函数关键字（或保留字）。

package comtest;

需要的jar包：

poi-302-FINAL-20080204jar

poi-contrib-302-FINAL-20080204jar

poi-scratchpad-302-FINAL-20080204jar

poi-35-beta6-20090622jar

geronimo-stax-api_10_spec-10jar

ooxml-schemas-10jar

openxml4j-bin-betajar

poi-ooxml-35-beta6-20090622jar

xmlbeans-230jar

dom4j-161jar

import javaioByteArrayInputStream;

import javaioFileInputStream;

import javaioFileOutputStream;

import javaioIOException;

import javaioInputStream;

import javaioStringWriter;

import orgapachepdfboxpdmodelPDDocument;

import orgapachepdfboxutilPDFTextStripper;

import orgapachepoiPOIOLE2TextExtractor;

import orgapachepoiPOITextExtractor;

import orgapachepoiPOIXMLDocument;

import orgapachepoiPOIXMLTextExtractor;

import orgapachepoiextractorExtractorFactory;

import orgapachepoihssfusermodelHSSFCell;

import orgapachepoihssfusermodelHSSFRow;

import orgapachepoihssfusermodelHSSFSheet;

import orgapachepoihssfusermodelHSSFWorkbook;

import orgapachepoihwpfextractorWordExtractor;

import orgapachepoiopenxml4jexceptionsOpenXML4JException;

import orgapachepoiopenxml4jopcOPCPackage;

import orgapachepoipoifsfilesystemDirectoryEntry;

import orgapachepoipoifsfilesystemDocumentEntry;

import orgapachepoipoifsfilesystemPOIFSFileSystem;

import orgapachepoixslfextractorXSLFPowerPointExtractor;

import orgapachepoixssfusermodelXSSFCell;

import orgapachepoixssfusermodelXSSFRow;

import orgapachepoixssfusermodelXSSFSheet;

import orgapachepoixssfusermodelXSSFWorkbook;

import orgapachepoixwpfextractorXWPFWordExtractor;

import orgapachexmlbeansXmlException;

public class WordAndExcelExtractor {

public static void main(String[] args) {

try {

// 读取word

String wordFile = "D:/1doc";

//String wordText2007 = WordAndExcelExtractorextractTextFromDOC2007(wordFile);

//Systemoutprintln("wordText2007=======" + wordText2007);

InputStream isword = new FileInputStream(wordFile);

WordExtractor wordExtractor = new WordExtractor(isword);

Systemoutprintln("word========" + wordExtractorgetText());

// 读取 Excel

InputStream is = new FileInputStream("D:/测试xls");

String excelText = WordAndExcelExtractorextractTextFromXLS(is);

Systemoutprintln("text2003==========" + excelText);

String excelFile = "D:/test2xlsx";

String excelText2007 = WordAndExcelExtractor

extractTextFromXLS2007(excelFile);

Systemoutprintln("excelText2007==========" + excelText2007);

// 读取 PPT

PowerPointExtractor ppe = new PowerPointExtractor("D:/testppt");

Systemoutprintln("ppt2003===============" + ppegetText());

// Systemoutprintln("###############################");

// Systemoutprintln(ppegetText(true, true, true, true));

// InputStream is = new FileInputStream("D:/testppt");

// PowerPointExtractor ppt2003 = new PowerPointExtractor(is);

// Systemoutprintln(ppt2003getText());

Systemoutprintln("");

XSLFPowerPointExtractor ppt = new XSLFPowerPointExtractor(

POIXMLDocumentopenPackage("D:/test2pptx"));

Systemoutprintln("ppt2007============================="

+ pptgetText());

/ 读取PDF /

InputStream in = new FileInputStream("D:/testpdf");

PDDocument pdfDocument = PDDocumentload(in);

if (pdfDocumentisEncrypted()) {

// 仅仅尝试使用默认密码打开加密的PDF

pdfDocumentdecrypt("");

}

PDFTextStripper stripper = null;

// 创建一个writer用来作来存储文件正文

StringWriter writer = new StringWriter();

if (stripper == null) {

stripper = new PDFTextStripper();

} else {

stripperresetEngine();

}

stripperwriteText(pdfDocument, writer);

String contents = writergetBuffer()toString();

Systemoutprintln("pdfd===" + contents);

/ 向Word中写入数据 /

byte[] a = contentsgetBytes();

ByteArrayInputStream bs = new ByteArrayInputStream(a);

POIFSFileSystem fs = new POIFSFileSystem();

// /////////////////////////////////

DirectoryEntry directory = fsgetRoot();

DocumentEntry de = directorycreateDocument("WordDocument", bs);

// 以上两句代码不能省略，否则输出的是乱码

FileOutputStream fos = new FileOutputStream("D:\\dddoc");

fswriteFilesystem(fos);

bsclose();

fosflush();

fosclose();

Systemoutprintln("写入成功");

} catch (Exception e) {

eprintStackTrace();

}

@Method: extractTextFromXLS

@Description: 从excel 2003档中提取纯文本

@param

@return String

@throws

@SuppressWarnings("deprecation")

private static String extractTextFromXLS(InputStream is) throws IOException {

StringBuffer content = new StringBuffer();

HSSFWorkbook workbook = new HSSFWorkbook(is); // 创建对Excel工作簿文件的引用

for (int numSheets = 0; numSheets < workbookgetNumberOfSheets(); numSheets++) {

if (null != workbookgetSheetAt(numSheets)) {

HSSFSheet aSheet = workbookgetSheetAt(numSheets); // 获得一个sheet

contentappend(aSheetgetSheetName());

contentappend("\r\n-----------------------\r\n");

for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet

getLastRowNum(); rowNumOfSheet++) {

if (null != aSheetgetRow(rowNumOfSheet)) {

HSSFRow aRow = aSheetgetRow(rowNumOfSheet); // 获得一行

for (short cellNumOfRow = 0; cellNumOfRow <= aRow

getLastCellNum(); cellNumOfRow++) {

if (null != aRowgetCell(cellNumOfRow)) {

HSSFCell aCell = aRowgetCell(cellNumOfRow); // 获得列值

if (aCellgetCellType() == HSSFCellCELL_TYPE_NUMERIC) {

contentappend(aCellgetNumericCellValue());

} else if (aCellgetCellType() == HSSFCellCELL_TYPE_BOOLEAN) {

contentappend(aCellgetBooleanCellValue());

} else {

contentappend(aCellgetStringCellValue());

}

contentappend("\t");

}

contentappend("\r\n");

}

return contenttoString();

}

@Method: extractTextFromXLS2007

@Description: 从excel 2007文档中提取纯文本

@param

@return String

@throws

private static String extractTextFromXLS2007(String fileName)

throws Exception {

StringBuffer content = new StringBuffer();

// 构造 XSSFWorkbook 对象，strPath 传入文件路径

XSSFWorkbook xwb = new XSSFWorkbook(fileName);

// 循环工作表Sheet

for (int numSheet = 0; numSheet < xwbgetNumberOfSheets(); numSheet++) {

XSSFSheet xSheet = xwbgetSheetAt(numSheet);

if (xSheet == null) {

continue;

}

// 循环行Row

for (int rowNum = 0; rowNum <= xSheetgetLastRowNum(); rowNum++) {

XSSFRow xRow = xSheetgetRow(rowNum);

if (xRow == null) {

continue;

}

// 循环列Cell

for (int cellNum = 0; cellNum <= xRowgetLastCellNum(); cellNum++) {

XSSFCell xCell = xRowgetCell(cellNum);

if (xCell == null) {

continue;

}

if (xCellgetCellType() == XSSFCellCELL_TYPE_BOOLEAN) {

contentappend(xCellgetBooleanCellValue());

} else if (xCellgetCellType() == XSSFCellCELL_TYPE_NUMERIC) {

contentappend(xCellgetNumericCellValue());

} else {

contentappend(xCellgetStringCellValue());

}

return contenttoString();

}

这是POI jar包的下载地址，我下载的是39版本的

File f = new File("/path/to/excel/file");

Workbook wb = null;

NPOIFSFileSystem npoifs = null;

OPCPackage pkg = null;

try {

npoifs = new NPOIFSFileSystem(f);

wb = WorkbookFactorycreate(npoifs);

} catch(OfficeXmlFileException ofe) {

pkg = OPCPackageopen(f);

以上就是关于java读取doc,pdf问题。全部的内容，包括:java读取doc,pdf问题。、在poi 中OPCPackage.open() 是什么函数OPCPackage这个类用来干嘛、谁能给我一个详细的Java通过Apache POI导出Excel方法，最好能给完整代码等相关内容解答，如果想了解更多相关内容，可以关注我们，你们的支持是我们更新的动力！

欢迎分享，转载请注明来源：内存溢出

原文地址: http://outofmemory.cn/web/9346782.html

java读取doc,pdf问题。

发表评论

评论列表（0条）