PDFBox是一个开源的对pdf文件进行 *** 作的库。 PDFBox-073jar加入classpath。同时FontBox10jar加入classpath,否则报错
import javaioFileInputStream;import javaioFileNotFoundException;
import javaioIOException;
import orgpdfboxpdfparserPDFParser;
import orgpdfboxpdmodelPDDocument;
import orgpdfboxutilPDFTextStripper;
public class PdfReader {
/
simply reader all the text from a pdf file
You have to deal with the format of the output text by yourself
2008-2-25
@param pdfFilePath file path
@return all text in the pdf file
/
public static String getTextFromPDF(String pdfFilePath)
{
String result = null;
FileInputStream is = null;
PDDocument document = null;
try {
is = new FileInputStream(pdfFilePath);
PDFParser parser = new PDFParser(is);
parserparse();
document = parsergetPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
result = strippergetText(document);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
eprintStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
eprintStackTrace();
} finally {
if (is != null) {
try {
isclose();
} catch (IOException e) {
// TODO Auto-generated catch block
eprintStackTrace();
}
}
if (document != null) {
try {
documentclose();
} catch (IOException e) {
// TODO Auto-generated catch block
eprintStackTrace();
}
}
}
return result;
}
public static void main(String[] args)
{
String str=PdfReadergetTextFromPDF("C:\\Readpdf");
Systemoutprintln(str);
}
}
代码2:
import javaioFile;import javaioFileOutputStream;
import javaioOutputStreamWriter;
import javaioWriter;
import javanetMalformedURLException;
import javanetURL;
import orgpdfboxpdmodelPDDocument;
import orgpdfboxutilPDFTextStripper;
public class PDFReader {
public void readFdf(String file) throws Exception {
boolean sort = false;
String pdfFile = file;
String textFile = null;
String encoding = "UTF-8";
int startPage = 1;
int endPage = IntegerMAX_VALUE;
Writer output = null;
PDDocument document = null;
try {
try {
// 首先当作一个URL来装载文件,如果得到异常再从本地文件系统//去装载文件
URL url = new URL(pdfFile);
//注意参数已不是以前版本中的URL而是File。
document = PDDocumentload(pdfFile);
// 获取PDF的文件名
String fileName = urlgetFile();
// 以原来PDF的名称来命名新产生的txt文件
if (fileNamelength() > 4) {
File outputFile = new File(fileNamesubstring(0, fileName
length() - 4)
+ "txt");
textFile = outputFilegetName();
}
} catch (MalformedURLException e) {
// 如果作为URL装载得到异常则从文件系统装载
//注意参数已不是以前版本中的URL而是File。
document = PDDocumentload(pdfFile);
if (pdfFilelength() > 4) {
textFile = pdfFilesubstring(0, pdfFilelength() - 4)
+ "txt";
}
}
output = new OutputStreamWriter(new FileOutputStream(textFile),
encoding);
PDFTextStripper stripper = null;
stripper = new PDFTextStripper();
// 设置是否排序
strippersetSortByPosition(sort);
// 设置起始页
strippersetStartPage(startPage);
// 设置结束页
strippersetEndPage(endPage);
// 调用PDFTextStripper的writeText提取并输出文本
stripperwriteText(document, output);
} finally {
if (output != null) {
// 关闭输出流
outputclose();
}
if (document != null) {
// 关闭PDF Document
documentclose();
}
}
}
/
@param args
/
public static void main(String[] args) {
// TODO Auto-generated method stub
PDFReader pdfReader = new PDFReader();
try {
// 取得E盘下的SpringGuidepdf的内容
pdfReaderreadFdf("C:\\Readpdf");
} catch (Exception e) {
eprintStackTrace();
}
}
}
2、抽取支持中文的pdf文件-xpdf
xpdf是一个开源项目,我们可以调用他的本地方法来实现抽取中文pdf文件。
>import javaio;
/
<p>Title: pdf extraction</p>
<p>Description: email:chris@matrixorgcn</p>
<p>Copyright: Matrix Copyright (c) 2003</p>
<p>Company: Matrixorgcn</p>
@author chris
@version 10,who use this example pls remain the declare
/
public class PdfWin {
public PdfWin() {
}
public static void main(String args[]) throws Exception
{
String PATH_TO_XPDF="C:Program Filesxpdfpdftotextexe";
String filename="c:apdf";
String[] cmd = new String[] { PATH_TO_XPDF, "-enc", "UTF-8", "-q", filename, "-"};
Process p = RuntimegetRuntime()exec(cmd);
BufferedInputStream bis = new BufferedInputStream(pgetInputStream());
InputStreamReader reader = new InputStreamReader(bis, "UTF-8");
StringWriter out = new StringWriter();
char [] buf = new char[10000];
int len;
while((len = readerread(buf))>= 0) {
//outwrite(buf, 0, len);
Systemoutprintln("the length is"+len);
}
readerclose();
String ts=new String(buf);
Systemoutprintln("the str is"+ts);
}
}
为程序的注析用的:
静态OPCPackage打开(Java,朗,字符串路径)
打开具有读/写权限的包。
访问在读写API文档中找到的文件。
*/
Publicvoid进程(字符串文件名)抛出异常{
PKG=OPCPackage。Open(filename);
XSSFReaderr=新XSSFReader(PKG);
R.gutierrezetSharedStringsTableSST=();
XMLReader解析器=fetchSheetParser(SST);
迭代器< InputStream > r gutierrez etSheetsData sheets = ();
而(表。HasNext()){
克鲁=0;
SheetIndex++;
InputStream工作表=工作表。下一个();
InputSourcesheetSource=新的InputSource(sheet);
解析器。解析(sheetSource);
表。关闭();
扩展资料:
注意事项:
函数中的这些语句用于做一些有意义的事情——通常是处理文本、控制输入或计算值。通过在程序代码中引入函数名和所需的参数,可以在程序中执行(或调用)函数。
类似的过程,但函数通常有一个返回值。它们都可以在自己的结构中调用自己,称为递归。
大多数编程语言在它们的函数构建方法中都包含函数关键字(或保留字)。
package comtest;
/
需要的jar包:
poi-302-FINAL-20080204jar
poi-contrib-302-FINAL-20080204jar
poi-scratchpad-302-FINAL-20080204jar
poi-35-beta6-20090622jar
geronimo-stax-api_10_spec-10jar
ooxml-schemas-10jar
openxml4j-bin-betajar
poi-ooxml-35-beta6-20090622jar
xmlbeans-230jar
dom4j-161jar
/
import javaioByteArrayInputStream;
import javaioFileInputStream;
import javaioFileOutputStream;
import javaioIOException;
import javaioInputStream;
import javaioStringWriter;
import orgapachepdfboxpdmodelPDDocument;
import orgapachepdfboxutilPDFTextStripper;
import orgapachepoiPOIOLE2TextExtractor;
import orgapachepoiPOITextExtractor;
import orgapachepoiPOIXMLDocument;
import orgapachepoiPOIXMLTextExtractor;
import orgapachepoiextractorExtractorFactory;
import orgapachepoihssfusermodelHSSFCell;
import orgapachepoihssfusermodelHSSFRow;
import orgapachepoihssfusermodelHSSFSheet;
import orgapachepoihssfusermodelHSSFWorkbook;
import orgapachepoihwpfextractorWordExtractor;
import orgapachepoiopenxml4jexceptionsOpenXML4JException;
import orgapachepoiopenxml4jopcOPCPackage;
import orgapachepoipoifsfilesystemDirectoryEntry;
import orgapachepoipoifsfilesystemDocumentEntry;
import orgapachepoipoifsfilesystemPOIFSFileSystem;
import orgapachepoixslfextractorXSLFPowerPointExtractor;
import orgapachepoixssfusermodelXSSFCell;
import orgapachepoixssfusermodelXSSFRow;
import orgapachepoixssfusermodelXSSFSheet;
import orgapachepoixssfusermodelXSSFWorkbook;
import orgapachepoixwpfextractorXWPFWordExtractor;
import orgapachexmlbeansXmlException;
public class WordAndExcelExtractor {
public static void main(String[] args) {
try {
// 读取word
String wordFile = "D:/1doc";
//String wordText2007 = WordAndExcelExtractorextractTextFromDOC2007(wordFile);
//Systemoutprintln("wordText2007=======" + wordText2007);
InputStream isword = new FileInputStream(wordFile);
WordExtractor wordExtractor = new WordExtractor(isword);
Systemoutprintln("word========" + wordExtractorgetText());
// 读取 Excel
InputStream is = new FileInputStream("D:/测试xls");
String excelText = WordAndExcelExtractorextractTextFromXLS(is);
Systemoutprintln("text2003==========" + excelText);
String excelFile = "D:/test2xlsx";
String excelText2007 = WordAndExcelExtractor
extractTextFromXLS2007(excelFile);
Systemoutprintln("excelText2007==========" + excelText2007);
// 读取 PPT
PowerPointExtractor ppe = new PowerPointExtractor("D:/testppt");
Systemoutprintln("ppt2003===============" + ppegetText());
// Systemoutprintln("###############################");
// Systemoutprintln(ppegetText(true, true, true, true));
//
// InputStream is = new FileInputStream("D:/testppt");
// PowerPointExtractor ppt2003 = new PowerPointExtractor(is);
// Systemoutprintln(ppt2003getText());
Systemoutprintln("");
XSLFPowerPointExtractor ppt = new XSLFPowerPointExtractor(
POIXMLDocumentopenPackage("D:/test2pptx"));
Systemoutprintln("ppt2007============================="
+ pptgetText());
/ 读取PDF /
InputStream in = new FileInputStream("D:/testpdf");
PDDocument pdfDocument = PDDocumentload(in);
if (pdfDocumentisEncrypted()) {
// 仅仅尝试使用默认密码打开加密的PDF
pdfDocumentdecrypt("");
}
PDFTextStripper stripper = null;
// 创建一个writer用来作来存储文件正文
StringWriter writer = new StringWriter();
if (stripper == null) {
stripper = new PDFTextStripper();
} else {
stripperresetEngine();
}
stripperwriteText(pdfDocument, writer);
String contents = writergetBuffer()toString();
Systemoutprintln("pdfd===" + contents);
/ 向Word中写入数据 /
byte[] a = contentsgetBytes();
ByteArrayInputStream bs = new ByteArrayInputStream(a);
POIFSFileSystem fs = new POIFSFileSystem();
// /////////////////////////////////
DirectoryEntry directory = fsgetRoot();
DocumentEntry de = directorycreateDocument("WordDocument", bs);
// 以上两句代码不能省略,否则输出的是乱码
FileOutputStream fos = new FileOutputStream("D:\\dddoc");
fswriteFilesystem(fos);
bsclose();
fosflush();
fosclose();
Systemoutprintln("写入成功");
} catch (Exception e) {
eprintStackTrace();
}
}
/
@Method: extractTextFromXLS
@Description: 从excel 2003档中提取纯文本
@param
@return String
@throws
/
@SuppressWarnings("deprecation")
private static String extractTextFromXLS(InputStream is) throws IOException {
StringBuffer content = new StringBuffer();
HSSFWorkbook workbook = new HSSFWorkbook(is); // 创建对Excel工作簿文件的引用
for (int numSheets = 0; numSheets < workbookgetNumberOfSheets(); numSheets++) {
if (null != workbookgetSheetAt(numSheets)) {
HSSFSheet aSheet = workbookgetSheetAt(numSheets); // 获得一个sheet
contentappend(aSheetgetSheetName());
contentappend("\r\n-----------------------\r\n");
for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet
getLastRowNum(); rowNumOfSheet++) {
if (null != aSheetgetRow(rowNumOfSheet)) {
HSSFRow aRow = aSheetgetRow(rowNumOfSheet); // 获得一行
for (short cellNumOfRow = 0; cellNumOfRow <= aRow
getLastCellNum(); cellNumOfRow++) {
if (null != aRowgetCell(cellNumOfRow)) {
HSSFCell aCell = aRowgetCell(cellNumOfRow); // 获得列值
if (aCellgetCellType() == HSSFCellCELL_TYPE_NUMERIC) {
contentappend(aCellgetNumericCellValue());
} else if (aCellgetCellType() == HSSFCellCELL_TYPE_BOOLEAN) {
contentappend(aCellgetBooleanCellValue());
} else {
contentappend(aCellgetStringCellValue());
}
contentappend("\t");
}
}
contentappend("\r\n");
}
}
}
}
return contenttoString();
}
/
@Method: extractTextFromXLS2007
@Description: 从excel 2007文档中提取纯文本
@param
@return String
@throws
/
private static String extractTextFromXLS2007(String fileName)
throws Exception {
StringBuffer content = new StringBuffer();
// 构造 XSSFWorkbook 对象,strPath 传入文件路径
XSSFWorkbook xwb = new XSSFWorkbook(fileName);
// 循环工作表Sheet
for (int numSheet = 0; numSheet < xwbgetNumberOfSheets(); numSheet++) {
XSSFSheet xSheet = xwbgetSheetAt(numSheet);
if (xSheet == null) {
continue;
}
// 循环行Row
for (int rowNum = 0; rowNum <= xSheetgetLastRowNum(); rowNum++) {
XSSFRow xRow = xSheetgetRow(rowNum);
if (xRow == null) {
continue;
}
// 循环列Cell
for (int cellNum = 0; cellNum <= xRowgetLastCellNum(); cellNum++) {
XSSFCell xCell = xRowgetCell(cellNum);
if (xCell == null) {
continue;
}
if (xCellgetCellType() == XSSFCellCELL_TYPE_BOOLEAN) {
contentappend(xCellgetBooleanCellValue());
} else if (xCellgetCellType() == XSSFCellCELL_TYPE_NUMERIC) {
contentappend(xCellgetNumericCellValue());
} else {
contentappend(xCellgetStringCellValue());
}
}
}
}
return contenttoString();
}
}
这是POI jar包的下载地址,我下载的是39版本的
>
File f = new File("/path/to/excel/file");
Workbook wb = null;
NPOIFSFileSystem npoifs = null;
OPCPackage pkg = null;
try {
npoifs = new NPOIFSFileSystem(f);
wb = WorkbookFactorycreate(npoifs);
} catch(OfficeXmlFileException ofe) {
pkg = OPCPackageopen(f);
以上就是关于java读取doc,pdf问题。全部的内容,包括:java读取doc,pdf问题。、在poi 中OPCPackage.open() 是什么函数OPCPackage这个类用来干嘛、谁能给我一个详细的Java通过Apache POI导出Excel方法,最好能给完整代码等相关内容解答,如果想了解更多相关内容,可以关注我们,你们的支持是我们更新的动力!
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)