通过打印机将一本纸质书转为pdf的格式,以下所有 *** 作都是在这个基础上 *** 作,最终目的生成n篇txt文件,每个txt名字规则是:起始页_篇&章&节 内容:是对应切割的内容;
简介这是一款基于OCR框架的解析工具,拥有比价完整的Java 类库,及完善的API文档,不但具备pdf的读写,还支持文本和图片的提取,水印的添加,书签的增删改,表格的 *** 作,同时还支持,将pdf转化成word、HTML、XPS、SVG、等多种方式;当然目前这类工具市面上有很多,不过通过比较最后选择了spire.pdf ;目前这款项目共有两个版本,一个是免费版本一个是付费版本,免费版本如果只是处理简单的pdf是没问题的,但是如果涉及到输出为pdf则会只显示前10页,第十一页则是预定的购买页介绍,不过介于spire.pdf的完善性我最后还是选择了他,至于10页的问题,后面会拿出我的处理办法,如果你是其他语言如.NET、Android、也可以使用这款产品;
相似产品及特点 PDFBoxitext百度Tesseract 只能用于识别图片,如果需要先将pdf转为图片 官网地址https://www.e-iceblue.cn/Introduce/Free-Spire-PDF-JAVA.html
功能实现前往官网下载jar包或者直接在maven上通过坐标也可实现,不过maven上的肯定没有网页下载的新;
<repositories>
<repository>
<id>com.e-iceblueid>
<url>http://repo.e-iceblue.cn/repository/maven-public/url>
repository>
repositories>
<dependencies>
<dependency>
<groupId> e-iceblue groupId>
<artifactId>spire.pdf.freeartifactId>
<version>3.11.6version>
dependency>
dependencies>
切割pdf
按每页切割
/**
*每一页生成一个pdf文档
*/
public static void splitPdfOneByOne(){
PdfDocument pdf = new PdfDocument();
int count = pdf.getPages().getCount();
System.out.println(count);
pdf.loadFromFile("C:\Users\wangchenchen\Desktop\boot-structure\book.pdf");
pdf.split("C:\Users\wangchenchen\Desktop\boot-structure\output\surgery_{0}.pdf",0);
pdf.close();
}
按指定页切割
package com.wangcc.wangccocrdemo001.ocr;
import com.spire.pdf.PdfDocument;
import com.spire.pdf.PdfPageBase;
import com.spire.pdf.graphics.PdfMargins;
import com.wangcc.wangccocrdemo001.util.FileUtil;
import java.awt.geom.Point2D;
import java.io.File;
/**
* 拆分文档
* 拆成每页一个
* 拆成每九页一个(免费版最多只能处理九页)
* @author wangcc
* @createTime 2021年08月31日 23:25:00
*/
public class SubPDF {
/**
* 每九页生成一个pdf
**/
public static void splitPdfMoreByOne(){
String fileName = "C:\Users\wangchenchen\Desktop\boot-structure\book.pdf";
String outPath = "C:\Users\wangchenchen\Desktop\boot-structure\outFile\outPDFByMore";
PdfDocument pdf = new PdfDocument();
pdf.loadFromFile(fileName);
int totalCount = pdf.getPages().getCount();
PdfPageBase pageBase;
PdfDocument document = new PdfDocument();
int count = 1;
for(int i = 41; i< 822;i++){
System.out.println(i+"/"+822);
pageBase = document.getPages().add(pdf.getPages().get(i).getSize(),new PdfMargins(0));
pdf.getPages().get(i).createTemplate().draw(pageBase, new Point2D.Float(0,0));
if(count % 9 == 0){
String path = "\splitPdf-"+i+".pdf";
document.saveToFile(outPath+path);
document = new PdfDocument();
}
count++;
}
if(document.getPages().getCount() >0){
String path = "\splitPdf-999999.pdf";
document.saveToFile(outPath+path);
}
}
/**
* @Description //
* @return void
**/
public static void splitPdfByNumber(Integer begin, Integer end,String filePath,String pdfOutPath){
if(begin.equals("") || end.equals("") || filePath.equals("") || pdfOutPath.equals("")){
System.out.println("传入参数有空.......");
return;
}
if(begin >= end){
System.out.println("截止页数不能小于或等于开始页数.......");
return;
}
if(end-begin > 9){
System.out.println(" *** 作页数最多为9页");
return;
}
File file = new File(pdfOutPath);
if (file.exists()){
FileUtil fileUtil = new FileUtil();
fileUtil.DeleteFolder(pdfOutPath);
}
PdfDocument pdf = new PdfDocument();
pdf.loadFromFile(filePath);
int totalCount = pdf.getPages().getCount();
PdfPageBase pageBase;
PdfDocument document = new PdfDocument();
for(int i = begin; i< end;i++){
System.out.println(i+"/"+end);
pageBase = document.getPages().add(pdf.getPages().get(i).getSize(),new PdfMargins(0));
pdf.getPages().get(i).createTemplate().draw(pageBase, new Point2D.Float(0,0));
String path = "\surgery_"+i+".pdf";
document.saveToFile(pdfOutPath+path);
document = new PdfDocument();
}
}
}
将pdf转成txt
package com.wangcc.wangccocrdemo001.ocr;
import com.spire.pdf.PdfDocument;
import com.spire.pdf.PdfPageBase;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 读取所有拆分文件生成txt文件
* @author wangcc
* @createTime 2021年08月31日 23:51:00
*/
public class ReadAllSplitFile {
public static String fileName = "C:\Users\wangchenchen\Desktop\boot-structure\outFile\outPDFByMore\";
public static String outPath = "C:\Users\wangchenchen\Desktop\boot-structure\outFile\readPdfFile.txt";
public static void main(String[] args) {
List<File> fileList = readAllFile();
List<String> pdfFileNameList = new ArrayList<>();
for (File file:fileList) {
pdfFileNameList.add(file.getName());
}
Collections.sort(pdfFileNameList, new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
int n1 = extractNumber(o1);
int n2 = extractNumber(o2);
return n1 - n2;
}
});
File file = new File(outPath);
if(file.exists()){
file.delete();
}
for (String s:pdfFileNameList) {
try {
readFile(s);
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static List<File> readAllFile(){
String filePath = "C:\Users\wangchenchen\Desktop\boot-structure\outFile\outPDFByMore";
ArrayList<File> fileList = new ArrayList<>();
File file = new File(filePath);
File[] files = file.listFiles();
if(Objects.isNull(files)){
return null;
}
for (File f:files) {
if(f.isFile()){
fileList.add(f);
}
}
return fileList;
}
/**
* @Param orderStr 排序:asc,des,不区分大小写
**/
public static List<File> sortFileByName(List<File> fileList, final String orderStr){
if(!orderStr.equalsIgnoreCase("asc") && !orderStr.equalsIgnoreCase("desc")){
return fileList;
}
File[] files = fileList.toArray(new File[0]);
Arrays.sort(files, new Comparator<File>() {
@Override
public int compare(File o1, File o2) {
int n1 = extractNumber(o1.getName());
int n2 = extractNumber(o2.getName());
if(orderStr == null || orderStr.length() < 1 || orderStr.equalsIgnoreCase("asc")){
return n1 - n2;
}else {
//降序
return n2 - n1;
}
}
});
return new ArrayList<File>(Arrays.asList(files));
}
public static int extractNumber(String name){
int i;
try {
String s = name.replaceAll("[^\d]", "");
i = Integer.parseInt(s);
}catch (Exception e){
i = 0;
}
return i;
}
public static void readFile(String path) throws IOException {
PdfDocument pdf = new PdfDocument();
pdf.loadFromFile(fileName+path);
PdfPageBase page;
StringBuilder sb = new StringBuilder();
Pattern pattern = Pattern.compile("(^(\s*)第)(.{1,9})[章节卷集部篇回](\s{1,10})(.{1,20})(\s{1,10})");
Pattern pattern1 = Pattern.compile("(\s{0,10})([0-9][0-9]?[0-9]?[0-9]?)");
//遍历PDF页面,获取每个页面的文本并添加到StringBuilder对象
for(int i = 0;i < pdf.getPages().getCount();i++) {
//System.out.println("循环遍历pdf页数:当前" + i + "页/" + pdf.getPages().getCount() + "页");
page = pdf.getPages().get(i);
int count = 0;
String extractText = null;
BufferedReader br = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(page.extractText(true).getBytes())));
while ((extractText = br.readLine())!= null){
Matcher matcher = pattern.matcher(extractText);
Matcher matcher1 = pattern1.matcher(extractText);
/*末尾包含数字的*/
if (count != 0 || matcher.find()){
//System.out.println(extractText);
if(!extractText.equals("") && !matcher1.find()){
String s = extractText.replaceAll("\s{5,9}", " ");
sb.append(s+"\n");
}
}
count++;
}
br.close();
}
FileWriter writer;
try {
//将StringBuilder对象中的文本写入到文本文件
writer = new FileWriter(outPath,true);
System.out.println(sb.toString());
writer.write(sb.toString());
writer.flush();
writer.close();
sb.delete(0,sb.length());
} catch (IOException e) {
e.printStackTrace();
}
pdf.close();
}
}
识别pdf中的篇章节生成对应文本
去除水印
package com.wangcc.wangccocrdemo001.ocr;
import com.spire.pdf.PdfDocument;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
/**
* 去除水印(未必都有用,主要是看水印的类型)
* 水印有两种
* 1.文字,放在对应的位置
* 2.大图片,png 背景透明
* 一些特殊的水印原则上是去不掉的
* @author wangcc
* @createTime 2021年08月31日 23:46:00
*/
public class ClearWaterMark {
public static void main(String[] args) throws IOException {
PdfDocument pdf = new PdfDocument();
pdf.loadFromFile("C:\Users\wangchenchen\Desktop\boot-structure\out\splitPdf-18.pdf");
BufferedImage bufferedImage = null;
for (int i = 0; i<pdf.getPages().getCount();i++){
bufferedImage = pdf.saveAsImage(i);
bufferedImage.getSubimage(bufferedImage.getMinX(),15,bufferedImage.getWidth(),bufferedImage.getHeight()-15);
File saveFile = new File("C:\Users\wangchenchen\Desktop\boot-structure\outImg\" + i + ".png");
if(!saveFile.exists()){
saveFile.mkdirs();
}
ImageIO.write(bufferedImage,"PNG",saveFile);
bufferedImage.flush();
}
}
}
获取书签
package com.wangcc.wangccocrdemo001.ocr;
import com.spire.pdf.PdfDocument;
import com.spire.pdf.bookmarks.PdfBookmark;
import com.spire.pdf.bookmarks.PdfBookmarkCollection;
import java.io.FileWriter;
import java.io.IOException;
/**
* 获取书签
* @author wangcc
* @createTime 2021年08月31日 23:47:00
*/
public class getBookMake {
public static void main(String[] args) {
PdfDocument pdf = new PdfDocument();
pdf.loadFromFile("C:\Users\wangchenchen\Desktop\boot-structure\out\splitPdf-18.pdf");
PdfBookmarkCollection bookmarkCollection = pdf.getBookmarks();
StringBuilder stringBuilder = new StringBuilder();
/*获取书签*/
GetBookMakeTitle(bookmarkCollection,stringBuilder);
FileWriter writer;
try {
writer = new FileWriter("C:\Users\wangchenchen\Desktop\boot-structure\读取的书签.txt");
writer.write(stringBuilder.toString());
writer.flush();
} catch (IOException e) {
e.printStackTrace();
}
pdf.dispose();
}
public static void GetBookMakeTitle(PdfBookmarkCollection bookmarkCollection,StringBuilder stringBuilder){
if(bookmarkCollection.getCount() > 0){
for (int i = 0; i < bookmarkCollection.getCount(); i++){
PdfBookmark bookmark = bookmarkCollection.get(i);
stringBuilder.append(bookmark.getTitle());
GetBookMakeTitle(bookmark,stringBuilder);
}
}
}
}
获取每页中图片
package com.wangcc.wangccocrdemo001.ocr;
import com.spire.pdf.PdfDocument;
import com.spire.pdf.PdfPageBase;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.Objects;
/**
* 提取每页pdf中的图片
* @author wangcc
* @createTime 2021年08月31日 23:41:00
*/
public class ExtractImg {
public static void main(String[] args) throws Exception{
//加载测试文档
InputStream inputStream = new BufferedInputStream(new FileInputStream(new File("C:\Users\wangchenchen\Desktop\boot-structure\demo.pdf")));
PdfDocument pdf = new PdfDocument();
//pdf.loadFromFile("C:\Users\wangchenchen\Desktop\boot-structure\demo.pdf");
pdf.loadFromStream(inputStream);
//定义一个int型变量
int index = 0;
//遍历PDF每一页
for (int i= 0;i< pdf.getPages().getCount(); i ++){
System.out.println(i+"/"+pdf.getPages().getCount());
//获取PDF页面
PdfPageBase page = pdf.getPages().get(i);
//使用extractImages方法获取页面上图片
for (BufferedImage image : page.extractImages()) {
if(Objects.nonNull(image)){
//指定输出图片名称
File output = new File( String.format("C:\Users\wangchenchen\Desktop\boot-structure\Img\Image_%d.pdf", index++));
//将图片保存为PNG格式文件
ImageIO.write(image, "PNG", output);
}
}
}
}
}
项目地址
spire.pdf 下载
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)