场景:防止抄袭!word对比就很容易,所以这里介绍pdf内容对比技术。
一、环境- win10
- python3.9.6
pdf转图片jpg,图片转文字,文字进行对比。
- 将 PDF 文件的每一页转换为一个图片
- 图片转为文字,对文字进行比较,再得到一个差异图。
- 将所有生成的差异图像拼接成一个 PDF 文件
使用方法:python diff-pdf.py test1.pdf test2.pdf
结果如下:
主要使用模块:pdfminer,io中的StringIO。
4.1 读取pdf def read_pdf(self, file_name):
rsrcmgr = PDFResourceManager()
laparams = LAParams()
fp = open(file_name, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = {}
for page in PDFPage.create_pages(document):
dictionary = {}
dictionary['textbox'] = []
dictionary['textline'] = []
interpreter.process_page(page)
layout = device.get_result()
for item in layout:
if isinstance(item, LTTextBox):
dictionary['textbox'].append(item)
for child in item:
if isinstance(child, LTTextLine):
dictionary['textline'].append(child)
pages[layout.pageid] = dictionary
return pages
4.2 pdf转文本
def convert_pdf_to_txt(self, path, page_no=-1):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
i = 0
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
check_extractable=True):
i += 1
if i != page_no:
continue
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
4.3 pdf比较
def compare_pdf(self, file1, file2, header_text, x_margin=10, compare_margin=0.2):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
fp = open(file1, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
out = StringIO()
layoutmode = 'normal'
scale = 1.3
fontscale = 1
html_coverter = HTMLPrivateConverter(rsrcmgr, out, scale=scale,
layoutmode=layoutmode, laparams=laparams, fontscale=fontscale,
imagewriter=None, header_text=header_text, x_margin=x_margin)
interpreter = PDFPageInterpreter(rsrcmgr, device)
testpages = PDFPage.create_pages(document)
file_dict = self.read_pdf(file2)
for page in testpages:
interpreter.process_page(page)
layout = device.get_result()
html_coverter.page_begin(layout)
if file_dict.get(layout.pageid) == None:
break
compare_page = file_dict[layout.pageid]
for item in layout:
if isinstance(item, LTTextBox):
html_coverter.begin_div('textbox', 1, item.x0 + html_coverter.x_margin, item.y1, item.width,
item.height,
item.get_writing_mode())
for child in item:
if isinstance(child, LTTextLine):
self.compare_textline(child, compare_page, html_coverter, compare_margin)
html_coverter.put_newline()
html_coverter.end_div()
html_coverter.page_end()
fp.close()
device.close()
retstr.close()
return out.getvalue()
4.4 完整程序
点击:完整源码
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)