换行符在最终输出中转换为下划线。这是我发现的最小工作解决方案。
from pdfminer.pdfparser import PDFParserfrom pdfminer.pdfdocument import PDFdocumentfrom pdfminer.pdfpage import PDFPagefrom pdfminer.pdfpage import PDFTextExtractionNotAllowedfrom pdfminer.pdfinterp import PDFResourceManagerfrom pdfminer.pdfinterp import PDFPageInterpreterfrom pdfminer.pdfdevice import PDFDevicefrom pdfminer.layout import LAParamsfrom pdfminer.converter import PDFPageAggregatorimport pdfminer# Open a PDF file.fp = open('/Users/me/Downloads/test.pdf', 'rb')# Create a PDF parser object associated with the file object.parser = PDFParser(fp)# Create a PDF document object that stores the document structure.# Password for initialization as 2nd parameterdocument = PDFdocument(parser)# Check if the document allows text extraction. If not, abort.if not document.is_extractable: raise PDFTextExtractionNotAllowed# Create a PDF resource manager object that stores shared resources.rsrcmgr = PDFResourceManager()# Create a PDF device object.device = PDFDevice(rsrcmgr)# BEGIN LAYOUT ANALYSIS# Set parameters for analysis.laparams = LAParams()# Create a PDF page aggregator object.device = PDFPageAggregator(rsrcmgr, laparams=laparams)# Create a PDF interpreter object.interpreter = PDFPageInterpreter(rsrcmgr, device)def parse_obj(lt_objs): # loop over the object list for obj in lt_objs: # if it's a textbox, print text and location if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal): print "%6d, %6d, %s" % (obj.bbox[0], obj.bbox[1], obj.get_text().replace('n', '_')) # if it's a container, recurse elif isinstance(obj, pdfminer.layout.LTFigure): parse_obj(obj._objs)# loop over all pages in the documentfor page in PDFPage.create_pages(document): # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object parse_obj(layout._objs)
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)