导言
使用PDFMiner提取文本
GitHub - https://github.com/euske/pdfminer PyPI - https://pypi.python.org/pypi/pdfminer/ 网页 - https://euske.github.io/pdfminer/
python -m pip install pdfminer
python -m pip install pdfminer.six
提取所有文本
import io
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
def extract_text_from_pdf(pdf_path):
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
# close open handles
converter.close()
fake_file_handle.close()
if text:
return text
if __name__ == '__main__':
print(extract_text_from_pdf('w9.pdf'))
io 模块创建一个类文件对象 。如果您使用的是Python 2,那么您将需要使用该StringIO 模块。我们的下一步是创建一个转换器。在这种情况下,我们选择TextConverter,但你也可以使用一个 HTMLConverter 或一个XMLConverter 你想要的。最后,我们创建一个PDF解释器对象,它将获取我们的资源管理器和转换器对象并提取文本。stdout。按页面提取文本
# miner_text_generator.py
import io
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
def extract_text_by_page(pdf_path):
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
yield text
# close open handles
converter.close()
fake_file_handle.close()
def extract_text(pdf_path):
for page in extract_text_by_page(pdf_path):
print(page)
print()
if __name__ == '__main__':
print(extract_text('w9.pdf'))
extract_text 功能打印出每页的文本。这是我们可以添加一些解析逻辑来解析我们想要的东西的地方。或者我们可以将文本(或HTML或XML)保存为单独的文件以供将来解析。通过pdf2txt.py导出文本
stdout 。它不会识别基于文本的图像,因为PDFMiner不支持光学字符识别(OCR)。让我们尝试使用它的最简单的方法,它只是将路径传递给PDF文件。我们将使用w9.pdf。打开终端并导航到已保存PDF的位置或修改下面的命令以指向该文件:pdf2txt.py w9.pdf
pdf2txt.py -o w9.html w9.pdf
pdf2txt.py -o w9.xml w9.pdf
<pages>
<page id="1" bbox="0.000,0.000,611.976,791.968" rotate="0">
<textbox id="0" bbox="36.000,732.312,100.106,761.160">
<textline bbox="36.000,732.312,100.106,761.160">
<text font="JYMPLA+HelveticaNeueLTStd-Roman" bbox="36.000,736.334,40.018,744.496" size="8.162">F</text>
<text font="JYMPLA+HelveticaNeueLTStd-Roman" bbox="40.018,736.334,44.036,744.496" size="8.162">o</text>
<text font="JYMPLA+HelveticaNeueLTStd-Roman" bbox="44.036,736.334,46.367,744.496" size="8.162">r</text>
<text font="JYMPLA+HelveticaNeueLTStd-Roman" bbox="46.367,736.334,52.338,744.496" size="8.162">m</text>
<text font="JYMPLA+HelveticaNeueLTStd-Roman" bbox="52.338,736.334,54.284,744.496" size="8.162"> </text>
<text font="JYMPLA+HelveticaNeueLTStd-Roman" bbox="54.284,736.334,56.230,744.496" size="8.162"> </text>
<text font="JYMPLA+HelveticaNeueLTStd-Roman" bbox="56.230,736.334,58.176,744.496" size="8.162"> </text
><text font="JYMPLA+HelveticaNeueLTStd-Roman" bbox="58.176,736.334,60.122,744.496" size="8.162"> </text>
<text font="ZWOHBU+HelveticaNeueLTStd-BlkCn" bbox="60.122,732.312,78.794,761.160" size="28.848">W</text>
<text font="ZWOHBU+HelveticaNeueLTStd-BlkCn" bbox="78.794,732.312,87.626,761.160" size="28.848">-</text>
<text font="ZWOHBU+HelveticaNeueLTStd-BlkCn" bbox="87.626,732.312,100.106,761.160" size="28.848">9</text>
<text></text>
</textline>
用Slate提取文本
easy_install distributepython -m pip install slatepython -m pip install git+https://github.com/timClicks/slate# slate_text_extraction.py
import slate
def extract_text_from_pdf(pdf_path):
with open(pdf_path) as fh:
document = slate.PDF(fh, password='', just_text=1)
for page in document:
print(page)
if __name__ == '__main__':
extract_text_from_pdf('w9.pdf')
导出数据
XML JSON CSV
导出到XML
# xml_exporter.py
import os
import xml.etree.ElementTree as xml
from miner_text_generator import extract_text_by_page
from xml.dom import minidom
def export_as_xml(pdf_path, xml_path):
filename = os.path.splitext(os.path.basename(pdf_path))[0]
root = xml.Element('{filename}'.format(filename=filename))
pages = xml.Element('Pages')
root.append(pages)
counter = 1
for page in extract_text_by_page(pdf_path):
text = xml.SubElement(pages, 'Page_{}'.format(counter))
text.text = page[0:100]
counter += 1
tree = xml.ElementTree(root)
xml_string = xml.tostring(root, 'utf-8')
parsed_string = minidom.parseString(xml_string)
pretty_string = parsed_string.toprettyxml(indent=' ')
with open(xml_path, 'w') as fh:
fh.write(pretty_string)
#tree.write(xml_path)
if __name__ == '__main__':
pdf_path = 'w9.pdf'
xml_path = 'w9.xml'
export_as_xml(pdf_path, xml_path)
<?xml version="1.0" ?>
<w9>
<Pages>
<Page_1>Form W-9(Rev. November 2017)Department of the Treasury Internal Revenue Service Request for Taxp</Page_1>
<Page_2>Form W-9 (Rev. 11-2017)Page 2 By signing the filled-out form, you: 1. Certify that the TIN you are g</Page_2>
<Page_3>Form W-9 (Rev. 11-2017)Page 3 Criminal penalty for falsifying information. Willfully falsifying cert</Page_3>
<Page_4>Form W-9 (Rev. 11-2017)Page 4 The following chart shows types of payments that may be exempt from ba</Page_4>
<Page_5>Form W-9 (Rev. 11-2017)Page 5 1. Interest, dividend, and barter exchange accounts opened before 1984</Page_5>
<Page_6>Form W-9 (Rev. 11-2017)Page 6 The IRS does not initiate contacts with taxpayers via emails. Also, th</Page_6>
</Pages>
</w9>
导出为JSON
json 在其标准库中包含一个模块,允许您以编程方式读取和写入JSON。让我们从上一节中学到的内容,并使用它来创建一个输出JSON而不是XML的导出器脚本:# json_exporter.py
import json
import os
from miner_text_generator import extract_text_by_page
def export_as_json(pdf_path, json_path):
filename = os.path.splitext(os.path.basename(pdf_path))[0]
data = {'Filename': filename}
data['Pages'] = []
counter = 1
for page in extract_text_by_page(pdf_path):
text = page[0:100]
page = {'Page_{}'.format(counter): text}
data['Pages'].append(page)
counter += 1
with open(json_path, 'w') as fh:
json.dump(data, fh)
if __name__ == '__main__':
pdf_path = 'w9.pdf'
json_path = 'w9.json'
export_as_json(pdf_path, json_path)
Filename 和 Pages。该Pages 键映射到一个空列表。接下来,我们遍历PDF的每个页面并提取每个页面的前100个字符。然后我们创建一个字典,其中页码作为键,100个字符作为值,并将其附加到顶级页面列表。最后,我们使用json 模块的dump 命令编写文件。{'Filename': 'w9',
'Pages': [{'Page_1': 'Form W-9(Rev. November 2017)Department of the Treasury Internal Revenue Service Request for Taxp'},
{'Page_2': 'Form W-9 (Rev. 11-2017)Page 2 By signing the filled-out form, you: 1. Certify that the TIN you are g'},
{'Page_3': 'Form W-9 (Rev. 11-2017)Page 3 Criminal penalty for falsifying information. Willfully falsifying cert'},
{'Page_4': 'Form W-9 (Rev. 11-2017)Page 4 The following chart shows types of payments that may be exempt from ba'},
{'Page_5': 'Form W-9 (Rev. 11-2017)Page 5 1. Interest, dividend, and barter exchange accounts opened before 1984'},
{'Page_6': 'Form W-9 (Rev. 11-2017)Page 6 The IRS does not initiate contacts with taxpayers via emails. Also, th'}]}导出为CSV
csv 模块,可用于读取和写入CSV文件。我们将在此处使用它从我们从PDF中提取的文本中创建CSV。我们来看看一些代码:# csv_exporter.py
import csv
import os
from miner_text_generator import extract_text_by_page
def export_as_csv(pdf_path, csv_path):
filename = os.path.splitext(os.path.basename(pdf_path))[0]
counter = 1
with open(csv_path, 'w') as csv_file:
writer = csv.writer(csv_file)
for page in extract_text_by_page(pdf_path):
text = page[0:100]
words = text.split()
writer.writerow(words)
if __name__ == '__main__':
pdf_path = 'w9.pdf'
csv_path = 'w9.csv'
export_as_csv(pdf_path, csv_path)csv 库。否则,导入与前一个示例相同。在我们的函数中,我们使用CSV文件路径创建CSV文件处理程序。然后我们初始化一个CSV编写器对象,该文件处理程序作为其唯一参数。接下来,我们像以前一样循环遍历PDF的页面。这里唯一的区别是我们将前100个字符分成单个单词。这允许我们将一些实际数据添加到CSV中。如果我们不这样做,那么每一行中只有一个元素,那时它实际上不是一个CSV文件。最后,我们将单词列表写入CSV文件。Form,W-9(Rev.,November,2017)Department,of,the,Treasury,Internal,Revenue,Service,Request,for,Taxp
Form,W-9,(Rev.,11-2017)Page,2,By,signing,the,filled-out,"form,",you:,1.,Certify,that,the,TIN,you,are,g
Form,W-9,(Rev.,11-2017)Page,3,Criminal,penalty,for,falsifying,information.,Willfully,falsifying,cert
Form,W-9,(Rev.,11-2017)Page,4,The,following,chart,shows,types,of,payments,that,may,be,exempt,from,ba
Form,W-9,(Rev.,11-2017)Page,5,1.,"Interest,","dividend,",and,barter,exchange,accounts,opened,before,1984
Form,W-9,(Rev.,11-2017)Page,6,The,IRS,does,not,initiate,contacts,with,taxpayers,via,emails.,"Also,",th
从PDF中提取图像
# Extract jpg's from pdf's. Quick and dirty.
import sys
pdf = file(sys.argv[1], "rb").read()
startmark = "ÿØ"
startfix = 0
endmark = "ÿÙ"
endfix = 2
i = 0
njpg = 0
while True:
istream = pdf.find("stream", i)
if istream < 0:
break
istart = pdf.find(startmark, istream, istream+20)
if istart < 0:
i = istream+20
continue
iend = pdf.find("endstream", istart)
if iend < 0:
raise Exception("Didn't find end of stream!")
iend = pdf.find(endmark, iend-20)
if iend < 0:
raise Exception("Didn't find end of JPG!")
istart += startfix
iend += endfix
print("JPG %d from %d to %d" % (njpg, istart, iend))
jpg = pdf[istart:iend]
jpgfile = file("jpg%d.jpg" % njpg, "wb")
jpgfile.write(jpg)
jpgfile.close()
njpg += 1
i = iend
pdfimages -all reportlab-sample.pdf images/prefix-jpg
images 已创建文件夹(或您要创建的任何输出文件夹),因为 pdfimages 它不会为您创建。# image_exporter.py
import os
import subprocess
def image_exporter(pdf_path, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
cmd = ['pdfimages', '-all', pdf_path,
'{}/prefix'.format(output_dir)]
subprocess.call(cmd)
print('Images extracted:')
print(os.listdir(output_dir))
if __name__ == '__main__':
pdf_path = 'reportlab-sample.pdf'
image_exporter(pdf_path, output_dir='images')
subprocess 和os 模块。如果输出目录不存在,我们尝试创建它。我们使用subprocess的 call 方法来执行 pdfimages。我们使用 call 因为它会等待pdfimages 完成运行。您可以使用Popen ,但这基本上将在后台运行该过程。最后,我们打印出输出目录的列表,以确认图像是否被提取到它。结语
pdfimage 实用程序)解决此问题。长按订阅更多精彩▼

如有收获,点个在看,诚挚感谢