《Pytho大法好》之一:批量PDF转Word

所需工具

  • python 3.6.1

  • pdfminer3k

整体思路

  • 单个文件处理
  • 读取PDF文件
  • 写入Word文件
  • 批量处理
  • 多线程提高效率

代码实现

1.读取PDF

from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams

def read_from_pdf(file_path):
with open(file_path, 'rb') as file:
resource_manager = PDFResourceManager()
return_str = StringIO()
lap_params = LAParams()

device = TextConverter(
resource_manager, return_str, laparams=lap_params)
process_pdf(resource_manager, device, file)
device.close()

content = return_str.getvalue()
return_str.close()
return content

2.写入Word

from docx import Document
def save_text_to_word(content, file_path):
doc = Document()
for line in content.split('\n'):
paragraph = doc.add_paragraph()
paragraph.add_run(remove_control_characters(line))
doc.save(file_path)

def remove_control_characters(content):
mpa = dict.fromkeys(range(32))
return content.translate(mpa)

3.PDF转换为Word

def pdf_to_word(pdf_file_path, word_file_path):
content = read_from_pdf(pdf_file_path)
save_text_to_word(content, word_file_path)

4.多线程提高效率

from concurrent.futures import ProcessPoolExecutor
tasks = []
with ProcessPoolExecutor(max_workers=int(config['max_worker'])) as executor:
for file in os.listdir(config['pdf_folder']):
extension_name = os.path.splitext(file)[1]
if extension_name != '.pdf':
continue
file_name = os.path.splitext(file)[0]
pdf_file = config['pdf_folder'] + '/' + file
word_file = config['word_folder'] + '/' + file_name + '.docx'
print('正在处理: ', file)
result = executor.submit(pdf_to_word, pdf_file, word_file)
tasks.append(result)
while True:
exit_flag = True
for task in tasks:
if not task.done():
exit_flag = False
if exit_flag:
print('完成')
exit(0)

5.完整实现代码

    import os
from configparser import ConfigParser
from io import StringIO
from io import open
from concurrent.futures import ProcessPoolExecutor

from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from docx import Document


def read_from_pdf(file_path):
with open(file_path, 'rb') as file:
resource_manager = PDFResourceManager()
return_str = StringIO()
lap_params = LAParams()

device = TextConverter(
resource_manager, return_str, laparams=lap_params)
process_pdf(resource_manager, device, file)
device.close()

content = return_str.getvalue()
return_str.close()
return content


def save_text_to_word(content, file_path):
doc = Document()
for line in content.split('\n'):
paragraph = doc.add_paragraph()
paragraph.add_run(remove_control_characters(line))
doc.save(file_path)


def remove_control_characters(content):
mpa = dict.fromkeys(range(32))
return content.translate(mpa)


def pdf_to_word(pdf_file_path, word_file_path):
content = read_from_pdf(pdf_file_path)
save_text_to_word(content, word_file_path)


def main():
config_parser = ConfigParser()
config_parser.read('config.cfg')
config = config_parser['default']

tasks = []
with ProcessPoolExecutor(max_workers=int(config['max_worker'])) as executor:
for file in os.listdir(config['pdf_folder']):
extension_name = os.path.splitext(file)[1]
if extension_name != '.pdf':
continue
file_name = os.path.splitext(file)[0]
pdf_file = config['pdf_folder'] + '/' + file
word_file = config['word_folder'] + '/' + file_name + '.docx'
print('正在处理: ', file)
result = executor.submit(pdf_to_word, pdf_file, word_file)
tasks.append(result)
while True:
exit_flag = True
for task in tasks:
if not task.done():
exit_flag = False
if exit_flag:
print('完成')
exit(0)


if __name__ == '__main__':
main()

具体使用可参考 GitHub

特别注意

1.部分PDF转换Word不成功是因为缺少字体的原因,需要下载相应的字体包放到相应位置

2.PDF中的图片不能转换

踩了许多坑,十分感谢各位前辈的探索,站在巨人的肩膀上才能看得更远!

参考资料

1.60行Python代码,实现多线程PDF转Word

2.手把手|20行Python代码教你批量将PDF文件转为Word格式(包教包会)

3.pdfminer3k 使用文档


------------------本文结束感谢您的阅读------------------
坚持原创技术分享,您的支持将鼓励我继续创作!