告别复制粘贴,Python 实现 PDF 转文本
↑ 关注 + 星标 ,每天学Python新技能
后台回复【大礼包】送你Python自学大礼包
机器之心编译
将 pdf 转换为图片;
检测和识别图像中的文本;
展示示例输出。
from pdf2image import convert_from_path
from pdf2image.exceptions import (
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError
)
pdf_path = "path/to/file/intro_RL_Lecture1.pdf"
images = convert_from_path(pdf_path)
for i, image in enumerate(images):
fname = "image" + str(i) + ".png"
image.save(fname, "PNG")
# adapted from this source: https://github.com/courao/ocr.pytorch
%load_ext autoreload
%autoreload 2
import os
from ocr import ocr
import time
import shutil
import numpy as np
import pathlib
from PIL import Image
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import pytesseract
def single_pic_proc(image_file):
image = np.array(Image.open(image_file).convert('RGB'))
result, image_framed = ocr(image)
return result,image_framed
image_files = glob('./input_images/*.*')
result_dir = './output_images_with_boxes/'
# If the output folder exists we will remove it and redo it.
if os.path.exists(result_dir):
shutil.rmtree(result_dir)
os.mkdir(result_dir)
for image_file in sorted(image_files):
result, image_framed = single_pic_proc(image_file) # detecting and recognizing the text
filename = pathlib.Path(image_file).name
output_file = os.path.join(result_dir, image_file.split('/')[-1])
txt_file = os.path.join(result_dir, image_file.split('/')[-1].split('.')[0]+'.txt')
txt_f = open(txt_file, 'w')
Image.fromarray(image_framed).save(output_file)
for key in result:
txt_f.write(result[key][1]+'\n')
txt_f.close()
import cv2 as cv
output_dir = pathlib.Path("./output_images_with_boxes")
# image = cv.imread(str(np.random.choice(list(output_dir.iterdir()),1)[0]))
image = cv.imread(f"{output_dir}/image7.png")
size_reshaped = (int(image.shape[1]),int(image.shape[0]))
image = cv.resize(image, size_reshaped)
cv.imshow("image", image)
cv.waitKey(0)
cv.destroyAllWindows()
filename = f"{output_dir}/image7.txt"
with open(filename, "r") as text:
for line in text.readlines():
print(line.strip("\n"))
原文链接:https://towardsdatascience.com/faster-notes-with-python-and-deep-learning-b713bbb3c186
推荐阅读
您看此文用 分 秒,转发只需1秒哦
评论