告别「复制+粘贴」,基于深度学习的 OCR,实现 PDF 转文本
Python学习与数据挖掘
共 4700字,需浏览 10分钟
·
2021-06-04 19:23
将 pdf 转换为图片;
检测和识别图像中的文本;
展示示例输出。
from pdf2image import convert_from_path
from pdf2image.exceptions import (
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError
)
pdf_path = "path/to/file/intro_RL_Lecture1.pdf"
images = convert_from_path(pdf_path)
for i, image in enumerate(images):
fname = "image" + str(i) + ".png"
image.save(fname, "PNG")
# adapted from this source: https://github.com/courao/ocr.pytorch
%load_ext autoreload
%autoreload 2
import os
from ocr import ocr
import time
import shutil
import numpy as np
import pathlib
from PIL import Image
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import pytesseract
def single_pic_proc(image_file):
image = np.array(Image.open(image_file).convert('RGB'))
result, image_framed = ocr(image)
return result,image_framed
image_files = glob('./input_images/*.*')
result_dir = './output_images_with_boxes/'
# If the output folder exists we will remove it and redo it.
if os.path.exists(result_dir):
shutil.rmtree(result_dir)
os.mkdir(result_dir)
for image_file in sorted(image_files):
result, image_framed = single_pic_proc(image_file) # detecting and recognizing the text
filename = pathlib.Path(image_file).name
output_file = os.path.join(result_dir, image_file.split('/')[-1])
txt_file = os.path.join(result_dir, image_file.split('/')[-1].split('.')[0]+'.txt')
txt_f = open(txt_file, 'w')
Image.fromarray(image_framed).save(output_file)
for key in result:
txt_f.write(result[key][1]+'\n')
txt_f.close()
import cv2 as cv
output_dir = pathlib.Path("./output_images_with_boxes")
# image = cv.imread(str(np.random.choice(list(output_dir.iterdir()),1)[0]))
image = cv.imread(f"{output_dir}/image7.png")
size_reshaped = (int(image.shape[1]),int(image.shape[0]))
image = cv.resize(image, size_reshaped)
cv.imshow("image", image)
cv.waitKey(0)
cv.destroyAllWindows()
filename = f"{output_dir}/image7.txt"
with open(filename, "r") as text:
for line in text.readlines():
print(line.strip("\n"))
福利时间
近日吴恩达新书《Machine Learning Yearning》中文版开放下载!
《Machine Learning Yearning》是吴恩达历时两年,根据自己多年实践经验整理出来的一本机器学习、深度学习实践经验宝典。里面讲的机器学习课程非常棒,很适合数学基础不是很好的人自学,最近中文版也开放下载阅读了!
如何下载?
1. 识别并关注下方公众号;
2. 在下面公众号(非本号)后台回复关键字「吴恩达」。
评论