import PyPDF2 import os from urllib.request import urlopen
# import fitz from pdf2image import convert_from_path, convert_from_bytes import tempfile from pdf2image.exceptions import PDFInfoNotInstalledError, PDFPageCountError, PDFSyntaxError
from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTImage, LTTextBoxHorizontal from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfparser import PDFParser, PDFDocument, PDFPage
filename = '/Users/wangchuanli/Downloads/2022_PDF.pdf' password = 'iceman' watermarkpdf = os.path.splitext(filename)[0] + '_with watermark' + os.path.splitext(filename)[1] tempdir = 'tempdir'
def print_hi(name): # Use a breakpoint in the code line below to debug your script. print(f'Hi, {name}') # Press ⌘F8 to toggle the breakpoint.
def getPdfFileText(pdfReader): # 获取第一页 pdfPage = pdfReader.getPage(0) # 获取页面内容 content = pdfPage.extractText() print('The content: %s' % content)
def addWatermark(pdfReader): # 打印水印文件 pdfWmReader = PyPDF2.PdfFileReader(open('watermark.pdf', 'rb')) # 创建新的用于保存添加水印后的Pdf pdfWriter = PyPDF2.PdfFileWriter() tempPage = pdfWmReader.getPage(0) # 遍历页面添加水印 for pageNum in range(0, pdfReader.numPages): # 对每页调用合并 pdfReader.getPage(pageNum).mergePage(tempPage) # 把加了水印的页面添加到最终pdf中 pdfWriter.addPage(pdfReader.getPage(pageNum))
# 保存 savePdfFile = open(watermarkpdf, 'wb') # 为下一个步骤做准备 pdfWriter.encrypt(password) pdfWriter.write(savePdfFile) savePdfFile.close() print('==> add water mark finished')
def createPdf(filename, pageObj): pdfWriter = PyPDF2.PdfFileWriter() pdfWriter.addPage(pageObj) savePdfFile = open(filename, 'wb') pdfWriter.write(savePdfFile) savePdfFile.close()
def splitPdfAndMergePdf(pdfReader): # 首先将拆分的pdf放入临时目录 if not os.path.exists(tempdir): os.mkdir(tempdir) # 遍历源文档,按每页拆分 for pageNum in range(0, pdfReader.numPages): createPdf(tempdir + os.path.sep + 'temp_' + str(pageNum) + '.pdf', pdfReader.getPage(pageNum)) print('==>split file finshed, then merge file')
pdfWriter = PyPDF2.PdfFileWriter() # 遍历目录,如果是以pdf结尾的就合并 for file in os.listdir(tempdir): if os.path.splitext(file)[1] == '.pdf': pdfTempReader = PyPDF2.PdfFileReader(open(tempdir + os.path.sep + file, 'rb')) pdfWriter.addPage(pdfTempReader.getPage(0)) savePdfFile = open('MergePdf.pdf', 'wb') pdfWriter.write(savePdfFile) savePdfFile.close()
def decryptPdf(filename): if os.path.exists(filename): with open(filename, 'rb') as fileObj: pdfReader = PyPDF2.PdfFileReader(fileObj) if pdfReader.isEncrypted: print('==> this pdf is encryped...') pdfReader.decrypt(password) print(pdfReader.documentInfo) else: print(filename + 'is not exist')
# Press the green button in the gutter to run the script. if __name__ == '__main__': print_hi('PyCharm')
images = convert_from_bytes(open('/Users/wangchuanli/Downloads/2022_PDF.pdf', 'rb').read()) for image in images: if not os.path.exists(r'/Users/wangchuanli/Downloads/temp/'): os.makedirs(r'/Users/wangchuanli/Downloads/temp/') image.save(r'/Users/wangchuanli/Downloads/temp/' + f'\img_{images.index(image)}.png', 'PNG')
if os.path.exists(filename): with open(filename, 'rb') as fileObj: pdfReader = PyPDF2.PdfFileReader(fileObj) print('the pdf info: %s\n' % pdfReader.documentInfo) # 获取 PDF 的页数 page_count = pdfReader.getNumPages() print(page_count) # 获取文档内容 # 获取一个 PdfFileWriter 对象 # pdf_output = pdfReader.getPage(0) # getPdfFileText(pdfReader) # 添加水印 # addWatermark(pdfReader) # 创建加密pdf,再解密打开 # decryptPdf(watermarkpdf) # 拆分pdf和合并pdf # splitPdfAndMergePdf(pdfReader)
fp = open(filename, 'rb') # 创建一个与文档关联的解析器 parser = PDFParser(fp) # PDF文档对象 doc = PDFDocument() # 创建pdf文档对象,存储文档结构 document = PDFDocument() # document = PDFDocument(parser, password) # 链接解析器和文档对象 parser.set_document(doc) doc.set_parser(parser) # 初始化文档 doc.initialize("") # 创建DPF资源管理器 resource = PDFResourceManager() # 参数分析器 laparam = LAParams() # 聚合器 device = PDFPageAggregator(resource, laparams=laparam) # 创建页面解析器 interpreter = PDFPageInterpreter(resource, device) # 使用文档对象从pdf中读取内容 # for page in doc.get_pages(): # # 使用页面解析器 # interpreter.process_page(page) # # 使用聚合器获取内容 # layout = device.get_result() # for text_obj in layout: # # 判断是否有get_text属性 # if hasattr(text_obj, 'get_text'): # print(text_obj.get_text()) # if isinstance(text_obj, LTImage): # 图片对象 # print(text_obj.get_) # if isinstance(text_obj, LTTextBoxHorizontal): # 获取文本内容 # print(text_obj.get_) # 水平文本框
# 处理包含在文档中的每一页 # for page in PDFPage.create_pages(document): # interpreter.process_page(page) # layout = device.get_result() # for x in layout: # # 获取文本对象 # if isinstance(x, LTTextBox): # print(x.get_text().strip()) # # 获取图片对象 # if isinstance(x, LTImage): # print('这里获取到一张图片') # # 获取 figure 对象 # if isinstance(x, LTFigure): # print('这里获取到一个 figure 对象')
else: print('the %s not exist' % filename)
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
|