import os,sys
import img2pdf
'''
程序功能: 用python將小冊子打印掃描的A3幅面雙頁亂碼的PDF文件轉換A4幅面順碼的PDF文件問題:A3幅面的雙面打印的在中間用騎馬釘裝訂成小冊子, 拆開中間裝訂的訂書釘由復印機一次性掃描成PDF文件。閱讀不方便:頁碼錯亂,A3幅面,左右兩頁。本程序將掃描的A3幅面的PDF文件轉換成理順頁碼單面的A4幅面的PDF文件。
編程思路: PDF文件==>頁面png(圖片文件)(用到:pdf2image,oppler中的 pdftommp.exe )==>圖片文件A3大小切分成2個A4幅面的圖片文件 (用到:PIL中的Image)==>圖片文件組成PDF文件(用到:img2pdf中的img2pdf.convert(pngList)附注:實測中使用png格式的形成的A4幅面的PDF較小。重要事項:
(1) 程序安裝在d:盤(或e:)的d:\leader
(2) 解壓poppler-0.68.0_x86后得到的bin目錄下的文件安裝到 d:\leader\bin;并將d:\leader\bin加入到windows的path變量中。(下面的批處理文件已經(jīng)解決這個問題)
(3) 批處理文件可以如下:rem main.cmdpath d:\leader\bin;%path%d:cd \leaderpython main.prg %1(4) 運行時,可以: main d:\A3pdf目錄也可以: 直接main運行。不添加命令行參數(shù)使用缺省A3PDF目錄為: d:\leader\pdf使用時只需將要轉換的A3pdf文件copy到 d:\leader\pdf之下即可。轉換完成的文件在 d:\leader\pdf\A4子目錄下。中間轉換時產(chǎn)生的圖片文件在 d:\leader\pdf\PNG 子目錄下。這些圖片文件可以刪除。版本 0.1版本 01,使用全局變量,各個函數(shù)均在一個文件中。程序使用了幾個庫:
pip install pillow
pip install PyPdf3
pip install pdf2image
pip install img2pdf程序還使用了 poppler-0.68.0_x86
pdf2image是包裝器,poppler是轉換過程真正需要的。編程 葉照清 363992124@qq.com
日期 2021.01.25=============
Poppler for Windows
I have been using the Poppler library for some time, over a series of various projects. It’s an open source set of libraries and command line tools, very useful for dealing with PDF files. Poppler is targeted primarily for the Linux environment, but the developers have included Windows support as well in the source code. Getting the executables (exe) and/or dlls for the latest version however is very difficult on Windows. So after years of pain, I jumped on oDesk and contracted Ilya Kitaev, to both compile with Microsoft Visual Studio, and also prepare automated tools for easy compiling in the future. Update: MSVC isn’t very well supported, these days the download is based off MinGW.So now, you can run the following utilities from Windows!PDFToText – Extract all the text from PDF document. I suggest you use the -Layout option for getting the content in the right order.
PDFToHTML – Which I use with the -xml option to get an XML file listing all of the text segments’ text, position and size, very handy for processing in C#
PDFToCairo – For exporting to images types, including SVG!
Many more smaller utilities
DownloadLatest binary : poppler-0.68.0_x86
http://blog.alivate.com.au/wp-content/uploads/2018/10/poppler-0.68.0_x86.7z
'''from pdf2image.exceptions import (PDFInfoNotInstalledError,PDFPageCountError,PDFSyntaxError
)
from pdf2image import convert_from_path
import os,sys,PyPDF3
from PIL import Imagedef pdf2img(pdf_file):basename=os.path.basename(pdf_file)[:-4]try:images = convert_from_path(pdf_file)for idx, img in enumerate(images):path=path_png+rf'\{basename}_{idx:02d}.png'img.save(path)except Exception as e:print(e)def pic_half(filename1,No,MaxPage):basename=os.path.basename(filename1)[:-3]basefileName= path_png+'\\'+basenameMaxPage+=1img = Image.open(filename1+'.png')size = img.size#print(size)# 準備將圖片切割成2張小圖片weight = int(size[0] // 2)height = int(size[1] // 1)# 切割后的小圖的寬度和高度#print(weight, height)for j in range(1):for i in range(2):box = (weight * i, height * j, weight * (i + 1), height * (j + 1))#print(box)imgHalf = img.crop(box)if No%2==1:if i==0:fsave= basefileName+f'_A4_{(MaxPage-No):02d}.png'else:fsave= basefileName+f'_A4_{No:02d}.png'else:if i==1:fsave= basefileName+f'_A4_{(MaxPage-No):02d}.png'else:fsave= basefileName+f'_A4_{No:02d}.png'## print('\t'+fsave)imgHalf.save(fsave)img.close()def one_pdf(pdf_file1):
## '''
## A3.pdf 總頁數(shù) 測試的是10頁==》對折20頁
## '''
## try:
## pdf_stream = open(pdf_file1,'rb')
## pdf = PyPDF3.PdfFileReader(pdf_stream)
## except:
## print(f"{pdf_file1} 不是合法的PDF文件!")
## exit(1)
##
## maxP=pdf.numPages
## pdf_stream.close()
## del pdfpdf2img(pdf_file1)basename=os.path.basename(pdf_file1)[:-4]for i in range(1,maxP+1):A3_png=path_png+f'\\{basename}_{i-1:02d}'print(A3_png)No=ipic_half(A3_png,No,maxP*2)def doImg2Pdf(fileName):bb=pdf_b_name[:-4]with open(f"{A4_dir}\\{bb}_A4.pdf", "wb") as f:#fileList = os.listdir(fileName)#print(fileList)pngList = []for ii in range(1,maxP*2+1):pngName =f'{bb}_A4_{ii:02d}.png'print('\t'+pngName)pngList.append(pngName) pfn_bytes = img2pdf.convert(pngList)f.write(pfn_bytes)print(f"{A4_dir}\\{bb}_A4.pdf 轉換完成。\n")################################
root = os.path.abspath(os.path.dirname(__file__))
dd=''
if root.find(":") == 1:print(__file__)dd=root[:2]
path =dd+r'\LEADER\PDF'
path_a4 = path+r'\A4'
path_png = path+r'\PNG'
PDF_list=[]
maxP=10if len(sys.argv)>1:path = sys.argv[1]if os.path.isdir(path):path_a4 = path+'\\A4'path_png = path+'\\PNG'if not os.path.exists(path_a4) : os.makedirs(path_a4)if not os.path.exists(path_png): os.makedirs(path_png) Dir_l = os.listdir(path)for ff in Dir_l[:]:if ff.find('.pdf') == -1:Dir_l.remove(ff)print("需轉換的文件列表:")for i in range(len(Dir_l)):print(f'{i:04d}\t{Dir_l[i]}')PDF_list = Dir_lprint(f"\n需轉換的文件總數(shù):{i+1:04d}")
else:print( f'{path} :非法目錄')exit(1)if len(PDF_list) == 0 : print("無PDF文件!");exit(0)#Main_loop
for pdf_file in PDF_list:full_fileName = path+'\\'+pdf_fileprint(f'{full_fileName} 轉換中。。。')try:pdf_stream = open(full_fileName,'rb')pdf = PyPDF3.PdfFileReader(pdf_stream)except:print(f"{Full_fileName} 不是合法的PDF文件!")exit(1)maxP=pdf.numPagespdf_stream.close()del pdfone_pdf(full_fileName)pdf_file = full_fileName #"e:\TEST\葉照清.pdf"png_dir = os.path.dirname(pdf_file)+'\\PNG'A4_dir = os.path.dirname(pdf_file)+'\\A4'pdf_b_name = os.path.basename(pdf_file)os.chdir(png_dir)doImg2Pdf(png_dir)#eof