澄清:我不想将页面添加到PDF文件中。我想将内容添加到一个非常大的PDF页面。页面有时会发生变化,每次内容都不同。
我正在使用pypdf2和reportlab对大型PDF页面进行少量添加(~10MB)。这需要30秒甚至更长时间,大部分时间用于解析原始文件。
通常也需要使用mergeRotatedTranslatedPage
转换页面。
我的想法是生成原始内容数组一次,然后每次我想添加内容时复制它。所以我修改了PageObject._merge
来做到这一点。它工作......有点儿。我现在已经下降到18秒。
有没有更好的方法来加快这个过程?一页18秒仍然很慢。
答案 0 :(得分:1)
如果您想使用处理器所有核心的100%容量,您可以使用“多处理”来实现,如下所示:
我们计算 PDF 中的页数和处理器的核数,以计算每个核有多少页必须工作。
必须工作的页面被发送到每个核心,最后所有的 PDF 都被连接起来。
# -*- coding: utf-8 -*-
from io import BytesIO
from PyPDF2 import PdfFileWriter, PdfFileReader, PdfFileMerger
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
from reportlab.lib.colors import Color
from webcolors import hex_to_rgb
import base64
import multiprocessing
import math
class SkyMark:
self.mainPdf=''
self.mask=''
#beginning, call from your api rest y pass request
def begin(self,request):
stringPdfBase64 = str(request.json['pdfBase4'])
blob = self.workMultiprocess(stringPdfBase64,'MyWaterMark text')
pdfbase64 = blob.decode('utf-8')
return pdfbase64
def workMultiprocess(self,stringPdfBase64,message='SkyMark'):
try:
#get pdf PdfFileReader objeto with your message
self.mask = self.getMaskWaterMark(message)
#Convert main pdfB64 to PdfFileReader object
sanitizedPdf = stringPdfBase64.rsplit(',', 1)[-1]
data = base64.b64decode(sanitizedPdf)
stream = BytesIO(data)
self.mainPdf = PdfFileReader(stream , strict=False)
numPaginas = self.mainPdf .getNumPages()
#count your cores of your processor
coresProcessor = int( multiprocessing.cpu_count() ) or 22
manager = multiprocessing.Manager()
return_dict = manager.dict()
jobs = []
#calculate how many pages each processor has
byPage= int( math.ceil( numPaginas/coresProcessor ))
pagesFrom=0
pagesTo=0
#Send task for every core
for i in range(coresProcessor):
pagesFrom = pagesTo
pagesTo = pagesFrom + byPage
if pagesTo>numPaginas:
pagesTo=numPaginas
p = multiprocessing.Process(target=self.doByPage, args=(pagesFrom,pagesTo,i, return_dict))
jobs.append(p)
p.start()
if pagesTo>=numPaginas:
break
for proc in jobs:
proc.join()
#Order single PDF's for merge
randData = return_dict.values()
ascArray = sorted(randData, key=lambda k: k['procnum'])
singlePdfsArray = []
for pdfs in ascArray:
singlePdfsArray.append(pdfs['dataB64'])
#merge task
return self.mergePdfsArray(singlePdfsArray)
except Exception as e:
print(f'Error {e}')
#Explotamos los cores del procesador
def doByPage(self,fromPage,toPage,procnum,return_dict):
output = PdfFileWriter()
waterMark = self.mask.getPage(0)
for i in range(fromPage,toPage):
#print(f'WaterMark page: {i}, Core: {procnum}')
page = self.mainPdf.getPage(i)
page.mergePage(waterMark)
page.compressContentStreams()
output.addPage(page)
letter_data = BytesIO()
output.write(letter_data)
letter_data.seek(0)
dataB64 = base64.b64encode(letter_data.read())
return_dict[procnum] = {'procnum':procnum,'dataB64':dataB64}
#Single Pdf with your watermark
def getMaskWaterMark(self,texto):
font_name='Helvetica'
font_size=22
color='#000000'
opacity=0.08
x=1
y=840
filename=''
bgTexto='';
for i in range(1, 6):
bgTexto+= ' '+texto;
cantidadRenglones=100
mask_stream = BytesIO()
watermark_canvas = canvas.Canvas(mask_stream, pagesize=A4)
watermark_canvas.setFont(font_name, font_size)
r, g, b = hex_to_rgb(color)
c = Color(r, g, b, alpha=opacity)
watermark_canvas.setFillColor(c)
print(watermark_canvas)
for i in range(1, cantidadRenglones):
watermark_canvas.drawString(x, y-(i * 25), bgTexto)
watermark_canvas.save()
mask_stream.seek(0)
mask = PdfFileReader(mask_stream , strict=False)
return mask
#Merge all pdf in only one pdf
def mergePdfsArray(self,arrayPdfsBase64):
merge = PdfFileMerger()
for f in arrayPdfsBase64:
nada = base64.b64decode(f)
stre = BytesIO(nada)
src = PdfFileReader(stre , strict=False)
merge.append(src)
letter_data = BytesIO()
merge.write(letter_data)
letter_data.seek(0)
data = base64.b64encode(letter_data.read())
return data