我正在寻找一种从这些文件类型中删除图片的方法,这是我提出的解决方案。它遍历给定的目录结构,复制具有适当扩展名的任何文件,并将副本重命名为filename.zip。然后它浏览zip结构并使用适当的扩展名提取所有图片类型文件,并将它们重命名为原始文件名,并带有唯一性编号。最后,它删除它创建的提取目录树。
从文本文档中提取图片是我工作的一部分,所以从长远来看,这实际上可以为我的公司节省数千小时。
所有代码都在下面,我真正要问的是:有更好的方法吗?有更高效的东西吗?可以缩放以包含其他格式吗?是否可以将文本提取到txt中 - 用于加载单词与记事本的加载时间?
此解决方案适用于我的Linux机器,我可以提取图片,但我还没有在Windows系统上进行测试。
#!/usr/bin/python3
import shutil
import os
import zipfile
def zipDoc(aFile,dirPath):
dotNDX = aFile.index(".") # position of the .
shortFN = aFile[:dotNDX] # name of the file before .
zipName = dirPath + shortFN + ".zip" # name and path of the file only .zip
shutil.copy2(dirPath + aFile, zipName) # copies all data from original into .zip format
useZIP = zipfile.ZipFile(zipName) # the usable zip file
return useZIP # returns the zipped file
def hasPicExtension(aFile): # if a file ends in a typical picture file extension, returns true
picEndings = [".jpeg",".jpg",".png",".bmp",".JPEG"".JPG",".BMP",".PNG"] # list of photo extensions
if aFile.endswith(tuple(picEndings)): # turn the list into a tuple, because .endswith accepts that
return True
else: # if it doesn't end in a picture extension
return False
def delDOCXEvidence(somePath): # removes the .docx file structures generated
##################################################################
# Working Linux code:
os.rmdir(somePath + "/word/media") # removes directory
os.rmdir(somePath + "/word") # removes more directory
##################################################################
##################################################################
# Untested windows code:
# os.rmdir(somePath + "\\\\word\\\\media") # removes directory
# os.rmdir(somePath + "\\\\word") #removes more directory
##################################################################
def delXLSXEvidence(somePath): # removes the .xlsx file structures generated
##################################################################
# Working Linux code:
os.rmdir(somePath + "/xl/media") # removes directory
os.rmdir(somePath + "/xl") # removes more directory
##################################################################
##################################################################
# Untested windows code:
# os.rmdir(somePath + "\\\\xl\\\\media") # removes directory
# os.rmdir(somePath + "\\\\xl") #removes more directory
##################################################################
def extractPicsFromDir(dirPath=""):
# when given a directory path, will extract all images from all .docx and .xlsx file types
if os.path.isdir(dirPath): # if the given path is a directory
for dirFile in os.listdir(dirPath): # loops through all files in the directory
dirFileName = os.fsdecode(dirFile) # strips out the file name
if dirFileName.endswith(".docx"):
useZIP = zipDoc(dirFile,dirPath) # turns it into a zip
picNum = 1 # number of pictures in file
for zippedFile in useZIP.namelist(): # loops through all files in the directory
if hasPicExtension(zippedFile): # if it ends with photo
useZIP.extract(zippedFile, path=dirPath) # extracts the picture to the path + word/media/
shutil.move(dirPath + str(zippedFile),dirPath + dirFileName[:dirFileName.index(".")] + " - " + str(picNum)) # moves the picture out
picNum += 1
delDOCXEvidence(dirPath) # removes the extracted file structure
os.remove(useZIP.filename) # removes zip file
# no evidence
if dirFileName.endswith(".xlsx"):
useZIP = zipDoc(dirFile,dirPath) # turns it into a zip
picNum = 1 # number of pictures in file
for zippedFile in useZIP.namelist(): # loops through all files in the directory
if hasPicExtension(zippedFile): # if it ends with photo
useZIP.extract(zippedFile, path=dirPath) # extracts the picture to the path + word/media/
shutil.move(dirPath + str(zippedFile),dirPath + dirFileName[:dirFileName.index(".")] + " - " + str(picNum)) # moves the picture out
picNum += 1
delXLSXEvidence(dirPath) # removes the extracted file structure
os.remove(useZIP.filename) # removes zip file
# no evidence
else:
print("Not a directory path!")
exit(1)
uDir = input("Enter your directory: ")
extractPicsFromDir(uDir)
答案 0 :(得分:0)
Excel文件为zip文件形式,很容易从excel或docx文件中提取图像:
import zipfile
from PIL import Image, ImageFilter
import io
blur = ImageFilter.GaussianBlur(40)
def redact_images(filename,FilePath):
outfile = filename.replace(".xlsx", "_redacted.xlsx")
with zipfile.ZipFile(filename) as inzip:
with zipfile.ZipFile(outfile, "w") as outzip:
i = 0
for info in inzip.infolist():
name = info.filename
content = inzip.read(info)
if name.endswith((".png", ".jpeg", ".gif")):
fmt = name.split(".")[-1]
Name = name.split("/")[-1]
img = Image.open(io.BytesIO(content))
img.save(FilePath + str(Name))
outb = io.BytesIO()
img.save(outb, fmt)
content = outb.getvalue()
info.file_size = len(content)
info.CRC = zipfile.crc32(content)
i += 1
outzip.writestr(info, content)
文件名:输入的excel文件的位置
FilePath:保存提取图像的位置