如何使用Python获取网站的屏幕截图/图像?

时间:2009-07-28 22:48:14

标签: python screenshot webpage backend

我想要实现的是从python中的任何网站获取网站截图。

环境:Linux

14 个答案:

答案 0 :(得分:43)

以下是使用webkit的简单解决方案: http://webscraping.com/blog/Webpage-screenshots-with-webkit/

import sys
import time
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *

class Screenshot(QWebView):
    def __init__(self):
        self.app = QApplication(sys.argv)
        QWebView.__init__(self)
        self._loaded = False
        self.loadFinished.connect(self._loadFinished)

    def capture(self, url, output_file):
        self.load(QUrl(url))
        self.wait_load()
        # set to webpage size
        frame = self.page().mainFrame()
        self.page().setViewportSize(frame.contentsSize())
        # render image
        image = QImage(self.page().viewportSize(), QImage.Format_ARGB32)
        painter = QPainter(image)
        frame.render(painter)
        painter.end()
        print 'saving', output_file
        image.save(output_file)

    def wait_load(self, delay=0):
        # process app events until page loaded
        while not self._loaded:
            self.app.processEvents()
            time.sleep(delay)
        self._loaded = False

    def _loadFinished(self, result):
        self._loaded = True

s = Screenshot()
s.capture('http://webscraping.com', 'website.png')
s.capture('http://webscraping.com/blog', 'blog.png')

答案 1 :(得分:35)

这是我的解决方案,从各种渠道获取帮助。它需要完整的网页屏幕捕获并裁剪它(可选)并从裁剪后的图像中生成缩略图。以下是要求:

要求:

  1. 安装NodeJS
  2. 使用Node的软件包管理器安装phantomjs:npm -g install phantomjs
  3. 安装selenium(在你的virtualenv中,如果你使用的话)
  4. 安装imageMagick
  5. 将phantomjs添加到系统路径(在Windows上)

  6. import os
    from subprocess import Popen, PIPE
    from selenium import webdriver
    
    abspath = lambda *p: os.path.abspath(os.path.join(*p))
    ROOT = abspath(os.path.dirname(__file__))
    
    
    def execute_command(command):
        result = Popen(command, shell=True, stdout=PIPE).stdout.read()
        if len(result) > 0 and not result.isspace():
            raise Exception(result)
    
    
    def do_screen_capturing(url, screen_path, width, height):
        print "Capturing screen.."
        driver = webdriver.PhantomJS()
        # it save service log file in same directory
        # if you want to have log file stored else where
        # initialize the webdriver.PhantomJS() as
        # driver = webdriver.PhantomJS(service_log_path='/var/log/phantomjs/ghostdriver.log')
        driver.set_script_timeout(30)
        if width and height:
            driver.set_window_size(width, height)
        driver.get(url)
        driver.save_screenshot(screen_path)
    
    
    def do_crop(params):
        print "Croping captured image.."
        command = [
            'convert',
            params['screen_path'],
            '-crop', '%sx%s+0+0' % (params['width'], params['height']),
            params['crop_path']
        ]
        execute_command(' '.join(command))
    
    
    def do_thumbnail(params):
        print "Generating thumbnail from croped captured image.."
        command = [
            'convert',
            params['crop_path'],
            '-filter', 'Lanczos',
            '-thumbnail', '%sx%s' % (params['width'], params['height']),
            params['thumbnail_path']
        ]
        execute_command(' '.join(command))
    
    
    def get_screen_shot(**kwargs):
        url = kwargs['url']
        width = int(kwargs.get('width', 1024)) # screen width to capture
        height = int(kwargs.get('height', 768)) # screen height to capture
        filename = kwargs.get('filename', 'screen.png') # file name e.g. screen.png
        path = kwargs.get('path', ROOT) # directory path to store screen
    
        crop = kwargs.get('crop', False) # crop the captured screen
        crop_width = int(kwargs.get('crop_width', width)) # the width of crop screen
        crop_height = int(kwargs.get('crop_height', height)) # the height of crop screen
        crop_replace = kwargs.get('crop_replace', False) # does crop image replace original screen capture?
    
        thumbnail = kwargs.get('thumbnail', False) # generate thumbnail from screen, requires crop=True
        thumbnail_width = int(kwargs.get('thumbnail_width', width)) # the width of thumbnail
        thumbnail_height = int(kwargs.get('thumbnail_height', height)) # the height of thumbnail
        thumbnail_replace = kwargs.get('thumbnail_replace', False) # does thumbnail image replace crop image?
    
        screen_path = abspath(path, filename)
        crop_path = thumbnail_path = screen_path
    
        if thumbnail and not crop:
            raise Exception, 'Thumnail generation requires crop image, set crop=True'
    
        do_screen_capturing(url, screen_path, width, height)
    
        if crop:
            if not crop_replace:
                crop_path = abspath(path, 'crop_'+filename)
            params = {
                'width': crop_width, 'height': crop_height,
                'crop_path': crop_path, 'screen_path': screen_path}
            do_crop(params)
    
            if thumbnail:
                if not thumbnail_replace:
                    thumbnail_path = abspath(path, 'thumbnail_'+filename)
                params = {
                    'width': thumbnail_width, 'height': thumbnail_height,
                    'thumbnail_path': thumbnail_path, 'crop_path': crop_path}
                do_thumbnail(params)
        return screen_path, crop_path, thumbnail_path
    
    
    if __name__ == '__main__':
        '''
            Requirements:
            Install NodeJS
            Using Node's package manager install phantomjs: npm -g install phantomjs
            install selenium (in your virtualenv, if you are using that)
            install imageMagick
            add phantomjs to system path (on windows)
        '''
    
        url = 'http://stackoverflow.com/questions/1197172/how-can-i-take-a-screenshot-image-of-a-website-using-python'
        screen_path, crop_path, thumbnail_path = get_screen_shot(
            url=url, filename='sof.png',
            crop=True, crop_replace=False,
            thumbnail=True, thumbnail_replace=False,
            thumbnail_width=200, thumbnail_height=150,
        )
    

    这些是生成的图像:

答案 2 :(得分:10)

在Mac上,有webkit2png,在Linux + KDE上,您可以使用khtml2png。我尝试了前者并且效果很好,并且听说后者正在使用。

我最近遇到QtWebKit声称是跨平台的(Qt将WebKit推入他们的库中,我猜)。但我从未尝试过,所以我不能告诉你更多。

QtWebKit链接显示了如何从Python访问。您应该至少可以使用子进程对其他进程执行相同的操作。

答案 3 :(得分:5)

我无法评论ars的回答,但实际上我使用QtWebkit运行Roland Tapken's code并且它运行良好。

只是想确认一下Roland在他的博客上发布的帖子在Ubuntu上运行得很好。我们的生产版本最终没有使用他写的任何内容,但我们使用PyQt / QtWebKit绑定取得了很大的成功。

答案 4 :(得分:4)

可以使用Selenium

from selenium import webdriver

DRIVER = 'chromedriver'
driver = webdriver.Chrome(DRIVER)
driver.get('https://www.spotify.com')
screenshot = driver.save_screenshot('my_screenshot.png')
driver.quit()

https://sites.google.com/a/chromium.org/chromedriver/getting-started

答案 5 :(得分:3)

11年后...
使用Python3.6Google PageSpeedApi Insights v5拍摄网站截图:

import base64
import requests
import traceback
import urllib.parse as ul

# It's possible to make requests without the api key, but the number of requests is very limited  

url = "https://duckgo.com"
urle = ul.quote_plus(url)
image_path = "duckgo.jpg"

key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
strategy = "desktop" # "mobile"
u = f"https://www.googleapis.com/pagespeedonline/v5/runPagespeed?key={key}&strategy={strategy}&url={urle}"

try:
    j = requests.get(u).json()
    ss_encoded = j['lighthouseResult']['audits']['final-screenshot']['details']['data'].replace("data:image/jpeg;base64,", "")
    ss_decoded = base64.b64decode(ss_encoded)
    with open(image_path, 'wb+') as f:
        f.write(ss_decoded) 
except :
    print(traceback.format_exc())
    exit(1)

注释:

  • Live Demo
  • 优点:免费
  • Conns:低分辨率
  • Get API Key
  • Docs
  • 限制:
    • 每天查询= 25,000
    • 每100秒的查询次数= 400

答案 6 :(得分:1)

使用Rendertron是一种选择。在后台,这是一个无头的Chrome,它暴露了以下端点:

  • /render/:url:访问此路线,例如如果您对DOM感兴趣,请使用requests.get
  • /screenshot/:url:如果您对屏幕截图感兴趣,请访问此路线。

您将使用npm安装rendertron,在一个终端上运行rendertron,访问http://localhost:3000/screenshot/:url并保存文件,但是render-tron.appspot.com上有一个演示,可以运行此Python3代码段本地而不安装npm软件包:

import requests

BASE = 'https://render-tron.appspot.com/screenshot/'
url = 'https://google.com'
path = 'target.jpg'
response = requests.get(BASE + url, stream=True)
# save file, see https://stackoverflow.com/a/13137873/7665691
if response.status_code == 200:
    with open(path, 'wb') as file:
        for chunk in response:
            file.write(chunk)

答案 7 :(得分:1)

我创建了一个名为pywebcapture的库,该库可以封装硒:

pip install pywebcapture

使用pip安装后,您可以执行以下操作轻松获得完整尺寸的屏幕截图:

# import modules
from pywebcapture import loader, driver

# load csv with urls
csv_file = loader.CSVLoader("csv_file_with_urls.csv", has_header_bool, url_column, optional_filename_column)
uri_dict = csv_file.get_uri_dict()

# create instance of the driver and run
d = driver.Driver("path/to/webdriver/", output_filepath, delay, uri_dict)
d.run()

享受!

https://pypi.org/project/pywebcapture/

答案 8 :(得分:1)

这是一个古老的问题,大多数答案都过时了。 目前,我将做2件事情中的1件。

1。创建一个可以截取屏幕截图的程序

我会使用Pyppeteer来拍摄网站的屏幕截图。它在Puppeteer软件包上运行。 Puppeteer旋转了无头的chrome浏览器,因此屏幕截图看起来就像在普通浏览器中一样。

这取自pyppeteer文档:

import asyncio
from pyppeteer import launch

async def main():
    browser = await launch()
    page = await browser.newPage()
    await page.goto('https://example.com')
    await page.screenshot({'path': 'example.png'})
    await browser.close()

asyncio.get_event_loop().run_until_complete(main())

2。使用屏幕截图API

您还可以使用屏幕截图API,例如this one。 令人高兴的是,您不必自己进行所有设置,而只需调用API端点即可。

这取自截图API的文档:

import urllib.parse
import urllib.request
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# The parameters.
token = "YOUR_API_TOKEN"
url = urllib.parse.quote_plus("https://example.com")
width = 1920
height = 1080
output = "image"

# Create the query URL.
query = "https://screenshotapi.net/api/v1/screenshot"
query += "?token=%s&url=%s&width=%d&height=%d&output=%s" % (token, url, width, height, output)

# Call the API.
urllib.request.urlretrieve(query, "./example.png")

答案 9 :(得分:0)

您没有提到您正在运行的环境,这会产生很大的不同,因为没有一个能够呈现HTML的纯Python Web浏览器。

但如果您使用的是Mac,我已经使用webkit2png取得了巨大的成功。如果没有,正如其他人指出的那样,有很多选择。

答案 10 :(得分:0)

您可以使用Google Page Speed API轻松完成任务。在我当前的项目中,我使用了用Python编写的Google Page Speed API的查询来捕获所提供的任何Web URL的屏幕快照并将其保存到某个位置。看看。

import urllib2
import json
import base64
import sys
import requests
import os
import errno

#   The website's URL as an Input
site = sys.argv[1]
imagePath = sys.argv[2]

#   The Google API.  Remove "&strategy=mobile" for a desktop screenshot
api = "https://www.googleapis.com/pagespeedonline/v1/runPagespeed?screenshot=true&strategy=mobile&url=" + urllib2.quote(site)

#   Get the results from Google
try:
    site_data = json.load(urllib2.urlopen(api))
except urllib2.URLError:
    print "Unable to retreive data"
    sys.exit()

try:
    screenshot_encoded =  site_data['screenshot']['data']
except ValueError:
    print "Invalid JSON encountered."
    sys.exit()

#   Google has a weird way of encoding the Base64 data
screenshot_encoded = screenshot_encoded.replace("_", "/")
screenshot_encoded = screenshot_encoded.replace("-", "+")

#   Decode the Base64 data
screenshot_decoded = base64.b64decode(screenshot_encoded)

if not os.path.exists(os.path.dirname(impagepath)):
    try:
        os.makedirs(os.path.dirname(impagepath))
        except  OSError as exc:
            if exc.errno  != errno.EEXIST:
                raise

#   Save the file
with open(imagePath, 'w') as file_:
    file_.write(screenshot_decoded)

不幸的是,以下是缺点。如果这些都不重要,则可以继续使用Google Page Speed API。效果很好。

  • 最大宽度为320px
  • 根据Google API配额,每天最多有25,000个请求

答案 11 :(得分:0)

使用Web服务s-shot.ru(因此速度不是很快),但通过链接配置轻松设置所需的内容。 而且,您可以轻松捕获整个页面的屏幕截图

import requests
import urllib.parse

BASE = 'https://mini.s-shot.ru/1024x0/JPEG/1024/Z100/?' # you can modify size, format, zoom
url = 'https://stackoverflow.com/'#or whatever link you need
url = urllib.parse.quote_plus(url) #service needs link to be joined in encoded format
print(url)

path = 'target1.jpg'
response = requests.get(BASE + url, stream=True)

if response.status_code == 200:
    with open(path, 'wb') as file:
        for chunk in response:
            file.write(chunk)

答案 12 :(得分:0)

import subprocess

def screenshots(url, name):
    subprocess.run('webkit2png -F -o {} {} -D ./screens'.format(name, url), 
      shell=True)

答案 13 :(得分:-2)

试试这个..

#!/usr/bin/env python

import gtk.gdk

import time

import random

while 1 :
    # generate a random time between 120 and 300 sec
    random_time = random.randrange(120,300)

    # wait between 120 and 300 seconds (or between 2 and 5 minutes)
    print "Next picture in: %.2f minutes" % (float(random_time) / 60)

    time.sleep(random_time)

    w = gtk.gdk.get_default_root_window()
    sz = w.get_size()

    print "The size of the window is %d x %d" % sz

    pb = gtk.gdk.Pixbuf(gtk.gdk.COLORSPACE_RGB,False,8,sz[0],sz[1])
    pb = pb.get_from_drawable(w,w.get_colormap(),0,0,0,0,sz[0],sz[1])

    ts = time.time()
    filename = "screenshot"
    filename += str(ts)
    filename += ".png"

    if (pb != None):
        pb.save(filename,"png")
        print "Screenshot saved to "+filename
    else:
        print "Unable to get the screenshot."