Question

此Python脚本对从website.txt文件加载的URL进行GET请求。然后它将检查响应中是否有“关键字”。如果找到密钥，它将保存在“ WorkingSites.txt”中。

一切正常，但是放慢了速度，因为它只能同时检查一个网址。同时检查10个URL的最佳和最简便的方法是什么？

能否请您在下面的脚本中提供示例

谢谢

import requests
import sys

if len(sys.argv) != 2:
    print "\n\033[34;1m[*]\033[0m python " + sys.argv[0] \
        + ' websites.txt '
    exit(0)

targetfile = open(sys.argv[1], 'r')
while True:
    success = open('WorkingSites.txt', 'a')
    host = targetfile.readline().replace('\n', '')
    if not host:
        break
    if not host.startswith('http'):
        host = 'http://' + host
    print '\033[34;1m[*]\033[0m Check        : ' + host
    try:
        r = requests.request('get', host, timeout=5,
                             headers={'Content-Type': 'application/x-www-form-urlencoded'
                             ,
                             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3163.100 Safari/537.36'
                             })
        text = 'KEYWORD'
    except:
        print '\033[31;1m[-]\033[0m Failed        : No Response\n'
        pass
        continue
    if text in r.text:
        print '\033[32;1m[+]\033[0m success        : ' + host + '\n'
        success.write(host + '\n')
    else:
        print '\033[31;1m[-]\033[0m Failed        : ' + host + '\n'

print "\033[34;1m[*]\033[0m Output Saved On : WorkingSites.txt"

Answer 1

import concurrent.futures
import requests
import sys
from timeit import default_timer
import psutil

INPUT = 'websites.txt'
OUTPUT = 'WorkingSites.txt'
SUCCESS = open(OUTPUT, 'a')
START_TIME = default_timer()

def fetch(host):
  KEYWORD = 'KEYWORD'

  try:
    with requests.get(host, timeout=5, headers={
    'Content-Type':'application/x-www-form-urlencoded',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3163.100 Safari/537.36'
    }) as response: 

      print('{0} {1} {2} {3}'.format(host, psutil.cpu_percent(), psutil.virtual_memory()[2], "{:5.2f}s".format(default_timer() - START_TIME)))

      if (response.status_code == 200 and KEYWORD in response.text):
        SUCCESS.write(host+'\n')

      return response
  except requests.exceptions.RequestException as e:
    pass

async def get_data_asynchronous():
  with open(INPUT) as fi:
      hosts = fi.read().splitlines()

  for host in hosts:
    if not host.startswith('http'):
      host = 'http://' + host

  with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
      loop = asyncio.get_event_loop()

      START_TIME = default_timer()

      futures = [
          loop.run_in_executor(
              executor, 
              fetch, 
              host,
          )
          for host in hosts
      ]

      for response in await asyncio.gather(*futures):
        pass

def main():
  loop = asyncio.get_event_loop()
  loop.run_until_complete(asyncio.ensure_future(get_data_asynchronous()))
  print("\033[34;1m[*]\033[0m Output Saved On : "+OUTPUT)

main()

这是我的脚本，目前它对第一个网址非常有效，但是无论出于什么原因，它都会变慢。

这是我的脚本或PC的问题吗？有人可以测试它是否已将数千个网址上传到pastebin https://pastebin.com/raw/5wtrpcDQ

谢谢你！

Python：多个同时请求

1 个答案: