我正在尝试使用Scrapy登录Github。
# -*- coding: utf-8 -*-
import scrapy
class AutoreplySpider(scrapy.Spider):
name = 'AutoLogin'
allowed_domains = ['github.com']
start_urls = ['https://github.com/login']
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={
'login': 'ac',
'password': 'pw'
},
callback=self.reply
)
def after_login(self, response):
pass
当我手动登录Github时,我选中了“记住用户名和密码”这个框。因此,如果我不注销,我应该在我再次访问Github时自动登录。我在终端运行脚本,但没有出现任何错误。但是,当我访问Github时,它需要我登录。我不确定我的代码是否有效。我有一段时间没有接触过Scrapy。有没有快速的方法来检查我是否成功登录?谢谢!
答案 0 :(得分:0)
代码不正确。表格通常都有隐藏字段。将凭证数据发送到服务器时,服务器将检查字段。我添加循环来收集所有输入标记字段。当表单部分正确时,可以在响应页面中找到帐户名称。如果它存在,你可以继续。
class AutologinSpider(scrapy.Spider):
name = 'AutoLogin'
allowed_domains = ['DOMAIN_TO_LOGIN_COM']
start_urls = ['URP_OF_FORM_PAGE']
custom_settings = {'ROBOTSTXT_OBEY': False}
def parse(self, response):
inputs = response.css('form input')
formdata = {}
for input in inputs:
name = input.css('::attr(name)').extract_first()
value = input.css('::attr(value)').extract_first()
formdata[name] = value
formdata['login'] = 'YOUR_LOGIN'
formdata['password'] = 'YOUR_PASSWORD'
return scrapy.FormRequest.from_response(
response,
formdata=formdata,
callback=self.after_login
)
def after_login(self, response):
if not response.css('ul.dropdown-menu li strong::text').extract_first() == 'YOU_ACCOUNT_NAME':
# Something wrong.
pass
# You have successfully logged in. Put you code here.
pass