间歇性404错误,Selenium,Python

时间:2019-05-29 12:03:50

标签: http-status-code-404 intermittent

我正在抓取一个包含大量查找内容的网站(USGA GHIN系统)(查找30至60个高尔夫球手的残障并存储在excel表中)。我似乎没有代码问题-我认为该网站检测到来自单一来源的大量请求,然后立即关闭。通常会发生约32个请求-前31个请求运行正常,然后弹出404。遇到时,我关闭硒,等待30秒钟,然后重新开始。有时这有效,有时却无效。我希望有人可以帮助我保护我的应用程序免受此问题的困扰。我已经对该程序进行了编码,以便在检测到三个错误后,它将保存已检索的数据并关闭。如果我稍等片刻,我可以重新启动它,它将在中断的地方继续运行,并且通常在没有遇到其他404的情况下运行到工作结束。

我是否可以更改发送的标头的一部分,以更改网站看到的身份,以便我的应用程序每次都能运行到工作终止?

我已经描述了我尝试过的解决方法。

#
#   This Module Retrieves and Stores Handicaps
#       First GHIN No is Passed
#
def Retrieve_Handicaps(br):
    global next_row, ghin_number, workbook
    #   Set up Loop
    next_row = 4
    failures = 0
    #   Open Player/Handicap File
    path1 = "C:/Users/Steve/Desktop/SaturdayGolf.xlsx"
    workbook = openpyxl.load_workbook(path1)
#   Open Handicaps Sheet
    sheet = workbook["Handicaps"]
    #   Get First GHIN No.
    ghin_number = sheet.cell(row = next_row, column = 2).value
    print("Processsing", ghin_number)
    first_ghin_number = ghin_number
    #   Open Iframe
    br.switch_to.frame(br.find_element_by_tag_name("iframe"))
    #   Find Handicap entry field
    ghin_no = WebDriverWait(br,10).until(EC.presence_of_element_located((By.XPATH,\
               '//*[@id="ctl00_bodyMP_tcLookupModes_tpSingle_tbGHIN"]')))
    #   Enter GHIN No.
    ghin_no.send_keys(ghin_number)
    #   Click on Lookup button
    br.find_element_by_css_selector("#ctl00_bodyMP_tcLookupModes_tpSingle_btnSubmit1").click()   
    #   Get revision Date
    result = Look_With_Wait(br, '//*[@id="ctl00_bodyMP_grdClubs"]/tbody/tr[1]/td[3]')
    if result == failure:
        print("Unable to retrieve revision date")
        return
    print("Revision date = ", retrieved_value.text)
    #   If new revision date found - store it and loop to clear previous handicaps
    if sheet.cell(row = 3, column = 3).value != retrieved_value.text:
        print("new date found")
        sheet.cell(row = 3, column = 3).value = retrieved_value.text
        no_of_players = int(sheet.cell(row = 1, column = 2).value)
        counter = 1
        #   Blank out any stored handicaps
        while counter < no_of_players + 1:
            sheet.cell(row = counter + 3, column = 3).value = 100
            counter += 1
        workbook.save(path1)
        print("Handicaps blanked out")
    #   Main Loop for processing all participants
    while ghin_number != 0:
        #   If Handicap is non blank, no need to process
        #print("top of main loop row =", next_row-3)
        #   Check for end of player list
        # Get next ghin number from excel file
        ghin_number = sheet.cell(row = next_row, column = 2).value
        if ghin_number == 0:
            workbook.save(path1)
            workbook.close()    
            return   
        if sheet.cell(row = next_row, column = 3).value != 100:
            #   Increment row in table
            #print("Skipping Ghin # =", sheet.cell(row = next_row, column = 2).value, "Stored Handicap =",\
                  #sheet.cell(row = next_row, column = 3).value) 
            next_row += 1   
        else:
            #print("Processing GHIN #", sheet.cell(row = next_row, column = 2).value)
        #   Blank handicap - process this golfer
##            # Get next ghin number from excel file
##            ghin_number = sheet.cell(row = next_row, column = 2).value
            #   Reload GHIN Entry page
            print("Going to GHIN page")
            br.get(GHIN_URL)
            print("back from GHIN load")
            #   Open Iframe
            br.switch_to.frame(br.find_element_by_tag_name("iframe"))
            #print("Loop #", next_row - 3, "After iframe")
            #   Find Handicap entry field
            ghin_no_entry = WebDriverWait(br,10).until(EC.presence_of_element_located((By.XPATH,\
                       '//*[@id="ctl00_bodyMP_tcLookupModes_tpSingle_tbGHIN"]')))
            #   Enter GHIN No.
            ghin_no_entry.send_keys(ghin_number)
            #   Click on Lookup button
            br.find_element_by_css_selector("#ctl00_bodyMP_tcLookupModes_tpSingle_btnSubmit1").click()
            #   Scrape Handicap
            result = Look_With_Wait(br,'//*[@id="ctl00_bodyMP_grdClubs"]/tbody/tr/td[2]')
            #   Test result and Store if found
            if result == success:
                print("Loop #", next_row - 3, "GHIN # =", ghin_number, "Handicap found = ", retrieved_value.text)
                #   Store handicap in worksheet for this golfer
                sheet.cell(row = next_row, column = 3).value = retrieved_value.text
                #   Prepare for Next iteration
                next_row +=1
            else:
                failures += 1
                if failures == 3:
                #   Only accomodate 5 failed attempts before shutting down
                    print("3 Failed attempts to work through the list")
                    workbook.save(path1)
                    workbook.close()    
                    return
                #   Close Chromedriver
                workbook.save(path1)
                workbook.close()
                print("workbook saved - returning to mainline") 
                br.close()
                br.quit()
                sleep(30)
                print("restarting, failures =", failures)
                #   Restart Chromedriver and go to Log on Page
                br = Initialize_Start_Chrome()
    return

0 个答案:

没有答案