Question

我正在尝试建立一个体育博彩程序。

现在我被困在它生成一个包含前两个大学篮球赛季所有得分的 csv 的部分。它正在从我已经生成的 csv 中提取 boxscore 索引。

一旦在进度条上达到 10653 次迭代，我就会不断收到索引超出范围错误。我在第 10653 行读取的 csv 中找不到任何特定内容。

我知道迭代对应于 csv 中的行，因为当我在 df = Boxscore(box_link).dataframe 之前运行所有代码行时，进度条在 14980 次迭代时完成，这与它正在读取的 csv 中的行数相同。

任何帮助将不胜感激。代码与错误消息一起在下方。

from sportsreference.ncaab.boxscore import Boxscore
start_season = 2020 # below code will pull data from all seasons starting from this year
box_df = None
schedule_df = pd.read_csv('ncaab - sheet81 - ncaab - sheet81.csv')#if only running for testing, a smaller csv may be used to speed up the process
season_df = schedule_df.loc[schedule_df.Season>=start_season]
for index, row in tqdm(season_df.iterrows()):
    box_link = row['BoxscoreIndex']
    _df = Boxscore(box_link).dataframe #The line to left is where the error keeps coming in "list index out of range". I ran everything above this and it works fine.  
        
    if box_df is not None:
        box_df = pd.concat([box_df,_df],axis=0)
    else:
        box_df = _df
            
box_df.to_csv('boxscores3.csv'.format(start_season),index=None)

IndexError                                Traceback (most recent call last)
<ipython-input-24-91c5b71b03e2> in <module>
      6 for index, row in tqdm(season_df.iterrows()):
      7     box_link = row['BoxscoreIndex']
----> 8     _df = Boxscore(box_link).dataframe #The line to left is where the error keeps coming in "list index out of range". I ran everything above this and it works fine.
      9 
     10     if box_df is not None:

~\Downloads\WPy64-3860\python-3.8.6.amd64\lib\site-packages\sportsreference\ncaab\boxscore.py in __init__(self, uri)
    223         self._home_defensive_rating = None
    224 
--> 225         self._parse_game_data(uri)
    226 
    227     def _retrieve_html_page(self, uri):

~\Downloads\WPy64-3860\python-3.8.6.amd64\lib\site-packages\sportsreference\ncaab\boxscore.py in _parse_game_data(self, uri)
    668             if short_field == 'away_record' or \
    669                short_field == 'home_record':
--> 670                 value = self._parse_record(short_field, boxscore, index)
    671                 setattr(self, field, value)
    672                 continue

~\Downloads\WPy64-3860\python-3.8.6.amd64\lib\site-packages\sportsreference\ncaab\boxscore.py in _parse_record(self, field, boxscore, index)
    375         records = boxscore(BOXSCORE_SCHEME[field]).items()
    376         records = [x.text() for x in records if x.text() != '']
--> 377         return records[index]
    378 
    379     def _find_boxscore_tables(self, boxscore):

IndexError: list index out of range

Answer 1

首先，只想指出这里的 .format() 方法 'boxscores3.csv'.format(start_season) 没有做任何事情。它仍然会返回 'boxscores3.csv'。您需要在字符串中包含该占位符才能在文件名中包含该占位符：

例如如果 start_season = '2020'，那么 'boxscores3_{0}.csv'.format(start_season) 会给你 'boxscores3_2020.csv'

因此，如果您希望该文件名动态，请更改为：

box_df.to_csv('boxscores3_{0}.csv'.format(start_season),index=None)

或

box_df.to_csv('boxscores3_{some_variable}.csv'.format(some_variable = start_season),index=None)

或

box_df.to_csv('boxscores3_%s.csv' %start_season),index=None)

接下来，除非您能提供该 csv 文件的样本，特别是第 10653 行，否则无法真正帮助您解决具体问题。

但是，在那之前，我可以提供使用 espn api 的替代解决方案。

您可以获得大学篮球比赛的得分，前提是您有游戏 ID。所以这段代码会遍历每个日期（需要给出开始日期），获取每个游戏的gameIds。然后使用 gameIds，可以从另一个 api 端点获取 boxscore。不幸的是，boxscore 不是以 json 格式返回，而是以 html 格式返回（这很好，因为我们可以使用 pandas 读取表格）。

我不确切知道您需要或想要什么，但这可能会在您学习 Python 以查看其他获取数据的方式时对您有所帮助：

代码：

from tqdm import tqdm
import requests
import pandas as pd
import datetime


date_list = []
sdate = datetime.date(2021, 1, 1)   # start date
edate = datetime.date.today()  # end date

delta = edate - sdate       # as timedelta

for i in range(delta.days + 1):
    day = sdate + datetime.timedelta(days=i)
    date_list.append(day.strftime("%Y%m%d"))

 


headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
payload = {
'xhr': '1',
'device': 'desktop',
'country': 'us',
'lang': 'en',
'region': 'us',
'site': 'espn',
'edition-host': 'espn.com',
'site-type': 'full'}

# Get gameIds
gameId_dict = {}
for dateStr in tqdm(date_list):
    url = 'https://secure.espn.com/core/mens-college-basketball/schedule/_/date/{dateStr}/group/50'.format(dateStr=dateStr)
    games = requests.get(url, headers=headers, params=payload).json()['content']['schedule'][dateStr]['games']
    gameId_dict[dateStr] = []
    for game in games:
        # Check if game was postponed
        if game['status']['type']['name'] in ['STATUS_POSTPONED','STATUS_CANCELED','STATUS_SCHEDULED']:
            continue
        game_info = {}
        game_info[game['id']] = {}
        game_info[game['id']]['awayTeam'] = game['shortName'].split('@')[0].strip()
        game_info[game['id']]['homeTeam'] = game['shortName'].split('@')[1].strip()
        gameId_dict[dateStr].append(game_info)



full_df = pd.DataFrame()
# Box score - gameId needed
box_url = 'https://secure.espn.com/core/mens-college-basketball/boxscore'
for dateStr, games in tqdm(gameId_dict.items()):
    for game in tqdm(games):
        for gameId, teams in game.items():
            payload = {
            'gameId': gameId,
            'xhr': '1',
             'render': 'true',
            'device': 'desktop',
            'country': 'us',
            'lang': 'en',
            'region': 'us',
            'site': 'espn',
            'edition-host': 'espn.com',
            'site-type': 'full'}
            
            data = requests.get(box_url, headers=headers, params=payload).json()
            away_df = pd.read_html(data['content']['html'], header=1)[0].rename(columns={'Bench':'Player'})
            away_df = away_df[away_df['Player'] != 'TEAM']
            away_df = away_df[away_df['Player'].notna()]
            away_df['Team'] = teams['awayTeam']
            away_df['Home_Away'] = 'Away'
            away_df['Starter_Bench'] = 'Bench'
            away_df.loc[0:4, 'Starter_Bench'] = 'Starter'
            away_df['Player'] = away_df['Player'].str.split(r"([a-z]+)([A-Z].+)", expand=True)[2]
            away_df[['Player','Team']] = away_df['Player'].str.extract('^(.*?)([A-Z]+)$', expand=True)
                    
            home_df = pd.read_html(data['content']['html'], header=1)[1].rename(columns={'Bench':'Player'})
            home_df = home_df[home_df['Player'] != 'TEAM']
            home_df = home_df[home_df['Player'].notna()]
            home_df['Team'] = teams['homeTeam']
            home_df['Home_Away'] = 'Home'
            home_df['Starter_Bench'] = 'Bench'
            home_df.loc[0:4, 'Starter_Bench'] = 'Starter'
            home_df['Player'] = home_df['Player'].str.split(r"([a-z]+)([A-Z].+)", expand=True)[2]
            home_df[['Player','Team']] = home_df['Player'].str.extract('^(.*?)([A-Z]+)$', expand=True)
            
            game_df = away_df.append(home_df, sort = False)
            game_df['Date'] = datetime.datetime.strptime(dateStr, '%Y%m%d').strftime('%m/%d/%y')
            full_df = full_df.append(game_df, sort = False)

full_df = full_df.reset_index(drop=True)

输出：

print (full_df.head(30).to_string())
                Player MIN    FG  3PT   FT OREB DREB REB AST STL BLK TO PF PTS  Team Home_Away Starter_Bench Pos      Date
0             H. Drame  22   2-7  0-2  0-0    1    1   2   0   0   1  1  4   4   SPU      Away       Starter   F  01/01/21
1             F. Drame  20   2-3  0-1  0-0    1    5   6   0   3   1  1  4   4   SPU      Away       Starter   F  01/01/21
2               M. Lee  24  2-11  0-4  1-2    1    2   3   0   0   0  3  0   5   SPU      Away       Starter   G  01/01/21
3             D. Banks  26  4-12  1-6  2-4    0    5   5   6   1   0  1  1  11   SPU      Away       Starter   G  01/01/21
4             D. Edert  32  6-10  2-4  1-2    0    4   4   0   2   0  1  2  15   SPU      Away       Starter   G  01/01/21
5           O. Diahame   1   0-1  0-0  0-0    0    0   0   0   0   0  0  0   0   SPU      Away         Bench   F  01/01/21
6             K. Ndefo  23  7-10  0-0  3-3    1    6   7   2   1   5  1  4  17   SPU      Away         Bench   F  01/01/21
7            B. Diallo  14   0-2  0-0  0-0    1    1   2   0   0   0  0  0   0   SPU      Away         Bench   G  01/01/21
8             T. Brake  24   1-2  0-1  0-0    0    0   0   1   0   0  0  1   2   SPU      Away         Bench   G  01/01/21
9           M. Silvera   6   0-0  0-0  0-0    0    1   1   1   0   0  1  0   0   SPU      Away         Bench   G  01/01/21
10            N. Kamba   8   0-1  0-0  0-0    0    0   0   0   0   0  2  0   0   SPU      Away         Bench   G  01/01/21
11            J. Fritz  38   5-9  0-0  4-5    2    8  10   4   1   3  1  3  14   CAN      Home       Starter   F  01/01/21
12            J. White  17   4-7  1-2  0-0    1    4   5   2   0   0  5  2   9   CAN      Home       Starter   F  01/01/21
13           A. Fofana  20   1-7  1-4  1-2    0    1   1   1   0   0  1  2   4   CAN      Home       Starter   G  01/01/21
14          A. Harried  23  3-10  1-4  0-1    2    5   7   1   1   1  0  1   7   CAN      Home       Starter   G  01/01/21
15        J. Henderson  37   3-8  3-5  5-6    0    1   1   2   0   0  1  1  14   CAN      Home       Starter   G  01/01/21
16      G. Maslennikov   2   0-2  0-1  0-0    0    0   0   0   0   0  1  1   0   CAN      Home         Bench   F  01/01/21
17            M. Green  18   3-4  0-0  2-2    1    4   5   2   1   0  2  1   8   CAN      Home         Bench   F  01/01/21
18          S. Hitchon   3   0-0  0-0  0-0    0    0   0   1   0   0  0  0   0   CAN      Home         Bench   F  01/01/21
19       S. Uijtendaal  20   2-4  1-2  0-0    0    0   0   0   1   0  0  2   5   CAN      Home         Bench   G  01/01/21
20          M. Brandon  19   4-5  1-2  0-0    0    3   3   2   2   0  2  1   9   CAN      Home         Bench   G  01/01/21
21           A. Ahemed   3   0-0  0-0  0-0    0    1   1   1   0   0  0  1   0   CAN      Home         Bench   G  01/01/21
22           K. Nwandu  34  5-13  1-3  0-1    1    3   4   3   1   0  3  1  11  NIAG      Away       Starter   F  01/01/21
23      G. Kuakumensah  23   1-2  1-2  1-2    0    2   2   1   0   0  1  1   4  NIAG      Away       Starter   F  01/01/21
24         N. Kratholm  18   4-7  0-0  3-5    2    2   4   1   0   0  0  2  11  NIAG      Away       Starter   F  01/01/21
25          M. Hammond  33  7-14  3-6  0-0    0    4   4   1   1   0  2  2  17  NIAG      Away       Starter   G  01/01/21
26          J. Roberts  28   2-6  2-6  2-2    0    2   2   3   1   0  2  3   8  NIAG      Away       Starter   G  01/01/21
27          J. Cintron  14   0-2  0-0  0-0    1    3   4   0   0   1  2  1   0  NIAG      Away         Bench   F  01/01/21
28  DonaldN. MacDonald   9   0-1  0-1  0-0    0    3   3   0   0   0  0  0   0  NIAG      Away         Bench   G  01/01/21
29          R. Solomon  25  4-11  0-2  2-2    1    3   4   0   3   0  0  1  10  NIAG      Away         Bench   G  01/01/21

Answer 2

你真的不需要使用那个库。只需使用 requests、beautifulsoup 和 pandas。图书馆可能会在幕后使用那些。但这现在应该对你有用。当您在短时间内因过多请求而被阻止时，我们可能需要添加一些内容（例如延迟/睡眠），我怀疑这是导致您出现问题的原因，但这会让您继续前进。看看它能给你带来什么，我们可以调整它。

import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from tqdm import tqdm
import time

season_df = pd.read_csv('C:/test/ncaab.csv')
links_list = season_df['BoxscoreIndex'].to_list()
delay = 30


def parse_tables(idx,basic_tables,advanced_tables):    
    try:
        basic = basic_tables[idx]
        advanced = advanced_tables[idx]
        team = basic.find('caption').text.split('(')[0].strip()
        team = team.split('Table')[0].strip()
        
        df_basic = pd.read_html(str(basic), header=1)[0]
        df_basic = df_basic[df_basic['Starters'] == 'School Totals']
        df_basic = df_basic.reset_index(drop=True)
        
        df_advanced = pd.read_html(str(advanced), header=1)[0]
        df_advanced = df_advanced[df_advanced['Starters'] == 'School Totals']
        df_advanced = df_advanced.reset_index(drop=True)
        
        drop_cols = []
        for col in df_basic.columns:
            if col in df_advanced.columns:
                drop_cols.append(col)
                
        df = df_basic.drop(['Starters'], axis=1).join(df_advanced.drop(drop_cols, axis=1))
        df['Team'] = team
        return df
    except:
        return None


def get_html(box_link):
    url = 'https://www.sports-reference.com/cbb/boxscores/%s.html' %box_link
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup, True
    else:
        return None, False
    


visited_links = []
errors_list = []
box_df = pd.DataFrame()
for box_link in tqdm(links_list):
    resp_success = False

    # This will skip odd rows
    if box_link in visited_links:
        continue
    
    while resp_success == False:
        soup, resp_success = get_html(box_link)
        if resp_success == False:
            print ('Will retry in %s seconds.' %delay)
            time.sleep(delay)
            
    basic_tables = soup.find_all('table', {'id': re.compile('.*box-score-basic.*')})
    advanced_tables = soup.find_all('table', {'id': re.compile('.*box-score-advanced.*')})
    
    if len(basic_tables) < 2:
        errors_list.append(box_link)
    
    for idx in range(0, len(basic_tables)):
        df = parse_tables(idx,basic_tables,advanced_tables)
        box_df = box_df.append(df, sort=False)
        
    visited_links.append(box_link)
 
        
box_df = box_df.reset_index(drop = True)
if len(errors_list) > 0:
    print('\n')
    print ('You may want to investigate the following.')
    for each in errors_list:
        print('Error with: %s' %each)

生成csv时索引超出范围

2 个答案: