从较小的DataFrame创建大型DataFrame

时间:2017-08-20 19:48:58

标签: python pandas dataframe web-scraping

当我从PGA网站上获取数据结构时,我遇到了问题。我无法将数据放入数据框并合并数据,以便以后可以使用数据框进行分析。刮削数据的尺寸永远不对。每次运行我似乎无法调和的代码时,我都会收到一个单独的错误。

我尝试过合并和连接数据帧,但似乎没有任何工作。感谢您的帮助

我真的希望我的数据框包含来自不同网站的个别统计信息,但与年份和播放器名称格式化的其他数据位于同一行。

import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import socket
import urllib.error
import pandas as pd
import urllib
import sqlalchemy
import numpy as np
import functools

base = 'http://www.pgatour.com/'
inn = 'stats/stat'
end = '.html'
years = ['2017','2016']


alpha = []
#all pages with links to tables
urls =     ['http://www.pgatour.com/stats.html','http://www.pgatour.com/stats/categories.ROTT_INQ.html','http://www.pgatour.com/stats/categories.RAPP_INQ.html','http://www.pgatour.com/stats/categories.RARG_INQ.html','http://www.pgatour.com/stats/categories.RPUT_INQ.html','http://www.pgatour.com/stats/categories.RSCR_INQ.html','http://www.pgatour.com/stats/categories.RSTR_INQ.html','http://www.pgatour.com/stats/categories.RMNY_INQ.html','http://www.pgatour.com/stats/categories.RPTS_INQ.html']
for i in urls:
    data = urlopen(i)
    soup = BeautifulSoup(data, "html.parser")
    for link in soup.find_all('a'):
        if link.has_attr('href'):
            alpha.append(base + link['href'][17:]) #may need adjusting
#data links
beta = []
for i in alpha:
    if inn in i:
        beta.append(i)

gamma = []
for i in beta:
    if i not in gamma:
        gamma.append(i)

jan = []
for i in gamma:
    try:
        data = urlopen(i)
        soup = BeautifulSoup(data, "html.parser")
        for table in soup.find_all('section',{'class':'module-statistics-off-the-tee-details'}):
            for j in table.find_all('h3'):
                y=j.get_text().replace(" ","").replace("-","").replace(":","").replace(">","").replace("<","").replace(">","").replace(")","").replace("(","").replace("=","").replace("+","")
                jan.append([i,str(y+'.csv')])
                print([i,str(y+'.csv')])
    except Exception as e:
            print(e)
            pass

#my problem starts here
#using urls list so that I can find error faster
urls = [['http://www.pgatour.com/stats/stat.02356.html','d']
    ,['http://www.pgatour.com/stats/stat.02568.html','f']
    ,['http://www.pgatour.com/stats/stat.111.html','r']]        
list = []
master = pd.DataFrame()
#jan = [['http://www.pgatour.com/stats/stat.02356.html', 'Last15EventsScoring.csv']]
#make a list with url and title name and cleaned csv name
#write to csv
row_sp = []
rows_sp =[]
title1 = [] 
title = []  
for i in urls:
    try:
        for y in years:
            data = urlopen(i[0][:-4] +y+ end)
            soup = BeautifulSoup(data, "html.parser")
            data1 = urlopen(i[0])
            soup1 = BeautifulSoup(data1, "html.parser")
            for table in soup1.find_all('table',{'id':'statsTable'}):
                title.append('year')
                for k in table.find_all('tr'):
                    for n in k.find_all('th'):
                        title1.append(n.get_text())
                        for l in title1:
                            if l not in title:
                                title.append(l)
                rows_sp.append(title)
            for table in soup.find_all('table',{'id':'statsTable'}):
                for h in table.find_all('tr'):
                    row_sp = [y]
                    for j in h.find_all('td'):
                        row_sp.append(j.get_text().replace(" ","").replace("\n","").replace("\xa0"," "))
                    rows_sp.append(row_sp)
            df=pd.DataFrame(rows_sp)
            df.columns = title
            df.drop(df.index[1],inplace = True)
            print(df)
            list.append(df)
    except Exception as e:
        print(e)
        pass
df_merge = functools.reduce(lambda  left,right: pd.merge(left,right,on=['year','PLAYER NAME'], how='outer'), list)

0 个答案:

没有答案
相关问题