从网页中提取数据

时间:2015-11-09 17:58:43

标签: python extract

我有一个脚本从这里提取数据:http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/

在脚本中获取数据的一部分如下所示:

pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])

mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])

fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])

我看到像fgpts这样的名称对应于表格标题,但我不明白为什么在脚本中缩写某些名称。

我想修改脚本以获取此表的标题:http://espn.go.com/nba/statistics/player/_/stat/rebounds。我尝试通过插入表格顶部的名称来完成此操作,但生成的CSV文件缺少信息。

完整代码:

import os
import csv
import time
import urllib2

uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes'

def get_data():

    try:
        req = urllib2.Request(uri)
        response = urllib2.urlopen(req, timeout=600)
        content = response.read()
        return content
    except Exception, e:
        print "\n[!] Error: " + str(e)
        print ''
        return False

def extract(data,rk):
    print '\n[+] Extracting data.'
    start = 0

    while True:
        store = [rk]
        if data.find('nba/player/',start) == -1:
            break

        with open("data.csv", "ab") as fcsv:

            main = data.find('nba/player/',start)

            name_start = data.find('>',main) + 1
            name_end = data.find('<',name_start)            
            store.append(data[name_start:name_end])

            team_start = data.find('">',name_end) + 2
            team_end = data.find('<',team_start) 
            store.append(data[team_start:team_end])

            gp_start = data.find(' >',team_end) + 2
            gp_end = data.find('<',gp_start) 
            store.append(data[gp_start:gp_end])

            mpg_start = data.find(' >',gp_end) + 2
            mpg_end = data.find('<',mpg_start) 
            store.append(data[mpg_start:mpg_end])

            pts_start = data.find('">',mpg_end) + 2
            pts_end = data.find('<',pts_start) 
            store.append(data[pts_start:pts_end])

            mf_start = data.find(' >',pts_end) + 2
            mf_end = data.find('<',mf_start) 
            store.append(data[mf_start:mf_end])

            fg_start = data.find(' >',mf_end) + 2
            fg_end = data.find('<',fg_start) 
            store.append(data[fg_start:fg_end])

            m3_start = data.find(' >',fg_end) + 2
            m3_end = data.find('<',m3_start) 
            store.append(data[m3_start:m3_end])

            p3_start = data.find(' >',m3_end) + 2
            p3_end = data.find('<',p3_start) 
            store.append(data[p3_start:p3_end])

            ft_start = data.find(' >',p3_end) + 2
            ft_end = data.find('<',ft_start) 
            store.append(data[ft_start:ft_end])

            ftp_start = data.find(' >',ft_end) + 2
            ftp_end = data.find('<',ftp_start) 
            store.append(data[ftp_start:ftp_end])

            start = name_end
            rk = rk + 1    
            csv.writer(fcsv).writerow(store)

            fcsv.close()

def main():
    print "\n[+] Initializing..."
    if not os.path.exists("data.csv"):
        with open("data.csv", "ab") as fcsv:
            csv.writer(fcsv).writerow(["RK","PLAYER","TEAM","GP", "MPG","PTS","FGM-FGA","FG%","3PM-3PA","3P%","FTM-FTA","FT%"])
        fcsv.close()
    rk = 1
    global uri
    while True:
        time.sleep(1)
        start = 0
        print "\n[+] Getting data, please wait."
        data = get_data()
        if not data:
            break

        extract(data,rk)

        print "\n[+] Preparing for next page."    
        time.sleep(1.5)
        rk = rk + 40
        if rk > 300:
            print "\n[+] All Done !\n"
            break

        uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/sort/avg48Points/count/' + str(rk)

if __name__ == '__main__':
    main()

我特别想知道如何根据头条新闻获取信息。与TEAM GP MPG PTS FGM-FGA FG% 3PM-3PA 3P% FTM-FTA FT%

一样

因此除pts

中的mpgpts_start = data.find('">',mpg_end) + 2之外,不需要更改脚本

我不明白为什么我不能只输入表格中标题的名称。与FTM-FTA代替ft一样,脚本会放置$sql = "SELECT * FROM inventory_tbl, inventory_photos_tbl WHERE inventory_tbl.id = inventory_photos_tbl.ext_1"

1 个答案:

答案 0 :(得分:0)

使用BeautifulSoup轻松提取html数据。下面的例子是你得到的想法,但不是你的问题的完整解决方案。但是你可以轻松扩展。

from bs4 import BeautifulSoup
import urllib2


def get_html_page_dom(url):
    response = urllib2.urlopen(url)
    html_doc = response.read()
    return BeautifulSoup(html_doc, 'html5lib')


def extract_rows(dom):
    table_rows = dom.select('.mod-content tbody tr')

    for tr in table_rows:
        #  skip headers
        klass = tr.get('class')
        if klass is not None and 'colhead' in klass:
            continue
        tds = tr.select('td')
        yield {'RK': tds[0].string,
               'PLAYER': tds[1].select('a')[0].string,
               'TEAM': tds[2].string,
               'GP': tds[3].string
               # you can fetch rest of the indexs for corresponding headers
               }

if __name__ == '__main__':
    dom = get_html_page_dom('http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/')
    for data in extract_rows(dom):
        print(data)

您只需运行并查看结果;)。