BeautifulSoup数据抓取

时间:2014-10-27 23:02:55

标签: python web-scraping beautifulsoup

我正试图从wunderground.com获取BeautifulSoup的4年小时数据。但是,当我使用此代码时,我遇到了一些问题。问题是每小时只显示一行。它分离3行(如附件)。当我尝试过滤数据时,很难研究它。我希望将每个数据放在一行中。你能帮帮我吗?

我的代码:

import urllib.request as urllib2, re, time
# importing parser
from bs4 import BeautifulSoup
f = open('weather.txt', 'w')
# Start and end year of simulation
for y in range(2009, 2013):
    # Type the months that you want to extract
# For example for January and February use range(1,3)
    for m in range(1, 13):
#checking for leap years
        for d in range(1,32):
            if y%400 == 0:
                leap = True
            elif y%100 == 0:
                leap = False
            elif y%4 == 0:
                leap = True
            else:
                leap = False
            if (m == 2 and leap and d > 29):
                continue
            elif (m == 2 and d > 28):
                continue
            elif (m in [4, 6, 9, 10] and d > 30):
                continue

            url = "http://www.wunderground.com/history/airport/LTBA/" + str(y) + "/" + str(m) + "/" + str(d) + "/DailyHistory.html"
            page = urllib2.urlopen(url)

            #opening the website with Beautiful Soup
            soup = BeautifulSoup(page)

            # finding section with observation details
            paragList = soup.findAll(id="observations_details")
            counter = 0
            counter_max = 0     # maximum number of columns

            # adding a zero to one digit numbers
            string = ''
            if len(str(m)) < 2:
                mStamp = '0' + str(m)
            else:
                mStamp = str(m)
            if len(str(d)) < 2:
                dStamp = '0' + str(d)
            else:
                dStamp = str(d)

            # time stamp is four digit year, two digit month, and two digit day    
            timestamp = str(y) + mStamp + dStamp
            print(timestamp)

            for i in paragList:

                # writing in text file the header with the name of each column
                headList = i.findAll('th')
                f.write('DATE,')
                for k in headList:
                    h_element = k.text
                    s = str(h_element)
                    f.write(s)
                    f.write(',')
                    counter_max = counter_max + 1
                f.write(' \n')

                # writing in text file each row with data
                tableList = i.findAll('tbody')
                for l in tableList:
                    bodyList = l.findAll('td')
                    for j in bodyList:
                        if counter == 0:
                            f.write(timestamp + ',')
                        if j.string:
                            element = j.text
                            # print (element)
                            s = str(element)
                            f.write(s)
                            f.write(',')
                        else:
                            elementList = j.findAll(j.b) + j.findAll('b')
                            for k in elementList:
                                if k.string:
                                    element = k.text
                                    # print(element)
                                    s = str(element)
                                    f.write(s)
                                    f.write(',')
                        counter = counter + 1
                        if counter == counter_max:
                            # print
                            # print("************************")
                            # print("**** NEXT RECORD *******")
                            # print("**** *******************")
                            f.write('\n')
                            counter = 0
            # print ("\n")
            # print("**** NEXT YEAR *******")


f.close()

示例输出:

enter image description here

0 个答案:

没有答案