尝试使用BeautifulSoup彻底遍历a table of sold property listings时遇到麻烦。
在此示例中
从表中仅获取已售物业的行的最佳方法是什么?
最终目标是要卖出售价;销售日期; #卧室/浴室/汽车;土地面积并附加到熊猫数据框中。
from bs4 import BeautifulSoup
import requests
# Globals
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
url = 'http://house.ksou.cn/p.php?q=West+Footscray%2C+VIC'
r=requests.get(url,headers=headers)
c=r.content
soup=BeautifulSoup(c,"html.parser")
r=requests.get(url,headers=headers)
c=r.content
soup=BeautifulSoup(c,"html.parser")
prop_table = soup.find('table', id="mainT")
#prop_table = soup.find('table', {"font-size" : "13px"})
#prop_table = soup.select('.addr') # Pluck out the listings
rows = prop_table.findAll('tr')
for row in rows:
print(row.text)
答案 0 :(得分:1)
此HTML解析起来很棘手,因为它没有固定的结构。不幸的是,我没有安装熊猫,所以我只将数据打印到屏幕上。
import requests
from bs4 import BeautifulSoup
url = 'http://house.ksou.cn/p.php?q=West+Footscray&p={page}&s=1&st=&type=&count=300®ion=West+Footscray&lat=0&lng=0&sta=vic&htype=&agent=0&minprice=0&maxprice=0&minbed=0&maxbed=0&minland=0&maxland=0'
data = []
for page in range(0, 2): # <-- increase to number of pages you want to crawl
soup = BeautifulSoup(requests.get(url.format(page=page)).text, 'html.parser')
for table in soup.select('table[id^="r"]'):
name = table.select_one('span.addr').text
price = table.select_one('span.addr').find_next('b').get_text(strip=True).split()[-1]
sold = table.select_one('span.addr').find_next('b').find_next_sibling(text=True).replace('in', '').replace('(Auction)', '').strip()
beds = table.select_one('img[alt="Bed rooms"]')
beds = beds.find_previous_sibling(text=True).strip() if beds else '-'
bath = table.select_one('img[alt="Bath rooms"]')
bath = bath.find_previous_sibling(text=True).strip() if bath else '-'
car = table.select_one('img[alt="Car spaces"]')
car = car.find_previous_sibling(text=True).strip() if car else '-'
land = table.select_one('b:contains("Land size:")')
land = land.find_next_sibling(text=True).split()[0] if land else '-'
building = table.select_one('b:contains("Building size:")')
building = building.find_next_sibling(text=True).split()[0] if building else '-'
data.append([name, price, sold, beds, bath, car, land, building])
# print the data
print('{:^25} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15}'.format('Name', 'Price', 'Sold', 'Beds', 'Bath', 'Car', 'Land', 'Building'))
for row in data:
print('{:<25} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15}'.format(*row))
打印:
Name Price Sold Beds Bath Car Land Building
51 Fontein Street $770,000 07 Dec 2019 - - - - -
50 Fontein Street $751,000 07 Dec 2019 - - - - -
9 Wellington Street $1,024,999 Dec 2019 2 1 1 381 -
239 Essex Street $740,000 07 Dec 2019 2 1 1 358 101
677a Barkly Street $780,000 Dec 2019 4 1 - 380 -
23A Busch Street $800,000 30 Nov 2019 3 1 1 215 -
3/2-4 Dyson Street $858,000 Nov 2019 3 2 - 378 119
3/101 Stanhope Street $803,000 30 Nov 2019 2 2 2 168 113
2/4 Rondell Avenue $552,500 30 Nov 2019 2 - - 1,088 -
3/2 Dyson Street $858,000 30 Nov 2019 3 2 2 378 -
9 Vine Street $805,000 Nov 2019 2 1 2 318 -
39 Robbs Road $957,000 23 Nov 2019 2 2 - 231 100
29 Robbs Road $1,165,000 Nov 2019 2 1 1 266 -
5 Busch Street $700,000 Nov 2019 2 1 1 202 -
46 Indwe Street $730,000 16 Nov 2019 3 1 1 470 -
29/132 Rupert Street $216,000 16 Nov 2019 1 1 1 3,640 -
11/10 Carmichael Street $385,000 15 Nov 2019 2 1 1 1,005 -
2/16 Carmichael Street $515,000 14 Nov 2019 2 1 1 112 -
4/26 Beaumont Parade $410,000 Nov 2019 2 1 1 798 -
5/10 Carmichael Street $310,000 Nov 2019 1 1 1 1,004 -