Python - 为什么这些数据被错误地写入文件?

时间:2015-09-28 10:23:30

标签: python web-scraping

只有第一个结果被写入csv,每行有一个url字母。这不是所有写的网址,而是每行一个。

我在这段代码的最后一部分做的是什么,导致cvs只用其中一个结果而不是所有结果写出来?

import requests
from bs4 import BeautifulSoup
import csv

def grab_listings():
    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/")
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    l_area = soup.find("div", {"class":"wlt_search_results"})
    for elem in l_area.findAll("a", {"class":"frame"}):
        return elem["href"]

    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/2/")
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    l_area = soup.find("div", {"class":"wlt_search_results"})
    for elem in l_area.findAll("a", {"class":"frame"}):
        return elem["href"]

    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/3/")
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    l_area = soup.find("div", {"class":"wlt_search_results"})
    for elem in l_area.findAll("a", {"class":"frame"}):
        return elem["href"]

    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/4/")
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    l_area = soup.find("div", {"class":"wlt_search_results"})
    for elem in l_area.findAll("a", {"class":"frame"}):
        return elem["href"]

    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/5/")
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    l_area = soup.find("div", {"class":"wlt_search_results"})
    for elem in l_area.findAll("a", {"class":"frame"}):
        return elem["href"]

    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/6/")
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    l_area = soup.find("div", {"class":"wlt_search_results"})
    for elem in l_area.findAll("a", {"class":"frame"}):
        return elem["href"]

    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/7/")
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    l_area = soup.find("div", {"class":"wlt_search_results"})
    for elem in l_area.findAll("a", {"class":"frame"}):
        return elem["href"]

    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/8/")
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    l_area = soup.find("div", {"class":"wlt_search_results"})
    for elem in l_area.findAll("a", {"class":"frame"}):
        return elem["href"]

    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/9/")
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    l_area = soup.find("div", {"class":"wlt_search_results"})
    for elem in l_area.findAll("a", {"class":"frame"}):
        return elem["href"]

l = grab_listings()


with open ("gyms.csv", "wb") as file:
        writer = csv.writer(file)
        for row in l:
            writer.writerow(row)

2 个答案:

答案 0 :(得分:1)

所以我稍微重构了你的代码,我认为它应该像你现在所期望的那样工作:

import requests
from bs4 import BeautifulSoup
import csv


def grab_listings(page_idx):
    ret = []
    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/"
           "page/{}/").format(page_idx) # the index of the page will be inserted here
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    l_area = soup.find("div", {"class": "wlt_search_results"})
    for elem in l_area.findAll("a", {"class": "frame"}):
        # be sure to add all your results to a list and return it,
        # if you return here then you will only get the first result
        ret.append(elem["href"])
    return ret


def main():
    l = [] # this will be a list of lists
    # call the function 9 times here with idx from 1 till 9
    for page_idx in range(1, 10):
        l.append(grab_listings(page_idx))
    print l

    with open("gyms.csv", "wb") as f:
        writer = csv.writer(f)
        for row in l:
            # be sure that your row is a list here, if it is only
            # a string all characters will be seperated by a comma.
            writer.writerow(row)

# for writing each URL in one line separated by commas at the end 
#    with open("gyms.csv", "wb") as f:
#        for row in l:
#            string_to_write = ',\n'.join(row)
#            f.write(string_to_write)

if __name__ == '__main__':
    main()

我在代码中添加了一些注释,希望它足够解释。如果不只是问:)

答案 1 :(得分:0)

简化为:

import requests
from bs4 import BeautifulSoup
import csv


def grab_listings():
    for i in range(0, 5):
        url = "http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/{}/"

        r = requests.get(url.format(i + 1))
        soup = BeautifulSoup(r.text, 'html.parser')
        l_area = soup.find("div", {"class": "wlt_search_results"})

        for elem in l_area.findAll("a", {"class": "frame"}):
            yield elem["href"]

l = grab_listings()


with open("gyms.csv", "w") as file:
    writer = csv.writer(file)
    for row in l:
        writer.writerow(row)