我写了一个简单的Python scraper来从nytimes.com上的特定页面获取一些文档。它的工作原理是,它可以正确地抓取并格式化所有URL,并尝试下载文件,并正确格式化名称。
但我得到的是1kb文件。我无法弄清楚原因。这是我的代码:
import urllib2
import urllib
from cookielib import CookieJar
files = 'http://www.nytimes.com/interactive/2014/11/25/us/evidence-released-in-michael-brown-case.html?_r=0'
slashpos = 0
def getLinks(url):
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
p = opener.open(url)
result = []
for line in p:
for element in line.split():
if element.startswith('href="http://gr'):
if element.endswith('pdf"') or element.endswith('png"') or element.endswith('jpg"'):
result.append(element[6:])
else:
continue
for char in result:
slashpos = char.rfind('/') + 1
urllib.urlretrieve(char, char[slashpos:-1])
getLinks(files)
感谢任何和所有帮助。谢谢!
答案 0 :(得分:0)
1)使用result.append(element[6:-1])
代替result.append(element[6:])
(避免在网址中使用双引号,导致下载失败的原因)
2)并保存文件使用urllib.urlretrieve(char, char[slashpos:])
而不是urllib.urlretrieve(char, char[slashpos:-1])
答案 1 :(得分:0)
解决了!! :d
#!/usr/bin/env python
from bs4 import BeautifulSoup
import urllib2
import urlparse
from sys import argv
from cookielib import CookieJar
if len(argv) != 2:
print "Usage:\n\tpython %s 'http://www.nytimes.com/interactive/2014/11/25/us/evidence-released-in-michael-brown-case.html?_r=0'"%argv[0]
exit()
url = argv[1]
urls =[]
try:
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
html = opener.open(url)
except:
print "[-] No such website"
exit()
soup = BeautifulSoup(html)
for tag in soup.find_all('a'):
try:
tag["href"] = urlparse.urljoin(url, tag['href'])
if tag['href'] not in urls and '.png' in tag['href'] or '.jpg' in tag['href']:
newpdf = tag['href'].split("/")
name = newpdf[-1]
resp = urllib2.urlopen(tag['href'])
meta_data = resp.info()
fsize = int(meta_data.getheaders("Content-Length")[0])
print "Downloading --> %s \t size: %s "%(name, fsize)
f = open(name, "wb")
f.write(resp.read())
f.close
urls.append(tag["href"])
else:
print tag['href']
except KeyboardInterrupt:
print " User hit CTRL+C"
exit()
except:
pass
希望它会对你有所帮助