如何从URL中提取某些字符串

时间:2013-06-12 13:50:45

标签: python scrapy

我正在尝试从下面提到的URL中提取某些字符串:

示例网址:

http://www.ladyblush.com/buy-sarees-online.html?p=1
http://www.ladyblush.com/buy-ladies-suits-online.html?p=1
http://www.ladyblush.com/buy-women-fashion-accessories.html?p=1

我想提取:

productCategory = "sarees" productSubCategory = ""
productCategory = "ladies" productSubCategory = "suits"
productCategory = "women" productSubCategory = "fashion-accessories"

等等。实际上我正在写一个蜘蛛,我需要从URL上提取productCategory和productSubCategory,如上所述。所以我试图从response.url中提取解析方法中的这些字段。请有人帮帮我

我的代码:

import re
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider

#------------------------------------------------------------------------------ 

class ESpider(CrawlSpider):

    name = "ladyblushSpider"    
    allowed_domains = ["ladyblush.com"]    
    URLSList = []

    for n in range (1,100):

        URLSList.append('http://www.ladyblush.com/buy-sarees-online.html?p=' + str(n))
        URLSList.append('http://www.ladyblush.com/buy-ladies-suits-online.html?p=' + str(n))
        URLSList.append('http://www.ladyblush.com/buy-women-fashion-accessories.html?p=' + str(n))
        URLSList.append('http://www.ladyblush.com/buy-nightwear-lingerie-online.html?p=' + str(n))
        URLSList.append('http://www.ladyblush.com/buy-women-dress-online-skirts-suits-kurtis-tops.html?p=' + str(n))
        URLSList.append('http://www.ladyblush.com/buy-decor-online-wallclock-bedsheets-cushions-bedcovers.html?p=' + str(n))
        URLSList.append('http://www.ladyblush.com/buy-cosmetics-online-massage-oils-aromatherapy-perfumes-soaps.html?p=' + str(n))
        URLSList.append('http://www.ladyblush.com/buy-jewelery-online-art-fashion-semi-precious-antique-junk-jewellery.html?p=' + str(n))

    start_urls = URLSList

    def parse(self, response):

        item = EscraperItem()
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//div[@class="third thumbnailSpillLarge"]')
        items = []

        for site in sites:

            item = EscraperItem()

            item['currency'] = 'INR'
            item['productCategory'] = [""]
            item['productSubCategory'] = [""]
            item['productSite'] = ["http://ladyblush.com/"]
            item['productImage'] = site.select('./a/div/img/@src').extract()
            item['productTitle'] = site.select('./a/div/img/@title').extract()                         
            item['productURL'] = [site.select('./a/@href').extract()[0].replace(" ","%20")]
            productMRP = site.select('.//div[@class="salePrice"]//div[@class="price-box"]//p[@class="old-price"]//span[@class="price"]/text()').extract()
            productPrice = site.select('.//div[@class="salePrice"]//div[@class="price-box"]//p[@class="special-price"]//span[@class="price"]/text()').extract()

            if productMRP and productPrice:
                price = [productMRP[1].strip()] + [productPrice[1].strip()]  
            else:
                price = site.select('.//div[@class="salePrice"]//div[@class="price-box"]//span[@class="regular-price"]//span[@class="price"]/text()').extract()    
            item['productPrice'] = price

            items.append(item)            
            secondURL = item['productURL'][0] 
            request = Request(secondURL,callback=self.parsePage2)
            request.meta['item'] = item

            yield request

    def parsePage2(self, response):

        hxs = HtmlXPathSelector(response)

        item = response.meta['item']

        if  hxs.select('//div[@class="addtocart-container"]/div/text()').extract():                   
            item['availability'] = False            
        else:                   
            item['availability'] = True

        if hxs.select('//label[@class="required"]/text()').extract():
            item['hasVariants'] = True
        else:         
            item['hasVariants'] = False
        item['image_urls'] = list(set(item['productImage']))
        item['productDesc'] = [" ".join([re.sub(r'[\t\n\r]',"",i.strip()) for i in hxs.select('//div[@class="std"]/text()').extract()])] 
        item['productImage'] = item['productImage'] + hxs.select('//div[@class="more-views"]/ul/li/a/img/@src').extract() + hxs.select('//div[@class="more-views"]/ul/li/a/@href').extract()  

        return item

#------------------------------------------------------------------------------ 

1 个答案:

答案 0 :(得分:1)

你可以从

获取网址 解析方法中的

response.url。然后,您可以解析它以获取URL路径

import os
test = 'buy-women-fashion-accessories.html?p=1'
parts = os.path.splitext(test)
# ('buy-women-fashion-accessories', '.html?p=1')
parts[0].split('-')[1:]
# ['women', 'fashion', 'accessories']

但这是一个相当脆弱的解决方案。你确定数据没有存储在你正在解析的页面的html中,而不是查看网址吗?

相关问题