python - How to add new requests for my Scrapy Spider during crawling -
i use xmlfeedspider in scrapy scrap real estate website.
each url request generated spider (via start_urls) return page in xml bunch of ads , link next page (search results limited 50 ads).
i therefore wondering how add additional page new request in spider ?
i've been searching through stackoverflow while can't find simple answer problem !
below code have in spider. have updated parse_nodes() method mentioned paul next url not picked reasons.
could yield additional requests in adapt_response method ?
from scrapy.spider import log scrapy.selector import xmlxpathselector scrapy.contrib.spiders import xmlfeedspider crawler.items import refitem, picitem crawler.seloger_helper import urlbuilder scrapy.http import request class seloger_spider_xml(xmlfeedspider): name = 'seloger_spider_xml' allowed_domains = ['seloger.com'] iterator = 'iternodes' # unnecessary, since it's default value itertag = 'annonce' '''spider initialized department argument''' def __init__(self, departement=none, *args, **kwargs): super(seloger_spider_xml, self).__init__(*args, **kwargs) #self.start_urls = urlbuilder(departement) #helper function generate start_urls self.start_urls = ['http://ws.seloger.com/search.xml?cp=72&idtt=2&tri=d_dt_crea&searchpg=1'] def parse_node(self, response, node): items = [] item = refitem() item['ref'] = int(''.join(node.select('//annonce/idannonce/text()').extract())) item['desc'] = ''.join(node.select('//annonce/descriptif/text()').extract()).encode('utf-8') item['libelle'] = ''.join(node.select('//annonce/libelle/text()').extract()).encode('utf-8') item['titre'] = ''.join(node.select('//annonce/titre/text()').extract()).encode('utf-8') item['ville'] = ''.join(node.select('//annonce/ville/text()').extract()).encode('utf-8') item['url'] =''.join(node.select('//annonce/permalien/text()').extract()).encode('utf-8') item['prix'] = ''.join(node.select('//annonce/prix/text()').extract()) item['prixunite'] = ''.join(node.select('//annonce/prixunite/text()').extract()) item['datemaj'] = ''.join(node.select('//annonce/dtfraicheur/text()').extract())[:10] item['datecrea'] = ''.join(node.select('//annonce/dtcreation/text()').extract())[:10] item['lati'] = (''.join(node.select('//annonce/latitude/text()').extract())) item['longi'] = (''.join(node.select('//annonce/longitude/text()').extract())) item['surface'] = (''.join(node.select('//annonce/surface/text()').extract())) item['surfaceunite'] = (''.join(node.select('//annonce/surfaceunite/text()').extract())) item['piece'] = (''.join(node.select('//annonce/nbpiece/text()').extract())).encode('utf-8') item['ce'] = (''.join(node.select('//annonce/dbilanemissionges/text()').extract())).encode('utf-8') items.append(item) photos in node.select('//annonce/photos'): link in photos.select('photo/thburl/text()').extract(): pic = picitem() pic['pic'] = link.encode('utf-8') pic['refpic'] = item['ref'] items.append(pic) return items def parse_nodes(self, response, nodes): n in super(seloger_spider_xml, self).parse_nodes(response, nodes): yield n # once you're done item/nodes # next page link using xpath # these lines borrowed form # https://github.com/scrapy/scrapy/blob/master/scrapy/contrib/spiders/feed.py#l73 selector = xmlxpathselector(response) self._register_namespaces(selector) link_url in selector.select('//pagesuivante/text()').extract(): yield request(link_url)
thank gilles
you can override parse_nodes()
method hook in "next page" url extraction.
the example below based on scrapy docs xmlfeedexample:
from scrapy import log scrapy.contrib.spiders import xmlfeedspider myproject.items import testitem scrapy.selector import xmlxpathselector scrapy.http import request class myspider(xmlfeedspider): name = 'example.com' allowed_domains = ['example.com'] start_urls = ['http://www.example.com/feed.xml'] iterator = 'iternodes' # unnecessary, since it's default value itertag = 'item' def parse_node(self, response, node): log.msg('hi, <%s> node!: %s' % (self.itertag, ''.join(node.extract()))) item = item() item['id'] = node.select('@id').extract() item['name'] = node.select('name').extract() item['description'] = node.select('description').extract() return item def parse_nodes(self, response, nodes): # call built-in method calls parse_node() # , yield whatever returns n in super(myspider, self).parse_nodes(response, nodes): yield n # once you're done item/nodes # next page link using xpath # these lines borrowed form # https://github.com/scrapy/scrapy/blob/master/scrapy/contrib/spiders/feed.py#l73 selector = xmlxpathselector(response) self._register_namespaces(selector) link_url in selector.select('//pagesuivante/text()').extract(): print "link_url", link_url yield request(link_url)
Comments
Post a Comment