python - How to add new requests for my Scrapy Spider during crawling -


i use xmlfeedspider in scrapy scrap real estate website.

each url request generated spider (via start_urls) return page in xml bunch of ads , link next page (search results limited 50 ads).

i therefore wondering how add additional page new request in spider ?

i've been searching through stackoverflow while can't find simple answer problem !

below code have in spider. have updated parse_nodes() method mentioned paul next url not picked reasons.

could yield additional requests in adapt_response method ?

from scrapy.spider import log scrapy.selector import xmlxpathselector scrapy.contrib.spiders import xmlfeedspider crawler.items import refitem, picitem crawler.seloger_helper import urlbuilder scrapy.http import request  class seloger_spider_xml(xmlfeedspider):     name = 'seloger_spider_xml'     allowed_domains = ['seloger.com']     iterator = 'iternodes' # unnecessary, since it's default value     itertag = 'annonce'    '''spider initialized department argument''' def __init__(self, departement=none, *args, **kwargs):     super(seloger_spider_xml, self).__init__(*args, **kwargs)     #self.start_urls = urlbuilder(departement) #helper function generate start_urls     self.start_urls = ['http://ws.seloger.com/search.xml?cp=72&idtt=2&tri=d_dt_crea&searchpg=1']  def parse_node(self, response, node):      items = []     item = refitem()        item['ref'] = int(''.join(node.select('//annonce/idannonce/text()').extract()))     item['desc'] = ''.join(node.select('//annonce/descriptif/text()').extract()).encode('utf-8')     item['libelle'] = ''.join(node.select('//annonce/libelle/text()').extract()).encode('utf-8')     item['titre'] = ''.join(node.select('//annonce/titre/text()').extract()).encode('utf-8')     item['ville'] = ''.join(node.select('//annonce/ville/text()').extract()).encode('utf-8')     item['url'] =''.join(node.select('//annonce/permalien/text()').extract()).encode('utf-8')     item['prix'] = ''.join(node.select('//annonce/prix/text()').extract())     item['prixunite'] = ''.join(node.select('//annonce/prixunite/text()').extract())     item['datemaj'] = ''.join(node.select('//annonce/dtfraicheur/text()').extract())[:10]     item['datecrea'] = ''.join(node.select('//annonce/dtcreation/text()').extract())[:10]     item['lati'] = (''.join(node.select('//annonce/latitude/text()').extract()))     item['longi'] = (''.join(node.select('//annonce/longitude/text()').extract()))     item['surface'] = (''.join(node.select('//annonce/surface/text()').extract()))     item['surfaceunite'] = (''.join(node.select('//annonce/surfaceunite/text()').extract()))     item['piece'] = (''.join(node.select('//annonce/nbpiece/text()').extract())).encode('utf-8')     item['ce'] = (''.join(node.select('//annonce/dbilanemissionges/text()').extract())).encode('utf-8')      items.append(item)      photos in node.select('//annonce/photos'):             link in photos.select('photo/thburl/text()').extract():                 pic = picitem()                 pic['pic'] = link.encode('utf-8')                 pic['refpic'] = item['ref']                 items.append(pic)      return items      def parse_nodes(self, response, nodes):         n in super(seloger_spider_xml, self).parse_nodes(response, nodes):             yield n     # once you're done item/nodes     # next page link using xpath     # these lines borrowed form     # https://github.com/scrapy/scrapy/blob/master/scrapy/contrib/spiders/feed.py#l73         selector = xmlxpathselector(response)         self._register_namespaces(selector)         link_url in selector.select('//pagesuivante/text()').extract():             yield request(link_url)  

thank gilles

you can override parse_nodes() method hook in "next page" url extraction.

the example below based on scrapy docs xmlfeedexample:

from scrapy import log scrapy.contrib.spiders import xmlfeedspider myproject.items import testitem scrapy.selector import xmlxpathselector scrapy.http import request  class myspider(xmlfeedspider):     name = 'example.com'     allowed_domains = ['example.com']     start_urls = ['http://www.example.com/feed.xml']     iterator = 'iternodes' # unnecessary, since it's default value     itertag = 'item'      def parse_node(self, response, node):         log.msg('hi, <%s> node!: %s' % (self.itertag, ''.join(node.extract())))          item = item()         item['id'] = node.select('@id').extract()         item['name'] = node.select('name').extract()         item['description'] = node.select('description').extract()         return item      def parse_nodes(self, response, nodes):         # call built-in method calls parse_node()         # , yield whatever returns         n in super(myspider, self).parse_nodes(response, nodes):             yield n          # once you're done item/nodes         # next page link using xpath         # these lines borrowed form         # https://github.com/scrapy/scrapy/blob/master/scrapy/contrib/spiders/feed.py#l73         selector = xmlxpathselector(response)         self._register_namespaces(selector)         link_url in selector.select('//pagesuivante/text()').extract():             print "link_url", link_url             yield request(link_url) 

Comments

Popular posts from this blog

java.util.scanner - How to read and add only numbers to array from a text file -

rewrite - Trouble with Wordpress multiple custom querystrings -