python - Gevent link crawler -

here have written code using python , beautiful soup parse links on page repository of links. next, fetches contents of of url repository created, parses links new content repository , continues process links in repository until stopped or after given number of links fetched.

but code slow. how can improve using asynchronous programming using gevents in python ?

code

class crawler(object):   def __init__(self):      self.soup = none                                        # beautiful soup object     self.current_page   = "http://www.python.org/"          # current page's address     self.links          = set()                             # queue every links fetched     self.visited_links  = set()      self.counter = 0 # simple counter debug purpose  def open(self):      # open url     print self.counter , ":", self.current_page     res = urllib2.urlopen(self.current_page)     html_code = res.read()     self.visited_links.add(self.current_page)       # fetch every links     self.soup = beautifulsoup.beautifulsoup(html_code)      page_links = []     try :         page_links = itertools.ifilter(  # deal absolute links                                          lambda href: 'http://' in href,                                             ( a.get('href') in self.soup.findall('a') )  )     except exception e: # magnificent exception handling         print 'error: ',e         pass        # update links      self.links = self.links.union( set(page_links) )         # choose random url non-visited set     self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]     self.counter+=1   def run(self):      # crawl 3 webpages (or stop if url has been fetched)     while len(self.visited_links) < 3 or (self.visited_links == self.links):         self.open()      link in self.links:         print link    if __name__ == '__main__':  c = crawler() c.run()

update 1

import gevent.monkey; gevent.monkey.patch_thread() bs4 import beautifulsoup import urllib2 import itertools import random import urlparse import sys  import gevent.monkey; gevent.monkey.patch_all(thread=false)     class crawler(object):   def __init__(self): self.soup = none                                        # beautiful soup object self.current_page   = "http://www.python.org/"          # current page's address self.links          = set()                             # queue every links fetched self.visited_links  = set()  self.counter = 0 # simple counter debug purpose  def open(self):  # open url print self.counter , ":", self.current_page res = urllib2.urlopen(self.current_page) html_code = res.read() self.visited_links.add(self.current_page)  # fetch every links self.soup = beautifulsoup(html_code)  page_links = [] try :     link in [h.get('href') h in self.soup.find_all('a')]:         print "found link: '" + link + "'"         if link.startswith('http'):     print 'entered in if link: ',link             page_links.append(link)             print "adding link" + link + "\n"         elif link.startswith('/'):     print 'entered in elif link: ',link             parts = urlparse.urlparse(self.current_page)             page_links.append(parts.scheme + '://' + parts.netloc + link)             print "adding link " + parts.scheme + '://' + parts.netloc + link + "\n"         else:     print 'entered in else link: ',link             page_links.append(self.current_page+link)             print "adding link " + self.current_page+link + "\n"  except exception, ex: # magnificent exception handling     print ex  # update links  self.links = self.links.union( set(page_links) )  # choose random url non-visited set self.current_page = random.sample( self.links.difference(self.visited_links),1)[0] self.counter+=1  def run(self):  # crawl 3 webpages (or stop if url has been fetched) crawling_greenlets = []  in range(3):   crawling_greenlets.append(gevent.spawn(self.open))   gevent.joinall(crawling_greenlets)  #while len(self.visited_links) < 4 or (self.visited_links == self.links): #    self.open()  link in self.links:   print link  if __name__ == '__main__': c = crawler() c.run()

import gevent , make sure monkey-patching done make standard library calls non-blocking , aware of gevent:

import gevent gevent import monkey; monkey.patch_all()

(you can selectively decide has monkey-patched, let's not problem @ moment)

in run, make open function called inside greenlet. run can return greenlet object, can wait whenever need results using gevent.joinall example. this:

def run(self):     return gevent.spawn(self.open)  c1 = crawler() c2 = crawler() c3 = crawler() crawling_tasks = [c.run() c in (c1,c2,c3)] gevent.joinall(crawling_tasks)  print [c.links c in (c1, c2, c3)]

Search This Blog

Bradly

python - Gevent link crawler -

Comments

Post a Comment

Popular posts from this blog

java.util.scanner - How to read and add only numbers to array from a text file -

What is the end of string notation in python -

php - Add the correct number of days for each month -