python - Gevent link crawler -
here have written code using python , beautiful soup parse links on page repository of links. next, fetches contents of of url repository created, parses links new content repository , continues process links in repository until stopped or after given number of links fetched.
but code slow. how can improve using asynchronous programming using gevents in python ?
code
class crawler(object): def __init__(self): self.soup = none # beautiful soup object self.current_page = "http://www.python.org/" # current page's address self.links = set() # queue every links fetched self.visited_links = set() self.counter = 0 # simple counter debug purpose def open(self): # open url print self.counter , ":", self.current_page res = urllib2.urlopen(self.current_page) html_code = res.read() self.visited_links.add(self.current_page) # fetch every links self.soup = beautifulsoup.beautifulsoup(html_code) page_links = [] try : page_links = itertools.ifilter( # deal absolute links lambda href: 'http://' in href, ( a.get('href') in self.soup.findall('a') ) ) except exception e: # magnificent exception handling print 'error: ',e pass # update links self.links = self.links.union( set(page_links) ) # choose random url non-visited set self.current_page = random.sample( self.links.difference(self.visited_links),1)[0] self.counter+=1 def run(self): # crawl 3 webpages (or stop if url has been fetched) while len(self.visited_links) < 3 or (self.visited_links == self.links): self.open() link in self.links: print link if __name__ == '__main__': c = crawler() c.run()
update 1
import gevent.monkey; gevent.monkey.patch_thread() bs4 import beautifulsoup import urllib2 import itertools import random import urlparse import sys import gevent.monkey; gevent.monkey.patch_all(thread=false) class crawler(object): def __init__(self): self.soup = none # beautiful soup object self.current_page = "http://www.python.org/" # current page's address self.links = set() # queue every links fetched self.visited_links = set() self.counter = 0 # simple counter debug purpose def open(self): # open url print self.counter , ":", self.current_page res = urllib2.urlopen(self.current_page) html_code = res.read() self.visited_links.add(self.current_page) # fetch every links self.soup = beautifulsoup(html_code) page_links = [] try : link in [h.get('href') h in self.soup.find_all('a')]: print "found link: '" + link + "'" if link.startswith('http'): print 'entered in if link: ',link page_links.append(link) print "adding link" + link + "\n" elif link.startswith('/'): print 'entered in elif link: ',link parts = urlparse.urlparse(self.current_page) page_links.append(parts.scheme + '://' + parts.netloc + link) print "adding link " + parts.scheme + '://' + parts.netloc + link + "\n" else: print 'entered in else link: ',link page_links.append(self.current_page+link) print "adding link " + self.current_page+link + "\n" except exception, ex: # magnificent exception handling print ex # update links self.links = self.links.union( set(page_links) ) # choose random url non-visited set self.current_page = random.sample( self.links.difference(self.visited_links),1)[0] self.counter+=1 def run(self): # crawl 3 webpages (or stop if url has been fetched) crawling_greenlets = [] in range(3): crawling_greenlets.append(gevent.spawn(self.open)) gevent.joinall(crawling_greenlets) #while len(self.visited_links) < 4 or (self.visited_links == self.links): # self.open() link in self.links: print link if __name__ == '__main__': c = crawler() c.run()
import gevent , make sure monkey-patching done make standard library calls non-blocking , aware of gevent:
import gevent gevent import monkey; monkey.patch_all()
(you can selectively decide has monkey-patched, let's not problem @ moment)
in run
, make open
function called inside greenlet. run
can return greenlet object, can wait whenever need results using gevent.joinall
example. this:
def run(self): return gevent.spawn(self.open) c1 = crawler() c2 = crawler() c3 = crawler() crawling_tasks = [c.run() c in (c1,c2,c3)] gevent.joinall(crawling_tasks) print [c.links c in (c1, c2, c3)]
Comments
Post a Comment