[python] 一个简单的spider
Thursday, May 22nd, 2008今天刚装一个插件 Google Syntax Highlighter for WordPress,用来高亮显示各种code的.于是用python写了一个非常简单的spider.试验一下.:)
# #############################
# Spider
# get the page content of the URI
#
class Spider:
def __init__(self):
"""
"""
self._count = 0
self._data = 0
self._cost = 0
self._agents = []
self._agents.append('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14')
self._agents.append('Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)')
self._agents.append('Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13')
self._agents.append('Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)')
self._agents.append('Mozilla/5.0 (compatible; YodaoBot/1.0; http://www.yodao.com/help/webmaster/spider/; )')
self._agents.append('Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)')
self._failed = 0
self._name = "G1029"
def get(self, u):
"""
"""
s = ""
c = time.time()
try:
random.shuffle(self._agents)
request = urllib2.Request(url=u)
request.add_header('User-Agent', self._agents[0])
f = urllib2.urlopen(request)
s = f.read()
f.close()
except:
pass
self._failed += 1
print "ERROR: "+u
self._count += 1
self._cost += (time.time() - c)
self._data += len(s)
return s
def set(self, item, value):
"""
"""
if item == 'name' :
self._name = value
else :
return None
return 1
def count(self):
"""
"""
return self._count
def failed(self):
"""
"""
return self._failed
def cost(self):
"""
"""
return self._cost
def data(self):
"""
"""
return self._data
def dump(self):
"""
"""
print 'Spider Name: %s Count: %d Failed: %d Cost: %d Data: %d' % (self._name, self._count, self._failed, self._cost, self._data)
return
似乎不错哦, 呵呵.先用这个吧
