#encoding: gb2312
import urllib2
import threading
import logging
import re
import sys
import os
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding("utf-8")
#日志初始化
FILE = os.getcwd()
logging.basicConfig(filename=os.path.join(FILE, 'log.txt'),level=logging.DEBUG)
#待抓取的任务队列
url_new = [('none','http://www.111com.net/')]
#已完成的任务
url_old = []
#已完成的状态
url_err = {200:[]}
#锁
lock = threading.Lock()
lock2= threading.Lock()
#线程执行主方法
#从任务列表中获取一条url进行抓取
#分析url,去重复,将得到的urls重新放入任务列表
#保存当前url的访问状态
def geturl():
global url_new
try:
while True:
lock.acquire()
if len(url_new)<=0:
lock.release()
continue
url_t = url_new.pop(0)
url = url_t[1]
try:
req = urllib2.urlopen(url)
except urllib2.HTTPError, e:
#记录到对应的列表中
if url_err.has_key(e.code):
url_err[e.code].append((url,url_t[0]))
else:
url_err[e.code] = [(url,url_t[0])]
with open('log.html', 'a+') as f:
f.write(str(e.code)+':'+url+', 来路:'+url_t[0]+' ')
continue
else:
url_err[200].append(url)
with open('log.html', 'a+') as f:
f.write('200:'+url+', 来路:'+url_t[0]+' ')
#记录到已访问的列表中
url_old.append(url)
#开始提取页面url
soup = BeautifulSoup(req.read().decode('UTF-8', 'ignore'))
alink= soup.find_all('a', attrs={'href':re.compile(".*?xxxxxx.*?")})
tmp_url = []
for a in alink:
href = a.get('href')
tmp_url.append(a.get('href') if a.get('href').find('http:')>=0 else 'http://www.xxxxxx.com'+a.get('href'))
tmp_url= {}.fromkeys(tmp_url).keys()
for link in tmp_url:
if link not in url_old:
url_new.append((url, link))
tmp = []
for i in xrange(len(url_new)):
if url_new[i][1] not in tmp:
tmp.append(url_new[i][1])
else:
del url_new[i]
#url_new = {}.fromkeys(url_new).keys()
#输出一下状态信息
os.system('cls')
print threading.Thread().getName()+":当前线程数:"+str(threading.activeCount())+",当前剩余任务量:"+str(len(url_new))+", 已访问:"+str(len(url_old))
for k in url_err.keys():
print str(k)+':'+str(len(url_err[k]))
lock.release()
except Exception as e:
logging.debug(str(e))
lock.release()
#线程数检测 死循环持续检测当前活动线程数
#不够数量时自动创建启动新线程
def threadcheck(num):
t=threading.Thread(target=geturl)
t.start()
t.join()
#定义主方法
def main():
"""初始 创建200个线程
for i in xrange(190):
t = threading.Thread(target=geturl)
threads.append(t)
for i in xrange(190):
threads[i].start()
for i in xrange(190):
threads[i].join()"""
t = threading.Thread(target=threadcheck, args=(10,))
t.start()
t.join()
#geturl(url_new.pop(0))
#开始
if __name__ == '__main__':
main()
input('整站抓取已结束!')
|