python实现多线程网页爬虫的2个例子

时间：2015-09-07 编辑：简简单单来源：一聚教程网

一般来说，使用线程有两种模式, 一种是创建线程要执行的函数, 把这个函数传递进Thread对象里，让它来执行. 另一种是直接从Thread继承，创建一个新的class，把线程执行的代码放到这个新的class里。

实现多线程网页爬虫，采用了多线程和锁机制，实现了广度优先算法的网页爬虫。

先给大家简单介绍下我的实现思路：

对于一个网络爬虫，如果要按广度遍历的方式下载，它是这样的：

1.从给定的入口网址把第一个网页下载下来

2.从第一个网页中提取出所有新的网页地址，放入下载列表中

3.按下载列表中的地址，下载所有新的网页

4.从所有新的网页中找出没有下载过的网页地址，更新下载列表

5.重复3、4两步，直到更新后的下载列表为空表时停止

例子1

代码如下

复制代码

#!/usr/bin/env python
#coding=utf-8
import threading
import urllib
import re
import time
g_mutex=threading.Condition()
g_pages=[] #从中解析所有url链接
g_queueURL=[] #等待爬取的url链接列表
g_existURL=[] #已经爬取过的url链接列表
g_failedURL=[] #下载失败的url链接列表
g_totalcount=0 #下载过的页面数
class Crawler:
def __init__(self,crawlername,url,threadnum):
    self.crawlername=crawlername
    self.url=url
    self.threadnum=threadnum
    self.threadpool=[]
    self.logfile=file("log.txt",'w')
def craw(self):
    global g_queueURL
    g_queueURL.append(url)
    depth=0
    print self.crawlername+" 启动..."
    while(len(g_queueURL)!=0):
      depth+=1
      print 'Searching depth ',depth,'...\n\n'
      self.logfile.write("URL:"+g_queueURL[0]+"........")
      self.downloadAll()
      self.updateQueueURL()
      content='\n>>>Depth '+str(depth)+':\n'
      self.logfile.write(content)
      i=0
      while i         content=str(g_totalcount+i)+'->'+g_queueURL[i]+'\n'
        self.logfile.write(content)
        i+=1
def downloadAll(self):
    global g_queueURL
    global g_totalcount
    i=0
    while i       j=0
      while j         g_totalcount+=1
        threadresult=self.download(g_queueURL[i+j],str(g_totalcount)+'.html',j)
        if threadresult!=None:
          print 'Thread started:',i+j,'--File number =',g_totalcount
        j+=1
      i+=j
      for thread in self.threadpool:
        thread.join(30)
      threadpool=[]
    g_queueURL=[]
def download(self,url,filename,tid):
    crawthread=CrawlerThread(url,filename,tid)
    self.threadpool.append(crawthread)
    crawthread.start()
def updateQueueURL(self):
    global g_queueURL
    global g_existURL
    newUrlList=[]
    for content in g_pages:
      newUrlList+=self.getUrl(content)
    g_queueURL=list(set(newUrlList)-set(g_existURL))
def getUrl(self,content):
    reg=r'"(http://.+?)"'
    regob=re.compile(reg,re.DOTALL)
    urllist=regob.findall(content)
    return urllist
class CrawlerThread(threading.Thread):
def __init__(self,url,filename,tid):
    threading.Thread.__init__(self)
    self.url=url
    self.filename=filename
    self.tid=tid
def run(self):
    global g_mutex
    global g_failedURL
    global g_queueURL
    try:
      page=urllib.urlopen(self.url)
      html=page.read()
      fout=file(self.filename,'w')
      fout.write(html)
      fout.close()
    except Exception,e:
      g_mutex.acquire()
      g_existURL.append(self.url)
      g_failedURL.append(self.url)
      g_mutex.release()
      print 'Failed downloading and saving',self.url
      print e
      return None
    g_mutex.acquire()
    g_pages.append(html)
    g_existURL.append(self.url)
    g_mutex.release()
if __name__=="__main__":
url=raw_input("请输入url入口:\n")
threadnum=int(raw_input("设置线程数:"))
crawlername="小小爬虫"
crawler=Crawler(crawlername,url,threadnum)
crawler.craw()

例子2

代码如下

复制代码

#!/usr/bin/env python
#coding=utf-8
import threading
import urllib
import re
import time

cur=0
last=0
totalcount=0
depth=0
t_mutex=threading.Condition()

class Mycrawler:
    def __init__(self,crawlername,seeds,threadnum):
        self.crawlername=crawlername
        self.seeds=seeds
        self.crawqueue=CrawQueue()
        self.initQueue(self.seeds)
        self.threadnum=threadnum
        self.threadpools=[]
        self.logfile=file('log2.txt','w')
    def initQueue(self,seeds):
        if isinstance(seeds,str):
            self.crawqueue.push(seeds)
        elif isinstance(seeds,list):
            for seed in seeds:
                self.crawqueue.push(seed)
        global last
        global totalcount
        totalcount=self.crawqueue.getQueueCount()
        last=totalcount
    def crawling(self):
        global cur
        global depth
        global last
        global totalcount
        self.log(">>>Depth "+str(depth)+":\n")
        while self.crawqueue.getQueueCount()!=0:
            url=self.crawqueue.pop()
            self.log(url)
            if url==None:
                continue
            self.crawqueue.addToVisited(url)
            links=self.getLinks(url)
            if links==None:
                print 'None'
                self.crawqueue.failed.append(url)
                continue
            beforenum = self.crawqueue.getQueueCount()
            self.crawqueue.addLinks(links)
            afternum = self.crawqueue.getQueueCount()
            totalcount+=afternum-beforenum
            cur+=1
            if cur==last:
                depth+=1
                self.log(">>>Depth "+str(depth)+":\n")
                last=totalcount
    def crawling2(self):
        global last
        global totalcount
        global depth
        self.log(">>>Depth "+str(depth)+":\n")
        totalcount=self.crawqueue.getQueueCount()
        last=totalcount
        while self.crawqueue.getQueueCount()!=0:
            for i in range(self.threadnum):
                url=self.crawqueue.pop()
                if url==None:
                    break
                crawthread=crawlerThread(url,i,self)
                self.threadpools.append(crawthread)
                crawthread.start()
            for i in range(len(self.threadpools)):
                crawthread=self.threadpools[i]
                crawthread.join(30)
    def log(self,content):
        self.logfile.write(content+"\n")
class crawlerThread(threading.Thread):
    def __init__(self,url,tid,mycrawler):
        threading.Thread.__init__(self)
        self.url=url
        self.tid=tid
        self.mycrawler=mycrawler
    def run(self):
        global t_mutex
        global cur
        global last
        global totalcount
        global depth
        t_mutex.acquire()
        self.mycrawler.log(self.url)
        t_mutex.release()
        links=self.getLinks(self.url)
        if links==None:
            t_mutex.acquire()
            self.mycrawler.crawqueue.addToVisited(self.url)
            self.mycrawler.crawqueue.addToFailed(self.url)
            t_mutex.release()
        else:
            t_mutex.acquire()
            self.mycrawler.crawqueue.addToVisited(self.url)
            beforenum=self.mycrawler.crawqueue.getQueueCount()
            self.mycrawler.crawqueue.addLinks(links)
            afternum =self.mycrawler.crawqueue.getQueueCount()
            totalcount+=afternum-beforenum
            t_mutex.release()
        t_mutex.acquire()
        cur+=1
        if cur==last:
            depth+=1
            self.mycrawler.log(">>>Depth "+str(depth)+":\n")
            last=totalcount
        t_mutex.release()
    def getLinks(self,url):
        try:
            page=urllib.urlopen(url)
            html=page.read()
            reg=r'"(http://.+?)"'
            regob=re.compile(reg,re.DOTALL)
            links=regob.findall(html)
            return links
        except:
            print 'Failed downloading and saving',url
            return None
class CrawQueue:
    def __init__(self):
        self.queue=[]
        self.visited=[]
        self.failed=[]
    def getQueue(self):
        return self.queue
    def getVisited(self):
        return self.visited
    def getFailed(self):
        return self.failed
    def push(self,url):
        if url!="" and url not in self.queue and url not in self.visited:
            self.queue.insert(0,url)
    def pop(self):
        if len(self.queue)==0:
            #print 'failed to pop: queue is empty'
            return None
        else:
            return self.queue.pop()
    def isEmpty(self):
        if len(self.queue)==0:
            return 1
        else:
            return 0
    def addToVisited(self,url):
        self.visited.append(url)
    def addToFailed(self,url):
        self.failed.append(url)
    def remove(self,url):
        self.queue.remove(url)
    def getVisitedCount(self):
        return len(self.visited)
    def getQueueCount(self):
        return len(self.queue)
    def addLinks(self,links):
        for link in links:
            self.push(link)

if __name__=="__main__":
    seeds="http://www.111com.net/"
    threadnum=int(raw_input("设置线程数:"))
    crawlername="小小爬虫"
    mycrawler=Mycrawler(crawlername,seeds,threadnum)
    mycrawler.crawling2()

好了以上就是使用python 实现的多线程网页爬虫的代码了，希望代码对各位有帮助。

上一个： PHP生成RSS订阅的程序代码
下一个：解决PHP的json_encode处理中文被转码为全英文的方法

推荐专题

最新下载

热门教程

python实现多线程网页爬虫的2个例子

相关文章

热门栏目

php教程

asp.net教程

手机开发

css教程

网页制作

办公数码

jsp教程