python多线程爬取斗图啦数据

发布于2019-08-05 19:03 阅读(908) 评论(0) 点赞(2) 收藏(2)

python多线程爬取斗图啦网的表情数据

使用到的技术点

requests请求库
re 正则表达式
pyquery解析库,python实现的jquery
threading 线程
queue 队列

'''
斗图啦多线程方式

'''

import   requests,time,re,os
from  pyquery  import  PyQuery as jq
from requests.exceptions import   RequestException
from urllib import  request
# 导入线程类
import threading
# 导入队列类
from queue import Queue
head = {
            "User_Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
        }
# 创建项目文件夹
pt=os.path.dirname(os.path.abspath(__file__))
path = os.path.join(pt, "斗图啦")
if not os.path.exists(path):
    os.mkdir(path)

'''
生产者类
继承自多线程类threading.Thread
重写init方法和run方法
'''
class Producer(threading.Thread):
    def __init__(self,img_queue,url_queue,*args,**kwargs):
        super(Producer, self).__init__(*args,*kwargs)
        self.img_queue=img_queue
        self.url_queue=url_queue

    def run(self):
        while True:
            if self.url_queue.empty():# 如果没有url了 直接退出循环
                break
            url=self.url_queue.get()
            self.parse_page(url)
   ## 解析数据方法
    def parse_page(self,url):
        res=requests.get(url,headers=head)
        doc=jq(res.text)
       # print(res.text)
        # 查询到所有的a标签
        items= doc.find(".page-content a").items()
        for a  in  items:
            title=a.find("p").text()
            src=a.find("img.img-responsive").attr("data-original")
            # 分割路径 拿到扩展名
            pathtype= os.path.splitext(src)[1]
            # 使用正则表达式 去掉特殊字符
            patitle=re.sub(r'[\.。，\?？\*!！\/~]',"",title)
            filename = patitle + pathtype
            filepath=os.path.join(path,filename)
            #  添加到消费者队列 循环下载图片
            self.img_queue.put((filepath,src))





'''
消费者
和生产者一样的道理
'''
class Customer(threading.Thread):
    def __init__(self,img_queue,url_queue,*args,**kwargs):
        super(Customer, self).__init__(*args,**kwargs)
        self.img_queue=img_queue
        self.url_queue=url_queue

    def run(self):
        while True:
            if self.img_queue.empty() and self.url_queue.empty():#如果没有url并且图片下载完成 直接退出
                break
            # 在队列中拿到路径和图片链接
            filepath,src=self.img_queue.get()
            print('%s开始下载,链接%s' % (filepath, src))
            # 请求图片
            img = requests.get(src)
            # 写入本地 content表示二进制数据,text是文本数据
            with open(filepath, "wb")as f:
                f.write(img.content)
            # request.urlretrieve(src,os.path.join(path,filename))
            print('%s下载完成' % filepath)




def main():
    # 构建url队列和img队列
    url_queue=Queue(100000)
    img_queue=Queue(100000)

    # 构建url 爬取1到100页的数据
    for  i in range(1,101):
        url="https://www.doutula.com/photo/list/?page="+str(i)
        url_queue.put(url)# 添加到生产者队列中
# 开启5个线程线程执行生产者
    for i in range(5):
        t=Producer(img_queue,url_queue)
        t.start()
    # 开启3个线程线程执行消费者
    for i in range(3):
        t=Customer(img_queue,url_queue)
        t.start()


if __name__ == '__main__':
    print("爬虫调度启动---------")
    main()
    print("爬虫调度完成---------")