python爬虫开发之“智联招聘”网页爬取-python黑洞网

本站消息

站长简介/公众号

出租广告位,需要合作请联系站长

慧雅

1090

文章

919521

访问

+关注

分类

*args和**kwargs(0)

debug(0)

日期归档

2023-05(1)

2023-06(2)

python爬虫开发之“智联招聘”网页爬取

发布于2019-08-05 19:02 阅读(2146) 评论(1) 点赞(4) 收藏(4)

先贴上需求：

 1. 输入起始页 和结束页 爬取智联招聘上 与python相关职业
 2. 爬取的信息包括 就业岗位名称 薪资 地区  公司名称  需求{包括学历和经验}
 3. 爬取的信息以字典形式保存到mongodb数据库中

附上url https://sou.zhaopin.com/?jl=681&kw=python&kt=3 点击 --->>>智联招聘

------------------------------------------分割线---------------------------------------------------------------

打开智联上述url 链接，发现会跳出以下画面：

能用requests 当然用requets的啦！毕竟比较简洁嘛~ 但很可惜，使用requests库是爬取不到信息滴

我还真不信邪从浏览器那儿头部全部复制下带上所有能用的都用了，结果。。显然这是一次失败的尝试。

（当然，有朋友通过抓包，可以找到某个接口，通过对该接口使用 post 或 get 方法同样也可以获取到信息，这也是一种方法。）

这里，我就使用究极方法 selenium + chromedriver 来实现。

https://sou.zhaopin.com/?p=1&jl=489&kw=python&kt=3&sf=0&st=0

观察url 可以发现  p 表示页码  kw表示关键字 jl等其它参数 不知道干啥滴 不过对我们爬取网站并没有影响

接下来就可以大展拳脚咯！！

为了加快爬取速度我采用了多线程的方式进行爬取，使用 4个线程进行下载页面 3个线程进行解析页面并写入到Mongodb中。

最重要的当然代码是如何实现的啦。

代码中有详细的注释有兴趣的可以看看~

def main():
    startpage = eval(input('输入起始页码:'))
    endpage = eval(input('输入结束页码:'))
    # page 队列
    url_queue = Queue()
    # html 内容队列
    data_queue = Queue()
    spider = ZhiLianSpider(startpage, endpage, url_queue, data_queue)
    # 执行run方法返回一个url队列
    spider.run()
    # 创建生产者
    spider.create_producer()
    # 创建消费者
    spider.create_customer()
    # 阻塞.
    spider.wait_c()
    spider.wait_p()

class ZhiLianSpider(object):
    # 定义类属性 生产者 和消费者
    pname = ['生产者1号', '生产者2号', '生产者3号', '生产者4号']
    cname = ['消费者1号', '消费者2号', '消费者3号']

    def __init__(self, start, end, urlqueue, dataqueue):
        self.start = start
        self.end = end
        self.url = r'https://sou.zhaopin.com/?p={}&jl=489&kw=python&kt=3&sf=0&st=0'
        self.urlqueue = urlqueue
        self.dataqueue = dataqueue
        # 创建一个生产者线程列表 用于阻塞等待
        self.p_threadlst = []
        # 创建一个消费者线程列表 用于阻塞等待
        self.c_threadlst = []

    # run 方法执行返回完整页面的url
    def run(self) -> None:
        for page in range(self.start, self.end + 1):
            self.urlqueue.put(self.url.format(page))

    def create_producer(self):
        '''
        为了不使main函数中有太多冗余 将创建生产者和消费者放在这个类方法中
        :return:
        '''
        for name in self.pname:
            p = Producer(data_queue=self.dataqueue, url_queue=self.urlqueue, name=name)
            self.p_threadlst.append(p)
            # 启动线程
            p.start()

    def wait_p(self):
        for p in self.p_threadlst:
            p.join()

    def create_customer(self):
        for name in self.cname:
            c = Customer(self.dataqueue, name)
            self.c_threadlst.append(c)
            c.start()

    def wait_c(self):
        for c in self.c_threadlst:
            c.join()

生产者代码：

class Producer(threading.Thread):
    '''
    封装一下 chromedriver 无头浏览器参数
    '''
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')

    def __init__(self, data_queue, url_queue, name):
        super(Producer, self).__init__()
        self.data_queue = data_queue
        self.url_queue = url_queue
        self.name = name

    def run(self) -> None:
        '''
        这里run方法主要实现两个方法 ① 下载页面 ② 将页面存到data_queue队列中
        :return:  这里的 ->None 表示返回的是空
        '''
        while not self.url_queue.empty():
            url = self.url_queue.get()
            # 下载页面
            print('我是{}---->>>>正在下载页面{}'.format(self.name, url.split('?')[1].split('&')[0]))
            self.download_html(url)
            print('我是{}---->>>>已完成下载页面{}'.format(self.name, url.split('?')[1].split('&')[0]))

    def download_html(self, url):
        # 创建一个浏览器对象
        browser = webdriver.Chrome('../chromedriver.exe', options=self.options)
        # 打开url
        browser.get(url)
        # 等待1秒
        time.sleep(1)
        # 处理弹出的按钮
        button = browser.find_element_by_css_selector('body > div.a-modal.risk-warning > div > div > button')
        # 点击按钮
        button.click()
        browser.implicitly_wait(3)
        # 等待js内容渲染
        time.sleep(2)
        # 将页面源码存入队列中
        self.data_queue.put(browser.page_source)
        # 最后最后一定要记得关闭浏览器！ 因为这个函数是写在一个循环中的
        browser.quit()

消费者：

class Customer(threading.Thread):
    # 初始化mongodb参数
    # 连接服务器
    conn = MongoClient(host='localhost', port=27017)
    # 创建数据库
    db = conn.zhaopin
    # 创建集合
    collection = db.zhaopin_collection
    # 创建一个锁对象 当一个线程进行数据库的写入时 锁上 存储信息完毕后释放
    lock = threading.Lock()

    def __init__(self, data_queue, name):
        super(Customer, self).__init__()
        self.data_queue = data_queue
        self.name = name

    def run(self) -> None:
        while True:
            try:
                # 获取页面内容进行解析
                content = self.data_queue.get(True, 20)
                print('我是{},我正在解析...'.format(self.name))
                self.parse_content(content)

            except Exception:
                print('我是{},已经完成解析...'.format(self.name))
                break

    def parse_content(self, content):
        '''
        观察网站源码发现 所有的招聘内容放在了一个 div容器中 取出这个容器 循环遍历即可
        <div id="listContent" class="contentpile__content">
        :param content:
        :return:
        '''
        # 创建一个列表用于存储字典信息
        info_list = []
        soup = BeautifulSoup(content, 'lxml')
        div_lst = soup.find('div', id='listContent')
        for item in div_lst:
            try:
                # 岗位名称
                jobname = item.find('span', class_='contentpile__content__wrapper__item__info__box__jobname__title')[
                    'title']
                # 工资
                saray = item.find('p', class_='contentpile__content__wrapper__item__info__box__job__saray').text
                # 地区
                area = item.find_all('li', class_='contentpile__content__wrapper__item__info__box__job__demand__item')[
                    0].text
                # 经验
                ex = item.find_all('li', class_='contentpile__content__wrapper__item__info__box__job__demand__item')[
                         1].text.strip(), \
                     item.find_all('li', class_='contentpile__content__wrapper__item__info__box__job__demand__item')[
                         2].text.strip()
                # 公司名
                company_name = item.find('a',
                                         class_='contentpile__content__wrapper__item__info__box__cname__title company_title').text

                # 将信息存储为字典
                item_info = {
                    '岗位名称': jobname,
                    '工资': saray,
                    '地区': area,
                    '经验': ex,
                    '公司名': company_name
                }
                info_list.append(item_info)

            except Exception:
                continue

        # 写入数据库
        self.lock.acquire()
        self.collection.insert_many(info_list)
        self.lock.release()

最后的最后贴上我爬取的部分信息吧~