加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 百科 > 正文

selenium +lxml爬取拉钩网公司详情页

发布时间:2020-12-16 23:37:04 所属栏目:百科 来源:网络整理
导读:#encoding: utf- 8 import webbrowserimport requests from selenium import webdriver from selenium.webdriver.support.ui import Select,WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_co
#encoding: utf-8
import webbrowser

import requests
from selenium import webdriver
from selenium.webdriver.support.ui import Select,WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
import time
import re
import csv
import requests

class LagouSpider(object):
    # chromedriver的绝对路径
    driver_path =/Users/mac126/chromedriver
    def __init__(self):
        # 初始化一个driver,并且指定chromedriver的路径
        self.driver = webdriver.Chrome(executable_path=self.driver_path)
        self.company_lists = []
        self.fp = open(lago.csv,a,encoding=utf-8,newline=‘‘)
        self.writer = csv.DictWriter(self.fp,[company_name,img,scale,address,description])

        self.writer.writeheader()

    def run(self):
        #运行
        # url=https://www.lagou.com/jobs/list_java?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=
        url = https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=#路径
        self.driver.get(url)#获取路径
        while True:#死循环

            WebDriverWait(driver=self.driver,timeout=10).until(
                EC.presence_of_element_located((By.XPATH,"//span[contains(@class,‘pager_next‘)]"))
            )
            resource = self.driver.page_source
            self.parse_list_page(resource)
            next_btn = self.driver.find_element_by_xpath("//span[contains(@class,‘pager_next‘)]")
            if "pager_next_disabled" in next_btn.get_attribute(class):
                break
            next_btn.click()
            time.sleep(5)


    def parse_list_page(self,resource):
        ‘‘‘
        获取页面信息
        :param resource:
        :return:
        ‘‘‘
        html = etree.HTML(resource)
        links = html.xpath("//a[@class=‘position_link‘]/@href")
        for link in links:
            self.parse_detail_page(link)
            time.sleep(1)

    def parse_detail_page(self,url):
        ‘‘‘
        详情页解析
        :param url:
        :return:
        ‘‘‘
        self.driver.execute_script("window.open(‘"+url+"‘)")
        self.driver.switch_to.window(self.driver.window_handles[1])
        WebDriverWait(self.driver,timeout=10).until(
            EC.presence_of_element_located((By.XPATH,"//dd[@class=‘job_bt‘]"))
        )
        resource = self.driver.page_source
        html = etree.HTML(resource)

        #找到公司详情页链接
        self.third_url = html.xpath(//*[@id="job_company"]/dt/a/@href)[0]
        self.parse_three_page(self.third_url)
        self.driver.close()
        self.driver.switch_to.window(self.driver.window_handles[0])
    def parse_three_page(self,url):

        url=self.third_url
        self.driver.get(url)
        resource = self.driver.page_source
        html = etree.HTML(resource)
        company_name = html.xpath(//div[@class="company_info"]/div[@class="company_main"]/h1/a/text())[0]#1
        img = html.xpath(//div[@class="top_info"]/div[1]/img/@src)[0]
        scale = html.xpath(//*[@id="basic_container"]/div[2]/ul/li[3]/span/text())[0]#1
        address = html.xpath(//*[@id="basic_container"]/div[2]/ul/li[4]/span/text())[0]#1
        description = html.xpath(//div[@class="company_intro_text"]/span[@class="company_content"]/p/text())#1
        if description:
            self.company_list = {
                company_name: company_name,img: img,scale: scale,address: address,description: "".join(description),}
        # print(company_name,img,scale,address,description)
        else:
            description = html.xpath(//div[@class="company_intro_text"]/span[@class="company_content"]/text())#1
            if description:
                self.company_list = {
                    company_name: company_name,}
        self.write_position(self.company_list)

    def write_position(self,company_list):
            ‘‘‘
            保存
            :param position:
            :return:
            ‘‘‘
            if len(self.company_lists) >= 100:
                self.writer.writerows(self.company_lists)
                self.company_lists.clear()
            self.company_lists.append(self.company_list)
            print(self.company_list)




def main():

    spider = LagouSpider()
    spider.run()

if __name__ == __main__:
    main()

爬取结果如图:

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读