爬取百度图片
发布时间:2020-12-16 03:03:54 所属栏目:百科 来源:网络整理
导读:因为是百度图片是瀑布流ajax异步上传的数据,所以这里用到抓包工具来抓取链接(fiddler) 好了直接上代码, 1 from selenium import webdriver 2 from selenium.webdriver.common.by import By 3 import requests,time 4 from queue import Queue 5 from url
因为是百度图片是瀑布流ajax异步上传的数据,所以这里用到抓包工具来抓取链接(fiddler) 好了直接上代码, 1 from selenium import webdriver 2 from selenium.webdriver.common.by import By 3 import requests,time 4 from queue import Queue 5 from urllib import request 6 import os,gevent 7 from lxml import etree 8 9 10 11 12 def get_img(html): 13 html = html.get() 14 15 html = etree.HTML(html) 16 17 img_url = html.xpath(‘//div[@id="imgid"]/div[last()]//li/@data-objurl‘) 18 # print(img_url) 19 path = ‘./baidupic/‘ 20 if not os.path.exists(path): 21 os.makedirs(path) 22 23 for url in img_url: 24 print(url) 25 # response = requests.get(url) 26 # img = response.content 27 try: 28 fname = url.split(‘/‘)[-1] 29 request.urlretrieve(url,os.path.join(path,fname)) 30 print(‘下载成功‘) 31 except: 32 print(‘图片不存在‘) 33 34 35 def get_page(): 36 #创建数据队列 37 q = Queue() 38 39 #百度图片搜索地址 40 base_url = ‘https://image.baidu.com/‘ 41 #返回浏览器对象 42 browser = webdriver.Chrome(executable_path=r‘C:UserszhaozhiDesktopchromedriver.exe‘) 43 #模拟访问 44 browser.get(base_url) 45 #输入搜索关键字 46 browser.find_element_by_id(‘kw‘).send_keys(‘美女‘) 47 #按键 48 browser.find_element_by_class_name(‘s_search‘).click() 49 # time.sleep(2) 50 for i in range(10): 51 browser.execute_script(‘window.scrollTo(0,document.body.scrollHeight)‘) 52 # time.sleep(2) 53 # html = browser.page_source 54 55 56 q.put(browser.page_source) 57 # browser.close() 58 # print(browser.page_source) 59 g_list=[] 60 for i in range(20): 61 g= gevent.spawn(get_img,q) 62 g_list.append(g) 63 64 gevent.joinall(g_list) 65 66 67 68 69 70 71 72 73 # browser.save_screenshot(‘baidupic.png‘) 74 # print(browser.page_source) 75 # browser.find_element(By_) 76 77 if __name__ == ‘__main__‘: 78 get_page() (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |