128进程,图片爬虫,增量更新,可作为标准组件
发布时间:2020-12-17 17:18:58 所属栏目:Python 来源:网络整理
导读:今天PHP站长网 52php.cn把收集自互联网的代码分享给大家,仅供参考。 #!/usr/bin/env pythonimport osimport os.pathimport urllibimport socketimport imghdrfrom multiprocessing import Poolfrom urlparse import urlp
以下代码由PHP站长网 52php.cn收集自互联网 现在PHP站长网小编把它分享给大家,仅供参考 #!/usr/bin/env python import os import os.path import urllib import socket import imghdr from multiprocessing import Pool from urlparse import urlparse results = [] exceptions = [] def callback(result): print 'result:',result if result: results.append(result) def path_exists(path): fname = path.split('/')[-1] for exist_fname in files: if exist_fname.startswith(fname): return exist_fname return False def retrieve(url,path): try: print 'retrieve:',url,' to',path if os.path.exists(path): return 'file exists:',path elif path_exists(path): return 'similar file:',path urllib.urlretrieve(url,path) ftype = imghdr.what(path) if ftype and ftype != path.split('.')[-1] and path.split('.')[-1] != 'jpg': os.rename(path,path+'.'+ftype) elif ftype is None: os.rename(path,path+'.none') return 'success:',path,ftype except Exception as e: exception = 'exception: ' + url + ' ' + path + ' | ' + str(e) exceptions.append(exception) return exception files = os.listdir('./imgs') def main(): pool = Pool(processes=128) exist_file = 0 socket.setdefaulttimeout(3) with open('samples.log') as f: for index,line in enumerate(f): try: count,url = line.split() except: print 'exception:',count,url continue # print 'main:',url fname = urlparse(url).path.split('/')[-1] path = './imgs/'+str(index)+'.'+count+'.'+fname result = pool.apply_async( retrieve,args=(url,path),callback=callback ) print 'apply async done' pool.close() pool.join() for e in exceptions: print e if __name__ == '__main__': main() 以上内容由PHP站长网【52php.cn】收集整理供大家参考研究 如果以上内容对您有帮助,欢迎收藏、点赞、推荐、分享。 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |