网站搬运工
发布时间:2020-12-17 17:19:28 所属栏目:Python 来源:网络整理
导读:今天PHP站长网 52php.cn把收集自互联网的代码分享给大家,仅供参考。 # -*- encoding=utf8 -*-import urllib2import lxml.html,reimport os.path,stat,io,sys,glob,timeimport threading,Queuefrom bottle import route,r
以下代码由PHP站长网 52php.cn收集自互联网 现在PHP站长网小编把它分享给大家,仅供参考 # -*- encoding=utf8 -*- import urllib2 import lxml.html,re import os.path,stat,io,sys,glob,time import threading,Queue from bottle import route,run,template,static_file from peewee import * db = SqliteDatabase('post.db') postlist=Queue.Queue(maxsize=200) class User(Model): uid = IntegerField(primary_key=True) name = CharField() password = FixedCharField() class Meta: database = db User._meta.auto_increment =True class Post(Model): post_id=IntegerField(primary_key=True) node = CharField() title = CharField() content = CharField() author = ForeignKeyField(User,related_name='author') class Meta: database = db class Remark(Model): remark_id=IntegerField(primary_key=True) post_id = IntegerField() content = CharField() user_id = ForeignKeyField(User,related_name='poster') class Meta: database = db db.connect() def fetchHtml(url,options): headers={'User-Agent':options['user_agent'],"Host":'www.'+options['domain'],'Connection':"keep-alive",'Refer':options['url'],} page='' retry=0 req=urllib2.Request(url) for header in headers: req.add_header(header,headers[header]) while not page and retry <3: try: page=urllib2.urlopen(url).read() except: retry=retry+1 print retry time.sleep(10) return page class ScrapIndex(threading.Thread): def __init__(self,config): threading.Thread.__init__(self) self.config=config def run(self): print("n run....") config=self.config url=config['url'] while True: page='' try: page=fetchHtml(url,config) except: print("error",url) if not page: continue doc = lxml.html.document_fromstring(page) for elem in doc.cssselect(config['links_css']): id=re.search(config['href_patten'],elem.get("href")).group(1) #print(elem) filename=config['save_dir']+'//'+id #print(filename) if not os.path.exists(filename): print(filename) self.touch(filename) postlist.put(id) time.sleep(config['refresh_fruiqence']) def touch(self,fname,times=None): with open(fname,'a'): os.utime(fname,times) class Refresh(threading.Thread): def __init__(self,config): threading.Thread.__init__(self) self.config=config def run(self): dir=self.config['save_dir'] while True: now=time.time() for path_and_filename in glob.iglob(dir+"/*"): ctime=os.stat(path_and_filename)[stat.ST_CTIME] elapse=now-ctime if elapse > 86400: print("t"*3,ctime,path_and_filename,elapse) os.remove(path_and_filename) elif elapse > 3600: print(postlist.qsize()) postlist.put(os.path.basename(path_and_filename)) time.sleep(20) class ScrapPage(threading.Thread): def __init__(self,config): threading.Thread.__init__(self) self.config=config def run(self): config=self.config while True: print(postlist.qsize()) id=postlist.get() url=config['detail_url'] % id print(url) filename=config['save_dir']+"//"+ id page='' try: page=fetchHtml(url,config) except: print("ERROR:",url) if not page: continue doc = lxml.html.document_fromstring(page) size=0 idset=[] if os.path.exists(filename): size=os.path.getsize(filename) print('size=',size) if 0 == size: header=doc.cssselect("#Main .box .header") node=header[0].findall("./a")[1].get("href").replace('/go/','') title=header[0].find("./h1").text_content() user=header[0].find("./small/a").text_content() content=doc.find_class("topic_content") if content: content=content[0].text_content() try: user = User.get(User.name ==user) user_id=user.uid except: created = User.create(name=user,password='xx') user_id=created.uid post,created=Post.create_or_get(post_id=int(id),node=node,title=title,author=user_id,content=content) #print(post,created) with open(filename,'r+') as f: f.write(chr(32)) idlist='' if 0 < size: with open(filename,'r+') as f: idlist=f.read().strip() uniq=set() if 0 != len(idlist) : uniq=set(idlist.split(',')) print("n-----------------------------") uniqnew=set([]) for elem in doc.cssselect('#Main div.box:nth-child(4) div[id^="r_"]'): user=elem.find(".//strong/a").text_content() try: user = User.get(User.name ==user) user_id=user.uid except: created = User.create(name=user,password='xx') user_id=created.uid #print(user_id) rid=elem.get('id').replace('r_','') td=elem.find_class("reply_content") content=td[0].text_content() #print(td[0].text_content()) if rid not in uniq: uniq.add(rid) Remark.create(content=content,user_id=user_id,post_id=id) time.sleep(10) #t=ScrapIndex(config) config={'url':'http://v2ex.com/?tab=all','domain':'v2ex.com','user_agent':'Mozilla/5.0 (Windows NT 6.3; rv:38.0) Gecko/20100101 Firefox/38.0','links_css':"div.box:nth-child(2) table td:nth-child(3) .item_title a",'href_patten':r"/t/(d+)#",'save_dir':'tmp',"detail_url":"http://v2ex.com/t/%s",'refresh_fruiqence':20} savedir=config['save_dir'] if not os.path.exists(savedir): os.mkdir(savedir) db.create_tables([User,Post,Remark]) threads=[ScrapIndex(config),Refresh(config),ScrapPage(config)] for t in threads: t.start() @route('/static/<filepath:path>') def server_static(filepath): return static_file(filepath,root='.') @route('/') def index(): posts=Post.select().paginate(0,30) return template('index',page=1,posts=posts) @route('/recent/:page') def recent(page): page = int(page) posts=Post.select().paginate((page-1)*30,30) page=page+1 return template('index',page=page,posts=posts) @route('/t/:id') def remark(id): id = int(id) post=Post.get(Post.post_id==id) remarks=Remark.select().where(Remark.post_id==id) return template('post',post=post,remarks=remarks) run(host='localhost',port=8080,debug=True) sys.exit() 模板文件在 http://git.oschina.net/yaky/movesite/attach_files/download?i=15588&u=http%3A%2F%2Ffiles.git.oschina.net%2Fgroup1%2FM00%2F00%2F9D%2FfMqNk1Xps-KAdIHKABZQps08o6k878.rar%3Ftoken%3D5af19cbad2332df36b8ebf433041ac2f%26ts%3D1441379309 python app.py 开始运行 如果提示缺少包 请运行python -m pip install xx 安装相应的包 以上内容由PHP站长网【52php.cn】收集整理供大家参考研究 如果以上内容对您有帮助,欢迎收藏、点赞、推荐、分享。 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |