练练正则
发布时间:2020-12-14 01:54:13 所属栏目:百科 来源:网络整理
导读:#!user/bin/python# coding: utf-8from bs4 import BeautifulSoupimport urllibimport redef get_html(url): req = urllib.urlopen(url).read() return reqdef handle_html(html): reg1 = re.compile(r'span class="atc_title".*?/span',re.S) reg2 = re.com
#!user/bin/python # coding: utf-8 from bs4 import BeautifulSoup import urllib import re def get_html(url): req = urllib.urlopen(url).read() return req def handle_html(html): reg1 = re.compile(r'<span class="atc_title">.*?</span>',re.S) reg2 = re.compile(r'<span class="atc_title">s*<a title=".*" target="_blank" href="(?P<link>.*)">(?P<title>.*)</a></span>') list1 = re.findall(reg1,html) list2 = [] for title in list1: n = re.search(reg2,title) print n.group('title') + 't' + n.group('link') list2.append(n.group('link')) return list2 def load_html(result): count = 0 for link in result: count += 1 urllib.urlretrieve(link,'D:Documents%s.html' % count) if __name__ == '__main__': url = 'http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html' html = get_html(url) result = handle_html(html) load_html(result) (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |