python bs4+requests 实现大学排名爬取
发布时间:2020-12-20 10:42:49 所属栏目:Python 来源:网络整理
导读:# -*- coding: utf-8 -*- """ Created on Mon Sep 9 10:55:55 2019@author: Administrator """ import os import requests import bs4 from bs4 import BeautifulSoup def main(): url = " http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html " root =
# -*- coding: utf-8 -*- """ Created on Mon Sep 9 10:55:55 2019 @author: Administrator """ import os import requests import bs4 from bs4 import BeautifulSoup def main(): url = "http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html" root = "d:/pictures//" path = root + url.split("/")[-1] writehtml(url,path) uinfo = [] html = gethtmltext(url) fillunivlist(uinfo,html) printunivlist(uinfo,50) def writehtml(url,path): if (not os.path.exists(path)) or os.path.getsize(path) <= 0: r = requests.get(url) r.encoding = r.apparent_encoding soup = BeautifulSoup(r.text,"html.parser") with open(path,"w",encoding="utf-8") as f1: f1.write(soup.prettify()) print("保存成功") else: print("文件已存在") def gethtmltext(url): try: r = requests.get(url) r.encoding = r.apparent_encoding r.raise_for_status() return r.text except: return "" def fillunivlist(ulist,html): soup = BeautifulSoup(html,"html.parser") for tr in soup.find("tbody").children: # 判断tr是否是bs4定义的tag类型 if isinstance(tr,bs4.element.Tag): # 将tr中所有的td标签变为列表 tds = tr("td") ulist.append([tds[0].string,tds[1].string,tds[3].string]) # ulist.append({"排名": tds[0].string, # "学校": tds[1].string, # "分数": tds[3].string}) def printunivlist(ulist,num): # print函数中的format格式化输出方法 # {:>10} 右对齐 不够10个字符的地方用英文空格填充 # {:0>10} 右对齐 不够10个字符的地方用0填充 # {:^10} 居中对齐 不够10个字符的地方用英文空格填充 # {:chr(12288)^10} 居中对齐 不够10个字符的地方用中文空格填充 # {:<10} 左对齐 #print("{:>5}{:>20}{:>5}".format("排名","学校","分数")) tl = "{0:^10}t{1:{3}^10}t{2:<4}" # chr(12288)为中文空格,用中文空格填充中文输出列,可以使排版整齐 print(tl.format("排名","学校名称","分数",chr(12288))) for ul in ulist[:num]: print(tl.format(ul[0],ul[1],ul[2],chr(12288))) # for dic in ulist[:num]: # print("{:^5}{:^20}{:^8}".format(dic["排名"],dic["学校"],dic["分数"])) if __name__ == "__main__": main() (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |