加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 编程开发 > Python > 正文

python bs4+requests 实现大学排名爬取

发布时间:2020-12-20 10:42:49 所属栏目:Python 来源:网络整理
导读:# -*- coding: utf-8 -*- """ Created on Mon Sep 9 10:55:55 2019@author: Administrator """ import os import requests import bs4 from bs4 import BeautifulSoup def main(): url = " http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html " root =
# -*- coding: utf-8 -*-
"""
Created on Mon Sep  9 10:55:55 2019

@author: Administrator
"""
import os
import requests
import bs4
from bs4 import BeautifulSoup


def main():
    url = "http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html"
    root = "d:/pictures//"
    path = root + url.split("/")[-1]
    writehtml(url,path)
    uinfo = []
    html = gethtmltext(url)
    fillunivlist(uinfo,html)
    printunivlist(uinfo,50)
    
    
def writehtml(url,path):
    if (not os.path.exists(path)) or os.path.getsize(path) <= 0:
        r = requests.get(url)
        r.encoding = r.apparent_encoding
        soup = BeautifulSoup(r.text,"html.parser")
        with open(path,"w",encoding="utf-8") as f1:
            f1.write(soup.prettify())
            print("保存成功")
    else:
        print("文件已存在")

    
def gethtmltext(url):
    try:
        r = requests.get(url)
        r.encoding = r.apparent_encoding
        r.raise_for_status()
        return r.text
    except:
        return ""


def fillunivlist(ulist,html):
    soup = BeautifulSoup(html,"html.parser")
    for tr in soup.find("tbody").children:
        # 判断tr是否是bs4定义的tag类型
        if isinstance(tr,bs4.element.Tag):
            # 将tr中所有的td标签变为列表
            tds = tr("td")
            ulist.append([tds[0].string,tds[1].string,tds[3].string])
#            ulist.append({"排名": tds[0].string,
#                          "学校": tds[1].string,
#                          "分数": tds[3].string})


def printunivlist(ulist,num):
    # print函数中的format格式化输出方法 
    # {:>10} 右对齐 不够10个字符的地方用英文空格填充
    # {:0>10} 右对齐 不够10个字符的地方用0填充
    # {:^10} 居中对齐 不够10个字符的地方用英文空格填充
    # {:chr(12288)^10} 居中对齐 不够10个字符的地方用中文空格填充
    # {:<10} 左对齐
    
  #print("{:>5}{:>20}{:>5}".format("排名","学校","分数"))
    tl = "{0:^10}t{1:{3}^10}t{2:<4}"
    # chr(12288)为中文空格,用中文空格填充中文输出列,可以使排版整齐
    print(tl.format("排名","学校名称","分数",chr(12288)))
    for ul in ulist[:num]:
        print(tl.format(ul[0],ul[1],ul[2],chr(12288)))
#    for dic in ulist[:num]:
#        print("{:^5}{:^20}{:^8}".format(dic["排名"],dic["学校"],dic["分数"]))

    
if __name__ == "__main__":
    main()

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读