下载全国城市空气质量历史数据

发布时间：2020-12-17 17:29:54 所属栏目：Python 来源：网络整理

导读：今天PHP站长网 52php.cn把收集自互联网的代码分享给大家，仅供参考。 ?import osimport shutilimport reimport urllib.requesthome = "http://www.tianqihoubao.com"def down2txt(code,tit,url): # 网页地址 page = urlli

以下代码由PHP站长网 52php.cn收集自互联网

现在PHP站长网小编把它分享给大家，仅供参考

?
import os
import shutil
import re
import urllib.request

home = "http://www.tianqihoubao.com"

def down2txt(code,tit,url):

    # 网页地址
    page = urllib.request.urlopen(url).read()
    try:
        page = page.decode("gbk")
    except:
        page = page.decode("utf-8")

    i_start = page.find("<h1>")
    i_end = page.find("</h1>")
    t = page[i_start : i_end]
    #tit = t.replace("rn","").replace("<h1>","").strip(" ")
    #print(tit)
    
    # 创建目录
    if not os.path.exists(os.getcwd() + "/data/" + code):
        os.makedirs(os.getcwd() + "/data/" + code)
    
    # 文件存在则不下载
    file = os.getcwd() + "/data/" + code + "/" + tit + ".txt"
    if os.path.exists(file):
        print("文件已存在：" + tit + ".txt")
        return

    # 截取表格文本
    i_start = page.find('<table')
    i_end = page.find('</table>')
    page = page[i_start:i_end]
    i_start = page.find(">")
    page = page[i_start:]
    page = page.replace("rn","")
    #page = page.replace("r","").replace("r","")
    page = page.replace("</b>","").replace("<b>","")
    #page = page.replace(" ","").replace(" ","")
    re_c = re.compile(">(.+?)<")
    ls = re.findall(re_c,page)
    
    f = open(file,"w")
    i = 0
    s = ""
    for l in ls:
        l = l.strip(" ")
        if l == "":
            continue
        s += l + " "
        if i == 8:
            #print(s)
            f.write(s + "n")
        i+=1
        if i >= 9:
            i = 0
            s = ""
   
    f.close()

def down_city(name,code):
    url = home + "/aqi/" + code + ".html"
    print(url)
    page = urllib.request.urlopen(url).read()
    page = page.decode("gbk")
    ls = re.findall(re.compile("href='(/aqi/" + code + "-" + ".+?html)'"),page)
    for l in ls:
        url = home + l
        tit = l.replace("/aqi/","").replace(".html","")
        print(url)
        down2txt(code,url)
        #print(l)

if __name__ == "__main__":
    url = home + "/aqi/"
    page = urllib.request.urlopen(url).read()
    try:
        page = page.decode("gbk")
    except:
        page = page.decode("utf-8")
    
    ls = re.findall(re.compile('href="/aqi/(.+?)</a>'),page)
    index = 0
    for l in ls:
        try:
            ls2 = l.replace(" ","").replace('.html">'," ").strip(" ").split(" ")
            if len(ls2) == 2:
                index += 1
                print( str(index) + "/" + str(len(ls)) + ": " + ls2[0] + " " + ls2[1])
                b_down = False
                # 查找下载记录
                if os.path.exists("dataindex.txt"):
                    f = open("dataindex.txt","r")
                    ls3 = f.readlines()
                    f.close()         
                    for l3 in ls3:
                        if l3.strip("n") == ls2[0] + " " + ls2[1]:
                            print(ls2[1] + " 已下载")
                            b_down = True
                            break
                
                if b_down : 
                    continue
                
                down_city(ls2[1],ls2[0])
                
                # 保存记录
                f = open("dataindex.txt","a")
                f.write(ls2[0] + " " + ls2[1] + "n")
                f.close()
        except:
            print("error!")

    print("finished!")

以上内容由PHP站长网【52php.cn】收集整理供大家参考研究

如果以上内容对您有帮助，欢迎收藏、点赞、推荐、分享。

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!