国家统计局区划码爬取

发布时间：2020-12-12 13:44:55 所属栏目：百科来源：网络整理

导读：目标数据 ? oracle存储表格 -- Create tablecreate table VILLAGE_CODE( id INTEGER,area_code VARCHAR2( 500 ),city_village_code VARCHAR2( 500 ),area_name VARCHAR2( 500 ))tablespace SYSTEM pctfree 10 pctused 40 initrans 1 maxtrans 255 storage (

目标数据

oracle存储表格

-- Create table
create table VILLAGE_CODE
(
  id                INTEGER,area_code         VARCHAR2(500),city_village_code VARCHAR2(500),area_name         VARCHAR2(500)
)
tablespace SYSTEM
  pctfree 10
  pctused 40
  initrans 1
  maxtrans 255
  storage
  (
    initial 64K
    next 1M
    minextents 1
    maxextents unlimited
  );
-- Add comments to the columns 
comment on column VILLAGE_CODE.id
  is ‘自增ID‘;
comment on column VILLAGE_CODE.area_code
  is ‘统计用区划代码‘;
comment on column VILLAGE_CODE.city_village_code
  is ‘城乡分类代码    ‘;
comment on column VILLAGE_CODE.area_name
  is ‘名称‘;

爬取代码

#!/usr/bin/env python
# encoding: utf-8
‘‘‘
@author: lurenjia
@contact: [email?protected]
@file: areacode.py
@time: 2018/9/29 14:40
@desc:
‘‘‘

import urllib2,re
from time import sleep
from random import random
from config import DBSession


headers = {
    "User-Agent": "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}
session = DBSession()


def insertVillage(code,name,city_village_code=‘-1‘):
    print code,city_village_code
    session.execute("insert into village_code(area_code,area_name,city_village_code) VALUES (‘%s‘,‘%s‘,‘%s‘)" %(code,city_village_code))
    session.commit()


def openUrl(url,type):
    try:
        sleep(random()*0.5)
        request = urllib2.Request(url,headers=headers)
        html = urllib2.urlopen(request,timeout=10).read().decode(‘gbk‘)
    except:
        html = None
        with open(‘error.txt‘,‘a+‘) as f:
            f.write(url+‘                   ‘+str(type)+‘n‘)
    finally:
        return html
    
    
def parseCode1(baseUrl,lastUrl):
    html = openUrl(baseUrl+lastUrl,1)
    if html:
        for tr in re.findall("<tr class=‘provincetr‘>.+?</tr>",html):
            for td in re.findall("<a href=‘(.+?html)‘>(.+?)<br/>",tr):
                parseCode2(baseUrl,td[0])
        

def parseCode2(baseUrl,lastUrl):
    html = openUrl(baseUrl + lastUrl,2)
    if html:
        for tr in re.findall("<tr class=‘citytr‘>.+?</tr>",html):
            for td in re.findall("<a href=‘(.+?html)‘>(.+?)</a></td><td><a href=‘.+?‘>(.+?)</a>",tr):
                insertVillage(td[1],td[2])
                parseCode3(baseUrl,td[0])
        

def parseCode3(baseUrl,lastUrl):
    baseUrl = baseUrl + lastUrl.split(‘/‘)[0] + ‘/‘
    lastUrl = ‘/‘.join(lastUrl.split(‘/‘)[1:])
    html = openUrl(baseUrl + lastUrl,3)
    if html:
        for tr in re.findall("<tr class=‘countytr‘>.+?</tr>",td[2])
                parseCode4(baseUrl,td[0])
        

def parseCode4(baseUrl,4)
    if html:
        for tr in re.findall("<tr class=‘towntr‘>.+?</tr>",td[2])
                parseCode5(baseUrl,td[0])
        

def parseCode5(baseUrl,5)
    if html:
        for tr in re.findall("<tr class=‘villagetr‘>.+?</tr>",html):
            for td in re.findall("<td>(.+?)</td><td>(.+?)</td><td>(.+?)</td>",tr):
                insertVillage(td[0],td[2],td[1])
        

if __name__=="__main__":
    baseUrl = ‘http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/‘
    parseCode1(baseUrl,‘index.html‘)

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!