加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 百科 > 正文

国家统计局区划码爬取

发布时间:2020-12-12 13:44:55 所属栏目:百科 来源:网络整理
导读:目标数据 ? oracle存储表格 -- Create tablecreate table VILLAGE_CODE( id INTEGER,area_code VARCHAR2( 500 ),city_village_code VARCHAR2( 500 ),area_name VARCHAR2( 500 ))tablespace SYSTEM pctfree 10 pctused 40 initrans 1 maxtrans 255 storage (

目标数据

?

oracle存储表格

-- Create table
create table VILLAGE_CODE
(
  id                INTEGER,area_code         VARCHAR2(500),city_village_code VARCHAR2(500),area_name         VARCHAR2(500)
)
tablespace SYSTEM
  pctfree 10
  pctused 40
  initrans 1
  maxtrans 255
  storage
  (
    initial 64K
    next 1M
    minextents 1
    maxextents unlimited
  );
-- Add comments to the columns 
comment on column VILLAGE_CODE.id
  is 自增ID;
comment on column VILLAGE_CODE.area_code
  is 统计用区划代码;
comment on column VILLAGE_CODE.city_village_code
  is 城乡分类代码    ;
comment on column VILLAGE_CODE.area_name
  is 名称;

?

爬取代码

#!/usr/bin/env python
# encoding: utf-8
‘‘‘
@author: lurenjia
@contact: [email?protected]
@file: areacode.py
@time: 2018/9/29 14:40
@desc:
‘‘‘

import urllib2,re
from time import sleep
from random import random
from config import DBSession


headers = {
    "User-Agent": "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}
session = DBSession()


def insertVillage(code,name,city_village_code=-1):
    print code,city_village_code
    session.execute("insert into village_code(area_code,area_name,city_village_code) VALUES (‘%s‘,‘%s‘,‘%s‘)" %(code,city_village_code))
    session.commit()


def openUrl(url,type):
    try:
        sleep(random()*0.5)
        request = urllib2.Request(url,headers=headers)
        html = urllib2.urlopen(request,timeout=10).read().decode(gbk)
    except:
        html = None
        with open(error.txt,a+) as f:
            f.write(url+                   +str(type)+n)
    finally:
        return html
    
    
def parseCode1(baseUrl,lastUrl):
    html = openUrl(baseUrl+lastUrl,1)
    if html:
        for tr in re.findall("<tr class=‘provincetr‘>.+?</tr>",html):
            for td in re.findall("<a href=‘(.+?html)‘>(.+?)<br/>",tr):
                parseCode2(baseUrl,td[0])
        

def parseCode2(baseUrl,lastUrl):
    html = openUrl(baseUrl + lastUrl,2)
    if html:
        for tr in re.findall("<tr class=‘citytr‘>.+?</tr>",html):
            for td in re.findall("<a href=‘(.+?html)‘>(.+?)</a></td><td><a href=‘.+?‘>(.+?)</a>",tr):
                insertVillage(td[1],td[2])
                parseCode3(baseUrl,td[0])
        

def parseCode3(baseUrl,lastUrl):
    baseUrl = baseUrl + lastUrl.split(/)[0] + /
    lastUrl = /.join(lastUrl.split(/)[1:])
    html = openUrl(baseUrl + lastUrl,3)
    if html:
        for tr in re.findall("<tr class=‘countytr‘>.+?</tr>",td[2])
                parseCode4(baseUrl,td[0])
        

def parseCode4(baseUrl,4)
    if html:
        for tr in re.findall("<tr class=‘towntr‘>.+?</tr>",td[2])
                parseCode5(baseUrl,td[0])
        

def parseCode5(baseUrl,5)
    if html:
        for tr in re.findall("<tr class=‘villagetr‘>.+?</tr>",html):
            for td in re.findall("<td>(.+?)</td><td>(.+?)</td><td>(.+?)</td>",tr):
                insertVillage(td[0],td[2],td[1])
        

if __name__=="__main__":
    baseUrl = http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/
    parseCode1(baseUrl,index.html)

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读