国家统计局区划码爬取
发布时间:2020-12-12 13:44:55 所属栏目:百科 来源:网络整理
导读:目标数据 ? oracle存储表格 -- Create tablecreate table VILLAGE_CODE( id INTEGER,area_code VARCHAR2( 500 ),city_village_code VARCHAR2( 500 ),area_name VARCHAR2( 500 ))tablespace SYSTEM pctfree 10 pctused 40 initrans 1 maxtrans 255 storage (
目标数据? oracle存储表格-- Create table create table VILLAGE_CODE ( id INTEGER,area_code VARCHAR2(500),city_village_code VARCHAR2(500),area_name VARCHAR2(500) ) tablespace SYSTEM pctfree 10 pctused 40 initrans 1 maxtrans 255 storage ( initial 64K next 1M minextents 1 maxextents unlimited ); -- Add comments to the columns comment on column VILLAGE_CODE.id is ‘自增ID‘; comment on column VILLAGE_CODE.area_code is ‘统计用区划代码‘; comment on column VILLAGE_CODE.city_village_code is ‘城乡分类代码 ‘; comment on column VILLAGE_CODE.area_name is ‘名称‘; ? 爬取代码#!/usr/bin/env python # encoding: utf-8 ‘‘‘ @author: lurenjia @contact: [email?protected] @file: areacode.py @time: 2018/9/29 14:40 @desc: ‘‘‘ import urllib2,re from time import sleep from random import random from config import DBSession headers = { "User-Agent": "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/69.0.3497.100 Safari/537.36" } session = DBSession() def insertVillage(code,name,city_village_code=‘-1‘): print code,city_village_code session.execute("insert into village_code(area_code,area_name,city_village_code) VALUES (‘%s‘,‘%s‘,‘%s‘)" %(code,city_village_code)) session.commit() def openUrl(url,type): try: sleep(random()*0.5) request = urllib2.Request(url,headers=headers) html = urllib2.urlopen(request,timeout=10).read().decode(‘gbk‘) except: html = None with open(‘error.txt‘,‘a+‘) as f: f.write(url+‘ ‘+str(type)+‘n‘) finally: return html def parseCode1(baseUrl,lastUrl): html = openUrl(baseUrl+lastUrl,1) if html: for tr in re.findall("<tr class=‘provincetr‘>.+?</tr>",html): for td in re.findall("<a href=‘(.+?html)‘>(.+?)<br/>",tr): parseCode2(baseUrl,td[0]) def parseCode2(baseUrl,lastUrl): html = openUrl(baseUrl + lastUrl,2) if html: for tr in re.findall("<tr class=‘citytr‘>.+?</tr>",html): for td in re.findall("<a href=‘(.+?html)‘>(.+?)</a></td><td><a href=‘.+?‘>(.+?)</a>",tr): insertVillage(td[1],td[2]) parseCode3(baseUrl,td[0]) def parseCode3(baseUrl,lastUrl): baseUrl = baseUrl + lastUrl.split(‘/‘)[0] + ‘/‘ lastUrl = ‘/‘.join(lastUrl.split(‘/‘)[1:]) html = openUrl(baseUrl + lastUrl,3) if html: for tr in re.findall("<tr class=‘countytr‘>.+?</tr>",td[2]) parseCode4(baseUrl,td[0]) def parseCode4(baseUrl,4) if html: for tr in re.findall("<tr class=‘towntr‘>.+?</tr>",td[2]) parseCode5(baseUrl,td[0]) def parseCode5(baseUrl,5) if html: for tr in re.findall("<tr class=‘villagetr‘>.+?</tr>",html): for td in re.findall("<td>(.+?)</td><td>(.+?)</td><td>(.+?)</td>",tr): insertVillage(td[0],td[2],td[1]) if __name__=="__main__": baseUrl = ‘http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/‘ parseCode1(baseUrl,‘index.html‘) (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |