Python爬取国外天气预报网站的方法
发布时间:2020-12-16 22:10:10 所属栏目:Python 来源:网络整理
导读:本篇章节讲解Python爬取国外天气预报网站的方法。供大家参考研究。具体如下: crawl_weather.py如下: #encoding=utf-8import httplibimport urllib2import timefrom threading import Threadimport threadingfrom Queue import Queuefrom time impo
本篇章节讲解Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下: crawl_weather.py如下: #encoding=utf-8 import httplib import urllib2 import time from threading import Thread import threading from Queue import Queue from time import sleep import re import copy lang = "fr" count = 0 class Location: # Location(False,"中国","北京","zh") # Location(True,"","亚洲","zh") def __init__(self,is_beyond_country,country_name,loc_name,lang): self.country_name = country_name self.loc_name = loc_name self.lang = lang self.is_beyond_country = is_beyond_country prn_lock = threading.RLock() def GetLocationURLs(url,recursive): global count if url.find("weather-forecast") != -1: count = count + 1 if count % 500 == 0: prn_lock.acquire() print "count:%d" % (count) prn_lock.release() return [url] page = urllib2.urlopen(url).read() time.sleep(0.01) #"<h6><a href="http://www.accuweather.com/zh/browse-locations/afr"><em>Africa</em></a></h6>" pattern = "<h6><a href="(.*)"><em>(.*)</em></a></h6>" locs = re.findall(pattern,page) locs = [(url,name) for url,name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1] if not recursive: urls = [url for url,name in locs] return urls urls = [] for _url,_name in locs: lst = GetLocationURLs(_url,True) urls.extend(lst) return urls #entry_url = "http://www.accuweather.com/zh/browse-locations" entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang) #regions = ["afr","ant","arc","asi","cac","eur","mea","nam","ocn","sam"] #regions = ["eur"] #region_urls = [ "%s/%s" % (entry_url,reg) for reg in regions] #region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"] sub_urls = GetLocationURLs(entry_url,False) print len(sub_urls) print sub_urls q = Queue() location_urls = [] ThreadNum = 5 lock = threading.RLock() for url in sub_urls: q.put(url) def working(): while True: url = q.get() lst = GetLocationURLs(url,True) print "%s %d urls " % (url,len(lst)) lock.acquire() location_urls.extend(lst) lock.release() q.task_done() for i in range(ThreadNum): t = Thread(target=working) t.setDaemon(True) t.start() q.join() fp = open('locations.txt',"w") fp.write("n".join(location_urls)) fp.close() #for url in location_urls: # print url #location_urls = GetLocationURLs(entry_url) ''' def Fetch(url): try: print url web_path = url[0] local_name = url[1] print "web_path:",web_path print "local_name:",local_name sContent = urllib2.urlopen(web_path).read() savePath = "D:CourseNLP_Manning%s" % (local_name) print savePath file = open(savePath,'wb') file.write(sContent) file.close() print savePath + " saved"; except: pass; def working(): while True: url = q.get() Fetch(url) sleep(10) q.task_done() #root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash" root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash" page = urllib2.urlopen(root_url).read() for i in range(NUM): t = Thread(target=working) t.setDaemon(True) t.start() urls = copy.deepcopy(ppt_urls) urls.extend(srt_urls) urls.extend(video_urls) print len(ppt_urls) print len(srt_urls) print len(video_urls) print len(urls) for url in urls: q.put(url) q.join() ''' ''' root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494" page = urllib2.urlopen(root_url).read() print page ''' FetchLocation.py如下: #encoding=utf-8 import sys import httplib import urllib2 import time from threading import Thread import threading from Queue import Queue from time import sleep import re import copy from xml.dom import minidom import HTMLParser import datetime q = Queue() locks = [threading.RLock() for i in range(2)] ThreadNumber = 20 locations = {} conds = {} def FindCountryBreadCrumbs(page): lines = page.splitlines() count = 0 start = -1 opened = False for line in lines: if line.find("<ul id="country-breadcrumbs">") != -1: start = count opened = True if opened and line.find("</ul>") != -1: end = count opened = False count = count + 1 return "n".join(lines[start: (end + 1)]) def GetText(nodelist): rc = [] for node in nodelist: if node.nodeType == node.TEXT_NODE: rc.append(HTMLParser.HTMLParser().unescape(node.data)) return ''.join(rc) def FindCondition(page): pat = "<span class="cond">(.*?)</span>" cds = re.findall(pat,page) cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds] return cds def ExtractInfo(url): try: page = urllib2.urlopen(url).read() except Exception,e: return [] text = FindCountryBreadCrumbs(page) text = HTMLParser.HTMLParser().unescape(text) dom = minidom.parseString(text.encode("utf-8")) locs = [] lis = dom.getElementsByTagName("li") for li in lis: adr_list = li.getElementsByTagName("a") if adr_list: locs.append(GetText(adr_list[0].childNodes).encode("utf-8")) strs = li.getElementsByTagName("strong") if strs: locs.append(GetText(strs[0].childNodes).encode("utf-8")) cds = FindCondition(page) return locs,cds def AddMap(lst,m): for x in lst: if m.get(x) == None: m[x] = 1 def working(): while True: urls = q.get() #print len(urls) m = {} m2 = {} count = 0 for url in urls: count = count + 1 #print "%d/%d" % (count,len(urls)) locs,cds = ExtractInfo(url) AddMap(locs,m) AddMap(cds,m2) locks[1].acquire() AddMap(m.keys(),locations) AddMap(m2.keys(),conds) locks[1].release() q.task_done() def main(): if len(sys.argv) < 2: exit() loc_path = sys.argv[1] fp = open(loc_path,"r") urls = [line.strip() for line in fp] fp.close() #urls = urls[0:1000] blocks = len(urls) / ThreadNumber + 1 for start in range(0,len(urls),blocks): end = start + blocks if end > len(urls): end = len(urls) q.put(urls[start:end]) for i in range(ThreadNumber): t = Thread(target=working) t.setDaemon(True) t.start() q.join() fp = open("location_name.fr","w") fp.write("n".join(locations.keys())) fp.close() fp = open("conditions.fr","w") fp.write("n".join(conds.keys())) fp.close() if __name__ == '__main__': main() 希望本文所述对大家的python程序设计有所帮助。 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |
相关内容
- 利用Python来协助姑妈买房!Python“买”房比她自己买便宜二
- python – list comprehension,其中列表本身为None
- 如何在Django 1.9中将??更新连接postgres查询写为queryset?
- python – 如何区分序列和映射
- Python程序中的进程操作-进程池(multiprocess.Pool)
- 如何使用子进程强制python释放内存?
- python – 关于APIView的django过滤器
- 是否可以访问标准传递给python的python脚本的源代码?
- python 调用windows api查看系统的电量
- python – 使用PyDev完成代码完成与PDT一样?