加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 大数据 > 正文

抓取历年天气

发布时间:2020-12-16 18:47:54 所属栏目:大数据 来源:网络整理
导读:使用goquery抓取天气的demo。数据量有点多。目前按省份存储天气数据。存储到csv文件中。 package mainimport ("code.google.com/p/mahonia""encoding/csv""fmt""github.com/PuerkitoBio/goquery""net/http""os""strings""time")var log = loger.Loger{Level:

使用goquery抓取天气的demo。数据量有点多。目前按省份存储天气数据。存储到csv文件中。


package main

import (
	"code.google.com/p/mahonia"
	"encoding/csv"
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"net/http"
	"os"
	"strings"
	"time"
)

var log = loger.Loger{
	Level: loger.DEBUG,}

const (
	YEAR      = 2013
	SleepTime = 100 //毫秒
)

func main() {
	sc,cc := GetCity()
	var weatherInfoAll []*WeaterInfo
	for key,value := range sc {
		filePath := fmt.Sprintf("%d%s.csv",YEAR,key)
		_,err := os.Stat(filePath)
		if err == nil {
			continue
		}
		weatherInfoAll = make([]*WeaterInfo,100000)
		for _,city := range value {
			name := cc[city]
			log.Debug("get ",key,city)
			client := &http.Client{}
			weatherInfoYear := GetWeather(client,city,name)
			weatherInfoAll = append(weatherInfoAll,weatherInfoYear...)
		}
		SaveToCSV(key,weatherInfoAll)
	}
}

//返回数据为省份=>城市名  城市名=>拼音.html
func GetCity() (sc map[string][]string,cc map[string]string) {
	url := "http://www.tianqihoubao.com/lishi/"
	request,err := http.NewRequest("GET",url,nil)
	if err != nil {
		log.Log(err)
		return
	}
	request.Header.Add("User-Agent","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML,like Gecko) Ubuntu Chromium/39.0.2171.65 Chrome/39.0.2171.65 Safari/537.36")
	request.Header.Add("referer","http://www.tianqihoubao.com/")
	resp,err := http.DefaultClient.Do(request)
	if err != nil {
		log.Log(err)
		return
	}
	document,err := goquery.NewDocumentFromResponse(resp)
	if err != nil {
		log.Log(err)
		return
	}
	gbk := mahonia.NewDecoder("gbk")
	sc = make(map[string][]string)
	cc = make(map[string]string)
	document.Find(".citychk").Find("dl").Each(func(index int,s *goquery.Selection) {
		province := gbk.ConvertString(s.Find("dt").Find("b").Text())
		citys := make([]string,20)
		s.Find("dd").Find("a").Each(func(index int,se *goquery.Selection) {
			uri,exists := se.Attr("href")
			if !exists {
				return
			}
			name := gbk.ConvertString(se.Text())
			uri = strings.Replace(uri,".html","",-1)
			citys = append(citys,name)
			cc[name] = uri
		})
		sc[province] = citys
	})
	return
}

type WeaterInfo struct {
	Province string
	City     string
	Date     string
	Info     string
	Temp     string
	Wind     string
}

func GetWeather(client *http.Client,province,name string) []*WeaterInfo {
	baseUrl := fmt.Sprintf("http://www.tianqihoubao.com%s/month/%%s",name)
	weaterInfoYear := make([]*WeaterInfo,380)
	for i := 1; i <= 12; i++ {
		url := fmt.Sprintf(baseUrl,fmt.Sprintf("%d%02d.html",i))
		weaterInfos := GetWeatherInfo(client,url)
		weaterInfoYear = append(weaterInfoYear,weaterInfos...)
		time.Sleep(time.Millisecond * SleepTime)
	}
	return weaterInfoYear
}

func GetWeatherInfo(client *http.Client,url string) (weaterInfos []*WeaterInfo) {
	request,err := client.Do(request)
	if err != nil {
		log.Log(err)
		return
	}
	document,err := goquery.NewDocumentFromResponse(resp)
	if err != nil {
		log.Log(err)
		return
	}
	gbk := mahonia.NewDecoder("gbk")
	weaterInfos = make([]*WeaterInfo,31)
	document.Find("#content").Find("tbody").Find("tr").Each(func(index int,s *goquery.Selection) {
		//排除第一个
		if index == 0 {
			return
		}
		var date,info,temp,wind string
		s.Find("td").Each(func(index int,se *goquery.Selection) {
			if index == 0 {
				date = gbk.ConvertString(se.Find("a").Text())
			}
			if index == 1 {
				info = gbk.ConvertString(se.Text())
			}
			if index == 2 {
				temp = gbk.ConvertString(se.Text())
			}
			if index == 3 {
				wind = gbk.ConvertString(se.Text())
			}
		})
		weatherInfo := &WeaterInfo{
			Province: province,City:     city,Date:     date,Info:     info,Temp:     temp,Wind:     wind,}
		weaterInfos = append(weaterInfos,weatherInfo)
	})
	return
}

func SaveToCSV(file string,weatherInfos []*WeaterInfo) (err error) {
	filePath := fmt.Sprintf("%d%s.csv",file)
	_,err = os.Stat(filePath)
	if err == nil {
		return
	}
	f,err := os.Create(filePath)
	if err != nil {
		log.Log(err)
		return
	}
	defer f.Close()
	f.WriteString("xEFxBBxBF")  //UTF-8
	w := csv.NewWriter(f)
	w.Write([]string{"省份","城市","日期","天气状况","气温","风力风向"})
	for i,weatherInfo := range weatherInfos {
		if i%1000 == 0 {
			w.Flush() //刷入文件
		}
		strs := []string{TrimSpace(weatherInfo.Province),TrimSpace(weatherInfo.City),TrimSpace(weatherInfo.Date),TrimSpace(weatherInfo.Info),TrimSpace(weatherInfo.Temp),TrimSpace(weatherInfo.Wind)}
		w.Write(strs)
	}
	w.Flush()
	return
}

func TrimSpace(value string) string {
	value = strings.Replace(value,"n",-1)
	return strings.Replace(value," ",-1)
}

日志库删掉了,因为看起来有点不是很好。当然,也没有说这个代码好。只是临时写着东西。

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读