加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 大数据 > 正文

golang 采集图片

发布时间:2020-12-16 18:50:44 所属栏目:大数据 来源:网络整理
导读:帮朋友忙去采集的,看他打开韩国网站 一个一个点开去保存看不下去了速度太慢了,原本想用PHP写的,刚好看了无闻大大的go采集,然后无耻的改了下就用了 , package main import ( "fmt" "io" "io/ioutil" "log" "net/http" "os" "path" "regexp" "strings" )

帮朋友忙去采集的,看他打开韩国网站 一个一个点开去保存看不下去了速度太慢了,原本想用PHP写的,刚好看了无闻大大的go采集,然后无耻的改了下就用了

package main

import ( "fmt" "io" "io/ioutil" "log" "net/http" "os" "path" "regexp" "strings" ) type NotFoundError struct { Message string } func (e NotFoundError) Error() string { return e.Message } type RemoteError struct { Host string Err error } func (e *RemoteError) Error() string { return e.Err.Error() } var UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/29.0.1541.0 Safari/537.36" // HttpGet gets the specified resource. ErrNotFound is returned if the // server responds with status 404. func HttpGet(client *http.Client,url string,header http.Header) (io.ReadCloser,error) { req,err := http.NewRequest("GET",url,nil) if err != nil { return nil,err } req.Header.Set("User-Agent",UserAgent) for k,vs := range header { req.Header[k] = vs } resp,err := client.Do(req) if err != nil { return nil,&RemoteError{req.URL.Host,err} } if resp.StatusCode == 200 { return resp.Body,nil } resp.Body.Close() if resp.StatusCode == 404 { // 403 can be rate limit error. || resp.StatusCode == 403 { err = NotFoundError{"Resource not found: " + url} } else { err = &RemoteError{req.URL.Host,fmt.Errorf("get %s -> %d",resp.StatusCode)} } return nil,err } // HttpGetBytes gets the specified resource. ErrNotFound is returned if the server // responds with status 404. func HttpGetBytes(client *http.Client,header http.Header) ([]byte,error) { rc,err := HttpGet(client,header) if err != nil { return nil,err } defer rc.Close() return ioutil.ReadAll(rc) } // HttpGetToFile gets the specified resource and writes to file. // ErrNotFound is returned if the server responds with status 404. func HttpGetToFile(client *http.Client,header http.Header,fileName string) error { rc,header) if err != nil { return err } defer rc.Close() os.MkdirAll(path.Dir(fileName),os.ModePerm) f,err := os.Create(fileName) if err != nil { return err } defer f.Close() _,err = io.Copy(f,rc) return err } var img = regexp.MustCompile(`href="javascript:goView((d+)`) var imgPattern = regexp.MustCompile(`id="mainImage" src="../upload(.*?).jpg`) var totalTask int func download(url string,num chan bool) { url = strings.TrimPrefix(url,`href="javascript:goView(`) page := "http://www.gdweb.co.kr/main/koreaWebView.asp?idx=%s&url=koreaWeb.asp" t,err := HttpGetBytes(&http.Client{},fmt.Sprintf(page,url),nil) if err != nil { log.Fatalf("获取页面失败:%v",err) } matches := imgPattern.FindAll(t,-1) for _,match := range matches { url = "http://www.gdweb.co.kr" + strings.TrimPrefix(string(match),`id="mainImage" src="..`) log.Printf("正在下载:%s",url) err := HttpGetToFile(&http.Client{},nil,"pics/"+path.Base(url)) if err != nil { log.Printf("图片下载失败(%s):%v",err) } } totalTask-- <-num } func main() { // 控制同时下载数量 num := make(chan bool,5) // 主线程爬取页面,子线程下载图片 //baseUrl := "http://nvmingxing.net/hotness/%d/" //abaseUrl := "http://www.gdweb.co.kr/main/koreaWebView.asp?idx=8200&url=koreaWeb.asp" baseUrl := "http://www.gdweb.co.kr/main/koreaWeb.asp?idx=&url=index.asp&lpage=124&page=%d" for i := 2; i < 124; i++ { log.Printf("抓取页面:%d",totalTask) data,fmt.Sprintf(baseUrl,i+1),nil) if err != nil { log.Fatalf("获取页面失败(%d):%v",i,err) } matches := img.FindAll(data,match := range matches { totalTask++ num <- true go download(string(match),num) } } }

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读