go语言 grequests+goquery 简单爬虫,使用多协程并发爬取
发布时间:2020-12-16 19:15:42 所属栏目:大数据 来源:网络整理
导读:/*下载工具*/package mainimport ("fmt"//go语言版本的jquery"github.com/PuerkitoBio/goquery""os""sync""strings"//go语言版本的request"github.com/levigross/grequests""time""strconv")var wg sync.WaitGroupfunc main() {now := time.Now()initalUrls
/*下载工具*/ package main import ( "fmt" //go语言版本的jquery "github.com/PuerkitoBio/goquery" "os" "sync" "strings" //go语言版本的request "github.com/levigross/grequests" "time" "strconv" ) var wg sync.WaitGroup func main() { now := time.Now() initalUrls := []string{"http://www.zngirls.com/girl/18071/album/",} for _,url := range initalUrls { doc,err := goquery.NewDocument(url) if err != nil { fmt.Errorf("下载错误:%#v",err) os.Exit(-1) } doc.Find(".igalleryli_link").Each(func(i int,s *goquery.Selection) { src,exists := s.Find("img").Attr("src") fmt.Printf("开始下载影集图片:%vn",src) if (exists) { wg.Add(1) go func(src string) { defer wg.Done() //下载图片 //tryTimes := map[int]int n := 0 s := strings.Replace(src,"cover/","",1) ss := strings.Split(s,"/") fm := strings.Join(ss[:len(ss) - 1],"/") sf0 := fm + "/%d.jpg" sfn := fm + "/%03d.jpg" for { //持续下载 s := "" if n == 0 { s = fmt.Sprintf(sf0,n) } else { s = fmt.Sprintf(sfn,n) } fmt.Printf("准备下载: %vn",s) res,_ := grequests.Get(s,&grequests.RequestOptions{ //结构体可以对指定的类型给值,而不一定都赋值 Headers:map[string]string{ "Referer":"http://www.zngirls.com","User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/50.0.2661.102 Safari/537.36"}}) //条件需要修改,如果没有图片,返回的是盗链,图片4kb if res.StatusCode != 200 { fmt.Printf("下载失败,退出影集下载:%sn",src) break } //图片可能是该网站,返回的盗链图片(4kb左右) length := res.Header.Get("Content-Length") slen,_ := strconv.Atoi(length) if slen < 4100{ fmt.Printf("下载内容失败,退出影集下载:%sn",src) break } index := strings.Index(s,"gallery") if index == -1 { fmt.Errorf("无效地址,找不到gallery关键词,解析失败:%sn",src) return } ss2 := strings.Split(string(s[index:]),"/") dirname := strings.Join(ss2[:len(ss2) - 1],"/") if _,err := os.Stat(dirname); err != nil { fmt.Printf("创建下载文件夹:%sn",dirname) os.MkdirAll(dirname,0666) } filename := strings.Join(ss2,"/") res.DownloadToFile(filename) fmt.Printf("成功下载图片到:%sn",filename) n++ } }(src) } }) } wg.Wait() //4M的带宽下载,需要16m36s,总大小202M,10个文件夹,560个文件 fmt.Printf("下载任务完成,耗时:%#vn",time.Now().Sub(now)) } (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |