加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 大数据 > 正文

[日常] Go语言圣经--并发的web爬虫

发布时间:2020-12-16 19:36:26 所属栏目:大数据 来源:网络整理
导读:两种: crawler.go? import ( "fmt" "links" //"log" "os" ) func main() { worklist := make(chan []string) // Start with the command-line arguments. go func() { worklist - os.Args[1:] }() // Crawl the web concurrently. seen := make(map[string]

两种:

crawler.go?

import (
"fmt"
"links"
//"log"
"os"
)

func main() {
worklist := make(chan []string)

    // Start with the command-line arguments.
    go func() { worklist <- os.Args[1:] }() 
    // Crawl the web concurrently.
    seen := make(map[string]bool)
    for list := range worklist {
            for _,link := range list {
                    if !seen[link] {
                            seen[link] = true
                            go func(link string) {
                                    worklist <- crawl(link)
                            }(link)
                    }   
            }   
    }   

}

var tokens = make(chan struct{},20)

//从一个url页面中提取出所有的url
func crawl(url string) []string {
fmt.Println(url)
tokens <- struct{}{}
list,err := links.Extract(url)
<-tokens
if err != nil {
//log.Print(err)
}
return list
}

crawler2.go?

import (
"fmt"
"links"
//"log"
"os"
"strings"
)

func main() {
worklist := make(chan []string)
unseenLinks := make(chan string)

    // Start with the command-line arguments.
    go func() { worklist <- os.Args[1:] }() 
    // Create 20 crawler goroutines to fetch each unseen link.
    for i := 0; i < 20; i++ {
            go func() {
                    for link := range unseenLinks {
                            //if strings.HasPrefix(link,"http://www.lypeng.com") {
                            foundLinks := crawl(link)
                            go func() { worklist <- foundLinks }() 

                            //} 
                    }   
            }() 
    }   

    // The main goroutine de-duplicates worklist items
    // and sends the unseen ones to the crawlers.
    seen := make(map[string]bool)
    for list := range worklist {
            for _,link := range list {
                    if !seen[link] {
                            seen[link] = true
                            unseenLinks <- link
                    }   
            }   
    }   

}

//从一个url页面中提取出所有的url
func crawl(url string) []string {
fmt.Println(url)
list,err := links.Extract(url)
if err != nil {
//log.Print(err)
}
return list
}

  

  

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读