去游览履带式锻炼的麻烦

时间:2012-11-04 09:49:23

标签: go

我正在进行巡回演唱,我觉得除了并发之外,我对语言有很好的理解。

slide 72上有一个练习要求读者并行化一个网络爬虫(并使其不能覆盖重复,但我还没有到达那里。)

这是我到目前为止所做的:

func Crawl(url string, depth int, fetcher Fetcher, ch chan string) {
    if depth <= 0 {
        return
    }

    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        ch <- fmt.Sprintln(err)
        return
    }

    ch <- fmt.Sprintf("found: %s %q\n", url, body)
    for _, u := range urls {
        go Crawl(u, depth-1, fetcher, ch)
    }
}

func main() {
    ch := make(chan string, 100)
    go Crawl("http://golang.org/", 4, fetcher, ch)

    for i := range ch {
        fmt.Println(i)
    }
}

我遇到的问题是放置close(ch)电话的位置。如果我在defer close(ch)方法的某处放置Crawl,那么我最终会在其中一个生成的goroutine中写入一个封闭的通道,因为该方法将在生成的goroutine之前完成执行。

如果我省略对close(ch)的调用,如我的示例代码所示,程序在所有goroutine完成执行后死锁,但主线程仍然在for循环中等待通道,因为通道是永远不会关闭。

18 个答案:

答案 0 :(得分:16)

查看Effective Go的并行化部分可以找到解决方案的想法。您必须在功能的每个返回路径上关闭通道。实际上这是defer语句的一个很好的用例:

func Crawl(url string, depth int, fetcher Fetcher, ret chan string) {
    defer close(ret)
    if depth <= 0 {
        return
    }

    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        ret <- err.Error()
        return
    }

    ret <- fmt.Sprintf("found: %s %q", url, body)

    result := make([]chan string, len(urls))
    for i, u := range urls {
        result[i] = make(chan string)
        go Crawl(u, depth-1, fetcher, result[i])
    }

    for i := range result {
        for s := range result[i] {
            ret <- s
        }
    }

    return
}

func main() {
    result := make(chan string)
    go Crawl("http://golang.org/", 4, fetcher, result)

    for s := range result {
        fmt.Println(s)
    }
}

您的代码的本质区别在于每个Crawl实例都有自己的返回通道,调用函数会在返回通道中收集结果。

答案 1 :(得分:8)

我和这个完全不同的方向。关于使用地图的提示我可能会误导。

// SafeUrlMap is safe to use concurrently.
type SafeUrlMap struct {
    v   map[string]string
    mux sync.Mutex
}

func (c *SafeUrlMap) Set(key string, body string) {
    c.mux.Lock()
    // Lock so only one goroutine at a time can access the map c.v.
    c.v[key] = body
    c.mux.Unlock()
}

// Value returns mapped value for the given key.
func (c *SafeUrlMap) Value(key string) (string, bool) {
    c.mux.Lock()
    // Lock so only one goroutine at a time can access the map c.v.
    defer c.mux.Unlock()
    val, ok := c.v[key]
    return val, ok
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, urlMap SafeUrlMap) {
    defer wg.Done()
    urlMap.Set(url, body)

    if depth <= 0 {
        return
    }

    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        return
    }

    for _, u := range urls {
        if _, ok := urlMap.Value(u); !ok {
            wg.Add(1)
            go Crawl(u, depth-1, fetcher, urlMap)
        }
    }

    return
}

var wg sync.WaitGroup

func main() {
    urlMap := SafeUrlMap{v: make(map[string]string)}

    wg.Add(1)
    go Crawl("http://golang.org/", 4, fetcher, urlMap)
    wg.Wait()

    for url := range urlMap.v {
        body, _ := urlMap.Value(url)
        fmt.Printf("found: %s %q\n", url, body)
    }
}

答案 2 :(得分:7)

O(1)时间查找地图上的网址而不是在所有访问网址的切片上进行O(n)查找应该有助于最大限度地减少在关键部分内部花费的时间,这对于此示例来说是一个微不足道的时间但是会变成与规模相关。

WaitGroup用于阻止顶级Crawl()函数返回,直到所有子进程例程都完成为止。

func Crawl(url string, depth int, fetcher Fetcher) {
    var str_map = make(map[string]bool)
    var mux sync.Mutex
    var wg sync.WaitGroup

    var crawler func(string,int)
    crawler = func(url string, depth int) {
        defer wg.Done()

        if depth <= 0 {
            return
        }   

        mux.Lock()
        if _, ok := str_map[url]; ok {
            mux.Unlock()
            return;
        }else{
            str_map[url] = true
            mux.Unlock()
        }

        body, urls, err := fetcher.Fetch(url)
        if err != nil {
            fmt.Println(err)
            return
        }
        fmt.Printf("found: %s %q %q\n", url, body, urls)

        for _, u := range urls {
            wg.Add(1)
            go crawler(u, depth-1)          
        }       
    }
    wg.Add(1)
    crawler(url,depth)
    wg.Wait()   
}

func main() {
    Crawl("http://golang.org/", 4, fetcher)
}

答案 3 :(得分:1)

这是我的解决方案。我有一个&#34;主人&#34;例如,如果找到要抓取的新网址,则会侦听网址渠道并启动新的抓取例程(将已抓取的网址放入频道)。

我没有显式关闭频道,而是有一个未完成的抓取goroutine的计数器,当计数器为0时,程序退出,因为它没有什么值得等待的。

func doCrawl(url string, fetcher Fetcher, results chan []string) {
    body, urls, err := fetcher.Fetch(url)
    results <- urls

    if err != nil {
        fmt.Println(err)
    } else {
        fmt.Printf("found: %s %q\n", url, body)
    }
}



func Crawl(url string, depth int, fetcher Fetcher) {
    results := make(chan []string)
    crawled := make(map[string]bool)
    go doCrawl(url, fetcher, results)
    // counter for unfinished crawling goroutines
    toWait := 1

    for urls := range results {
        toWait--

        for _, u := range urls {
            if !crawled[u] {
                crawled[u] = true
                go doCrawl(u, fetcher, results)
                toWait++
            }
        }

        if toWait == 0 {
            break
        }
    }
}

答案 4 :(得分:1)

这是我的版本(受@fasmat的启发)–该版本通过使用RWMutex使用自定义缓存来防止两次提取相同的URL。

type Cache struct {
    data map[string]fakeResult
    mux sync.RWMutex
}

var cache = Cache{data: make(map[string]fakeResult)}

//cache adds new page to the global cache
func (c *Cache) cache(url string) fakeResult {
    c.mux.Lock()

    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        body = err.Error()
    }

    data := fakeResult{body, urls}
    c.data[url] = data

    c.mux.Unlock()

    return data
}

//Visit visites the page at given url and caches it if needec
func (c *Cache) Visit(url string) (data fakeResult, alreadyCached bool) {
    c.mux.RLock()
    data, alreadyCached = c.data[url]
    c.mux.RUnlock()

    if !alreadyCached {
        data = c.cache(url)
    }

    return data, alreadyCached
}

/*
Crawl crawles all pages reachable from url and within the depth (given by args).
Fetches pages using given fetcher and caches them in the global cache.
Continously sends newly discovered pages to the out channel.
*/
func Crawl(url string, depth int, fetcher Fetcher, out chan string) {
    defer close(out)

    if depth <= 0 {
        return
    }

    data, alreadyCached := cache.Visit(url)
    if alreadyCached {
        return
    }

    //send newly discovered page to out channel
    out <- fmt.Sprintf("found: %s %q", url, data.body)

    //visit linked pages
    res := make([]chan string, len(data.urls))

    for i, link := range data.urls {
        res[i] = make(chan string)
        go Crawl(link, depth-1, fetcher, res[i])
    }

    //send newly discovered pages from links to out channel
    for i := range res {
        for s := range res[i] {
            out <- s
        }
    }
}

func main() {
    res := make(chan string)
    go Crawl("https://golang.org/", 4, fetcher, res)

    for page := range res {
        fmt.Println(page)   
    }
}

除了不两次获取URL之外,该解决方案不使用预先知道总页数的事实(适用于任意页数),并且不会错误地限制计时器的执行时间。 / p>

答案 5 :(得分:1)

我将safeCounter和waitGroup传递给了Crawl函数,然后使用safeCounter来跳过已访问的URL,waitGroup以防止早期退出当前goroutine。

func Crawl(url string, depth int, fetcher Fetcher, c *SafeCounter, wg *sync.WaitGroup) {
    defer wg.Done()

    if depth <= 0 {
        return
    }

    c.mux.Lock()
    c.v[url]++
    c.mux.Unlock()

    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Printf("found: %s %q\n", url, body)
    for _, u := range urls {
        c.mux.Lock()
        i := c.v[u]
        c.mux.Unlock()
        if i == 1 {
            continue
        }
        wg.Add(1)
        go Crawl(u, depth-1, fetcher, c, wg)
    }
    return
}

func main() {
    c := SafeCounter{v: make(map[string]int)}
    var wg sync.WaitGroup
    wg.Add(1)
    Crawl("https://golang.org/", 4, fetcher, &c, &wg)
    wg.Wait()
}

答案 6 :(得分:1)

我认为使用地图(与使用其他语言的集合相同的方式)和互斥锁是最简单的方法:

func Crawl(url string, depth int, fetcher Fetcher) {
    mux.Lock()
    defer mux.Unlock()
    if depth <= 0 || IsVisited(url) {
        return
    }
    visit[url] = true
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Printf("found: %s %q\n", url, body)
    for _, u := range urls {
        //
        go Crawl(u, depth-1, fetcher)
    }
    return
}

func IsVisited(s string) bool {
    _, ok := visit[s]
    return ok
}

var mux sync.Mutex

var visit = make(map[string]bool)

func main() {
    Crawl("https://golang.org/", 4, fetcher)
    time.Sleep(time.Second)
}

答案 7 :(得分:0)

这是我的解决方案。我有一个问题,就是主函数不等待goroutine来打印其状态并完成。我检查了上一张幻灯片是否使用了等待退出前等待1秒钟的解决方案,因此我决定使用该方法。不过,在实践中,我认为一些协调机制会更好。

import (
    "fmt"
    "sync"
    "time"
)

type SafeMap struct {
    mu sync.Mutex
    v  map[string]bool
}

// Sets the given key to true.
func (sm *SafeMap) Set(key string) {
    sm.mu.Lock()
    sm.v[key] = true
    sm.mu.Unlock()
}

// Get returns the current value for the given key.
func (sm *SafeMap) Get(key string) bool {
    sm.mu.Lock()
    defer sm.mu.Unlock()
    return sm.v[key]
}

var safeMap = SafeMap{v: make(map[string]bool)}

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    if depth <= 0 {
        return
    }
    
    // if the value exists, don't fetch it twice
    if safeMap.Get(url) {
        return  
    }
    
    // check if there is an error fetching
    body, urls, err := fetcher.Fetch(url)
    safeMap.Set(url)
    if err != nil {
        fmt.Println(err)
        return
    }
    
    // list contents and crawl recursively
    fmt.Printf("found: %s %q\n", url, body)
    for _, u := range urls {
        go Crawl(u, depth-1, fetcher)
    }
}

func main() {
    go Crawl("https://golang.org/", 4, fetcher)
    time.Sleep(time.Second)
}

答案 8 :(得分:0)

我是新来的,所以有点盐,但是在我看来,这种解决方案似乎更惯用了。它对所有结果都使用一个通道,对所有爬网请求(尝试对特定的URL进行爬网)都使用一个通道,并使用一个等待组来跟踪完成情况。主要的抓取调用充当向工作进程(在处理重复数据删除时)的抓取请求的分发者,并且充当未决抓取多少抓取请求的跟踪器。

package main

import (
    "fmt"
    "sync"
)

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

type FetchResult struct {
    url  string
    body string
    err  error
}

type CrawlRequest struct {
    url   string
    depth int
}

type Crawler struct {
    depth           int
    fetcher         Fetcher
    results         chan FetchResult
    crawlRequests   chan CrawlRequest
    urlReservations map[string]bool
    waitGroup       *sync.WaitGroup
}

func (crawler Crawler) Crawl(url string, depth int) {
    defer crawler.waitGroup.Done()

    if depth <= 0 {
        return
    }

    body, urls, err := crawler.fetcher.Fetch(url)
    crawler.results <- FetchResult{url, body, err}

    if len(urls) == 0 {
        return
    }

    crawler.waitGroup.Add(len(urls))
    for _, url := range urls {
        crawler.crawlRequests <- CrawlRequest{url, depth - 1}
    }
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) (results chan FetchResult) {
    results = make(chan FetchResult)
    urlReservations := make(map[string]bool)
    crawler := Crawler{
        crawlRequests: make(chan CrawlRequest),
        depth:         depth,
        fetcher:       fetcher,
        results:       results,
        waitGroup:     &sync.WaitGroup{},
    }

    crawler.waitGroup.Add(1)

    // Listen for crawlRequests, pass them through to the caller if they aren't duplicates.
    go func() {
        for crawlRequest := range crawler.crawlRequests {
            if _, isReserved := urlReservations[crawlRequest.url]; isReserved {
                crawler.waitGroup.Done()
                continue
            }
            urlReservations[crawlRequest.url] = true
            go crawler.Crawl(crawlRequest.url, crawlRequest.depth)
        }
    }()

    // Wait for the wait group to finish, and then close the channel
    go func() {
        crawler.waitGroup.Wait()
        close(results)
    }()

    // Send the first crawl request to the channel
    crawler.crawlRequests <- CrawlRequest{url, depth}

    return
}

func main() {
    results := Crawl("https://golang.org/", 4, fetcher)
    for result := range results {
        if result.err != nil {
            fmt.Println(result.err)
            continue
        }
        fmt.Printf("found: %s %q\n", result.url, result.body)
    }

    fmt.Printf("done!")
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "https://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "https://golang.org/pkg/",
            "https://golang.org/cmd/",
        },
    },
    "https://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "https://golang.org/",
            "https://golang.org/cmd/",
            "https://golang.org/pkg/fmt/",
            "https://golang.org/pkg/os/",
        },
    },
    "https://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
    "https://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
}

答案 9 :(得分:0)

/*
Exercise: Web Crawler
In this exercise you'll use Go's concurrency features to parallelize a web crawler.

Modify the Crawl function to fetch URLs in parallel without fetching the same URL twice.

Hint: you can keep a cache of the URLs that have been fetched on a map, but maps alone are not safe for concurrent use!
*/

package main

import (
    "fmt"
    "sync"
    "time"
)

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

type Response struct {
    url  string
    urls []string
    body string
    err  error
}

var ch chan Response = make(chan Response)
var fetched map[string]bool = make(map[string]bool)
var wg sync.WaitGroup
var mu sync.Mutex

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // TODO: Fetch URLs in parallel.
    // TODO: Don't fetch the same URL twice.
    // This implementation doesn't do either:
    var fetch func(url string, depth int, fetcher Fetcher)
    wg.Add(1)
    recv := func() {
        for res := range ch {
            body, _, err := res.body, res.urls, res.err
            if err != nil {
                fmt.Println(err)
                continue
            }
            fmt.Printf("found: %s %q\n", url, body)
        }
    }

    fetch = func(url string, depth int, fetcher Fetcher) {
        time.Sleep(time.Second / 2)
        defer wg.Done()
        if depth <= 0 || fetched[url] {
            return
        }
        mu.Lock()
        fetched[url] = true
        mu.Unlock()
        body, urls, err := fetcher.Fetch(url)
        for _, u := range urls {
            wg.Add(1)
            go fetch(u, depth-1, fetcher)
        }
        ch <- Response{url, urls, body, err}
    }

    go fetch(url, depth, fetcher)
    go recv()
    return
}

func main() {
    Crawl("https://golang.org/", 4, fetcher)
    wg.Wait()
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "https://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "https://golang.org/pkg/",
            "https://golang.org/cmd1/",
        },
    },
    "https://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "https://golang.org/",
            "https://golang.org/cmd2/",
            "https://golang.org/pkg/fmt/",
            "https://golang.org/pkg/os/",
        },
    },
    "https://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
    "https://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
}

https://gist.github.com/gaogao1030/5d63ed925534f3610ccb7e25ed46992a

答案 10 :(得分:0)

由于这里的大多数解决方案都不适合我(包括已接受的答案),因此我将上传自己的灵感来自Kamil(特别感谢:)(无重复/所有有效的网址)

package main

import (
    "fmt"
    "runtime"
    "sync"
)

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, set map[string]bool) {
    // TODO: Fetch URLs in parallel.
    // TODO: Don't fetch the same URL twice.
    if depth <= 0 { return }
    // use a set to identify if the URL should be traversed or not
    fmt.Println(runtime.NumGoroutine())
    set[url] = true
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Printf("found: %s %q\n", url, body)
    for _, u := range urls {
        if set[u] == false {
            wg.Add(1)
            go Crawl(u, depth-1, fetcher, set)
        }
    }
    wg.Done()
}

var wg sync.WaitGroup

func main() {
    collectedURLs := make(map[string]bool)
    Crawl("https://golang.org/", 4, fetcher, collectedURLs)
    wg.Wait()
}

答案 11 :(得分:0)

这是我的解决方案:)

package main

import (
    "fmt"
    "runtime"
    "sync"
)

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, set map[string]bool) {
    // TODO: Fetch URLs in parallel.
    // TODO: Don't fetch the same URL twice.
    // This implementation doesn't do either:
    if depth <= 0 {
        return
    }
    // use a set to identify if the URL should be traversed or not
    if set[url] == true {
        wg.Done()
        return
    } else {
        fmt.Println(runtime.NumGoroutine())
        set[url] = true
        body, urls, err := fetcher.Fetch(url)
        if err != nil {
            fmt.Println(err)
            return
        }
        fmt.Printf("found: %s %q\n", url, body)
        for _, u := range urls {
            Crawl(u, depth-1, fetcher, set)
        }

    }

}

var wg sync.WaitGroup

func main() {
    wg.Add(6)
    collectedURLs := make(map[string]bool)
    go Crawl("https://golang.org/", 4, fetcher, collectedURLs)
    wg.Wait()
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "https://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "https://golang.org/pkg/",
            "https://golang.org/cmd/",
        },
    },
    "https://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "https://golang.org/",
            "https://golang.org/cmd/",
            "https://golang.org/pkg/fmt/",
            "https://golang.org/pkg/os/",
        },
    },
    "https://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
    "https://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
}

答案 12 :(得分:0)

以下是我的解决方案。除了全局映射,我只需要更改Crawl的内容。像其他解决方案一样,我使用了sync.Mapsync.WaitGroup。我已经屏蔽了重要部分。

var m sync.Map

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // This implementation doesn't do either:
    if depth <= 0 {
        return
    }
    // Don't fetch the same URL twice.
    /////////////////////////////////////
    _, ok := m.LoadOrStore(url, url)   //
    if ok {                            //
        return                         //
    }                                  //
    /////////////////////////////////////
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Printf("found: %s %q\n", url, body)
    // Fetch URLs in parallel.
    /////////////////////////////////////
    var wg sync.WaitGroup              //
    defer wg.Wait()                    //
    for _, u := range urls {           //
        wg.Add(1)                      //
        go func(u string) {            //
            defer wg.Done()            //
            Crawl(u, depth-1, fetcher) //
        }(u)                           //
    }                                  //
    /////////////////////////////////////
    return
}

答案 13 :(得分:0)

与接受的答案相似的想法,但是没有获取重复的URL,并直接打印到控制台。也不使用defer()。我们使用通道在goroutine完成时发出信号。 SafeMap的构想是从导游中先前给出的SafeCounter提出的。

对于子goroutine,我们创建一个通道数组,并等待每个孩子返回,方法是在通道上等待。

package main

import (
    "fmt"
    "sync"
)

// SafeMap is safe to use concurrently.
type SafeMap struct {
    v   map[string] bool
    mux sync.Mutex
}

// SetVal sets the value for the given key.
func (m *SafeMap) SetVal(key string, val bool) {
    m.mux.Lock()
    // Lock so only one goroutine at a time can access the map c.v.
    m.v[key] = val
    m.mux.Unlock()
}

// Value returns the current value of the counter for the given key.
func (m *SafeMap) GetVal(key string) bool {
    m.mux.Lock()
    // Lock so only one goroutine at a time can access the map c.v.
    defer m.mux.Unlock()
    return m.v[key]
}

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, status chan bool, urlMap SafeMap) {

    // Check if we fetched this url previously.
    if ok := urlMap.GetVal(url); ok {
        //fmt.Println("Already fetched url!")
        status <- true
        return 
    }

    // Marking this url as fetched already.
    urlMap.SetVal(url, true)

    if depth <= 0 {
        status <- false
        return
    }

    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        status <- false
        return
    }

    fmt.Printf("found: %s %q\n", url, body)

    statuses := make ([]chan bool, len(urls)) 
    for index, u := range urls {
        statuses[index] = make (chan bool)
        go Crawl(u, depth-1, fetcher, statuses[index], urlMap)
    }

    // Wait for child goroutines.
    for _, childstatus := range(statuses) {
        <- childstatus
    }

    // And now this goroutine can finish.
    status <- true

    return
}

func main() {
    urlMap := SafeMap{v: make(map[string] bool)}
    status := make(chan bool)
    go Crawl("https://golang.org/", 4, fetcher, status, urlMap)
    <- status
}

答案 14 :(得分:0)

以下是仅使用sync waitGroup进行并行化的简单解决方案。

var fetchedUrlMap = make(map[string]bool)
var mutex sync.Mutex

func Crawl(url string, depth int, fetcher Fetcher) {
	//fmt.Println("In Crawl2 with url" , url)
	if _, ok := fetchedUrlMap[url]; ok {
		return
	}

	if depth <= 0 {
		return
	}
	body, urls, err := fetcher.Fetch(url)
	mutex.Lock()
	fetchedUrlMap[url] = true
	mutex.Unlock()
	if err != nil {
		fmt.Println(err)
		return
	}
	fmt.Printf("found: %s %q\n", url, body)

	var wg sync.WaitGroup
	for _, u := range urls {
		//	fmt.Println("Solving for ", u)
		wg.Add(1)
		go func(uv string) {
			Crawl(uv, depth-1, fetcher)
			wg.Done()
		}(u)
	}
	wg.Wait()
}

答案 15 :(得分:0)

这是我的解决方案,使用sync.WaitGroup和获取网址的SafeCache:

package main

import (
    "fmt"
    "sync"
)

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

// Safe to use concurrently
type SafeCache struct {
    fetched map[string]string
    mux     sync.Mutex
}

func (c *SafeCache) Add(url, body string) {
    c.mux.Lock()
    defer c.mux.Unlock()

    if _, ok := c.fetched[url]; !ok {
        c.fetched[url] = body
    }
}

func (c *SafeCache) Contains(url string) bool {
    c.mux.Lock()
    defer c.mux.Unlock()

    _, ok := c.fetched[url]
    return ok
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, cache SafeCache,
    wg *sync.WaitGroup) {

    defer wg.Done()
    if depth <= 0 {
        return
    }
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Printf("found: %s %q\n", url, body)
    cache.Add(url, body)
    for _, u := range urls {
        if !cache.Contains(u) {
            wg.Add(1)
            go Crawl(u, depth-1, fetcher, cache, wg)
        }
    }
    return
}

func main() {
    cache := SafeCache{fetched: make(map[string]string)}
    var wg sync.WaitGroup

    wg.Add(1)
    Crawl("http://golang.org/", 4, fetcher, cache, &wg)
    wg.Wait()
}

答案 16 :(得分:0)

我使用slice来避免两次抓取url,没有并发的递归版本是可以的,但不确定这个并发版本。

func Crawl(url string, depth int, fetcher Fetcher) {
    var str_arrs []string
    var mux sync.Mutex

    var crawl func(string, int)
    crawl = func(url string, depth int) {
        if depth <= 0 {
            return
        }

        mux.Lock()
        for _, v := range str_arrs {
            if url == v {
                mux.Unlock()
                return
            }
        }
        str_arrs = append(str_arrs, url)
        mux.Unlock()

        body, urls, err := fetcher.Fetch(url)
        if err != nil {
            fmt.Println(err)
            return
        }
        fmt.Printf("found: %s %q\n", url, body)
        for _, u := range urls {
            go crawl(u, depth-1) // could delete “go” then it is recursive
        }
    }

    crawl(url, depth)
    return
}

func main() {
    Crawl("http://golang.org/", 4, fetcher)
}

答案 17 :(得分:0)

我用一个简单的频道实现了它,所有goroutines都发送了它们的消息。为了确保在没有更多goroutine时它被关闭,我使用一个安全计数器,当计数器为0时关闭通道。

type Msg struct {
    url string
    body string
}

type SafeCounter struct {
    v int
    mux sync.Mutex
}

func (c *SafeCounter) inc() {
    c.mux.Lock()
    defer c.mux.Unlock()
    c.v++   
}

func (c *SafeCounter) dec(ch chan Msg) {
    c.mux.Lock()
    defer c.mux.Unlock()
    c.v--
    if c.v == 0 {
        close(ch)
    }
}

var goes SafeCounter = SafeCounter{v: 0}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, ch chan Msg) {
    defer goes.dec(ch)
    // TODO: Fetch URLs in parallel.
    // TODO: Don't fetch the same URL twice.
    // This implementation doesn't do either:
    if depth <= 0 {
        return
    }
    if !cache.existsAndRegister(url) {
        body, urls, err :=  fetcher.Fetch(url)
        if err != nil {
            fmt.Println(err)
            return
        }
        ch <- Msg{url, body}
        for _, u := range urls {
            goes.inc()
            go Crawl(u, depth-1, fetcher, ch)
        }
    }
    return
}

func main() {
    ch := make(chan Msg, 100)
    goes.inc()
    go Crawl("http://golang.org/", 4, fetcher, ch)
    for m := range ch {
        fmt.Printf("found: %s %q\n", m.url, m.body)
    }
}

请注意,安全计数器必须在goroutine之外递增。