Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/go/7.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
巡更练习:网络爬虫-所有goroutine都处于休眠状态-死锁_Go - Fatal编程技术网

巡更练习:网络爬虫-所有goroutine都处于休眠状态-死锁

巡更练习:网络爬虫-所有goroutine都处于休眠状态-死锁,go,Go,练习来源: 说明: 在本练习中,您将使用Go的并发特性来并行化web爬虫 修改爬网函数以并行获取URL,而不必两次获取相同的URL 提示:您可以保留已在地图上获取的URL的缓存,但单独使用地图并不安全 以下是我的答案: package main import ( "fmt" "sync" ) type Fetcher interface { // Fetch returns the body of URL and // a slice of URLs foun

练习来源:

说明:

在本练习中,您将使用Go的并发特性来并行化web爬虫

修改爬网函数以并行获取URL,而不必两次获取相同的URL

提示:您可以保留已在地图上获取的URL的缓存,但单独使用地图并不安全

以下是我的答案:

package main

import (
    "fmt"
    "sync"
)

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

var crawledURLs = make(map[string]bool)
var mux sync.Mutex

func CrawlURL(url string, depth int, fetcher Fetcher, quit chan bool) {
    defer func() { quit <- true }()
    if depth <= 0 {
        return
    }

    mux.Lock()
    _, isCrawled := crawledURLs[url]
    if isCrawled {
        return
    }
    crawledURLs[url] = true
    mux.Unlock()

    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Printf("found: %s %q\n", url, body)
    quitThis := make(chan bool)
    for _, u := range urls {
        go CrawlURL(u, depth-1, fetcher, quitThis)
    }
    for range urls {
        <-quitThis
    }
    return
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    CrawlURL(url, depth, fetcher, make(chan bool))
    return
}

func main() {
    Crawl("https://golang.org/", 4, fetcher)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "https://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "https://golang.org/pkg/",
            "https://golang.org/cmd/",
        },
    },
    "https://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "https://golang.org/",
            "https://golang.org/cmd/",
            "https://golang.org/pkg/fmt/",
            "https://golang.org/pkg/os/",
        },
    },
    "https://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
    "https://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
}
我想知道为什么会发生僵局?是因为我用错了频道吗


注意到我忘记了释放
if isCrawled{}
分支中的互斥体, 因此,我对代码进行了如下编辑:

...
    if isCrawled {
        mux.Unlock() // added this line
        return
    }
...
但死锁仍然存在,并且输出不同:

found: https://golang.org/ "The Go Programming Language"
not found: https://golang.org/cmd/
found: https://golang.org/pkg/ "Packages"
found: https://golang.org/pkg/os/ "Package os"
found: https://golang.org/pkg/fmt/ "Package fmt"
fatal error: all goroutines are asleep - deadlock!

主要问题是在返回
if isCrawled{}
分支之前忘记释放互斥体

此外,如果您确实需要同步goroutine,我建议您使用同步api。通道更好地用于通信和共享数据

这是使用
sync.WaitGroup
的解决方案:

这里是您的解决方案,它只包含通道:


问题是,第一次调用
CrawlURL()
时,您并没有从作为参数传递的通道中读取内容。因此,一旦该函数尝试通过
defer func(){quit感谢您的帮助。但是,Go Tour没有引入同步API,因此我更喜欢使用通道的解决方案。此外,即使我在if isCrawled{}分支中添加
mux.Unlock()
,输出中仍然会出现死锁。
found: https://golang.org/ "The Go Programming Language"
not found: https://golang.org/cmd/
found: https://golang.org/pkg/ "Packages"
found: https://golang.org/pkg/os/ "Package os"
found: https://golang.org/pkg/fmt/ "Package fmt"
fatal error: all goroutines are asleep - deadlock!
package main

import (
    "fmt"
    "sync"
)

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

var crawledURLs = make(map[string]bool)
var mux sync.Mutex

func CrawlURL(url string, depth int, fetcher Fetcher, quit chan bool) {
    //For very first function instance, this would block forever if 
    //nobody is receiving from the other end of this channel
    defer func() { quit <- true }()

    if depth <= 0 {
        return
    }

    mux.Lock()
    _, isCrawled := crawledURLs[url]
    if isCrawled {
        mux.Unlock()
        return
    }
    crawledURLs[url] = true
    mux.Unlock()

    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Printf("found: %s %q\n", url, body)
    quitThis := make(chan bool)
    for _, u := range urls {
        go CrawlURL(u, depth-1, fetcher, quitThis)
    }
    for range urls {
        <-quitThis
    }
    return
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    lastQuit := make(chan bool)
    go CrawlURL(url, depth, fetcher, lastQuit)
    //You need to receive from this channel in order to
    //unblock the called function
    <-lastQuit
    return
}

func main() {
    Crawl("https://golang.org/", 10, fetcher)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "https://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "https://golang.org/pkg/",
            "https://golang.org/cmd/",
        },
    },
    "https://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "https://golang.org/",
            "https://golang.org/cmd/",
            "https://golang.org/pkg/fmt/",
            "https://golang.org/pkg/os/",
        },
    },
    "https://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
    "https://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
}