Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/spring/12.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
go中的对象正在被替换_Go - Fatal编程技术网

go中的对象正在被替换

go中的对象正在被替换,go,Go,我正在学习写蜘蛛。我正在尝试从allpages.com获取所有业务类别的列表 下面是我的整个计划。不幸的是,我无法隔离问题,因此我已将其全部粘贴 如果你运行这个程序,你会看到它首先正确地下载了第一页,并将所有提取的类别添加到类别列表中 然而,当它随后下载后续页面时,似乎会弄乱对父类别的引用。例如,它错误地计算URLhttp://www.allpages.com/travel-tourism/political-ideological-organizations/,而事实上政治意识形态组织/不是旅

我正在学习写蜘蛛。我正在尝试从
allpages.com
获取所有业务类别的列表

下面是我的整个计划。不幸的是,我无法隔离问题,因此我已将其全部粘贴

如果你运行这个程序,你会看到它首先正确地下载了第一页,并将所有提取的类别添加到类别列表中

然而,当它随后下载后续页面时,似乎会弄乱对父类别的引用。例如,它错误地计算URL
http://www.allpages.com/travel-tourism/political-ideological-organizations/
,而事实上
政治意识形态组织/
不是
旅游/
的子类别。通过挖掘日志,它似乎覆盖了
父对象中的数据。工人越多,错误就越明显

在我开始通过引用goroutine来传递数据之前,这个方法工作得稍微好一点,但我基本上遇到了相同的问题

我有几个问题:

  • 如何在不通过日志行拾取的情况下调试它
  • 出了什么问题/为什么不工作?如何修复

    package main
    
    import (
            "fmt"
            "github.com/PuerkitoBio/goquery"
            "log"
            "strconv"
            "strings"
            "regexp"
    )
    
    const domain = "http://www.allpages.com/"
    const categoryPage = "category.html"
    
    type Category struct {
            url string
            level uint
            name string
            entries int
            parent *Category
    }
    
    type DownloadResult struct {
            doc *goquery.Document
            category *Category
    }
    
    const WORKERS = 2
    const SEPARATOR = "§§§"
    
    func main() {
    
            allCategories := make([]Category, 0)
    
            downloadChannel := make(chan *Category)
            resultsChannel := make(chan *DownloadResult, 100)
    
            for w := 1; w <= WORKERS; w++ {
                    go worker(downloadChannel, resultsChannel)
            }
    
            numRequests := 1
            downloadChannel <- &Category{ domain + categoryPage, 0, "root", 0, nil }
    
            for result := range resultsChannel {
                    var extractor func(doc *goquery.Document) []string
    
                    if result.category.level == 0 {
                            extractor = topLevelExtractor
                    } else if result.category.level == 1 {
                            extractor = secondLevelExtractor
                    } else {
                            extractor = thirdLevelExtractor
                    }
    
                    categories := extractCategories(result.doc, result.category, extractor)
                    allCategories = append(allCategories, *categories...)
    
                    //fmt.Printf("Appending categories: %v", *categories)
    
                    fmt.Printf("total categories = %d, total requests = %d\n", len(allCategories), numRequests)
    
                    for _, category := range *categories {
                            numRequests += 1
                            downloadChannel <- &category
                    }
    
                    // close the channels when there are no more jobs
                    if len(allCategories) > numRequests {
                            close(downloadChannel)
                            close(resultsChannel)
                    }
            }
    
            fmt.Println("Done")
    }
    
    func worker(downloadChannel <-chan *Category, results chan<- *DownloadResult) {
            for target := range downloadChannel {
                    fmt.Printf("Downloading %v (addr %p) ...", target, &target)
    
                    doc, err := goquery.NewDocument(target.url)
                    if err != nil {
                            log.Fatal(err)
                            panic(err)
                    }
    
                    fmt.Print("done \n")
    
                    results <- &DownloadResult{doc, target}
            }
    }
    
    func extractCategories(doc *goquery.Document, parent *Category, extractor func(doc *goquery.Document) []string) *[]Category {
    
            numberRegex, _ := regexp.Compile("[0-9,]+")
    
            log.Printf("Extracting subcategories for page %s\n", parent)
    
            subCategories := extractor(doc)
    
            categories := make([]Category, 0)
    
            for _, subCategory := range subCategories {
                    log.Printf("Got subcategory=%s from parent=%s", subCategory, parent)
                    extracted := strings.Split(subCategory, SEPARATOR)
    
                    numberWithComma := numberRegex.FindString(extracted[2])
                    number := strings.Replace(numberWithComma, ",", "", -1)
    
                    numRecords, err := strconv.Atoi(number)
                    if err != nil {
                            log.Fatal(err)
                            panic(err)
                    }
    
                    var category Category
    
                    level := parent.level + 1
    
                    if parent.level == 0 {
                            category = Category{ domain + extracted[1], level, extracted[0], numRecords, parent }
                    } else {
                            log.Printf("category URL=%s, parent=%s, parent=%v", extracted[1], parent.url, parent)
                            category = Category{ parent.url + extracted[1], level, extracted[0], numRecords, parent }
                    }
    
                    log.Printf("Appending category=%v (pointer=%p)", category, &category)
    
                    categories = append(categories, category)
            }
    
            return &categories
    }
    
    func topLevelExtractor(doc *goquery.Document) []string {
            return doc.Find(".cat-listings-td .c-1s-2m-1-td1").Map(func(i int, s *goquery.Selection) string {
                    title := s.Find("a").Text()
                    url := s.Find("a").Map(func(x int, a *goquery.Selection) string {
                            v, _ := a.Attr("href")
                            return v
                    })
                    records := s.Clone().Children().Remove().End().Text()
    
                    //log.Printf("Item %d: %s, %s - %s\n", i, title, records, url)
    
                    res := []string{title, url[0], records}
                    return strings.Join(res, SEPARATOR)
            })
    }
    
    func secondLevelExtractor(doc *goquery.Document) []string {
            return doc.Find(".c-2m-3c-1-table .c-2m-3c-1-td1").Map(func(i int, s *goquery.Selection) string {
                    title := s.Find("a").Text()
                    url := s.Find("a").Map(func(x int, a *goquery.Selection) string {
                            v, _ := a.Attr("href")
                            return v
                    })
                    records := s.Clone().Children().Remove().End().Text()
    
                    //log.Printf("Item %d: %s, %s - %s\n", i, title, records, url)
    
                    res := []string{title, url[0], records}
                    return strings.Join(res, SEPARATOR)
            })
    }
    
    func thirdLevelExtractor(doc *goquery.Document) []string {
            return doc.Find(".c-2m-3c-1-table .c-2m-3c-1-td1").Map(func(i int, s *goquery.Selection) string {
                    title := s.Find("a").Text()
                    url := s.Find("a").Map(func(x int, a *goquery.Selection) string {
                            v, _ := a.Attr("href")
                            return v
                    })
                    records := s.Clone().Children().Remove().End().Text()
    
                    //log.Printf("Item %d: %s, %s - %s\n", i, title, records, url)
    
                    res := []string{title, url[0], records}
                    return strings.Join(res, SEPARATOR)
            })
    }
    
    主程序包
    进口(
    “fmt”
    “github.com/purkitobio/goquery”
    “日志”
    “strconv”
    “字符串”
    “regexp”
    )
    常量域=”http://www.allpages.com/"
    const categoryPage=“category.html”
    类型类别结构{
    url字符串
    水平井
    名称字符串
    输入整型
    父*类别
    }
    类型DownloadResult结构{
    doc*goquery.Document
    类别*类别
    }
    施工工人=2
    常量分隔符=“§§§”
    func main(){
    allCategories:=make([]类别,0)
    下载频道:=制作(成龙*类别)
    结果频道:=make(chan*下载结果,100)
    对于w:=1;w循环:

                for _, category := range *categories {
                        numRequests += 1
                        downloadChannel <- &category
                }
    

    你能提供一个小得多的独立片段来演示这个问题吗?很可能这个问题隐藏在大量的噪音中。@Volker不幸的是,正如我解释的,我不知道发生了什么,所以我不能浓缩它。你认为这是一个好问题吗?答案对其他人也有帮助吗?@Volker肯定。这个问题正在对u,category:=range*categories
    使用
    循环,并传递对
    &category
    的引用。这是传递对生成器中使用的变量的引用,而不是底层元素。将其更改为对i:=0;i循环;i++
    已修复该问题。请注意,潜在的问题是使用“指向切片的指针”,这在大多数情况下是错误的,在您的情况下也是错误的:不要这样做。
        for i := 0; i < len(*categories); i++ {
            fmt.Printf("Queuing category: %v (%p)", categoriesValues[i], categoriesValues[i])
    
            downloadChannel <- &categoriesValues[i]
        }