go中的对象正在被替换_Go - Fatal编程技术网

go中的对象正在被替换

go中的对象正在被替换,go,Go,我正在学习写蜘蛛。我正在尝试从allpages.com获取所有业务类别的列表下面是我的整个计划。不幸的是，我无法隔离问题，因此我已将其全部粘贴如果你运行这个程序，你会看到它首先正确地下载了第一页，并将所有提取的类别添加到类别列表中然而，当它随后下载后续页面时，似乎会弄乱对父类别的引用。例如，它错误地计算URLhttp://www.allpages.com/travel-tourism/political-ideological-organizations/，而事实上政治意识形态组织/不是旅

我正在学习写蜘蛛。我正在尝试从

allpages.com

获取所有业务类别的列表

下面是我的整个计划。不幸的是，我无法隔离问题，因此我已将其全部粘贴

如果你运行这个程序，你会看到它首先正确地下载了第一页，并将所有提取的类别添加到类别列表中

然而，当它随后下载后续页面时，似乎会弄乱对父类别的引用。例如，它错误地计算URL

http://www.allpages.com/travel-tourism/political-ideological-organizations/

，而事实上

政治意识形态组织/

不是

旅游/

的子类别。通过挖掘日志，它似乎覆盖了

父对象中的数据。工人越多，错误就越明显
在我开始通过引用goroutine来传递数据之前，这个方法工作得稍微好一点，但我基本上遇到了相同的问题
我有几个问题：
如何在不通过日志行拾取的情况下调试它
出了什么问题/为什么不工作？如何修复
package main

import (
        "fmt"
        "github.com/PuerkitoBio/goquery"
        "log"
        "strconv"
        "strings"
        "regexp"
)

const domain = "http://www.allpages.com/"
const categoryPage = "category.html"

type Category struct {
        url string
        level uint
        name string
        entries int
        parent *Category
}

type DownloadResult struct {
        doc *goquery.Document
        category *Category
}

const WORKERS = 2
const SEPARATOR = "§§§"

func main() {

        allCategories := make([]Category, 0)

        downloadChannel := make(chan *Category)
        resultsChannel := make(chan *DownloadResult, 100)

        for w := 1; w <= WORKERS; w++ {
                go worker(downloadChannel, resultsChannel)
        }

        numRequests := 1
        downloadChannel <- &Category{ domain + categoryPage, 0, "root", 0, nil }

        for result := range resultsChannel {
                var extractor func(doc *goquery.Document) []string

                if result.category.level == 0 {
                        extractor = topLevelExtractor
                } else if result.category.level == 1 {
                        extractor = secondLevelExtractor
                } else {
                        extractor = thirdLevelExtractor
                }

                categories := extractCategories(result.doc, result.category, extractor)
                allCategories = append(allCategories, *categories...)

                //fmt.Printf("Appending categories: %v", *categories)

                fmt.Printf("total categories = %d, total requests = %d\n", len(allCategories), numRequests)

                for _, category := range *categories {
                        numRequests += 1
                        downloadChannel <- &category
                }

                // close the channels when there are no more jobs
                if len(allCategories) > numRequests {
                        close(downloadChannel)
                        close(resultsChannel)
                }
        }

        fmt.Println("Done")
}

func worker(downloadChannel <-chan *Category, results chan<- *DownloadResult) {
        for target := range downloadChannel {
                fmt.Printf("Downloading %v (addr %p) ...", target, &target)

                doc, err := goquery.NewDocument(target.url)
                if err != nil {
                        log.Fatal(err)
                        panic(err)
                }

                fmt.Print("done \n")

                results <- &DownloadResult{doc, target}
        }
}

func extractCategories(doc *goquery.Document, parent *Category, extractor func(doc *goquery.Document) []string) *[]Category {

        numberRegex, _ := regexp.Compile("[0-9,]+")

        log.Printf("Extracting subcategories for page %s\n", parent)

        subCategories := extractor(doc)

        categories := make([]Category, 0)

        for _, subCategory := range subCategories {
                log.Printf("Got subcategory=%s from parent=%s", subCategory, parent)
                extracted := strings.Split(subCategory, SEPARATOR)

                numberWithComma := numberRegex.FindString(extracted[2])
                number := strings.Replace(numberWithComma, ",", "", -1)

                numRecords, err := strconv.Atoi(number)
                if err != nil {
                        log.Fatal(err)
                        panic(err)
                }

                var category Category

                level := parent.level + 1

                if parent.level == 0 {
                        category = Category{ domain + extracted[1], level, extracted[0], numRecords, parent }
                } else {
                        log.Printf("category URL=%s, parent=%s, parent=%v", extracted[1], parent.url, parent)
                        category = Category{ parent.url + extracted[1], level, extracted[0], numRecords, parent }
                }

                log.Printf("Appending category=%v (pointer=%p)", category, &category)

                categories = append(categories, category)
        }

        return &categories
}

func topLevelExtractor(doc *goquery.Document) []string {
        return doc.Find(".cat-listings-td .c-1s-2m-1-td1").Map(func(i int, s *goquery.Selection) string {
                title := s.Find("a").Text()
                url := s.Find("a").Map(func(x int, a *goquery.Selection) string {
                        v, _ := a.Attr("href")
                        return v
                })
                records := s.Clone().Children().Remove().End().Text()

                //log.Printf("Item %d: %s, %s - %s\n", i, title, records, url)

                res := []string{title, url[0], records}
                return strings.Join(res, SEPARATOR)
        })
}

func secondLevelExtractor(doc *goquery.Document) []string {
        return doc.Find(".c-2m-3c-1-table .c-2m-3c-1-td1").Map(func(i int, s *goquery.Selection) string {
                title := s.Find("a").Text()
                url := s.Find("a").Map(func(x int, a *goquery.Selection) string {
                        v, _ := a.Attr("href")
                        return v
                })
                records := s.Clone().Children().Remove().End().Text()

                //log.Printf("Item %d: %s, %s - %s\n", i, title, records, url)

                res := []string{title, url[0], records}
                return strings.Join(res, SEPARATOR)
        })
}

func thirdLevelExtractor(doc *goquery.Document) []string {
        return doc.Find(".c-2m-3c-1-table .c-2m-3c-1-td1").Map(func(i int, s *goquery.Selection) string {
                title := s.Find("a").Text()
                url := s.Find("a").Map(func(x int, a *goquery.Selection) string {
                        v, _ := a.Attr("href")
                        return v
                })
                records := s.Clone().Children().Remove().End().Text()

                //log.Printf("Item %d: %s, %s - %s\n", i, title, records, url)

                res := []string{title, url[0], records}
                return strings.Join(res, SEPARATOR)
        })
}

主程序包
进口(
“fmt”
“github.com/purkitobio/goquery”
“日志”
“strconv”
“字符串”
“regexp”
)
常量域=”http://www.allpages.com/"
const categoryPage=“category.html”
类型类别结构{
url字符串
水平井
名称字符串
输入整型
父*类别
}
类型DownloadResult结构{
doc*goquery.Document
类别*类别
}
施工工人=2
常量分隔符=“§§§”
func main（）{
allCategories:=make（[]类别，0）
下载频道：=制作（成龙*类别）
结果频道：=make（chan*下载结果，100）
对于w:=1；w循环：
            for _, category := range *categories {
                    numRequests += 1
                    downloadChannel <- &category
            }

你能提供一个小得多的独立片段来演示这个问题吗？很可能这个问题隐藏在大量的噪音中。@Volker不幸的是，正如我解释的，我不知道发生了什么，所以我不能浓缩它。你认为这是一个好问题吗？答案对其他人也有帮助吗？@Volker肯定。这个问题正在对u，category:=range*categories
使用循环，并传递对&category
的引用。这是传递对生成器中使用的变量的引用，而不是底层元素。将其更改为对i:=0；i循环；i++
已修复该问题。请注意，潜在的问题是使用“指向切片的指针”，这在大多数情况下是错误的，在您的情况下也是错误的：不要这样做。
    for i := 0; i < len(*categories); i++ {
        fmt.Printf("Queuing category: %v (%p)", categoriesValues[i], categoriesValues[i])

        downloadChannel <- &categoriesValues[i]
    }