加载带有和不带go例程的地图_Go

加载带有和不带go例程的地图

加载带有和不带go例程的地图,go,Go,这是我遇到的一个有趣的情况。在使用go例程进行一些数据操作之后，我需要从一个文件中读取数据，并根据我们发现的内容填充一个映射。以下是简化的问题陈述和示例：运行gen_data.sh #!/bin/bash rm some.dat || : for i in `seq 1 10000`; do echo "$i `date` tx: $RANDOM rx:$RANDOM" >> some.dat done 如果我使用loadtoDict.go将some.dat中的那

这是我遇到的一个有趣的情况。在使用go例程进行一些数据操作之后，我需要从一个文件中读取数据，并根据我们发现的内容填充一个映射。以下是简化的问题陈述和示例：

运行

gen_data.sh

#!/bin/bash 

rm some.dat || : 
for i in `seq 1 10000`; do 
    echo "$i `date` tx: $RANDOM rx:$RANDOM" >> some.dat
done

如果我使用

loadtoDict.go

将

some.dat

中的那些行读入一个

map[int]字符串

而不使用go例程，它将保持对齐。（由于第一和第二个单词相同，请参见下面的o/p。）

在现实生活中，我确实需要在将行加载到地图之前处理这些行（很昂贵），使用go例程可以加快字典的创建，这是解决实际问题的一个重要要求

loadtoDict.go

package main

import (
    "bufio"
    "fmt"
    "log"
    "os"
)

var (
    fileName = "some.dat"
)

func checkerr(err error) {
    if err != nil {
        fmt.Println(err)
        log.Fatal(err)
    }
}

func main() {
    ourDict := make(map[int]string)
    f, err := os.Open(fileName)
    checkerr(err)
    defer f.Close()

    fscanner := bufio.NewScanner(f)

    indexPos := 1

    for fscanner.Scan() {
        text := fscanner.Text()
        //fmt.Println("text", text)
        ourDict[indexPos] = text
        indexPos++

    }

    for i, v := range ourDict {
        fmt.Printf("%d: %s\n", i, v)
    }

}

运行：

$ ./loadtoDict
...
8676: 8676 Mon Dec 23 15:52:24 PST 2019 tx: 17718 rx:1133
2234: 2234 Mon Dec 23 15:52:20 PST 2019 tx: 13170 rx:15962
3436: 3436 Mon Dec 23 15:52:21 PST 2019 tx: 17519 rx:5419
6177: 6177 Mon Dec 23 15:52:23 PST 2019 tx: 5731 rx:5449

注意第一个和第二个单词是如何“对齐”的。但是，如果使用go例程加载地图，则会出现错误：

async\u loadtoDict.go

package main

import (
    "bufio"
    "fmt"
    "log"
    "os"
    "sync"
)

var (
    fileName = "some.dat"
    mu       = &sync.RWMutex{}
    MAX = 9000
)

func checkerr(err error) {
    if err != nil {
        fmt.Println(err)
        log.Fatal(err)
    }
}

func main() {
    ourDict := make(map[int]string)
    f, err := os.Open(fileName)
    checkerr(err)
    defer f.Close()

    fscanner := bufio.NewScanner(f)

    indexPos := 1
    var wg sync.WaitGroup
    sem := make(chan int, MAX)
    defer close(sem)

    for fscanner.Scan() {
        text := fscanner.Text()
        wg.Add(1)
        sem <- 1
        go func() {
            mu.Lock()
            defer mu.Unlock()
            ourDict[indexPos] = text
            indexPos++
            <- sem
            wg.Done()
        }()

    }

    wg.Wait()

    for i, v := range ourDict {
        fmt.Printf("%d: %s\n", i, v)
    }

}

尽管使用互斥锁保护摄取

ourDict[indexPos]

，这仍然存在。我希望我的地图索引与摄取尝试对齐

谢谢

您的信号量

sem

无法工作，因为您对其进行了深度缓冲

通常，这是为此类任务设置映射的错误方法，因为读取文件将是缓慢的部分。如果您有一个更复杂的任务，例如，阅读一行，想很多，设置一些您想要的东西作为伪代码结构：

type workType struct {
    index int
    line  string
}

var wg sync.WaitGroup
wg.Add(nWorkers)
// I made this buffered originally but there's no real point, so
// fixing that in an edit
work := make(chan workType)
for i := 0; i < nWorkers; i++ {
    go readAndDoWork(work, &wg)
}

for i := 1; fscanner.Scan(); i++ {
    work <- workType{index: i, line: fscanner.Text()}
}
close(work)
wg.Wait()

... now your dictionary is ready ...

使用

insertIntoDict

获取互斥体（以保护从索引到结果的映射）并写入字典。（如果愿意，您可以将其内联。）

这里的想法是根据可用CPU的数量来设置一定数量的工人，每个CPU抓取下一个工作项并处理它。主goroutine只是将工作打包，然后关闭工作通道，这将使所有工作人员看到输入的结束，然后等待他们发出完成计算的信号

（如果您愿意，您可以再创建一个goroutine来读取工人计算的结果并将其放入映射。这样，映射本身就不需要互斥。）

正如我在评论中提到的，您无法控制goroutine的执行顺序，因此不应该从内部更改索引

下面是一个示例，其中与地图的交互在单个goroutine中，而您的处理在其他goroutine中：

package main

import (
    "bufio"
    "fmt"
    "log"
    "os"
    "sync"
)

var (
    fileName = "some.dat"
    MAX      = 9000
)

func checkerr(err error) {
    if err != nil {
        fmt.Println(err)
        log.Fatal(err)
    }
}

type result struct {
    index int
    data string
}

func main() {
    ourDict := make(map[int]string)
    f, err := os.Open(fileName)
    checkerr(err)
    defer f.Close()

    fscanner := bufio.NewScanner(f)

    var wg sync.WaitGroup
    sem := make(chan struct{}, MAX) // Use empty structs for semaphores as they have no allocation
    defer close(sem)
    out := make(chan result)
    defer close(out)
    indexPos := 1

    for fscanner.Scan() {
        text := fscanner.Text()
        wg.Add(1)
        sem <- struct{}{}

        go func(index int, data string) {
            // Defer the release of your resources, otherwise if any error occur in your goroutine
            // you'll have a deadlock
            defer func() {
                wg.Done()
                <-sem
            }()
            // Process your data
            out <- result{index, data}
        }(indexPos, text) // Pass in the data that will change on the iteration, go optimizer will move it around better

        indexPos++
    }

    // The goroutine is the only one to write to the dict, so no race condition
    go func() {
        for {
            if entry, ok := <-out; ok {
                ourDict[entry.index] = entry.data
            } else {
                return // Exit goroutine when channel closes
            }
        }
    }()

    wg.Wait()

    for i, v := range ourDict {
        fmt.Printf("%d: %s\n", i, v)
    }

}

主程序包
进口(
“布菲奥”
“fmt”
“日志”
“操作系统”
“同步”
)
变量(
fileName=“some.dat”
最大值=9000
)
func checkerr（错误错误）{
如果错误！=零{
fmt.Println（错误）
log.Fatal（错误）
}
}
类型结果结构{
索引整数
数据串
}
func main（）{
ourDict:=make（映射[int]字符串）
f、 错误：=os.Open（文件名）
checker（err）
延迟f.关闭（）
fscanner:=bufio.NewScanner（f）
var wg sync.WaitGroup
sem:=make（chan struct{}，MAX）//对信号量使用空结构，因为它们没有分配
延迟关闭（sem）
out:=制造（结果）
延迟结束（关闭）
indexPos:=1
对于fscanner.Scan（）{
text:=fscanner.text（）
工作组.添加（1）
sem好的，我已经解决了这个问题。通过复制给goroutine一个值以保持不变，似乎是可行的
更改：
for fscanner.Scan() {
    text := fscanner.Text()
    wg.Add(1)
    sem <- 1
    go func() {
        mu.Lock()
        defer mu.Unlock()
        ourDict[indexPos] = text
        indexPos++
        <- sem
        wg.Done()
    }()

}

fscanner.Scan（）的{
text:=fscanner.text（）
工作组.添加（1）
sem有多么不必要的复杂…索引不匹配的原因是，即使您以相同的顺序创建goroutine并防止并发，您仍然有MAX
（9000）goroutines正在等待，您无法控制它们恢复的顺序，索引代表的是执行的顺序，而不是创建。顺便说一句，您的代码是完全顺序的，只是不确定。除非我保持MAX=1
，否则我会观察我上面所报告的内容-这会挫败go routines prep并填充我的map
。我确实需要要在将行加载到映射之前对其进行处理，使用go例程可以加快字典的创建速度，这是解决实际问题的一个重要要求。在单个goroutine中添加对映射的访问，而其他goroutine只准备数据（我假设存在一些数据操作，因为if只是一个传递，就像示例goroutines那样，它实际上会降低开销）。我将举一个例子作为回答。正如我在回答中所说，你的信号量sem
不起作用，因为你对它进行了深度缓冲。当你设置MAX=1
时，你将它设为一个条目深度，然后它就起作用了：它迫使你的每个派生goroutine等待前一个完成后才能开始。传递索引会有所帮助，是的，因为您使用互斥锁来保护它（仅在使用和增量期间，也就是说，其他goroutine在您之前增加了它）但是现在你有了一个每goroutine索引。但是你认为变量sem
对你有什么作用呢？sem是为了将并发限制在最大goroutine。对-所以当它设置为9000时，你可以剥离多达9000个并行goroutine。这并不是真正的生产性：你可用的CPU数量将限制实际goroutine的数量您可以做的工作。当您将其设置为1时，您将自己限制为1个goroutine，然后所有goroutine之间共享indexPos
本身，只有1个gorouting使用它。使用更新的代码，您可以在每个goroutine中复制indexPos。请注意，这是有成本的（很小）创建一个新的goroutine和一个成本（很小）在一个通道上发送和接收。通常，最好将n-CPU可用的工作线程旋转一次，然后通过一个通道为每个工作线程提供数据，而不是为每个工作线程旋转一个工作线程，然后让它通过一个信号通道说话，以限制一次可以实际运行的线程数量。参考torek答案，这将为您提供更好的结果这是一个复杂的代码，在最后一次尝试中
package main

import (
    "bufio"
    "fmt"
    "log"
    "os"
    "sync"
)

var (
    fileName = "some.dat"
    MAX      = 9000
)

func checkerr(err error) {
    if err != nil {
        fmt.Println(err)
        log.Fatal(err)
    }
}

type result struct {
    index int
    data string
}

func main() {
    ourDict := make(map[int]string)
    f, err := os.Open(fileName)
    checkerr(err)
    defer f.Close()

    fscanner := bufio.NewScanner(f)

    var wg sync.WaitGroup
    sem := make(chan struct{}, MAX) // Use empty structs for semaphores as they have no allocation
    defer close(sem)
    out := make(chan result)
    defer close(out)
    indexPos := 1

    for fscanner.Scan() {
        text := fscanner.Text()
        wg.Add(1)
        sem <- struct{}{}

        go func(index int, data string) {
            // Defer the release of your resources, otherwise if any error occur in your goroutine
            // you'll have a deadlock
            defer func() {
                wg.Done()
                <-sem
            }()
            // Process your data
            out <- result{index, data}
        }(indexPos, text) // Pass in the data that will change on the iteration, go optimizer will move it around better

        indexPos++
    }

    // The goroutine is the only one to write to the dict, so no race condition
    go func() {
        for {
            if entry, ok := <-out; ok {
                ourDict[entry.index] = entry.data
            } else {
                return // Exit goroutine when channel closes
            }
        }
    }()

    wg.Wait()

    for i, v := range ourDict {
        fmt.Printf("%d: %s\n", i, v)
    }

}

for fscanner.Scan() {
    text := fscanner.Text()
    wg.Add(1)
    sem <- 1
    go func() {
        mu.Lock()
        defer mu.Unlock()
        ourDict[indexPos] = text
        indexPos++
        <- sem
        wg.Done()
    }()

}

for fscanner.Scan() {
        text := fscanner.Text()
        wg.Add(1)
        sem <- 1
        go func(mypos int) {
                mu.Lock()
                defer mu.Unlock()
                ourDict[mypos] = text
                <-sem
                wg.Done()
        }(indexPos)
        indexPos++
}

package main

import (
    "bufio"
    "fmt"
    "log"
    "os"
    "sync"
)

const (
    MAX      = 10
    fileName = "some.dat"
)

type gunk struct {
    line string
    id   int
}

func main() {
    ourDict := make(map[int]string)
    wg := sync.WaitGroup{}
    mu := sync.RWMutex{}

    cha := make(chan gunk)

    for i := 0; i < MAX; i++ {
        wg.Add(1)
        go func(id int) {
            defer wg.Done()
            for {
                textin, ok := <-cha
                if !ok {
                    return
                }
                mu.Lock()
                ourDict[textin.id] = textin.line
                mu.Unlock()
            }
        }(i)
    }

    f, err := os.Open(fileName)
    checkerr(err)
    defer f.Close()
    fscanner := bufio.NewScanner(f)
    indexPos := 1

    for fscanner.Scan() {
        text := fscanner.Text()
        thisgunk := gunk{line: text, id: indexPos}
        cha <- thisgunk
        indexPos++
    }

    close(cha)
    wg.Wait()
    for i, v := range ourDict {
        fmt.Printf("%d: %s\n", i, v)
    }

}

func checkerr(err error) {
    if err != nil {
        fmt.Println(err)
        log.Fatal(err)
    }
}