C# IEnumerable<；T>；，Parallel.ForEach与内存管理_C#_Sql Server_Memory Management_Task Parallel Library

C# IEnumerable<；T>；，Parallel.ForEach与内存管理

c# sql-server memory-management

C# IEnumerable<；T>；，Parallel.ForEach与内存管理,c#,sql-server,memory-management,task-parallel-library,C#,Sql Server,Memory Management,Task Parallel Library,我正在读取和处理大量的Sql Server数据（输入的行数为100多万，输出的行数为100多万）。对每个源行执行的处理都很重要。单线程版本的性能不符合预期。我当前的并行处理版本在一些较小的批处理（300000个源行，1M个输出行）上运行得非常好，但在非常大的运行中，我遇到了一些内存不足的异常本准则的灵感来源于以下提供的答案：以下是总体思路：获取源数据（数据太大，无法读入内存，因此我们将“流式”处理）公共静态IEnumerable ReadData（） { 使用（SqlConnectio

我正在读取和处理大量的Sql Server数据（输入的行数为100多万，输出的行数为100多万）。对每个源行执行的处理都很重要。单线程版本的性能不符合预期。我当前的并行处理版本在一些较小的批处理（300000个源行，1M个输出行）上运行得非常好，但在非常大的运行中，我遇到了一些内存不足的异常

本准则的灵感来源于以下提供的答案：

以下是总体思路：

获取源数据（数据太大，无法读入内存，因此我们将“流式”处理）

公共静态IEnumerable ReadData（）
{
使用（SqlConnection con=newsqlconnection（Settings.ConnectionString））
使用（SqlCommand cmd=newsqlcommand（selectionSql，con））
{
con.Open（）；
使用（SqlDataReader dr=cmd.ExecuteReader（CommandBehavior.CloseConnection））
{
while（dr.Read（））
{
//在此做出一些决定–使用1到n个源行
//创建MyObject实例的步骤
生成返回新的MyObject（某些参数）；
}
}
}
}

一旦我们到达并行处理的点，我们希望使用SqlBulkCopy对象来写入数据。因此，我们不希望并行处理单个MyObject，因为我们希望对每个线程执行大容量复制。因此，我们将从上面阅读另一个IEnumerable，它返回一批MyObject

class MyObjectBatch 
{
    public List<MyObject> Items { get; set; }

    public MyObjectBatch (List<MyObject> items)
    {
        this.Items = items;
    }

    public static IEnumerable<MyObjectBatch> Read(int batchSize)
    {
        List<MyObject> items = new List<MyObjectBatch>();
        foreach (MyObject o in DataAccessLayer.ReadData())
        {
            items.Add(o);
            if (items.Count >= batchSize)
            {
                yield return new MyObjectBatch(items);                    
                items = new List<MyObject>(); // reset
            }
        }
        if (items.Count > 0) yield return new MyObjectBatch(items);            
    }
}

类MyObjectBatch
{
公共列表项{get；set；}
公共MyObject批处理（列表项）
{
这个。项目=项目；
}
公共静态IEnumerable读取（int batchSize）
{
列表项=新列表（）；
foreach（DataAccessLayer.ReadData（）中的MyObject o）
{
增加（o）项；
如果（items.Count>=batchSize）
{
返回新的MyObject批次（项目）；
items=新列表（）；//重置
}
}
如果（items.Count>0）产生返回新的MyObject批次（items）；
}
}

最后，我们到达并行处理“批”的点

ObjectProcessor processor = new ObjectProcessor();

ParallelOptions options = new ParallelOptions { MaxDegreeOfParallelism = Settings.MaxThreads };
Parallel.ForEach(MyObjectBatch.Read(Settings.BatchSize), options, batch =>
{
    // Create a container for data processed by this thread
    // the container implements IDataReader
    ProcessedData targetData = new ProcessedData(some params));

    // process the batch… for each MyObject in MyObjectBatch – 
    // results are collected in targetData
    for (int index = 0; index < batch.Items.Count; index++) 
    {
        processor.Process(batch.Item[index], targetData);
    }

    // bulk copy the data – this creates a SqlBulkCopy instance
    // and loads the data to the target table
    DataAccessLayer.BulkCopyData(targetData);

    // explicitly set the batch and targetData to null to try to free resources

});

ObjectProcessor处理器=新的ObjectProcessor（）；
ParallelOptions=new ParallelOptions{MaxDegreeOfParallelism=Settings.MaxThreads}；
Parallel.ForEach（MyObjectBatch.Read（Settings.BatchSize），选项，批=>
{
//为该线程处理的数据创建容器
//容器实现IDataReader
ProcessedData targetData=新的ProcessedData（某些参数））；
//为MyObject batch中的每个MyObject处理批次–
//结果收集在targetData中
对于（int index=0；index


上面的所有内容都大大简化了，但我相信它包含了所有重要的概念。以下是我看到的行为：
性能非常好–对于大小合理的数据集，我得到了非常好的结果
然而，随着它的处理，所消耗的内存将继续增长。对于较大的数据集，这会导致异常
我已经通过日志记录证明，如果我减慢了从数据库读取的速度，它会减慢批处理读取的速度，并随后减慢正在创建的并行线程的速度（特别是如果我设置了MaxDegreeOfParallelization）。我担心我的读取速度超过了我的处理速度，但是如果我限制线程，它应该只读取每个线程可以处理的内容
较小或较大的批处理大小会对性能产生一定影响，但使用的内存量会随着批处理的大小而持续增长
这里哪里有机会恢复一些内存？当我的“批处理”超出范围时，是否应该恢复该内存？在前两层我可以做些什么来释放一些资源吗
要回答一些问题：
1.它是否可以完全用SQL完成？不，处理逻辑非常复杂（而且是动态的）。一般来说，它是做低级二进制解码。
2.我们尝试了SSIS（取得了一些成功）。问题是源数据和输出的定义都是动态的。SSI似乎需要非常严格的输入和输出列定义，在这种情况下，这是行不通的
有人还询问了ProcessedData对象-这实际上相当简单：
class ProcessedData : IDataReader 
{
    private int _currentIndex = -1;
    private string[] _fieldNames { get; set; }

    public string TechnicalTableName { get; set; }        
    public List<object[]> Values { get; set; }

    public ProcessedData(string schemaName, string tableName, string[] fieldNames)
    {            
        this.TechnicalTableName = "[" + schemaName + "].[" + tableName + "]";
        _fieldNames = fieldNames;            
        this.Values = new List<object[]>();
    }

    #region IDataReader Implementation

    public int FieldCount
    {
        get { return _fieldNames.Length; }
    }

    public string GetName(int i)
    {
        return _fieldNames[i];
    }

    public int GetOrdinal(string name)
    {
        int index = -1;
        for (int i = 0; i < _fieldNames.Length; i++)
        {
            if (_fieldNames[i] == name)
            {
                index = i;
                break;
            }
        }
        return index;
    }

    public object GetValue(int i)
    {
        if (i > (Values[_currentIndex].Length- 1))
        {
            return null;
        }
        else
        {
            return Values[_currentIndex][i];
        }
    }

    public bool Read()
    {
        if ((_currentIndex + 1) < Values.Count)
        {
            _currentIndex++;
            return true;
        }
        else
        {
            return false;
        }
    }

    // Other IDataReader things not used by SqlBulkCopy not implemented
}

类处理数据：IDataReader
{
私有int_currentIndex=-1；
私有字符串[]_字段名{get；set；}
公共字符串TechnicalTableName{get；set；}
公共列表值{get；set；}
公共处理数据（字符串模式名、字符串表名、字符串[]字段名）
{            
this.TechnicalTableName=“[”+schemaName+”]。“+tableName+”]；
_字段名=字段名；
this.Values=新列表（）；
}
#区域IDataReader实现
公共整型字段计数
{
获取{return\u fieldNames.Length；}
}
公共字符串GetName（int i）
{
返回_fieldNames[i]；
}
public int GetOrdinal（字符串名称）
{
int指数=-1；
对于（int i=0；i<\u fieldNames.Length；i++）
{
if（_fieldNames[i]==名称）
{
指数=i；
打破
}
}
收益指数；
}
公共对象GetValue（int i）
{
如果（i>（值[_currentIndex].Length-1））
{
返回null；
}
class ProcessedData : IDataReader 
{
    private int _currentIndex = -1;
    private string[] _fieldNames { get; set; }

    public string TechnicalTableName { get; set; }        
    public List<object[]> Values { get; set; }

    public ProcessedData(string schemaName, string tableName, string[] fieldNames)
    {            
        this.TechnicalTableName = "[" + schemaName + "].[" + tableName + "]";
        _fieldNames = fieldNames;            
        this.Values = new List<object[]>();
    }

    #region IDataReader Implementation

    public int FieldCount
    {
        get { return _fieldNames.Length; }
    }

    public string GetName(int i)
    {
        return _fieldNames[i];
    }

    public int GetOrdinal(string name)
    {
        int index = -1;
        for (int i = 0; i < _fieldNames.Length; i++)
        {
            if (_fieldNames[i] == name)
            {
                index = i;
                break;
            }
        }
        return index;
    }

    public object GetValue(int i)
    {
        if (i > (Values[_currentIndex].Length- 1))
        {
            return null;
        }
        else
        {
            return Values[_currentIndex][i];
        }
    }

    public bool Read()
    {
        if ((_currentIndex + 1) < Values.Count)
        {
            _currentIndex++;
            return true;
        }
        else
        {
            return false;
        }
    }

    // Other IDataReader things not used by SqlBulkCopy not implemented
}

GetRows()
.AsBatches(10000)    
.AsParallel().WithDegreeOfParallelism(8)
.Select(TransformItems) //generate rows to write
.AsEnumerable() //leave PLINQ
.SelectMany(x => x) //flatten batches
.AsBatches(1000000) //create new batches with different size
.AsParallel().WithDegreeOfParallelism(2) //PLINQ with different DOP
.ForEach(WriteBatchToDB); //write to DB