C# 解析和上传>;1GB的C数据#
我写了一个程序来解析和上传大量数据到数据库。 问题是解析太慢了。 我的程序的工作方式是使用Parser类来解析(使用并行化)每个文件,并为它在每个文件中解析的每个条目引发一个事件:C# 解析和上传>;1GB的C数据#,c#,sql-server,performance,parsing,file-upload,C#,Sql Server,Performance,Parsing,File Upload,我写了一个程序来解析和上传大量数据到数据库。 问题是解析太慢了。 我的程序的工作方式是使用Parser类来解析(使用并行化)每个文件,并为它在每个文件中解析的每个条目引发一个事件: Parallel.ForEach<FileInfo>( files, new ParallelOptions { MaxDegreeOfParallelism = maxParallelism }, (inputFile, args) => { //
Parallel.ForEach<FileInfo>(
files,
new ParallelOptions { MaxDegreeOfParallelism = maxParallelism },
(inputFile, args) =>
{
// Using underlying FileStream to allow concurrent Read/Write access.
using (var input = new StreamReader(inputFile.FullName))
{
while (!input.EndOfStream)
{
RaiseEntryParsed(ParseCity(input.ReadLine()));
}
ParsedFiles++;
RaiseFileParsed(inputFile);
}
});
RaiseDirectoryParsed(Directory);
每次提交后清除表的行的原因是为了节省内存,并防止内存中存在太多实体时出现OutOfMemoryException
我怎样才能使它更快,它目前的速度慢得令人无法接受。我分析了应用程序,它指出大部分时间都花在Entryparsed事件上。
谢谢虽然我同意其他一些评论和答案,但您是否尝试过:
cityTable.Rows.BeginEdit()
将第一项添加到城市表之前
然后打电话给a:
cityTable.Rows.EndEdit()
在FileParased事件处理程序中 我做了一个简短的测试项目,并尝试了几种不同的方法。我的目标是构建一个包含27列和(id,a,B,C,…,Z)以及numorrows的数据表,使用顺序代码,尽可能快地生成大约300000行 (每行填充一个id,其余列填充随机的5个字母的单词) 在第四次尝试中,我偶然发现了一种不同的语法,用于根据Object类型的值数组将行添加到表中。(见附件) 在您的情况下,它将类似于:
cityTable.Rows.Add( new Object[] {
((City)e.DatabaseEntry).Id ,
ObjectThatGoesInColumn2 ,
ObjectThatGoesInColumn3 ,
ObjectThatGoesInLastColumn
}
而不是:
DataRow row = cityTable.NewRow();
row[0] = 100;
row["City Name"] = Anaheim;
row["Column 7"] = ...
...
row["Column 26"] = checksum;
workTable.Rows.Add( row );
这将给你一个加速,因为你不会一次一列地单独设置每一列,并且根据你的剖析器图片,你至少有12列是单独设置的
这还避免了对列名字符串进行散列以查看所处理的数组位置,然后再次检查数据类型是否正确
如果您感兴趣,以下是我的测试项目:
class Program
{
public static System.Data.DataSet dataSet;
public static System.Data.DataSet dataSet2;
public static System.Data.DataSet dataSet3;
public static System.Data.DataSet dataSet4;
public static Random rand = new Random();
public static int NumOfRows = 300000;
static void Main(string[] args)
{
#region test1
Console.WriteLine("Starting");
Console.WriteLine("");
Stopwatch watch = new Stopwatch();
watch.Start();
MakeTable();
watch.Stop();
Console.WriteLine("Elapsed Time was: " + watch.ElapsedMilliseconds + " milliseconds.");
dataSet = null;
Console.WriteLine("");
Console.WriteLine("Completed.");
Console.WriteLine("");
#endregion
/*
#region test2
Console.WriteLine("Starting Test 2");
Console.WriteLine("");
watch.Reset();
watch.Start();
MakeTable2();
watch.Stop();
Console.WriteLine("Elapsed Time was: " + watch.ElapsedMilliseconds + " milliseconds.");
dataSet2 = null;
Console.WriteLine("");
Console.WriteLine("Completed Test 2.");
#endregion
#region test3
Console.WriteLine("");
Console.WriteLine("Starting Test 3");
Console.WriteLine("");
watch.Reset();
watch.Start();
MakeTable3();
watch.Stop();
Console.WriteLine("Elapsed Time was: " + watch.ElapsedMilliseconds + " milliseconds.");
dataSet3 = null;
Console.WriteLine("");
Console.WriteLine("Completed Test 3.");
#endregion
*/
#region test4
Console.WriteLine("Starting Test 4");
Console.WriteLine("");
watch.Reset();
watch.Start();
MakeTable4();
watch.Stop();
Console.WriteLine("Elapsed Time was: " + watch.ElapsedMilliseconds + " milliseconds.");
dataSet4 = null;
Console.WriteLine("");
Console.WriteLine("Completed Test 4.");
#endregion
//printTable();
Console.WriteLine("");
Console.WriteLine("Press Enter to Exit...");
Console.ReadLine();
}
private static void MakeTable()
{
DataTable table = new DataTable("Table 1");
DataColumn column;
DataRow row;
column = new DataColumn();
column.DataType = System.Type.GetType("System.Int32");
column.ColumnName = "id";
column.ReadOnly = true;
column.Unique = true;
table.Columns.Add(column);
for (int i = 65; i <= 90; i++)
{
column = new DataColumn();
column.DataType = System.Type.GetType("System.String");
column.ColumnName = "5-Letter Word " + (char)i;
column.AutoIncrement = false;
column.Caption = "Random Word " + (char)i;
column.ReadOnly = false;
column.Unique = false;
// Add the column to the table.
table.Columns.Add(column);
}
DataColumn[] PrimaryKeyColumns = new DataColumn[1];
PrimaryKeyColumns[0] = table.Columns["id"];
table.PrimaryKey = PrimaryKeyColumns;
// Instantiate the DataSet variable.
dataSet = new DataSet();
// Add the new DataTable to the DataSet.
dataSet.Tables.Add(table);
// Create three new DataRow objects and add
// them to the DataTable
for (int i = 0; i < NumOfRows; i++)
{
row = table.NewRow();
row["id"] = i;
for (int j = 65; j <= 90; j++)
{
row["5-Letter Word " + (char)j] = getRandomWord();
}
table.Rows.Add(row);
}
}
private static void MakeTable2()
{
DataTable table = new DataTable("Table 2");
DataColumn column;
DataRow row;
column = new DataColumn();
column.DataType = System.Type.GetType("System.Int32");
column.ColumnName = "id";
column.ReadOnly = true;
column.Unique = true;
table.Columns.Add(column);
for (int i = 65; i <= 90; i++)
{
column = new DataColumn();
column.DataType = System.Type.GetType("System.String");
column.ColumnName = "5-Letter Word " + (char)i;
column.AutoIncrement = false;
column.Caption = "Random Word " + (char)i;
column.ReadOnly = false;
column.Unique = false;
// Add the column to the table.
table.Columns.Add(column);
}
DataColumn[] PrimaryKeyColumns = new DataColumn[1];
PrimaryKeyColumns[0] = table.Columns["id"];
table.PrimaryKey = PrimaryKeyColumns;
// Instantiate the DataSet variable.
dataSet2 = new DataSet();
// Add the new DataTable to the DataSet.
dataSet2.Tables.Add(table);
// Create three new DataRow objects and add
// them to the DataTable
for (int i = 0; i < NumOfRows; i++)
{
row = table.NewRow();
row.BeginEdit();
row["id"] = i;
for (int j = 65; j <= 90; j++)
{
row["5-Letter Word " + (char)j] = getRandomWord();
}
row.EndEdit();
table.Rows.Add(row);
}
}
private static void MakeTable3()
{
DataTable table = new DataTable("Table 3");
DataColumn column;
column = new DataColumn();
column.DataType = System.Type.GetType("System.Int32");
column.ColumnName = "id";
column.ReadOnly = true;
column.Unique = true;
table.Columns.Add(column);
for (int i = 65; i <= 90; i++)
{
column = new DataColumn();
column.DataType = System.Type.GetType("System.String");
column.ColumnName = "5-Letter Word " + (char)i;
column.AutoIncrement = false;
column.Caption = "Random Word " + (char)i;
column.ReadOnly = false;
column.Unique = false;
// Add the column to the table.
table.Columns.Add(column);
}
DataColumn[] PrimaryKeyColumns = new DataColumn[1];
PrimaryKeyColumns[0] = table.Columns["id"];
table.PrimaryKey = PrimaryKeyColumns;
// Instantiate the DataSet variable.
dataSet3 = new DataSet();
// Add the new DataTable to the DataSet.
dataSet3.Tables.Add(table);
DataRow[] newRows = new DataRow[NumOfRows];
for (int i = 0; i < NumOfRows; i++)
{
newRows[i] = table.NewRow();
}
// Create three new DataRow objects and add
// them to the DataTable
for (int i = 0; i < NumOfRows; i++)
{
newRows[i]["id"] = i;
for (int j = 65; j <= 90; j++)
{
newRows[i]["5-Letter Word " + (char)j] = getRandomWord();
}
table.Rows.Add(newRows[i]);
}
}
private static void MakeTable4()
{
DataTable table = new DataTable("Table 2");
DataColumn column;
column = new DataColumn();
column.DataType = System.Type.GetType("System.Int32");
column.ColumnName = "id";
column.ReadOnly = true;
column.Unique = true;
table.Columns.Add(column);
for (int i = 65; i <= 90; i++)
{
column = new DataColumn();
column.DataType = System.Type.GetType("System.String");
column.ColumnName = "5-Letter Word " + (char)i;
column.AutoIncrement = false;
column.Caption = "Random Word " + (char)i;
column.ReadOnly = false;
column.Unique = false;
// Add the column to the table.
table.Columns.Add(column);
}
DataColumn[] PrimaryKeyColumns = new DataColumn[1];
PrimaryKeyColumns[0] = table.Columns["id"];
table.PrimaryKey = PrimaryKeyColumns;
// Instantiate the DataSet variable.
dataSet4 = new DataSet();
// Add the new DataTable to the DataSet.
dataSet4.Tables.Add(table);
// Create three new DataRow objects and add
// them to the DataTable
for (int i = 0; i < NumOfRows; i++)
{
table.Rows.Add(
new Object[] {
i,
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord()
}
);
}
}
private static string getRandomWord()
{
char c0 = (char)rand.Next(65, 90);
char c1 = (char)rand.Next(65, 90);
char c2 = (char)rand.Next(65, 90);
char c3 = (char)rand.Next(65, 90);
char c4 = (char)rand.Next(65, 90);
return "" + c0 + c1 + c2 + c3 + c4;
}
private static void printTable()
{
foreach (DataRow row in dataSet.Tables[0].Rows)
{
Console.WriteLine( row["id"] + "--" + row["5-Letter Word A"] + " - " + row["5-Letter Word Z"] );
}
}
}
类程序
{
公共静态System.Data.DataSet数据集;
公共静态System.Data.DataSet dataSet2;
公共静态System.Data.DataSet dataSet3;
公共静态System.Data.DataSet数据集4;
public static Random rand=new Random();
公共静态整数numorrows=300000;
静态void Main(字符串[]参数)
{
#区域测试1
控制台写入线(“启动”);
控制台。写线(“”);
秒表=新秒表();
watch.Start();
MakeTable();
看,停;
WriteLine(“经过的时间是:“+watch.elapsedmillisons+”毫秒”);
数据集=空;
控制台。写线(“”);
Console.WriteLine(“已完成”);
控制台。写线(“”);
#端区
/*
#区域测试2
控制台写入线(“启动测试2”);
控制台。写线(“”);
watch.Reset();
watch.Start();
MakeTable2();
看,停;
WriteLine(“经过的时间是:“+watch.elapsedmillisons+”毫秒”);
dataSet2=null;
控制台。写线(“”);
控制台写入线(“完成测试2”);
#端区
#区域测试3
控制台。写线(“”);
控制台写入线(“启动测试3”);
控制台。写线(“”);
watch.Reset();
watch.Start();
MakeTable3();
看,停;
WriteLine(“经过的时间是:“+watch.elapsedmillisons+”毫秒”);
dataSet3=null;
控制台。写线(“”);
控制台写入线(“完成测试3”);
#端区
*/
#区域测试4
控制台写入线(“启动测试4”);
控制台。写线(“”);
watch.Reset();
watch.Start();
MakeTable4();
看,停;
WriteLine(“经过的时间是:“+watch.elapsedmillisons+”毫秒”);
dataSet4=null;
控制台。写线(“”);
Console.WriteLine(“完成测试4”);
#端区
//printTable();
控制台。写线(“”);
Console.WriteLine(“按Enter键退出…”);
Console.ReadLine();
}
私有静态void MakeTable()
{
DataTable=新的DataTable(“表1”);
数据列;
数据行;
column=newdatacolumn();
column.DataType=System.Type.GetType(“System.Int32”);
column.ColumnName=“id”;
column.ReadOnly=true;
column.Unique=true;
表.列.添加(列);
对于(int i=65;i如果您正在寻找原始性能,这样的东西不是最好的选择吗?它完全绕过了datatable代码,这似乎是一个不必要的步骤
void BulkInsertFile(string fileName, string tableName)
{
FileInfo info = new FileInfo(fileName);
string name = info.Name;
string shareDirectory = ""; //the path of the share: \\servername\shareName\
string serverDirectory = ""; //the local path of the share on the server: C:\shareName\
File.Copy(fileName, shareDirectory + name);
// or you could call your method to parse the file and write it to the share directory.
using (SqlConnection cnn = new SqlConnection("connectionString"))
{
cnn.Open();
using (SqlCommand cmd = cnn.CreateCommand())
{
cmd.CommandText = string.Format("bulk insert {0} from '{1}' with (fieldterminator = ',', rowterminator = '\n')", tableName, serverDirectory + name);
try
{
cmd.ExecuteScalar();
}
catch (SqlException ex)
{
MessageBox.Show(ex.Message);
}
}
}
}
是有关批量插入
命令的一些信息。您是否先在没有并行化的情况下尝试了它?绑定IO的并行化操作可能根本没有帮助(并且可能会使事情变得更糟)。但我们离题了;您说的是“解析”太慢了,但您实际上没有显示任何解析。此外,我认为这段代码风险极高:您可能有两个线程同时与同一个SqlBulkCopy对话-这是无效的。这里的底层数据是什么?我认为您可能应该完全忽略DataTable,直接在非缓冲输入和那么:这里的输入是什么?除了考虑@MarcGravel刚才说的内容外,还可以尝试分析应用程序(或者,如果手头没有,请暂停调试器10次,看看它在哪里停止得最多).Profile到事件中以查看更多细节。Marc,我尝试了没有并行化的方式查看此帖子:并行化解析提供了巨大的性能提升。该程序目前实际上没有IO限制,是我的CPU以100%的利用率运行。磁盘驱动器和网络几乎没有得到利用(在任务管理器中).我不认为这里的问题是解析本身,它很简单(只是类型转换)。我可以用锁修复有风险的线程问题,但这会减慢进程
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections.Concurrent;
using System.Threading.Tasks;
using System.IO;
using System.Data;
namespace dataTableTesting2
{
class Program
{
private static const int BufferSize = 20; //Each buffer can only contain this many elements at a time
//This limits the total amount of memory
private static const int MaxBlockSize = 100;
private static BlockingCollection<string> buffer1 = new BlockingCollection<string>(BufferSize);
private static BlockingCollection<string[]> buffer2 = new BlockingCollection<string[]>(BufferSize);
private static BlockingCollection<Object[][]> buffer3 = new BlockingCollection<Object[][]>(BufferSize);
/// <summary>
/// Start Pipelines and wait for them to finish.
/// </summary>
static void Main(string[] args)
{
TaskFactory f = new TaskFactory(TaskCreationOptions.LongRunning, TaskContinuationOptions.None);
Task stage0 = f.StartNew(() => PopulateFilesList(buffer1));
Task stage1 = f.StartNew(() => ReadFiles(buffer1, buffer2));
Task stage2 = f.StartNew(() => ParseStringBlocks(buffer2, buffer3));
Task stage3 = f.StartNew(() => UploadBlocks(buffer3) );
Task.WaitAll(stage0, stage1, stage2, stage3);
/*
// Note for more workers on particular stages you can make more tasks for each stage, like the following
// which populates the file list in 1 task, reads the files into string[] blocks in 1 task,
// then parses the string[] blocks in 4 concurrent tasks
// and lastly uploads the info in 2 tasks
TaskFactory f = new TaskFactory(TaskCreationOptions.LongRunning, TaskContinuationOptions.None);
Task stage0 = f.StartNew(() => PopulateFilesList(buffer1));
Task stage1 = f.StartNew(() => ReadFiles(buffer1, buffer2));
Task stage2a = f.StartNew(() => ParseStringBlocks(buffer2, buffer3));
Task stage2b = f.StartNew(() => ParseStringBlocks(buffer2, buffer3));
Task stage2c = f.StartNew(() => ParseStringBlocks(buffer2, buffer3));
Task stage2d = f.StartNew(() => ParseStringBlocks(buffer2, buffer3));
Task stage3a = f.StartNew(() => UploadBlocks(buffer3) );
Task stage3b = f.StartNew(() => UploadBlocks(buffer3) );
Task.WaitAll(stage0, stage1, stage2a, stage2b, stage2c, stage2d, stage3a, stage3b);
*/
}
/// <summary>
/// Adds the filenames to process into the first pipeline
/// </summary>
/// <param name="output"></param>
private static void PopulateFilesList( BlockingCollection<string> output )
{
try
{
buffer1.Add("file1.txt");
buffer1.Add("file2.txt");
//...
buffer1.Add("lastFile.txt");
}
finally
{
output.CompleteAdding();
}
}
/// <summary>
/// Takes filnames out of the first pipeline, reads them into string[] blocks, and puts them in the second pipeline
/// </summary>
private static void ReadFiles( BlockingCollection<string> input, BlockingCollection<string[]> output)
{
try
{
foreach (string file in input.GetConsumingEnumerable())
{
List<string> list = new List<string>(MaxBlockSize);
using (StreamReader sr = new StreamReader(file))
{
int countLines = 0;
while (!sr.EndOfStream)
{
list.Add( sr.ReadLine() );
countLines++;
if (countLines > MaxBlockSize)
{
output.Add(list.ToArray());
countLines = 0;
list = new List<string>(MaxBlockSize);
}
}
if (list.Count > 0)
{
output.Add(list.ToArray());
}
}
}
}
finally
{
output.CompleteAdding();
}
}
/// <summary>
/// Takes string[] blocks from the second pipeline, for each line, splits them by tabs, and parses
/// the data, storing each line as an object array into the third pipline.
/// </summary>
private static void ParseStringBlocks( BlockingCollection<string[]> input, BlockingCollection< Object[][] > output)
{
try
{
List<Object[]> result = new List<object[]>(MaxBlockSize);
foreach (string[] block in input.GetConsumingEnumerable())
{
foreach (string line in block)
{
string[] splitLine = line.Split('\t'); //split line on tab
string cityName = splitLine[0];
int cityPop = Int32.Parse( splitLine[1] );
int cityElevation = Int32.Parse(splitLine[2]);
//...
result.Add(new Object[] { cityName, cityPop, cityElevation });
}
output.Add( result.ToArray() );
}
}
finally
{
output.CompleteAdding();
}
}
/// <summary>
/// Takes the data blocks from the third pipeline, and uploads each row to SQL Database
/// </summary>
private static void UploadBlocks(BlockingCollection<Object[][]> input)
{
/*
* At this point 'block' is an array of object arrays.
*
* The block contains MaxBlockSize number of cities.
*
* There is one object array for each city.
*
* The object array for the city is in the pre-defined order from pipeline stage2
*
* You could do a couple of things at this point:
*
* 1. declare and initialize a DataTable with the correct column types
* then, do the dataTable.Rows.Add( rowValues )
* then, use a Bulk Copy Operation to upload the dataTable to SQL
* http://msdn.microsoft.com/en-us/library/7ek5da1a
*
* 2. Manually perform the sql commands/transactions similar to what
* Kevin recommends in this suggestion:
* http://stackoverflow.com/questions/1024123/sql-insert-one-row-or-multiple-rows-data/1024195#1024195
*
* I've demonstrated the first approach with this code.
*
* */
DataTable dataTable = new DataTable();
//set up columns of dataTable here.
foreach (Object[][] block in input.GetConsumingEnumerable())
{
foreach (Object[] rowValues in block)
{
dataTable.Rows.Add(rowValues);
}
//do bulkCopy to upload table containing MaxBlockSize number of cities right here.
dataTable.Rows.Clear(); //Remove the rows when you are done uploading, but not the dataTable.
}
}
}
}
void BulkInsertFile(string fileName, string tableName)
{
FileInfo info = new FileInfo(fileName);
string name = info.Name;
string shareDirectory = ""; //the path of the share: \\servername\shareName\
string serverDirectory = ""; //the local path of the share on the server: C:\shareName\
File.Copy(fileName, shareDirectory + name);
// or you could call your method to parse the file and write it to the share directory.
using (SqlConnection cnn = new SqlConnection("connectionString"))
{
cnn.Open();
using (SqlCommand cmd = cnn.CreateCommand())
{
cmd.CommandText = string.Format("bulk insert {0} from '{1}' with (fieldterminator = ',', rowterminator = '\n')", tableName, serverDirectory + name);
try
{
cmd.ExecuteScalar();
}
catch (SqlException ex)
{
MessageBox.Show(ex.Message);
}
}
}
}