使用fscanf读取整数的变量数
我有超过100000个以下格式的csv文件:使用fscanf读取整数的变量数,c,performance,scanf,strsep,C,Performance,Scanf,Strsep,我有超过100000个以下格式的csv文件: 1,1,5,1,1,1,0,0,6,6,1,1,1,0,1,0,13,4,7,8,18,20,,,,,,,,,,,,,,,,,,,,,, 1,1,5,1,1,1,0,1,6,5,1,1,1,0,1,0,4,7,8,18,20,,,,,,,,,,,,,,,,,,,,,,, 1,1,5,1,1,1,0,2,6,5,1,1,1,0,1,0,4,7,8,18,20,,,,,,,,,,,,,,,,,,,,,,, 1,1,5,1,1,1,0,3,6,5,1,1,
1,1,5,1,1,1,0,0,6,6,1,1,1,0,1,0,13,4,7,8,18,20,,,,,,,,,,,,,,,,,,,,,,
1,1,5,1,1,1,0,1,6,5,1,1,1,0,1,0,4,7,8,18,20,,,,,,,,,,,,,,,,,,,,,,,
1,1,5,1,1,1,0,2,6,5,1,1,1,0,1,0,4,7,8,18,20,,,,,,,,,,,,,,,,,,,,,,,
1,1,5,1,1,1,0,3,6,5,1,1,1,0,1,0,13,4,7,8,20,,,,,,,,,,,,,,,,,,,,,,,
1,1,5,1,1,1,0,4,6,5,1,1,1,0,1,0,13,4,7,8,20,,,,,,,,,,,,,,,,,,,,,,,
1,1,5,1,1,1,0,5,6,4,1,0,1,0,1,0,4,8,18,20,,,,,,,,,,,,,,,,,,,,,,,,
1,1,5,1,1,1,0,6,6,5,1,1,1,0,1,0,4,7,8,18,20,,,,,,,,,,,,,,,,,,,,,,,
1,1,5,1,1,1,0,7,6,5,1,1,1,0,1,0,13,4,7,8,20,,,,,,,,,,,,,,,,,,,,,,,
1,1,5,1,1,1,0,8,6,5,1,1,1,0,1,0,13,4,7,8,20,,,,,,,,,,,,,,,,,,,,,,,
1,1,5,1,1,2,0,0,12,12,1,2,4,1,1,0,13,4,7,8,18,20,21,25,27,29,31,32,,,,,,,,,,,,,,,,
我只需要10号字段和17号字段,10号字段是计数器,指示有多少个字段
从字段17开始存储整数,即我需要的是:
6,13,4,7,8,18,20
5,4,7,8,18,20
5,4,7,8,18,20
5,13,4,7,8,20
5,13,4,7,8,20
4,4,8,18,20
5,4,7,8,18,20
5,13,4,7,8,20
5,13,4,7,8,20
12,13,4,7,8,18,20,21,25,27,29,31,32
需要读取的最大整数数为28。在C++中,Getline可以很容易地实现这一点,但是,从我以前的经验来看,
因为我需要处理100000多个这样的文件,每个文件可能有300000~400000行这样的行。
因此,使用Getline读入数据并构建vector>可能会有严重的性能问题 为了我。我尝试使用fscanf来实现这一点:
while (!feof(stream)){
fscanf(fstream,"%*d,%*d,%*d,%*d,%*d,%*d,%*d,%*d,%*d,%d",&MyCounter);
fscanf(fstream,"%*d,%*d,%*d,%*d,%*d,%*d"); // skip to column 17
for (int i=0;i<MyCounter;i++){
fscanf(fstream,"%d",&MyIntArr[i]);
}
fscanf(fstream,"%*s"); // to finish the line
}
while(!feof(stream)){
fscanf(fstream、%*d、%*d、%*d、%*d、%*d、%*d、%*d、%*d、%*d、%*d、%*d、%d、&MyCounter);
fscanf(fstream,“%*d,%*d,%*d,%*d,%*d”);//跳到第17列
对于(int i=0;iSo),每行最多有43个数字。即使是64位,每个数字也限制为21位,因此1024字节对于一行最多946字节来说是足够的(只要没有空格)
用于跳到所需列的辅助函数
const char *find_nth_comma(const char *s, int n) {
const char *p = s;
if (p && n) while (*p) {
if (*p == ',') {
if (--n == 0) break;
}
++p;
}
return p;
}
因此,在循环中,跳到第10列查找第一个感兴趣的数字,然后跳到第17列开始读取其余数字。完成的循环如下所示:
while (fgets(line, sizeof(line), stdin) != NULL) {
const char *p = find_nth_comma(line, 9);
char *end;
assert(p && *p);
MyCounter = strtol(p+1, &end, 10);
assert(*end == ',');
p = find_nth_comma(end+1, 6);
assert(p && *p);
for (int i = 0; i < MyCounter; ++i, p = end) {
MyIntArray[i] = strtol(p+1, &end, 10);
assert((*end == ',') ||
(i == MyCounter-1) &&
(*end == '\0' || isspace(*end & 0xFF)));
}
}
while(fgets(line,sizeof(line),stdin)!=NULL){
const char*p=查找第n个逗号(第9行);
字符*结束;
断言(p&&p);
MyCounter=strtol(p+1和end,10);
断言(*end==',');
p=找到第n个逗号(结束+1,6);
断言(p&&p);
对于(int i=0;i
这种方法也适用于mmap
解决方案。fgets
将被一个指向文件中要处理的下一行的函数所取代。find\n\u comma
需要修改以检测行尾/文件尾,而不是依赖NUL终止的字符串。strtol
将e使用一个自定义函数进行更改,该函数再次检测行尾或文件结尾。(此类更改的目的是删除任何需要复制数据的代码,这将是mmap
方法的动机。)
通过并行处理,可以同时解析文件的多个部分。但是,让不同的线程处理不同的文件,然后在处理完所有文件后整理结果就足够了。为了最大限度地提高性能,您应该使用mmap
或equi映射内存中的文件使用特殊代码对文件进行赋值和解析,通常使用指针一次扫描一个字符,检查'\n'
和/或'\r'
是否记录结束,并动态转换数字以存储到数组中。棘手的部分包括:
- 如何分配或以其他方式处理目标阵列
- 这些字段都是数字吗?是整数吗
- 最后一条记录是否由换行符终止?您可以在
mmap
调用后轻松检查此情况。优点是,您只需要在遇到换行符序列时检查文件结尾
读取运行时确定的整数数的最简单方法可能是指向较长格式字符串的右部分。换句话说,我们可以有一个包含28个%d,
说明符的格式字符串,但指向字符串末尾之前的第n个,并将该指针作为scanf()的格式字符串传递
作为一个简单的例子,考虑从6的最大值接受3个整数:
%d,%d,%d,%d,%d,%d,“
^
箭头显示用作模式参数的字符串指针
这是一个完整的示例;它的运行时间大约为8秒,迭代次数为100万次(1000万行)当使用gcc-O3
构建时。更新输入字符串指针的机制有点复杂,这在从文件流读取时显然是不必要的。我跳过了检查nfields最终我使用内存映射文件来解决我的问题(此解决方案是
我上一个问题的副产品,读取大CSV文件时的性能问题)
自从我在MS Windows上工作,所以我使用Stephan Brumme的“便携式内存映射C++类”
因为我不需要处理大于2GB的文件,所以我的实现更简单。
对于超过2GB的文件,请访问web以了解如何处理
请在下面找到我的代码:
// may tried RandomAccess/SequentialScan
MemoryMapped MemFile(FilterBase.BaseFileName, MemoryMapped::WholeFile, MemoryMapped::RandomAccess);
// point to start of memory file
char* start = (char*)MemFile.getData();
// dummy in my case
char* tmpBuffer = start;
// looping counter
uint64_t i = 0;
// pre-allocate result vector
MyVector.resize(300000);
// Line counter
int LnCnt = 0;
//no. of field
int NumOfField=43;
//delimiter count, num of field + 1 since the leading and trailing delimiter are virtual
int DelimCnt=NoOfField+1;
//Delimiter position. May use new to allocate at run time
// or even use vector of integer
// This is to store the delimiter position in each line
// since the position is relative to start of file. if file is extremely
// large, may need to change from int to unsigner, long or even unsigned long long
static int DelimPos[DelimCnt];
// Max number of field need to read usually equal to NumOfField, can be smaller, eg in my case, I only need 4 fields
// from first 15 field, in this case, can assign 15 to MaxFieldNeed
int MaxFieldNeed=NumOfField;
// keep track how many comma read each line
int DelimCounter=0;
// define field and line seperator
char FieldDelim=',';
char LineSep='\n';
// 1st field, "virtual Delimiter" position
DelimPos[CommaCounter]=-1
DelimCounter++;
// loop through the whole memory field, 1 and only once
for (i = 0; i < MemFile.size();i++)
{
// grab all position of delimiter in each line
if ((MemFile[i] == FieldDelim) && (DelimCounter<=MaxFieldNeed)){
DelimPos[DelimCounter] = i;
DelimCounter++;
};
// grab all values when end of line hit
if (MemFile[i] == LineSep) {
// no need to use if (DelimCounter==NumOfField) just assign anyway, waste a little bit
// memory in integer array but gain performance
DelimPos[DelimCounter] = i;
// I know exactly what the format is and what field(s) I want
// a more general approach (as a CSV reader) may put all fields
// into vector of vector of string
// With *EFFORT* one may modify this piece of code so that it can parse
// different format at run time eg similar to:
// fscanf(fstream,"%d,%f....
// also, this piece of code cannot handle complex CSV e.g.
// Peter,28,157CM
// John,26,167CM
// "Mary,Brown",25,150CM
MyVector.StrField = string(strat+DelimPos[0] + 1, strat+DelimPos[1] - 1);
MyVector.IntField = strtol(strat+DelimPos[3] + 1,&tmpBuffer,10);
MyVector.IntField2 = strtol(strat+DelimPos[8] + 1,&tmpBuffer,10);
MyVector.FloatField = strtof(start + DelimPos[14] + 1,&tmpBuffer);
// reset Delim counter each line
DelimCounter=0
// previous line seperator treat as first delimiter of next line
DelimPos[DelimCounter] = i;
DelimCounter++
LnCnt++;
}
}
MyVector.resize(LnCnt);
MyVector.shrink_to_fit();
MemFile.close();
};
例如处理空字段、执行计算等。
使用这段代码,我可以在57秒内处理2100个文件(6.3 GB)!!!
(我在其中编码CSV格式,在上一个案例中仅获取4个值)。
稍后将更改此代码以处理此问题。
Thx所有在这个问题上帮助我的人。“因此,使用Getline读取数据并构建向量>可能会给我带来严重的性能问题。我尝试使用fscanf来实现这一点”——假设fscanf()
可以做一些std::getline
可以防止的事情。你为什么这么认为?另外,你最后的问题是征求意见,这通常被认为是离题的。fscanf(fstream,“%*d”&myintar[i]);
是错误的。删除*
我会读一行带有fgets的内容,然后跳过n个逗号(所以没有整数扫描),然后从那里扫描或标记。<代码>((FEF)(流)){…} /代码>这是什么环境?文件是固定的吗?还是需要不止一次这样做?我会考虑Perl或MMAP(2)。在任何情况下,我都可能手动地走这条线,使用Strutol(3)。在正确的位置执行转换。为了正确起见,您应该将char
参数强制转换为isspace()
,以避免未定义的行为:isspace((未签名的char)*p)
。还有一个潜在问题:find\n\u comm
char const *const input =
"1,1,5,1,1,1,0,0,6,6,1,1,1,0,1,0,13,4,7,8,18,20,,,,,,,,,,,,,,,,,,,,,,\n"
"1,1,5,1,1,1,0,1,6,5,1,1,1,0,1,0,4,7,8,18,20,,,,,,,,,,,,,,,,,,,,,,,\n"
"1,1,5,1,1,1,0,2,6,5,1,1,1,0,1,0,4,7,8,18,20,,,,,,,,,,,,,,,,,,,,,,,\n"
"1,1,5,1,1,1,0,3,6,5,1,1,1,0,1,0,13,4,7,8,20,,,,,,,,,,,,,,,,,,,,,,,\n"
"1,1,5,1,1,1,0,4,6,5,1,1,1,0,1,0,13,4,7,8,20,,,,,,,,,,,,,,,,,,,,,,,\n"
"1,1,5,1,1,1,0,5,6,4,1,0,1,0,1,0,4,8,18,20,,,,,,,,,,,,,,,,,,,,,,,,\n"
"1,1,5,1,1,1,0,6,6,5,1,1,1,0,1,0,4,7,8,18,20,,,,,,,,,,,,,,,,,,,,,,,\n"
"1,1,5,1,1,1,0,7,6,5,1,1,1,0,1,0,13,4,7,8,20,,,,,,,,,,,,,,,,,,,,,,,\n"
"1,1,5,1,1,1,0,8,6,5,1,1,1,0,1,0,13,4,7,8,20,,,,,,,,,,,,,,,,,,,,,,,\n"
"1,1,5,1,1,2,0,0,12,12,1,2,4,1,1,0,13,4,7,8,18,20,21,25,27,29,31,32,,,,,,,,,,,,,,,,\n";
#include <stdio.h>
#define SKIP_FIELD "%*[^,],"
#define DECIMAL_FIELD "%d,"
int read()
{
int n; /* bytes read - not needed for file or stdin */
int sum = 0; /* just to make sure results are used */
for (char const *s = input; *s; ) {
int nfields;
int array[28];
int m = sscanf(s,
/* field 0 is missing */
SKIP_FIELD SKIP_FIELD SKIP_FIELD
SKIP_FIELD SKIP_FIELD SKIP_FIELD
SKIP_FIELD SKIP_FIELD SKIP_FIELD
DECIMAL_FIELD /* field 10 */
SKIP_FIELD SKIP_FIELD SKIP_FIELD
SKIP_FIELD SKIP_FIELD SKIP_FIELD
"%n",
&nfields,
&n);
if (m != 1) {
return -1;
}
s += n;
static const char fieldchars[] = DECIMAL_FIELD;
static const size_t fieldsize = sizeof fieldchars - 1; /* ignore terminating null */
static const char *const parse_entries =
DECIMAL_FIELD DECIMAL_FIELD DECIMAL_FIELD DECIMAL_FIELD
DECIMAL_FIELD DECIMAL_FIELD DECIMAL_FIELD DECIMAL_FIELD
DECIMAL_FIELD DECIMAL_FIELD DECIMAL_FIELD DECIMAL_FIELD
DECIMAL_FIELD DECIMAL_FIELD DECIMAL_FIELD DECIMAL_FIELD
DECIMAL_FIELD DECIMAL_FIELD DECIMAL_FIELD DECIMAL_FIELD
DECIMAL_FIELD DECIMAL_FIELD DECIMAL_FIELD DECIMAL_FIELD
DECIMAL_FIELD DECIMAL_FIELD DECIMAL_FIELD DECIMAL_FIELD
"[^\n] ";
const char *const line_parse = parse_entries + (28-nfields) * fieldsize;
/* now read nfields (max 28) */
m = sscanf(s,
line_parse,
&array[0], &array[1], &array[2], &array[3],
&array[4], &array[5], &array[6], &array[7],
&array[8], &array[9], &array[10], &array[11],
&array[12], &array[13], &array[14], &array[15],
&array[16], &array[17], &array[18], &array[19],
&array[20], &array[21], &array[22], &array[23],
&array[24], &array[25], &array[26], &array[27]);
if (m != nfields) {
return -1;
}
/* advance stream position */
sscanf(s, "%*[^\n] %n", &n); s += n;
/* use the results */
for (int i = 0; i < nfields; ++i) {
sum += array[i];
}
}
return sum;
}
#undef SKIP_FIELD
#undef DECIMAL_FIELD
int main()
{
int sum = 0;
for (int i = 0; i < 1000000; ++i) {
sum += read() * (i&1 ? 1 : - 1); /* alternate add and subtract */
}
return sum != 0;
}
// may tried RandomAccess/SequentialScan
MemoryMapped MemFile(FilterBase.BaseFileName, MemoryMapped::WholeFile, MemoryMapped::RandomAccess);
// point to start of memory file
char* start = (char*)MemFile.getData();
// dummy in my case
char* tmpBuffer = start;
// looping counter
uint64_t i = 0;
// pre-allocate result vector
MyVector.resize(300000);
// Line counter
int LnCnt = 0;
//no. of field
int NumOfField=43;
//delimiter count, num of field + 1 since the leading and trailing delimiter are virtual
int DelimCnt=NoOfField+1;
//Delimiter position. May use new to allocate at run time
// or even use vector of integer
// This is to store the delimiter position in each line
// since the position is relative to start of file. if file is extremely
// large, may need to change from int to unsigner, long or even unsigned long long
static int DelimPos[DelimCnt];
// Max number of field need to read usually equal to NumOfField, can be smaller, eg in my case, I only need 4 fields
// from first 15 field, in this case, can assign 15 to MaxFieldNeed
int MaxFieldNeed=NumOfField;
// keep track how many comma read each line
int DelimCounter=0;
// define field and line seperator
char FieldDelim=',';
char LineSep='\n';
// 1st field, "virtual Delimiter" position
DelimPos[CommaCounter]=-1
DelimCounter++;
// loop through the whole memory field, 1 and only once
for (i = 0; i < MemFile.size();i++)
{
// grab all position of delimiter in each line
if ((MemFile[i] == FieldDelim) && (DelimCounter<=MaxFieldNeed)){
DelimPos[DelimCounter] = i;
DelimCounter++;
};
// grab all values when end of line hit
if (MemFile[i] == LineSep) {
// no need to use if (DelimCounter==NumOfField) just assign anyway, waste a little bit
// memory in integer array but gain performance
DelimPos[DelimCounter] = i;
// I know exactly what the format is and what field(s) I want
// a more general approach (as a CSV reader) may put all fields
// into vector of vector of string
// With *EFFORT* one may modify this piece of code so that it can parse
// different format at run time eg similar to:
// fscanf(fstream,"%d,%f....
// also, this piece of code cannot handle complex CSV e.g.
// Peter,28,157CM
// John,26,167CM
// "Mary,Brown",25,150CM
MyVector.StrField = string(strat+DelimPos[0] + 1, strat+DelimPos[1] - 1);
MyVector.IntField = strtol(strat+DelimPos[3] + 1,&tmpBuffer,10);
MyVector.IntField2 = strtol(strat+DelimPos[8] + 1,&tmpBuffer,10);
MyVector.FloatField = strtof(start + DelimPos[14] + 1,&tmpBuffer);
// reset Delim counter each line
DelimCounter=0
// previous line seperator treat as first delimiter of next line
DelimPos[DelimCounter] = i;
DelimCounter++
LnCnt++;
}
}
MyVector.resize(LnCnt);
MyVector.shrink_to_fit();
MemFile.close();
};
if (MemFile[i] == LineSep) {
}