Linux 在centOS上,大型浮点数集的stringstream序列化比序列化块的4个pthread ts快。在Windows上线程速度更快
我的任务是优化硬盘上大型浮点数集的序列化。 我最初的做法如下:Linux 在centOS上,大型浮点数集的stringstream序列化比序列化块的4个pthread ts快。在Windows上线程速度更快,linux,multithreading,pthreads,Linux,Multithreading,Pthreads,我的任务是优化硬盘上大型浮点数集的序列化。 我最初的做法如下: class StringStreamDataSerializer { public: void serializeRawData(const vector<float>& data); void saveToFileStream(std::fstream& file); private: stringstream _stringStream; }; void StringStreamDataSeria
class StringStreamDataSerializer
{
public:
void serializeRawData(const vector<float>& data);
void saveToFileStream(std::fstream& file);
private:
stringstream _stringStream;
};
void StringStreamDataSerializer::serializeRawData(const vector<float>& data)
{
for (float currentFloat : data)
_stringStream << currentFloat;
}
void StringStreamDataSerializer::saveToFileStream(std::fstream& file)
{
file << _stringStream.str().c_str();
file.close();
}
类StringStreamDataSerializer
{
公众:
无效数据(常量向量和数据);
void saveToFileStream(std::fstream&file);
私人:
stringstream _stringstream;
};
void StringStreamDataSerializer::serializeRawData(常量向量和数据)
{
用于(浮动电流浮动:数据)
_二进制浮点和十进制输出之间的stringStream转换非常昂贵。如果性能是一个问题,您应该将数据序列化为二进制(可能在endianess转换之后,这样您至少可以在IEEE 754系统之间实现互操作性)
关于GNU/Linux性能不佳的线程,这就像一个问题。在多线程模式下,stringstream
当前使用一个进程范围的、竞争激烈的引用计数器进行区域设置处理。二进制浮点和十进制输出之间的转换非常昂贵。如果性能是一个问题,您应该序列化二进制数据(可能在endianess转换之后,因此您至少可以在IEEE 754系统之间获得互操作性)
关于GNU/Linux性能不佳的线程,这就像一个问题。在多线程模式下,stringstream
目前使用一个进程范围内的、激烈竞争的引用计数器来处理区域设置。谢谢你的回复!我想我理解这个问题。我将尝试实现并使用ftoa方法,该方法可以获得区域设置对象,可能会锁定它,或者根本不使用它。感谢您的回复!我想我理解这个问题。我将尝试实现并使用ftoa方法,该方法将获得一个locale对象,可能会锁定它,或者根本不使用它。
struct st_args
{
const vector<float>* data;
size_t from;
size_t to;
size_t segment;
} ;
string outputs[4];
std::mutex g_display_mutex;
void serializeLocal(void *context)
{
struct st_args *readParams = (st_args*)context;
for (auto i = readParams->from; i < readParams->to; i++)
{
string currentFloat = std::to_string( readParams->data->at(i));
currentFloat.erase(currentFloat.find_last_not_of('0') + 1,
std::string::npos);
outputs[readParams->segment] += currentFloat;
}
}
void SImplePThreadedSerializer::serializeRawData(const vector<float>& data)
{
const int N = 4;
size_t totalFloats = data.size();
st_args* seg;
pthread_t* chunk;
chunk = (pthread_t *) malloc(N*sizeof(pthread_t));
seg = (st_args *) malloc(N*sizeof(st_args));
size_t from = 0;
for(int i = 0; i < N; i++)
{
seg[i].from = 0;
seg[i].data = &data;
}
int i = 0;
for (; i < N - 1; ++i)
{
seg[i].from = from;
seg[i].to = seg[i].from + totalFloats / N;
seg[i].segment = i;
pthread_create(&chunk[i], NULL, (void *(*)(void *)) serializeLocal,
(void *) &(seg[i]));
from += totalFloats / N;
}
seg[i].from = from;
seg[i].to = totalFloats;
seg[i].segment = i;
pthread_create(&chunk[i], NULL, (void *(*)(void *)) serializeLocal, (void *)
&(seg[i]));
size_t totalBuffered = 0;
for (int k = 0; k < N; k++)
{
pthread_join(chunk[k], NULL);
totalBuffered += outputs[k].size();
}
str.reserve(totalBuffered);
for (int k = 0; k < N; k++)
{
str+= outputs[k];
}
free(chunk);
free(seg);
}
static void serializeChunk(string& output, const vector<float>& data, size_t
from, size_t to)
{
for (auto i = from; i < to; i++)
{
string currentFloat = std::to_string(data[i]);
//fuckin trim the zeroes at the end
currentFloat.erase(currentFloat.find_last_not_of('0') + 1,
std::string::npos);
output += currentFloat;
}
}
void SimpleMultiThreadedSerializer::serializeRawData(const vector<float>&
data)
{
const int N = 4;
thread t[N]; // say, 4 CPUs.
string outputs[N];
size_t totalFloats = data.size();
size_t from = 0;
int i = 0;
for (; i < N - 1; ++i)
{
t[i] = thread(serializeChunk, std::ref(outputs[i]), data, from, from +
totalFloats / N);
from += totalFloats / N;
}
t[i] = thread(serializeChunk, std::ref(outputs[i]), data, from,
totalFloats);
for (i = 0; i < N; ++i)
t[i].join();
size_t totalBuffered = 0;
for (int i = 0; i < N; ++i)
totalBuffered += outputs[i].size();
str.reserve(totalBuffered);
for (int i = 0; i < N; ++i)
str += outputs[i];
}