C 优化O(n^2)到O(n)(未排序的字符串)
我这里有一个优化问题。我想让这段代码在O(n)中运行,我已经尝试了几个小时了 字节数组c包含一个字符串,e包含相同的字符串,但已排序。Int数组nc和ne包含字符串中的索引,例如C 优化O(n^2)到O(n)(未排序的字符串),c,arrays,sorting,optimization,C,Arrays,Sorting,Optimization,我这里有一个优化问题。我想让这段代码在O(n)中运行,我已经尝试了几个小时了 字节数组c包含一个字符串,e包含相同的字符串,但已排序。Int数组nc和ne包含字符串中的索引,例如 c: s l e e p i n g nc: 0 0 0 1 0 0 0 0 e: e e g i l n p s ne: 0 1 0 0 0 0 0 0 现在的问题是get_next_索引是线性的——有没有办法解决这个问题 void decode_block(int p) { BYTE xj = c[p]
c:
s l e e p i n g
nc:
0 0 0 1 0 0 0 0
e:
e e g i l n p s
ne:
0 1 0 0 0 0 0 0
现在的问题是get_next_索引是线性的——有没有办法解决这个问题
void decode_block(int p) {
BYTE xj = c[p];
int nxj = nc[p];
for (int i = 0; i < block_size; i++) {
result[i] = xj;
int q = get_next_index(xj, nxj, c, nc);
xj = e[q];
nxj = ne[q];
}
fwrite(result, sizeof (BYTE), block_size, stdout);
fflush(stdout);
}
int get_next_index(BYTE xj, int nxj, BYTE* c, int* nc) {
int i = 0;
while ( ( xj != c[i] ) || ( nxj != nc[i] ) ) {
i++;
}
return i;
}
接下来,我必须将块大小(=长度c=长度nc=长度e=长度ne)乘以
- 将结果xj存储在result中
- 查找c[i]==xj的数字索引
- xj现在是e[i]
首先,遍历已排序的字符串并填充一个查找表,该表允许您查找给定字符的第一个列表元素。例如,您的查找表可能看起来像
std::array扫描xj
和nxj
一次,然后构建一个查找表。这是一个双O(n)操作
最合理的方法是使用二叉树,根据xj
或nxj
的值进行排序。该节点将包含您寻求的索引。这会将查找减少到O(lg n)。以下是我对Burrows-Wheeler变换的完整实现:
u8* bwtCompareBuf;
u32 bwtCompareLen;
s32 bwtCompare( const void* v1, const void* v2 )
{
u8* c1 = bwtCompareBuf + ((u32*)v1)[0];
u8* c2 = bwtCompareBuf + ((u32*)v2)[0];
for ( u32 i = 0; i < bwtCompareLen; i++ )
{
if ( c1[i] < c2[i] ) return -1;
if ( c1[i] > c2[i] ) return +1;
}
return 0;
}
void bwtEncode( u8* inputBuffer, u32 len, u32& first )
{
s8* tmpBuf = alloca( len * 2 );
u32* indices = new u32[len];
for ( u32 i = 0; i < len; i++ ) indices[i] = i;
bwtCompareBuf = tmpBuf;
bwtCompareLen = len;
qsort( indices.data(), len, sizeof( u32 ), bwtCompare );
u8* tbuf = (u8*)tmpBuf + ( len - 1 );
for ( u32 i = 0; i < len; i++ )
{
u32 idx = indices[i];
if ( idx == 0 ) idx = len;
inputBuffer[i] = tbuf[idx];
if ( indices[i] == 1 ) first = i;
}
delete[] indices;
}
void bwtDecode( u8* inputBuffer, u32 len, u32 first )
{
// To determine a character's position in the output string given
// its position in the input string, we can use the knowledge about
// the fact that the output string is sorted. Each character 'c' will
// show up in the output stream in in position i, where i is the sum
// total of all characters in the input buffer that precede c in the
// alphabet, plus the count of all occurences of 'c' previously in the
// input stream.
// compute the frequency of each character in the input buffer
u32 freq[256] = { 0 };
u32 count[256] = { 0 };
for ( u32 i = 0; i < len; i++ )
freq[inputBuffer[i]]++;
// freq now holds a running total of all the characters less than i
// in the input stream
u32 sum = 0;
for ( u32 i = 0; i < 256; i++ )
{
u32 tmp = sum;
sum += freq[i];
freq[i] = tmp;
}
// Now that the freq[] array is filled in, I have half the
// information needed to position each 'c' in the input buffer. The
// next piece of information is simply the number of characters 'c'
// that appear before this 'c' in the input stream. I keep track of
// that information in the count[] array as I go. By adding those
// two numbers together, I get the destination of each character in
// the input buffer, and I just write it directly to the destination.
u32* trans = new u32[len];
for ( u32 i = 0; i < len; i++ )
{
u32 ch = inputBuffer[i];
trans[count[ch] + freq[ch]] = i;
count[ch]++;
}
u32 idx = first;
s8* tbuf = alloca( len );
memcpy( tbuf, inputBuffer, len );
u8* srcBuf = (u8*)tbuf;
for ( u32 i = 0; i < len; i++ )
{
inputBuffer[i] = srcBuf[idx];
idx = trans[idx];
}
delete[] trans;
}
u8*bwtcomparef;
u32 BWTComparelin;
s32 bwtCompare(常数无效*v1,常数无效*v2)
{
u8*c1=bwtCompareBuf+((u32*)v1[0];
u8*c2=bwtCompareBuf+((u32*)v2)[0];
对于(u32 i=0;ic2[i])返回+1;
}
返回0;
}
无效bwtEncode(u8*输入缓冲区、u32 len、u32和first)
{
s8*tmpBuf=alloca(len*2);
u32*指数=新的u32[len];
对于(u32 i=0;i
用O(n)进行解码。你能再明确一点吗?您是在尝试对未排序的数组进行排序,还是什么?我添加了更多信息hmmm..恢复c
给定的e
、ne
和nc
?(在示例数据中似乎不可能)或构造给定的c
和nc
?(一点也不难,但是O(N ln)。)只给出了c,e是通过排序c来构造的,ne和nc是通过e上的一个简单循环来构造的,而cI并没有真正得到它。。。我在查找中到底存储了什么?我应该使用什么样的迭代器?@user720491:你看过最新的编辑了吗。第一个版本是乱码。@user720491:请看实现草图,它应该会使方法更清晰一些。代码是C,所以我不知道如何实现它。这不考虑索引(nc和ne),是吗?@user720491:Oops,对不起,我忘了这个问题被标记了,不是吗。但算法保持不变,它在C++中明显短了。你能提供更多关于查找表的信息吗?扫描XJ和NXJ,让我们假设P=2,然后XJ= E和NXJ=0。现在如何构建查找表?
// Instead of a list, a deque will likely perform better,
// but you have to test this yourself in your particular case.
std::array<std::list<size_t>,(1<<sizeof(char))> lookup;
for (size_t i = 0; i < sortedLength; i++) {
lookup[sorted[i]].push_back(i);
}
size_t const j = lookup[unsorted[i]].front();
lookup[unsorted[i]].pop_front();
return j;
u8* bwtCompareBuf;
u32 bwtCompareLen;
s32 bwtCompare( const void* v1, const void* v2 )
{
u8* c1 = bwtCompareBuf + ((u32*)v1)[0];
u8* c2 = bwtCompareBuf + ((u32*)v2)[0];
for ( u32 i = 0; i < bwtCompareLen; i++ )
{
if ( c1[i] < c2[i] ) return -1;
if ( c1[i] > c2[i] ) return +1;
}
return 0;
}
void bwtEncode( u8* inputBuffer, u32 len, u32& first )
{
s8* tmpBuf = alloca( len * 2 );
u32* indices = new u32[len];
for ( u32 i = 0; i < len; i++ ) indices[i] = i;
bwtCompareBuf = tmpBuf;
bwtCompareLen = len;
qsort( indices.data(), len, sizeof( u32 ), bwtCompare );
u8* tbuf = (u8*)tmpBuf + ( len - 1 );
for ( u32 i = 0; i < len; i++ )
{
u32 idx = indices[i];
if ( idx == 0 ) idx = len;
inputBuffer[i] = tbuf[idx];
if ( indices[i] == 1 ) first = i;
}
delete[] indices;
}
void bwtDecode( u8* inputBuffer, u32 len, u32 first )
{
// To determine a character's position in the output string given
// its position in the input string, we can use the knowledge about
// the fact that the output string is sorted. Each character 'c' will
// show up in the output stream in in position i, where i is the sum
// total of all characters in the input buffer that precede c in the
// alphabet, plus the count of all occurences of 'c' previously in the
// input stream.
// compute the frequency of each character in the input buffer
u32 freq[256] = { 0 };
u32 count[256] = { 0 };
for ( u32 i = 0; i < len; i++ )
freq[inputBuffer[i]]++;
// freq now holds a running total of all the characters less than i
// in the input stream
u32 sum = 0;
for ( u32 i = 0; i < 256; i++ )
{
u32 tmp = sum;
sum += freq[i];
freq[i] = tmp;
}
// Now that the freq[] array is filled in, I have half the
// information needed to position each 'c' in the input buffer. The
// next piece of information is simply the number of characters 'c'
// that appear before this 'c' in the input stream. I keep track of
// that information in the count[] array as I go. By adding those
// two numbers together, I get the destination of each character in
// the input buffer, and I just write it directly to the destination.
u32* trans = new u32[len];
for ( u32 i = 0; i < len; i++ )
{
u32 ch = inputBuffer[i];
trans[count[ch] + freq[ch]] = i;
count[ch]++;
}
u32 idx = first;
s8* tbuf = alloca( len );
memcpy( tbuf, inputBuffer, len );
u8* srcBuf = (u8*)tbuf;
for ( u32 i = 0; i < len; i++ )
{
inputBuffer[i] = srcBuf[idx];
idx = trans[idx];
}
delete[] trans;
}