Algorithm 简单字符串压缩算法_Algorithm_Encoding_Compression

Algorithm 简单字符串压缩算法

algorithm encoding compression

Algorithm 简单字符串压缩算法,algorithm,encoding,compression,Algorithm,Encoding,Compression,我希望找到以下形式的字符串的最短编码： abbcccc = a2b4c [注意：此贪婪算法不保证最短解] 通过记住以前出现的所有字符，可以直接找到重复字符串的第一个匹配项（包括所有重复的最小结束索引=所有重复后的最大剩余字符串），并将其替换为RLE（Python3代码）：为了使其对迭代应用具有鲁棒性，我们必须排除一些可能不应用RLE的情况（例如“11”或“）”），我们还必须确保RLE不会使字符串变长（两个字符的子字符串的出现时间可能是“abab”中的两倍）：通过上述插入的print语句，我

我希望找到以下形式的字符串的最短编码：

abbcccc = a2b4c

[注意：此贪婪算法不保证最短解]

通过记住以前出现的所有字符，可以直接找到重复字符串的第一个匹配项（包括所有重复的最小结束索引=所有重复后的最大剩余字符串），并将其替换为RLE（Python3代码）：

为了使其对迭代应用具有鲁棒性，我们必须排除一些可能不应用RLE的情况（例如“11”或“）”），我们还必须确保RLE不会使字符串变长（两个字符的子字符串的出现时间可能是“abab”中的两倍）：

通过上述插入的

print

语句，我们可以得到如下跟踪和结果：

>>> iterativeRLE('xyabcdefdefabcdefdef')
found 2*'def'
xyabc2(def)abcdefdef
found 2*'def'
xyabc2(def)abc2(def)
found 2*'abc2(def)'
xy2(abc2(def))
'xy2(abc2(def))'

但是这个贪婪算法对这个输入失败：

>>> iterativeRLE('abaaabaaabaa')
found 3*'a'
ab3abaaabaa
found 3*'a'
ab3ab3abaa
found 2*'b3a'
a2(b3a)baa
found 2*'a'
a2(b3a)b2a
'a2(b3a)b2a'

鉴于最短解之一是

3（ab2a）

，因为贪婪算法不起作用，所以需要一些搜索。这里是一个深度优先搜索，并进行一些删减（如果在分支中未触及字符串的第一个

idx0

字符，则不尝试在这些字符中查找重复的子字符串；如果替换子字符串的多次出现，则对所有连续出现的子字符串执行此操作）：

< P>以下是我的C++实现，它是用<代码> o（n）< /COD>时间复杂度和<代码> o（1）空间复杂度。

class Solution {
public:
    int compress(vector<char>& chars) {
        int n = (int)chars.size();
        if(chars.empty()) return 0;
        int left = 0, right = 0, currCharIndx = left;
        while(right < n) {
            if(chars[currCharIndx] != chars[right]) {
                int len = right - currCharIndx;
                chars[left++] = chars[currCharIndx];
                if(len > 1) {
                    string freq = to_string(len);
                    for(int i = 0; i < (int)freq.length(); i++) {
                        chars[left++] = freq[i];
                    }
                }
                currCharIndx = right;
            }
            right++;
        }
        int len = right - currCharIndx;
        chars[left++] = chars[currCharIndx];
        if(len > 1) {
            string freq = to_string(len);
            for(int i = 0; i < freq.length(); i++) {
                chars[left++] = freq[i];
            }
        }
        return left;
    }
};

类解决方案{
公众：
整数压缩（向量和字符）{
int n=（int）chars.size（）；
if（chars.empty（））返回0；
int left=0，right=0，currCharIndx=left；
while（右1）{
字符串频率=到字符串（len）；
对于（int i=0；i<（int）频率长度（）；i++）{
chars[left++]=freq[i]；
}
}
currCharIndx=右；
}
右++；
}
int len=右-currCharIndx；
chars[left++]=chars[currCharIndx]；
如果（len>1）{
字符串频率=到字符串（len）；
对于（int i=0；i


您需要跟踪三个指针-right
用于迭代，currCharIndx
用于跟踪当前字符的第一个位置，left
用于跟踪压缩字符串的写入位置。
如果有字符串，请说tcttttttttttttttttttttttttttct，然后该代码将返回长度为15的tc11tc10tc11tct。但是，有一种更好的编码tc11tc2（t9tct），其长度为14。@q85ts正确。在这个答案和我的另一个答案中看到我的笔记。这不是从一个a
开始的吗。继续重复下一个角色两倍的次数。停在c。？对于最多2**26个字符的字符串，“唯一需要的信息”是“停止字符”-禁用解压缩器/扩展器：。
>>> iterativeRLE('xyabcdefdefabcdefdef')
found 2*'def'
xyabc2(def)abcdefdef
found 2*'def'
xyabc2(def)abc2(def)
found 2*'abc2(def)'
xy2(abc2(def))
'xy2(abc2(def))'

>>> iterativeRLE('abaaabaaabaa')
found 3*'a'
ab3abaaabaa
found 3*'a'
ab3ab3abaa
found 2*'b3a'
a2(b3a)baa
found 2*'a'
a2(b3a)b2a
'a2(b3a)b2a'

def isRLE(s):
    "is this a well nested RLE? (only well nested RLEs can be further nested)"
    nestCnt = 0
    for c in s:
        if c == '(':
            nestCnt += 1
        elif c == ')':
            if nestCnt == 0:
                return False
            nestCnt -= 1
    return nestCnt == 0

def singleRLE_gen(s,idx0=0):
    "find all occurrences of a repeating substring with first repetition not ending before index idx0 and replace each with RLE"
    print("looking for repeated substrings in '%s', first rep. not ending before index %d" % (s,idx0))
    occ = dict() # for each character remember all previous indices of occurrences
    for idx,c in enumerate(s):
        if idx>0 and s[idx-1] in '0123456789': continue # sub-RLE cannot start after number

        if not c in occ: occ[c] = []
        for c_occ in occ[c]:
            s_c = s[c_occ:idx]
            if not isRLE(s_c): continue # avoid RLEs for e.g. '))...)'
            if idx+len(s_c) < idx0: continue # pruning: this substring has been tried before
            if c_occ-len(s_c) >= 0 and s[c_occ-len(s_c):c_occ] == s_c: continue # pruning: always take all repetitions
            i = 1
            while s[idx+(i-1)*len(s_c) : idx+i*len(s_c)] == s_c:
                i += 1
            if i > 1:
                rle_pars = ('(',')') if len(s_c) > 1 else ('','')
                rle = ('%d'%i) + rle_pars[0] + s_c + rle_pars[1]
                if len(rle) <= i*len(s_c): # in case of a tie prefer RLE
                    s_RLE = s[:c_occ] + rle + s[idx+(i-1)*len(s_c):]
                    #print("  replacing %d*'%s' -> %s" % (i,s_c,s_RLE))
                    yield s_RLE,c_occ
        occ[c].append(idx)

def iterativeRLE_depthFirstSearch(s):
    shortestRLE = s
    candidatesRLE = [(s,0)]
    while len(candidatesRLE) > 0:
        candidateRLE,idx0 = candidatesRLE.pop(0)
        for rle,idx in singleRLE_gen(candidateRLE,idx0):
            if len(rle) <= len(shortestRLE):
                shortestRLE = rle
                print("new optimum: '%s'" % shortestRLE)
            candidatesRLE.append((rle,idx))
    return shortestRLE

>>> iterativeRLE_depthFirstSearch('tctttttttttttcttttttttttctttttttttttct')
looking for repeated substrings in 'tctttttttttttcttttttttttctttttttttttct', first rep. not ending before index 0
new optimum: 'tc11tcttttttttttctttttttttttct'
new optimum: '2(tctttttttttt)ctttttttttttct'
new optimum: 'tctttttttttttc2(ttttttttttct)'
looking for repeated substrings in 'tc11tcttttttttttctttttttttttct', first rep. not ending before index 2
new optimum: 'tc11tc10tctttttttttttct'
new optimum: 'tc11t2(ctttttttttt)tct'
new optimum: 'tc11tc2(ttttttttttct)'
looking for repeated substrings in 'tc5(tt)tcttttttttttctttttttttttct', first rep. not ending before index 2
...
new optimum: '2(tctttttttttt)c11tct'
...
new optimum: 'tc11tc10tc11tct'
...
new optimum: 'tc11t2(c10t)tct'
looking for repeated substrings in 'tc11tc2(ttttttttttct)', first rep. not ending before index 6
new optimum: 'tc11tc2(10tct)'
...    
new optimum: '2(tc10t)c11tct'
...    
'2(tc10t)c11tct'

class Solution {
public:
    int compress(vector<char>& chars) {
        int n = (int)chars.size();
        if(chars.empty()) return 0;
        int left = 0, right = 0, currCharIndx = left;
        while(right < n) {
            if(chars[currCharIndx] != chars[right]) {
                int len = right - currCharIndx;
                chars[left++] = chars[currCharIndx];
                if(len > 1) {
                    string freq = to_string(len);
                    for(int i = 0; i < (int)freq.length(); i++) {
                        chars[left++] = freq[i];
                    }
                }
                currCharIndx = right;
            }
            right++;
        }
        int len = right - currCharIndx;
        chars[left++] = chars[currCharIndx];
        if(len > 1) {
            string freq = to_string(len);
            for(int i = 0; i < freq.length(); i++) {
                chars[left++] = freq[i];
            }
        }
        return left;
    }
};