LZW压缩算法

LZW压缩算法是一种压缩方法,由Lemple-Ziv-Welch 三人共同创造,用他们的名字命名。

算法概念

  • LZW逐个地输入源数据,使用贪婪分析算法来形成词典的每一个条目。
  • 每一次都力图从输入串中分解出已经识别的最长的字串,即把已在词典中出现的最长字串作为前缀Prefix,然后加上下一个输入字符C,组成新的扩展字符串Prefix.C。
  • 这个新的字符串是否要加到词典中,要看词典中是否存有和它相同的字符串。如有,那么这个字符串就变成新的前缀,继续输入新的字符,否则就把这个字符串加到词典中生成一个新的词条,并给它分配一个代码。

    LZW算法编码流程

    1)初始化:将所有的单字符串放入词典
    2)读第一个输入字符给前缀串P
    3)Step: 读下一个输入字符C;
       if 没有这样的C(输入已穷尽):
          码字(P) 输出;结束。
       if P+C 已存在于词典中:
        P:=P+C;repeat Step;
       else P+C不在于词典中:
         P+C加进词典;
         码字(P) 输出;
        P:=C;repeat Step.
    ##流程图

LZW算法译码流程

1)初始化词典
2)初始化P=空
3)CW=读入码字
4)CW是否在词典中
是:P+CW对应的串中的第一个字符(C1)放入词典(注:如果已经在串表中了就无需放入)
否:P+P中的第一个字符(P1)放入词典
5):P:=CW对应的串
6):输出CW对应的串
7):是否还有下一个码字,如果有跳转到步骤3,否则结束

流程图

代码

C

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>

/* -------- aux stuff ---------- */
void* mem_alloc(size_t item_size, size_t n_item)
{
size_t *x = calloc(1, sizeof(size_t)*2 + n_item * item_size);
x[0] = item_size;
x[1] = n_item;
return x + 2;
}

void* mem_extend(void *m, size_t new_n)
{
size_t *x = (size_t*)m - 2;
x = realloc(x, sizeof(size_t) * 2 + *x * new_n);
if (new_n > x[1])
memset((char*)(x + 2) + x[0] * x[1], 0, x[0] * (new_n - x[1]));
x[1] = new_n;
return x + 2;
}

inline void _clear(void *m)
{
size_t *x = (size_t*)m - 2;
memset(m, 0, x[0] * x[1]);
}

#define _new(type, n) mem_alloc(sizeof(type), n)
#define _del(m) { free((size_t*)(m) - 2); m = 0; }
#define _len(m) *((size_t*)m - 1)
#define _setsize(m, n) m = mem_extend(m, n)
#define _extend(m) m = mem_extend(m, _len(m) * 2)


/* ----------- LZW stuff -------------- */
typedef uint8_t byte;
typedef uint16_t ushort;

#define M_CLR 256 /* clear table marker */
#define M_EOD 257 /* end-of-data marker */
#define M_NEW 258 /* new code index */

/* encode and decode dictionary structures.
for encoding, entry at code index is a list of indices that follow current one,
i.e. if code 97 is 'a', code 387 is 'ab', and code 1022 is 'abc',
then dict[97].next['b'] = 387, dict[387].next['c'] = 1022, etc. */
typedef struct {
ushort next[256];
} lzw_enc_t;

/* for decoding, dictionary contains index of whatever prefix index plus trailing
byte. i.e. like previous example,
dict[1022] = { c: 'c', prev: 387 },
dict[387] = { c: 'b', prev: 97 },
dict[97] = { c: 'a', prev: 0 }
the "back" element is used for temporarily chaining indices when resolving
a code to bytes
*/
typedef struct {
ushort prev, back;
byte c;
} lzw_dec_t;

byte* lzw_encode(byte *in, int max_bits)
{
int len = _len(in), bits = 9, next_shift = 512;
ushort code, c, nc, next_code = M_NEW;
lzw_enc_t *d = _new(lzw_enc_t, 512);

if (max_bits > 15) max_bits = 15;
if (max_bits < 9 ) max_bits = 12;

byte *out = _new(ushort, 4);
int out_len = 0, o_bits = 0;
uint32_t tmp = 0;

inline void write_bits(ushort x) {
tmp = (tmp << bits) | x;
o_bits += bits;
if (_len(out) <= out_len) _extend(out);
while (o_bits >= 8) {
o_bits -= 8;
out[out_len++] = tmp >> o_bits;
tmp &= (1 << o_bits) - 1;
}
}

//write_bits(M_CLR);
for (code = *(in++); --len; ) {
c = *(in++);
if ((nc = d[code].next[c]))
code = nc;
else {
write_bits(code);
nc = d[code].next[c] = next_code++;
code = c;
}

/* next new code would be too long for current table */
if (next_code == next_shift) {
/* either reset table back to 9 bits */
if (++bits > max_bits) {
/* table clear marker must occur before bit reset */
write_bits(M_CLR);

bits = 9;
next_shift = 512;
next_code = M_NEW;
_clear(d);
} else /* or extend table */
_setsize(d, next_shift *= 2);
}
}

write_bits(code);
write_bits(M_EOD);
if (tmp) write_bits(tmp);

_del(d);

_setsize(out, out_len);
return out;
}

byte* lzw_decode(byte *in)
{
byte *out = _new(byte, 4);
int out_len = 0;

inline void write_out(byte c)
{
while (out_len >= _len(out)) _extend(out);
out[out_len++] = c;
}

lzw_dec_t *d = _new(lzw_dec_t, 512);
int len, j, next_shift = 512, bits = 9, n_bits = 0;
ushort code, c, t, next_code = M_NEW;

uint32_t tmp = 0;
inline void get_code() {
while(n_bits < bits) {
if (len > 0) {
len --;
tmp = (tmp << 8) | *(in++);
n_bits += 8;
} else {
tmp = tmp << (bits - n_bits);
n_bits = bits;
}
}
n_bits -= bits;
code = tmp >> n_bits;
tmp &= (1 << n_bits) - 1;
}

inline void clear_table() {
_clear(d);
for (j = 0; j < 256; j++) d[j].c = j;
next_code = M_NEW;
next_shift = 512;
bits = 9;
};

clear_table(); /* in case encoded bits didn't start with M_CLR */
for (len = _len(in); len;) {
get_code();
if (code == M_EOD) break;
if (code == M_CLR) {
clear_table();
continue;
}

if (code >= next_code) {
fprintf(stderr, "Bad sequence\n");
_del(out);
goto bail;
}

d[next_code].prev = c = code;
while (c > 255) {
t = d[c].prev; d[t].back = c; c = t;
}

d[next_code - 1].c = c;

while (d[c].back) {
write_out(d[c].c);
t = d[c].back; d[c].back = 0; c = t;
}
write_out(d[c].c);

if (++next_code >= next_shift) {
if (++bits > 16) {
/* if input was correct, we'd have hit M_CLR before this */
fprintf(stderr, "Too many bits\n");
_del(out);
goto bail;
}
_setsize(d, next_shift *= 2);
}
}

/* might be ok, so just whine, don't be drastic */
if (code != M_EOD) fputs("Bits did not end in EOD\n", stderr);

_setsize(out, out_len);
bail: _del(d);
return out;
}

int main()
{
int i, fd = open("unixdict.txt", O_RDONLY);

if (fd == -1) {
fprintf(stderr, "Can't read file\n");
return 1;
};

struct stat st;
fstat(fd, &st);

byte *in = _new(char, st.st_size);
read(fd, in, st.st_size);
_setsize(in, st.st_size);
close(fd);

printf("input size: %d\n", _len(in));

byte *enc = lzw_encode(in, 9);
printf("encoded size: %d\n", _len(enc));

byte *dec = lzw_decode(enc);
printf("decoded size: %d\n", _len(dec));

for (i = 0; i < _len(dec); i++)
if (dec[i] != in[i]) {
printf("bad decode at %d\n", i);
break;
}

if (i == _len(dec)) printf("Decoded ok\n");


_del(in);
_del(enc);
_del(dec);

return 0;
}

C++

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#include <string>
#include <map>

// Compress a string to a list of output symbols.
// The result will be written to the output iterator
// starting at "result"; the final iterator is returned.
template <typename Iterator>
Iterator compress(const std::string &uncompressed, Iterator result) {
// Build the dictionary.
int dictSize = 256;
std::map<std::string,int> dictionary;
for (int i = 0; i < 256; i++)
dictionary[std::string(1, i)] = i;

std::string w;
for (std::string::const_iterator it = uncompressed.begin();
it != uncompressed.end(); ++it) {
char c = *it;
std::string wc = w + c;
if (dictionary.count(wc))
w = wc;
else {
*result++ = dictionary[w];
// Add wc to the dictionary.
dictionary[wc] = dictSize++;
w = std::string(1, c);
}
}

// Output the code for w.
if (!w.empty())
*result++ = dictionary[w];
return result;
}

// Decompress a list of output ks to a string.
// "begin" and "end" must form a valid range of ints
template <typename Iterator>
std::string decompress(Iterator begin, Iterator end) {
// Build the dictionary.
int dictSize = 256;
std::map<int,std::string> dictionary;
for (int i = 0; i < 256; i++)
dictionary[i] = std::string(1, i);

std::string w(1, *begin++);
std::string result = w;
std::string entry;
for ( ; begin != end; begin++) {
int k = *begin;
if (dictionary.count(k))
entry = dictionary[k];
else if (k == dictSize)
entry = w + w[0];
else
throw "Bad compressed k";

result += entry;

// Add w+entry[0] to the dictionary.
dictionary[dictSize++] = w + entry[0];

w = entry;
}
return result;
}

#include <iostream>
#include <iterator>
#include <vector>

int main() {
std::vector<int> compressed;
compress("TOBEORNOTTOBEORTOBEORNOT", std::back_inserter(compressed));
copy(compressed.begin(), compressed.end(), std::ostream_iterator<int>(std::cout, ", "));
std::cout << std::endl;
std::string decompressed = decompress(compressed.begin(), compressed.end());
std::cout << decompressed << std::endl;

return 0;
}

Java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import java.util.*;

public class LZW {
/** Compress a string to a list of output symbols. */
public static List<Integer> compress(String uncompressed) {
// Build the dictionary.
int dictSize = 256;
Map<String,Integer> dictionary = new HashMap<String,Integer>();
for (int i = 0; i < 256; i++)
dictionary.put("" + (char)i, i);

String w = "";
List<Integer> result = new ArrayList<Integer>();
for (char c : uncompressed.toCharArray()) {
String wc = w + c;
if (dictionary.containsKey(wc))
w = wc;
else {
result.add(dictionary.get(w));
// Add wc to the dictionary.
dictionary.put(wc, dictSize++);
w = "" + c;
}
}

// Output the code for w.
if (!w.equals(""))
result.add(dictionary.get(w));
return result;
}

/** Decompress a list of output ks to a string. */
public static String decompress(List<Integer> compressed) {
// Build the dictionary.
int dictSize = 256;
Map<Integer,String> dictionary = new HashMap<Integer,String>();
for (int i = 0; i < 256; i++)
dictionary.put(i, "" + (char)i);

String w = "" + (char)(int)compressed.remove(0);
StringBuffer result = new StringBuffer(w);
for (int k : compressed) {
String entry;
if (dictionary.containsKey(k))
entry = dictionary.get(k);
else if (k == dictSize)
entry = w + w.charAt(0);
else
throw new IllegalArgumentException("Bad compressed k: " + k);

result.append(entry);

// Add w+entry[0] to the dictionary.
dictionary.put(dictSize++, w + entry.charAt(0));

w = entry;
}
return result.toString();
}

public static void main(String[] args) {
List<Integer> compressed = compress("TOBEORNOTTOBEORTOBEORNOT");
System.out.println(compressed);
String decompressed = decompress(compressed);
System.out.println(decompressed);
}
}

输出:

[84, 79, 66, 69, 79, 82, 78, 79, 84, 256, 258, 260, 265, 259, 261, 263]
TOBEORNOTTOBEORTOBEORNOT


参考资料
  1. 多媒体技术基础(第三版)