LZW压缩算法

2018-06-27 22:34:45

LZW压缩算法是一种压缩方法，由Lemple-Ziv-Welch 三人共同创造，用他们的名字命名。

算法概念

LZW逐个地输入源数据，使用贪婪分析算法来形成词典的每一个条目。
每一次都力图从输入串中分解出已经识别的最长的字串，即把已在词典中出现的最长字串作为前缀Prefix，然后加上下一个输入字符C，组成新的扩展字符串Prefix.C。
这个新的字符串是否要加到词典中，要看词典中是否存有和它相同的字符串。如有，那么这个字符串就变成新的前缀，继续输入新的字符，否则就把这个字符串加到词典中生成一个新的词条，并给它分配一个代码。
LZW算法编码流程
1)初始化：将所有的单字符串放入词典
2)读第一个输入字符给前缀串P
3)Step: 读下一个输入字符C;
　　　if 没有这样的C(输入已穷尽)：
　　　　码字(P) 输出；结束。
　　　if P+C 已存在于词典中：
　　　　P:=P+C；repeat Step；
　　　else P+C不在于词典中:
　　　　　P+C加进词典；
　　　　　码字(P) 输出；
　　　 P:=C；repeat Step.
##流程图

LZW算法译码流程

1）初始化词典
2）初始化P=空
3）CW=读入码字
4）CW是否在词典中
是：P+CW对应的串中的第一个字符(C1)放入词典（注：如果已经在串表中了就无需放入）
否：P+P中的第一个字符(P1)放入词典
5）：P:=CW对应的串
6）：输出CW对应的串
7）：是否还有下一个码字，如果有跳转到步骤3，否则结束

流程图

代码

C

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
 
/* -------- aux stuff ---------- */
void* mem_alloc(size_t item_size, size_t n_item)
{
  size_t *x = calloc(1, sizeof(size_t)*2 + n_item * item_size);
  x[0] = item_size;
  x[1] = n_item;
  return x + 2;
}
 
void* mem_extend(void *m, size_t new_n)
{
  size_t *x = (size_t*)m - 2;
  x = realloc(x, sizeof(size_t) * 2 + *x * new_n);
  if (new_n > x[1])
    memset((char*)(x + 2) + x[0] * x[1], 0, x[0] * (new_n - x[1]));
  x[1] = new_n;
  return x + 2;
}
 
inline void _clear(void *m)
{
  size_t *x = (size_t*)m - 2;
  memset(m, 0, x[0] * x[1]);
}
 
#define _new(type, n) mem_alloc(sizeof(type), n)
#define _del(m)   { free((size_t*)(m) - 2); m = 0; }
#define _len(m)   *((size_t*)m - 1)
#define _setsize(m, n)  m = mem_extend(m, n)
#define _extend(m)  m = mem_extend(m, _len(m) * 2)
 
 
/* ----------- LZW stuff -------------- */
typedef uint8_t byte;
typedef uint16_t ushort;
 
#define M_CLR 256 /* clear table marker */
#define M_EOD 257 /* end-of-data marker */
#define M_NEW 258 /* new code index */
 
/* encode and decode dictionary structures.
   for encoding, entry at code index is a list of indices that follow current one,
   i.e. if code 97 is 'a', code 387 is 'ab', and code 1022 is 'abc',
   then dict[97].next['b'] = 387, dict[387].next['c'] = 1022, etc. */
typedef struct {
  ushort next[256];
} lzw_enc_t;
 
/* for decoding, dictionary contains index of whatever prefix index plus trailing
   byte.  i.e. like previous example,
    dict[1022] = { c: 'c', prev: 387 },
    dict[387]  = { c: 'b', prev: 97 },
    dict[97]   = { c: 'a', prev: 0 }
   the "back" element is used for temporarily chaining indices when resolving
   a code to bytes
 */
typedef struct {
  ushort prev, back;
  byte c;
} lzw_dec_t;
 
byte* lzw_encode(byte *in, int max_bits)
{
  int len = _len(in), bits = 9, next_shift = 512;
  ushort code, c, nc, next_code = M_NEW;
  lzw_enc_t *d = _new(lzw_enc_t, 512);
 
  if (max_bits > 15) max_bits = 15;
  if (max_bits < 9 ) max_bits = 12;
 
  byte *out = _new(ushort, 4);
  int out_len = 0, o_bits = 0;
  uint32_t tmp = 0;
 
  inline void write_bits(ushort x) {
    tmp = (tmp << bits) | x;
    o_bits += bits;
    if (_len(out) <= out_len) _extend(out);
    while (o_bits >= 8) {
      o_bits -= 8;
      out[out_len++] = tmp >> o_bits;
      tmp &= (1 << o_bits) - 1;
    }
  }
 
  //write_bits(M_CLR);
  for (code = *(in++); --len; ) {
    c = *(in++);
    if ((nc = d[code].next[c]))
      code = nc;
    else {
      write_bits(code);
      nc = d[code].next[c] = next_code++;
      code = c;
    }
 
    /* next new code would be too long for current table */
    if (next_code == next_shift) {
      /* either reset table back to 9 bits */
      if (++bits > max_bits) {
        /* table clear marker must occur before bit reset */
        write_bits(M_CLR);
 
        bits = 9;
        next_shift = 512;
        next_code = M_NEW;
        _clear(d);
      } else  /* or extend table */
        _setsize(d, next_shift *= 2);
    }
  }
 
  write_bits(code);
  write_bits(M_EOD);
  if (tmp) write_bits(tmp);
 
  _del(d);
 
  _setsize(out, out_len);
  return out;
}
 
byte* lzw_decode(byte *in)
{
  byte *out = _new(byte, 4);
  int out_len = 0;
 
  inline void write_out(byte c)
  {
    while (out_len >= _len(out)) _extend(out);
    out[out_len++] = c;
  }
 
  lzw_dec_t *d = _new(lzw_dec_t, 512);
  int len, j, next_shift = 512, bits = 9, n_bits = 0;
  ushort code, c, t, next_code = M_NEW;
 
  uint32_t tmp = 0;
  inline void get_code() {
    while(n_bits < bits) {
      if (len > 0) {
        len --;
        tmp = (tmp << 8) | *(in++);
        n_bits += 8;
      } else {
        tmp = tmp << (bits - n_bits);
        n_bits = bits;
      }
    }
    n_bits -= bits;
    code = tmp >> n_bits;
    tmp &= (1 << n_bits) - 1;
  }
 
  inline void clear_table() {
    _clear(d);
    for (j = 0; j < 256; j++) d[j].c = j;
    next_code = M_NEW;
    next_shift = 512;
    bits = 9;
  };
 
  clear_table(); /* in case encoded bits didn't start with M_CLR */
  for (len = _len(in); len;) {
    get_code();
    if (code == M_EOD) break;
    if (code == M_CLR) {
      clear_table();
      continue;
    }
 
    if (code >= next_code) {
      fprintf(stderr, "Bad sequence\n");
      _del(out);
      goto bail;
    }
 
    d[next_code].prev = c = code;
    while (c > 255) {
      t = d[c].prev; d[t].back = c; c = t;
    }
 
    d[next_code - 1].c = c;
 
    while (d[c].back) {
      write_out(d[c].c);
      t = d[c].back; d[c].back = 0; c = t;
    }
    write_out(d[c].c);
 
    if (++next_code >= next_shift) {
      if (++bits > 16) {
        /* if input was correct, we'd have hit M_CLR before this */
        fprintf(stderr, "Too many bits\n");
        _del(out);
        goto bail;
      }
      _setsize(d, next_shift *= 2);
    }
  }
 
  /* might be ok, so just whine, don't be drastic */
  if (code != M_EOD) fputs("Bits did not end in EOD\n", stderr);
 
  _setsize(out, out_len);
bail: _del(d);
  return out;
}
 
int main()
{
  int i, fd = open("unixdict.txt", O_RDONLY);
 
  if (fd == -1) {
    fprintf(stderr, "Can't read file\n");
    return 1;
  };
 
  struct stat st;
  fstat(fd, &st);
 
  byte *in = _new(char, st.st_size);
  read(fd, in, st.st_size);
  _setsize(in, st.st_size);
  close(fd);
 
  printf("input size:   %d\n", _len(in));
 
  byte *enc = lzw_encode(in, 9);
  printf("encoded size: %d\n", _len(enc));
 
  byte *dec = lzw_decode(enc);
  printf("decoded size: %d\n", _len(dec));
 
  for (i = 0; i < _len(dec); i++)
    if (dec[i] != in[i]) {
      printf("bad decode at %d\n", i);
      break;
    }
 
  if (i == _len(dec)) printf("Decoded ok\n");
 
 
  _del(in);
  _del(enc);
  _del(dec);
 
  return 0;
}

C++

#include <string>
#include <map>
 
// Compress a string to a list of output symbols.
// The result will be written to the output iterator
// starting at "result"; the final iterator is returned.
template <typename Iterator>
Iterator compress(const std::string &uncompressed, Iterator result) {
  // Build the dictionary.
  int dictSize = 256;
  std::map<std::string,int> dictionary;
  for (int i = 0; i < 256; i++)
    dictionary[std::string(1, i)] = i;
 
  std::string w;
  for (std::string::const_iterator it = uncompressed.begin();
       it != uncompressed.end(); ++it) {
    char c = *it;
    std::string wc = w + c;
    if (dictionary.count(wc))
      w = wc;
    else {
      *result++ = dictionary[w];
      // Add wc to the dictionary.
      dictionary[wc] = dictSize++;
      w = std::string(1, c);
    }
  }
 
  // Output the code for w.
  if (!w.empty())
    *result++ = dictionary[w];
  return result;
}
 
// Decompress a list of output ks to a string.
// "begin" and "end" must form a valid range of ints
template <typename Iterator>
std::string decompress(Iterator begin, Iterator end) {
  // Build the dictionary.
  int dictSize = 256;
  std::map<int,std::string> dictionary;
  for (int i = 0; i < 256; i++)
    dictionary[i] = std::string(1, i);
 
  std::string w(1, *begin++);
  std::string result = w;
  std::string entry;
  for ( ; begin != end; begin++) {
    int k = *begin;
    if (dictionary.count(k))
      entry = dictionary[k];
    else if (k == dictSize)
      entry = w + w[0];
    else
      throw "Bad compressed k";
 
    result += entry;
 
    // Add w+entry[0] to the dictionary.
    dictionary[dictSize++] = w + entry[0];
 
    w = entry;
  }
  return result;
}
 
#include <iostream>
#include <iterator>
#include <vector>
 
int main() {
  std::vector<int> compressed;
  compress("TOBEORNOTTOBEORTOBEORNOT", std::back_inserter(compressed));
  copy(compressed.begin(), compressed.end(), std::ostream_iterator<int>(std::cout, ", "));
  std::cout << std::endl;
  std::string decompressed = decompress(compressed.begin(), compressed.end());
  std::cout << decompressed << std::endl;
 
  return 0;
}

Java

import java.util.*;
 
public class LZW {
    /** Compress a string to a list of output symbols. */
    public static List<Integer> compress(String uncompressed) {
        // Build the dictionary.
        int dictSize = 256;
        Map<String,Integer> dictionary = new HashMap<String,Integer>();
        for (int i = 0; i < 256; i++)
            dictionary.put("" + (char)i, i);
 
        String w = "";
        List<Integer> result = new ArrayList<Integer>();
        for (char c : uncompressed.toCharArray()) {
            String wc = w + c;
            if (dictionary.containsKey(wc))
                w = wc;
            else {
                result.add(dictionary.get(w));
                // Add wc to the dictionary.
                dictionary.put(wc, dictSize++);
                w = "" + c;
            }
        }
 
        // Output the code for w.
        if (!w.equals(""))
            result.add(dictionary.get(w));
        return result;
    }
 
    /** Decompress a list of output ks to a string. */
    public static String decompress(List<Integer> compressed) {
        // Build the dictionary.
        int dictSize = 256;
        Map<Integer,String> dictionary = new HashMap<Integer,String>();
        for (int i = 0; i < 256; i++)
            dictionary.put(i, "" + (char)i);
 
        String w = "" + (char)(int)compressed.remove(0);
        StringBuffer result = new StringBuffer(w);
        for (int k : compressed) {
            String entry;
            if (dictionary.containsKey(k))
                entry = dictionary.get(k);
            else if (k == dictSize)
                entry = w + w.charAt(0);
            else
                throw new IllegalArgumentException("Bad compressed k: " + k);
 
            result.append(entry);
 
            // Add w+entry[0] to the dictionary.
            dictionary.put(dictSize++, w + entry.charAt(0));
 
            w = entry;
        }
        return result.toString();
    }
 
    public static void main(String[] args) {
        List<Integer> compressed = compress("TOBEORNOTTOBEORTOBEORNOT");
        System.out.println(compressed);
        String decompressed = decompress(compressed);
        System.out.println(decompressed);
    }
}

输出：

[84, 79, 66, 69, 79, 82, 78, 79, 84, 256, 258, 260, 265, 259, 261, 263]
TOBEORNOTTOBEORTOBEORNOT

参考资料

多媒体技术基础（第三版）