LZW compression: Difference between revisions

Content deleted Content added

Inline

Revision as of 09:58, 13 August 2009

The Lempel-Ziv-Welch (LZW) algorithm provides lossless data compression. You can read a complete description of it in the Wikipedia article on the subject. It was patented, but it fell in the public domain in 2004.

C

See Bit oriented IO for bitio.h and Basic string manipulation functions for estrings.h.

<lang c>#include <stdio.h>

include <stdlib.h>
include <stdbool.h>
include <string.h>
include "bitio.h"
include "estrings.h"

define MIN(A,B) ( ((A)<(B)) ? (A) : (B) )
define MAX(A,B) ( ((A)>(B)) ? (A) : (B) )

define DEBUG

if defined(DEBUG)
define Debug(...) fprintf(stderr, __VA_ARGS__)
else
define Debug(...)
endif</lang>

We need a dictionary and so we implement one according to our need.

<lang c>struct DictionaryNode {

  String string;
  unsigned int code;
  struct DictionaryNode *next;

}; typedef struct DictionaryNode DN;

define NODES_PER_ALLOC 512

struct DictionaryStruct {

  DN *first;
  DN **nodes;
  size_t num_of_nodes;
  size_t num_of_alloc;
  bool sorted;

}; typedef struct DictionaryStruct *Dictionary;

Dictionary newDictionary() {

   Dictionary t;
   
   t = malloc(sizeof(struct DictionaryStruct));
   if ( t==NULL ) return NULL;
   t->num_of_nodes = 0;
   t->num_of_alloc = 1;
   t->nodes = malloc(t->num_of_alloc * NODES_PER_ALLOC * sizeof(void *));
   t->first = NULL;
   t->sorted = false;
   return t;

}

bool _expandDictionary(Dictionary d) {

  d->nodes = realloc(d->nodes, (d->num_of_alloc + 1) * NODES_PER_ALLOC * sizeof(void *));
  if ( d->nodes == NULL ) return false;
  d->num_of_alloc++;
  return true;

}

bool dictionaryPut_ns(Dictionary d, String s, unsigned int c) {

  DN *newdw;
  
  if ( (d->num_of_nodes + 1) > (d->num_of_alloc * NODES_PER_ALLOC) )
  {
     if ( !_expandDictionary(d) ) return false;
  }
  newdw = malloc(sizeof(DN));
  if ( newdw == NULL ) return false;
  newdw->string = cloneString(s);
  newdw->code = c;
  newdw->next = d->first;
  d->first = newdw;
  d->nodes[d->num_of_nodes] = newdw;
  d->num_of_nodes++;
  d->sorted = false;
  return true;

}

int _dict_strcompare(const void *av, const void *bv) {

  DN *a = *((DN **)av);
  DN *b = *((DN **)bv);
  return compareStrings(a->string, b->string);

}

void dictionarySort(Dictionary d) {

   if ( ! (d->sorted) ) {
     qsort(d->nodes, d->num_of_nodes, sizeof(void *), _dict_strcompare);
     d->sorted = true;
   }

}

bool dictionaryPut(Dictionary d, String s, unsigned int c) {

  bool added = dictionaryPut_ns(d, s, c);
  if ( added )
  {
      dictionarySort(d);
  }
  return added;

}

DN *dictionaryLookUp(Dictionary d, String s) {

  if ( d->num_of_nodes == 0 ) return NULL;
  int low, high, med, p;
  /* perform a "binary" search ... */
  for(low=-1, high = d->num_of_nodes-1; (high-low)>1 ; )
  {
     med = (high+low) >> 1;
     p = compareStrings(s, d->nodes[med]->string);
     if ( p <= 0 )
     {
        high = med;
     } else {
        low = med;
     }
  }
  if ( compareStrings(s, d->nodes[high]->string) == 0 ) return d->nodes[high];
  return NULL;

}

void destroyDictionary(Dictionary d) {

  int i;
  if ( d==NULL ) return;
  if ( d->nodes != NULL )
  {
    for(i=0; i < d->num_of_nodes; i++)
    {
       destroyString(d->nodes[i]->string);
       free(d->nodes[i]);
    }
    free(d->nodes);
  }
  free(d);

}</lang>

LZW code:

<lang c>#define BITS_PER_WORD 9 unsigned int codemark = 256; unsigned char num_of_bits = BITS_PER_WORD;

struct LZWHandleStruct {

  unsigned int codemark;
  unsigned char num_of_bits;
  Dictionary dict;
  size_t bytes_written;
  FILE *iostream;

}; typedef struct LZWHandleStruct LZWHandle;

LZWHandle *lzw_begin(FILE *o) {

  LZWHandle *lh;
  
  lh = malloc(sizeof(LZWHandle));
  if ( lh != NULL )
  {
    lh->bytes_written = 0;
    lh->codemark = 256;
    lh->num_of_bits = BITS_PER_WORD;
    lh->dict = newDictionary();
    lh->iostream = o;
    if ( lh->dict == NULL ) { free(lh); return NULL; }
  }
  return lh;

}

bool lzw_compress(LZWHandle *lh, const char *stream, size_t len) {

  unsigned int i;
  char tc;
  Dictionary d = lh->dict;
  DN *oc;
  String w = NULL;
  String wc = NULL;
  String t = NULL;
  size_t wbits = 0;
  
  t = newString();
  for(i=0; i < 256; i++)
  {
     tc = i;
     setString(t, &tc, 1);
     dictionaryPut_ns(d, t, i);
  }
  destroyString(t); t = NULL;
  
  /* a trick: we have put values in order, so it is sorted*/
  d->sorted = true;
  
  w = newString();
  wc = newString();
  
  for(i=0; i < len; i++)
  {
     copyString(wc, w);
     appendChar(wc, stream[i]);
     if ( (dictionaryLookUp(d, wc)) != NULL )
     {
        copyString(w, wc);
     } else {
        oc = dictionaryLookUp(d, w);
        if ( oc != NULL )
        { /* guardring for bugs :D */
          wbits += bits_write(oc->code, lh->num_of_bits, lh->iostream);
          Debug("%u ; %d ; %u\n", oc->code, lh->num_of_bits, lh->codemark);
        }
        dictionaryPut(d, wc, lh->codemark);
        lh->codemark++;
        if ( lh->codemark >= (1 << lh->num_of_bits) )
        {
           lh->num_of_bits++;
        }
        setString(w, &stream[i], 1);
     }
  }
  if ( ! stringIsEmpty(w) )
  {
      oc = dictionaryLookUp(d, w);
      /* no guard ring for bugs here :) */
      wbits += bits_write(oc->code, lh->num_of_bits, lh->iostream);
      Debug("%u ; %d ; %u\n", oc->code, lh->num_of_bits, lh->codemark);
  }
  lh->bytes_written = wbits >> 3;
  destroyString(w); destroyString(wc);
  return true;

}

int lzw_end(LZWHandle *lh) {

  int oby=0;
  if ( lh != NULL )
  {
    oby=bits_flush(lh->iostream);
    destroyDictionary(lh->dict);
    free(lh);
  }
  return oby;

}</lang>

Testing:

<lang c>const char *text = "TOBEORNOTTOBEORTOBEORNOT"; int main() {

    LZWHandle *lzw;
    size_t writtenbytecount = 0;
    
    lzw = lzw_begin(stdout);
    if ( lzw != NULL )
    {
      lzw_compress(lzw, text, strlen(text));
      writtenbytecount += lzw->bytes_written;
      Debug("bits on exit: %u\n", lzw->num_of_bits);
      lzw_end(lzw);
      fprintf(stderr, "n. of input bytes: %u\n"
                      "n. of output bytes: %u\n",
                      strlen(text), writtenbytecount
             );
    } else return 1;
    return 0;

}</lang>

This test code really emits the compressed bytes stream. The decimal values of the codes (as other solutions do) are sent as debug informations.

Common Lisp

Library: Babel

This version is based upon the Perl one. It doesn't contain mixed type data at the cost of being more consy. It includes vector operation routines, since using VECTOR-PUSH-APPEND reallocates the whole vector with each call.

The Babel library is required to convert octet vectors to strings. Lisp strings can contain characters out of the ASCII/latin1 character set, including the whole Unicode range in them. The exact encoding used is dependent upon the user's locale (LC_CTYPE on Unix).

<lang lisp>(declaim (ftype (function (vector vector &optional fixnum fixnum) vector)

               vector-append))

(defun vector-append (old new &optional (start2 0) end2)

 (declare (optimize (speed 3) (safety 0) (debug 0)))
 (prog1 old                     
   (let* ((old-fill (fill-pointer old))
          (new-fill (+ old-fill (length new))))
     (when (> new-fill (array-dimension old 0))
       (adjust-array old (* 4 new-fill)))
     (setf (fill-pointer old) new-fill)
     (replace old new :start1 old-fill :start2 start2 :end2 end2))))

(declaim (ftype (function (vector t) vector) vector-append1)) (defun vector-append1 (old new)

 (prog1 old                     
   (let* ((old-fill (fill-pointer old))
          (new-fill (1+ old-fill)))
     (when (> new-fill (array-dimension old 0))
       (adjust-array old (* 4 new-fill)))
     (setf (fill-pointer old) new-fill)
     (setf (aref old old-fill) new))))

(declaim (ftype (function (&optional t) vector) make-empty-vector)) (defun make-empty-vector (&optional (element-type t))

 (make-array 0 :element-type element-type :fill-pointer 0 :adjustable t))

(declaim (ftype (function (t &optional t) vector) make-vector-with-elt)) (defun make-vector-with-elt (elt &optional (element-type t))

 (make-array 1 :element-type element-type
               :fill-pointer 1
               :adjustable t
               :initial-element elt))

(declaim (ftype (function (vector t) vector) vector-append1-new)) (defun vector-append1-new (old new)

 (vector-append1 (vector-append (make-empty-vector 'octet) old)
                 new))

(declaim (ftype (function (vector vector) vector) vector-append-new)) (defun vector-append-new (old new)

 (vector-append (vector-append (make-empty-vector 'octet) old)
                new))

(deftype octet () '(unsigned-byte 8))

(declaim (ftype (function () hash-table) build-dictionary)) (defun build-dictionary ()

 (let ((dictionary (make-hash-table :test #'equalp)))
   (loop for i below 256
         do (let ((vec (make-vector-with-elt i 'octet)))
              (setf (gethash vec dictionary) vec)))
   dictionary))

(declaim (ftype (function ((vector octet)) (vector octet))

               lzw-compress-octets))

(defun lzw-compress-octets (octets)

 (declare (optimize (speed 3) (safety 0) (debug 0)))
 (loop with dictionary-size of-type fixnum = 256
       with w = (make-empty-vector 'octet)
       with result = (make-empty-vector 't)
       with dictionary = (build-dictionary)
       for c across octets
       for wc = (vector-append1-new w c)
       if (gethash wc dictionary) do (setq w wc)
       else do
         (vector-append result (gethash w dictionary))
         (setf (gethash wc dictionary)
               (make-vector-with-elt dictionary-size)) 
         (incf dictionary-size)
         (setq w (make-vector-with-elt c 'octet))
       finally (unless (zerop (length (the (vector octet) w)))
                 (vector-append result (gethash w dictionary)))
               (return result)))

(declaim (ftype (function (vector) (vector octet)) lzw-decompress)) (defun #1=lzw-decompress (octets)

 (declare (optimize (speed 3) (safety 0) (debug 0)))
 (when (zerop (length octets))
   (return-from #1# (make-empty-vector 'octet)))
 (loop with dictionary-size = 256
       with dictionary = (build-dictionary)
       with result = (make-vector-with-elt (aref octets 0) 'octet)
       with w = (copy-seq result)
       for i from 1 below (length octets)
       for k = (make-vector-with-elt (aref octets i) 't)
       for entry = (or (gethash k dictionary)
                       (if (equalp k dictionary-size)
                           (coerce (list w (aref w 0)) '(vector octet))
                           (error "bad compresed entry at pos ~S" i)))
       do (vector-append result entry)
          (setf (gethash (make-vector-with-elt dictionary-size) dictionary)
                (vector-append1-new w (aref entry 0)))
          (incf dictionary-size)
          (setq w entry)
       finally (return result)))

(defgeneric lzw-compress (datum)

 (:method ((string string))
   (lzw-compress (babel:string-to-octets string)))
 (:method ((octets vector))
   (lzw-compress-octets octets)))

(defun lzw-decompress-to-string (octets)

 (babel:octets-to-string (lzw-decompress octets)))

(defun test (string)

 (assert (equal #2=(lzw-decompress-to-string (lzw-compress string)) string) ()
         "Can't compress ~S properly, got ~S instead" string #2#)
 t)

</lang>

And the format used:

<lang lisp>CL-USER> (test "TOBEORNOTTOBEORTOBEORNOT") T CL-USER> (lzw-compress "TOBEORNOTTOBEORTOBEORNOT")

(84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263)

CL-USER> (lzw-decompress-to-string *) "TOBEORNOTTOBEORTOBEORNOT"</lang>

C++

<lang cpp>#include <string>

include <map>

// Compress a string to a list of output symbols. // The result will be written to the output iterator // starting at "result"; the final iterator is returned. template <typename Iterator> Iterator compress(const std::string &uncompressed, Iterator result) {

 // Build the dictionary.
 int dictSize = 256;
 std::map<std::string,int> dictionary;
 for (int i = 0; i < 256; i++)
   dictionary[std::string(1, i)] = i;
 
 std::string w;
 for (std::string::const_iterator it = uncompressed.begin();
      it != uncompressed.end(); ++it) {
   char c = *it;
   std::string wc = w + c;
   if (dictionary.count(wc))
     w = wc;
   else {
     *result++ = dictionary[w];
     // Add wc to the dictionary.
     dictionary[wc] = dictSize++;
     w = std::string(1, c);
   }
 }
 
 // Output the code for w.
 if (!w.empty())
   *result++ = dictionary[w];
 return result;

}

// Decompress a list of output ks to a string. // "begin" and "end" must form a valid range of ints template <typename Iterator> std::string decompress(Iterator begin, Iterator end) {

 // Build the dictionary.
 int dictSize = 256;
 std::map<int,std::string> dictionary;
 for (int i = 0; i < 256; i++)
   dictionary[i] = std::string(1, i);
 
 std::string w(1, *begin++);
 std::string result = w;
 std::string entry;
 for ( ; begin != end; begin++) {
   int k = *begin;
   if (dictionary.count(k))
     entry = dictionary[k];
   else if (k == dictSize)
     entry = w + w[0];
   else
     throw "Bad compressed k";
   
   result += entry;
   
   // Add w+entry[0] to the dictionary.
   dictionary[dictSize++] = w + entry[0];
   
   w = entry;
 }
 return result;

}

include <iostream>
include <iterator>
include <vector>

int main() {

 std::vector<int> compressed;
 compress("TOBEORNOTTOBEORTOBEORNOT", std::back_inserter(compressed));
 copy(compressed.begin(), compressed.end(), std::ostream_iterator<int>(std::cout, ", "));
 std::cout << std::endl;
 std::string decompressed = decompress(compressed.begin(), compressed.end());
 std::cout << decompressed << std::endl;
 
 return 0;

}</lang>

Clojure

<lang Clojure> (defn make-dict []

 (let [vals (range 0 256)]
   (zipmap (map (comp #'list #'char) vals) vals)))

(defn compress [#^String text]

 (loop [t (seq text)
        r '()
        w '()
        dict (make-dict)
        s 256]
   (let [c (first t)]
     (if c
       (let [wc (cons c w)]
         (if (get dict wc)
           (recur (rest t) r wc dict s)
           (recur (rest t) (cons (get dict w) r) (list c) (assoc dict wc s) (inc s))))
       (reverse (if w (cons (get dict w) r) r))))))

(compress "TOBEORNOTTOBEORTOBEORNOT") </lang> The output: <lang Clojure> (84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263) </lang>

D

D 1, with Phobos, from the Python version (the final writefln works only on 7-bit ASCII strings): <lang d> import std.stdio: writefln;

/// Compress a string to a list of output symbols. int[] compress(string uncompressed) {

   // build the dictionary
   int dict_size = 256;
   int[string] dictionary;
   for (int i; i < dict_size; i++)
       dictionary["" ~ cast(char)i] = i;

   string w;
   int[] result;
   foreach (c; uncompressed) {
       string wc = w ~ c;
       if (wc in dictionary)
           w = wc;
       else {
           result ~= dictionary[w];
           // add wc to the dictionary
           dictionary[wc] = dict_size;
           dict_size++;
           w = "" ~ c;
       }
   }

   // output the code for w
   if (w.length)
       result ~= dictionary[w];
   return result;

}

/// Decompress a list of output ks to a string. string decompress(int[] compressed) {

   // build the dictionary
   int dict_size = 256;
   string[int] dictionary;
   for (int i; i < dict_size; i++)
       dictionary[i] = "" ~ cast(char)i;

   string w = "" ~ cast(char)compressed[0];
   string result = w;
   foreach (k; compressed[1 .. $]) {
       string entry;
       if (k in dictionary)
           entry = dictionary[k];
       else if (k == dict_size)
           entry = w ~ w[0];
       else
           throw new Exception("Bad compressed k");
       result ~= entry;

       // add w+entry[0] to the dictionary
       dictionary[dict_size] = w ~ entry[0];
       dict_size++;

       w = entry;
   }
   return result;

}

void main() {

   auto compressed = compress("TOBEORNOTTOBEORTOBEORNOT");
   writefln(compressed);

   auto decompressed = decompress(compressed);
   writefln(decompressed);

} </lang> The output: <lang d> [84,79,66,69,79,82,78,79,84,256,258,260,265,259,261,263] TOBEORNOTTOBEORTOBEORNOT </lang>

Forth

Works with: GNU Forth version 0.6.2

256 value next-symbol

\ current string fragment

create w 256 allot     \ counted string

: w=c ( c -- )       w 1+ c!        1 w c! ;
: w+c ( c -- )  w count + c!  w c@ 1+ w c! ;

\ Compression

\ dictionary of strings to symbols
0 value dict

: init-dict  table to dict  256 to next-symbol  dict set-current ;

: free-dict                           forth-wordlist set-current ;

: in-dict? ( key len -- ? )		\ can assume len > 1
  dict search-wordlist dup if nip then ;

: lookup-dict ( key len -- symbol )
  dup 1 = if drop c@ exit then
  dict search-wordlist if >body @ else abort" bad-dict!" then ;

: put-dict ( data key len -- )
  nextname create , ;

\ output buffer of symbols
\  in real life, these symbols would be packed into octets
variable out-size
create out 256 cells allot

: output ( symbol -- )
  dup out out-size @ cells + !  1 out-size +!
  dup 256 < if emit space else . then ;

: compress ( addr len -- )
  init-dict  0 out-size !
  over c@ w=c  1 /string
  bounds do
    i c@ w+c
    w count in-dict? 0= if
      w count 1- lookup-dict output
      next-symbol dup w count put-dict
      1+ to next-symbol
      i c@ w=c
    then
  loop
  w count lookup-dict output
  free-dict ;

\ Decompression

\ array of symbols to strings (in real code this would need to be growable)
\  next-symbol is reused for the size of this table
create symtab 256 cells allot
0 value start

: init-symtab  256 to next-symbol  here to start ;

: free-symtab  start here - allot ;

: get-symbol ( symbol -- addr len )
  dup 256 < if pad c! pad 1 exit then
  256 - cells symtab + @ count ;

: add-symbol ( addr len -- )
  here symtab next-symbol 256 - cells + !
  s,
  next-symbol 1+ to next-symbol ;

create entry 256 allot

: decompress ( addr len -- )
  init-symtab
  over @ dup emit w=c
  cells bounds cell+ do
    i @ next-symbol < if
      i @ get-symbol entry place
    else i @ next-symbol = if
      w 1+ c@ w count + c!  w count 1+ entry place
    else
      abort" bad symbol!"
    then then
    entry count type	\ output
    entry 1+ c@ w+c
    w count add-symbol
    entry count w place
  1 cells +loop
  free-symtab ;

\ Testing

s" TOBEORNOTTOBEORTOBEORNOT" compress cr
\ T O B E O R N O T 256 258 260 265 259 261 263

out out-size @ decompress cr
\ TOBEORNOTTOBEORTOBEORNOT

Java

<lang java>import java.util.*;

public class LZW {

   /** Compress a string to a list of output symbols. */
   public static List<Integer> compress(String uncompressed) {
       // Build the dictionary.
       int dictSize = 256;
       Map<String,Integer> dictionary = new HashMap<String,Integer>();
       for (int i = 0; i < 256; i++)
           dictionary.put("" + (char)i, i);
       
       String w = "";
       List<Integer> result = new ArrayList<Integer>();
       for (char c : uncompressed.toCharArray()) {
           String wc = w + c;
           if (dictionary.containsKey(wc))
               w = wc;
           else {
               result.add(dictionary.get(w));
               // Add wc to the dictionary.
               dictionary.put(wc, dictSize++);
               w = "" + c;
           }
       }

       // Output the code for w.
       if (!w.equals(""))
           result.add(dictionary.get(w));
       return result;
   }
   
   /** Decompress a list of output ks to a string. */
   public static String decompress(List<Integer> compressed) {
       // Build the dictionary.
       int dictSize = 256;
       Map<Integer,String> dictionary = new HashMap<Integer,String>();
       for (int i = 0; i < 256; i++)
           dictionary.put(i, "" + (char)i);
       
       String w = "" + (char)(int)compressed.remove(0);
       String result = w;
       for (int k : compressed) {
           String entry;
           if (dictionary.containsKey(k))
               entry = dictionary.get(k);
           else if (k == dictSize)
               entry = w + w.charAt(0);
           else
               throw new IllegalArgumentException("Bad compressed k: " + k);
           
           result += entry;
           
           // Add w+entry[0] to the dictionary.
           dictionary.put(dictSize++, w + entry.charAt(0));
           
           w = entry;
       }
       return result;
   }

   public static void main(String[] args) {
       List<Integer> compressed = compress("TOBEORNOTTOBEORTOBEORNOT");
       System.out.println(compressed);
       String decompressed = decompress(compressed);
       System.out.println(decompressed);
   }

}</lang>

JavaScript

<lang javascript> //LZW Compression/Decompression for Strings var LZW = {

   "compress" : function(uncompressed) 
   {
       // Build the dictionary.
       var dictSize = 256;
       var dictionary = {};
       for (var i = 0; i < 256; i++)
       {
           dictionary[String.fromCharCode(i)] = i;
       }
		
       var w = "";
       var result = [];
       for (var i = 0; i < uncompressed.length; i++) 
       {
       	var c = uncompressed.charAt(i);
           var wc = w + c;
           if (dictionary[wc])
               w = wc;
           else {
               result.push(dictionary[w]);
               // Add wc to the dictionary.
               dictionary[wc] = dictSize++;
               w = "" + c;
           }
       }

       // Output the code for w.
       if (w != "")
           result.push(dictionary[w]);
       return result;
   },

   "decompress" : function(compressed) {
       // Build the dictionary.
       var dictSize = 256;
       var dictionary = [];
       for (var i = 0; i < 256; i++)
       {
           dictionary[i] = String.fromCharCode(i);
	}

       var w = String.fromCharCode(compressed[0]);
       var result = w;
       for (var i = 1; i < compressed.length; i++) {
           var entry = "";
           var k = compressed[i];
           if (dictionary[k])
               entry = dictionary[k];
           else if (k == dictSize)
               entry = w + w.charAt(0);
           else
               return null;

           result += entry;

           // Add w+entry[0] to the dictionary.
           dictionary[dictSize++] = w + entry.charAt(0);

           w = entry;
       }
       return result;
   }

}

// For Test Purposes var comp = LZW.compress("TOBEORNOTTOBEORTOBEORNOT"); var decomp = LZW.decompress(comp); document.write(comp+'
'+decomp); </lang>

Output: <lang javascript> 84,79,66,69,79,82,78,79,84,256,258,260,265,259,261,263 TOBEORNOTTOBEORTOBEORNOT </lang>

Objective-C

Works with: GNUstep

The class for the LZW compression algorithm:

<lang objc>#import <Foundation/Foundation.h>

import <stdio.h>

@interface LZWCompressor : NSObject {

 @private
   NSMutableArray *iostream;
   NSMutableDictionary *dict;
   NSUInteger codemark;

}

-(LZWCompressor *) init; -(LZWCompressor *) initWithArray: (NSMutableArray *) stream; -(BOOL) compressData: (NSData *) string; -(void) setArray: (NSMutableArray *) stream; -(NSArray *) getArray; @end

@implementation LZWCompressor : NSObject

-(LZWCompressor *) init {

  self = [super init];
  if ( self )
  {
     iostream = nil;
     codemark = 256;
     dict = [[NSMutableDictionary alloc] initWithCapacity: 512];
  }
  return self;

}

-(LZWCompressor *) initWithArray: (NSMutableArray *) stream {

  self = [self init];
  if ( self )
  {
     [self setArray: stream];
  }
  return self;

}

-(void) dealloc {

  [dict release];
  [iostream release];
  [super dealloc];

}

-(void) setArray: (NSMutableArray *) stream {

  iostream = [stream retain];

}

-(BOOL) compressData: (NSData *) string; {

   NSUInteger i;
   unsigned char j;
   
   // prepare dict
   for(i=0; i < 256; i++)
   {
      j = i;
      NSData *s = [NSData dataWithBytes: &j length: 1];
      [dict setObject: [NSNumber numberWithUnsignedInt: i] forKey: s];
   }
   
   NSMutableData *w = [NSMutableData data];
   NSMutableData *wc = [NSMutableData data];
   
   for(i=0; i < [string length]; i++)
   {
      [wc setData: w];
      [wc appendData: [string subdataWithRange: NSMakeRange(i, 1)]];
      if ( [dict objectForKey: wc] != nil )
      {
         [w setData: wc];
      } else {
         [iostream addObject: [dict objectForKey: w]];
         [dict setObject: [NSNumber numberWithUnsignedInt: codemark] forKey: wc];
         codemark++;
         [w setData: [string subdataWithRange: NSMakeRange(i, 1)]];
      }
   }
   if ( [w length] != 0 )
   {
      [iostream addObject: [dict objectForKey: w]];
   }
   return YES;

}

-(NSArray *) getArray {

 return iostream;

}

@end</lang>

Usage example:

<lang objc>const char *text = "TOBEORNOTTOBEORTOBEORNOT";

int main() {

 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
 
 NSMutableArray *array = [[NSMutableArray alloc] init];
 LZWCompressor *lzw = [[LZWCompressor alloc]
                       initWithArray: array ];
 if ( lzw )
 {
    [lzw compressData: [NSData dataWithBytes: text
                        length: strlen(text)]];
    NSEnumerator *en = [array objectEnumerator];
    id obj;
    while( (obj = [en nextObject]) )
    {
       printf("%u\n", [obj unsignedIntValue]);
    }
    [lzw release];
 }
 [array release];
 
 [pool release];
 return EXIT_SUCCESS;

}</lang>

Output (reformatted by hand):

 84  79  66  69  79  82  78  79
 84 256 258 260 265 259 261 263

OCaml

<lang ocaml>#directory "+site-lib/extlib/"

load "extLib.cma"

open ExtString

(** compress a string to a list of output symbols *) let compress ~uncompressed =

 (* build the dictionary *)
 let dict_size = 256 in
 let dictionary = Hashtbl.create 397 in
 for i=0 to 255 do
   let str = String.make 1 (char_of_int i) in
   Hashtbl.add dictionary str i
 done;

 let f = (fun (w, dict_size, result) c ->
   let c = String.make 1 c in
   let wc = w ^ c in
   if Hashtbl.mem dictionary wc then
     (wc, dict_size, result)
   else
     begin
       (* add wc to the dictionary *)
       Hashtbl.add dictionary wc dict_size;
       let this = Hashtbl.find dictionary w in
       (c, dict_size + 1, this::result)
     end
 ) in
 let w, _, result =
   String.fold_left f ("", dict_size, []) uncompressed
 in

 (* output the code for w *)
 let result =
   if w = ""
   then result
   else (Hashtbl.find dictionary w) :: result
 in

 (List.rev result)

exception ValueError of string

(** decompress a list of output symbols to a string *) let decompress ~compressed =

 (* build the dictionary *)
 let dict_size = 256 in
 let dictionary = Hashtbl.create 397 in
 for i=0 to pred dict_size do
   let str = String.make 1 (char_of_int i) in
   Hashtbl.add dictionary i str
 done;

 let w, compressed =
   match compressed with
   | hd::tl -> (String.make 1 (char_of_int hd)), tl
   | [] -> failwith "empty input"
 in

 let result = w::[] in

 let result, _, _ =
   List.fold_left (fun (result, w, dict_size) k ->
     let entry =
       if Hashtbl.mem dictionary k then
         Hashtbl.find dictionary k
       else if k = Hashtbl.length dictionary then
         w ^ (String.make 1 w.[0])
       else
         raise(ValueError(Printf.sprintf "Bad compressed k: %d" k))
     in
     let result = entry :: result in
  
     (* add (w ^ entry.[0]) to the dictionary *)
     Hashtbl.add dictionary dict_size (w ^ (String.make 1 entry.[0]));
     (result, entry, dict_size + 1)
   ) (result, w, dict_size) compressed
 in
 (List.rev result)

</lang>

here is the interface: <lang ocaml>val compress : uncompressed:string -> int list val decompress : compressed:int list -> string list</lang>

How to use:
The compressed datas are a list of symbols (of type int) that will require more than 8 bits to be saved. So to know how many bits are required, you need to know how many bits are required for the greatest symbol in the list.

<lang ocaml>let greatest = List.fold_left max 0 ;;

(** number of bits needed to encode the integer m *) let n_bits m =

 let m = float m in
 let rec aux n =
   let max = (2. ** n) -. 1. in
   if max >= m then int_of_float n
   else aux (n +. 1.0)
 in
 aux 1.0

let write_compressed ~filename ~compressed =

 let nbits = n_bits(greatest compressed) in
 let oc = open_out filename in
 output_byte oc nbits;
 let ob = IO.output_bits(IO.output_channel oc) in
 List.iter (IO.write_bits ob nbits) compressed;
 IO.flush_bits ob;
 close_out oc;

let read_compressed ~filename =

 let ic = open_in filename in
 let nbits = input_byte ic in
 let ib = IO.input_bits(IO.input_channel ic) in
 let rec loop acc =
   try
     let code = IO.read_bits ib nbits in
     loop (code::acc)
   with _ -> List.rev acc
 in
 let compressed = loop [] in
 let result = decompress ~compressed in
 let buf = Buffer.create 2048 in
 List.iter (Buffer.add_string buf) result;
 (Buffer.contents buf)

</lang>

Perl

In this version the hashes contain mixed typed data: <lang perl># Compress a string to a list of output symbols. sub compress {

   my $uncompressed = shift;

   # Build the dictionary.
   my $dict_size = 256;
   my %dictionary = map {chr $_ => chr $_} 0..$dict_size-1;

   my $w = "";
   my @result;
   foreach my $c (split , $uncompressed) {
       my $wc = $w . $c;
       if (exists $dictionary{$wc}) {
           $w = $wc;
       } else {
           push @result, $dictionary{$w};
           # Add wc to the dictionary.
           $dictionary{$wc} = $dict_size;
           $dict_size++;
           $w = $c;
       }
   }

   # Output the code for w.
   if ($w) {
       push @result, $dictionary{$w};
   }
   return @result;

}

Decompress a list of output ks to a string.

sub decompress {

   my @compressed = @_;

   # Build the dictionary.
   my $dict_size = 256;
   my %dictionary = map {chr $_ => chr $_} 0..$dict_size-1;

   my $w = shift @compressed;
   my $result = $w;
   foreach my $k (@compressed) {
       my $entry;
       if (exists $dictionary{$k}) {
           $entry = $dictionary{$k};
       } elsif ($k == $dict_size) {
           $entry = $w . substr(w,0,1);
       } else {
           die "Bad compressed k: $k";
       }
       $result .= $entry;

       # Add w+entry[0] to the dictionary.
       $dictionary{$dict_size} = $w . substr($entry,0,1);
       $dict_size++;

       $w = $entry;
   }
   return $result;

}

How to use:

my @compressed = compress('TOBEORNOTTOBEORTOBEORNOT'); print "@compressed\n"; my $decompressed = decompress(@compressed); print "$decompressed\n";</lang>

Output:

T O B E O R N O T 256 258 260 265 259 261 263
TOBEORNOTTOBEORTOBEORNOT

Python

In this version the dicts contain mixed typed data: <lang python>def compress(uncompressed):

   """Compress a string to a list of output symbols."""

   # Build the dictionary.
   dict_size = 256
   dictionary = dict((chr(i), chr(i)) for i in xrange(dict_size))
   # in Python 3: dictionary = {chr(i): chr(i) for i in range(dict_size)}

   w = ""
   result = []
   for c in uncompressed:
       wc = w + c
       if wc in dictionary:
           w = wc
       else:
           result.append(dictionary[w])
           # Add wc to the dictionary.
           dictionary[wc] = dict_size
           dict_size += 1
           w = c

   # Output the code for w.
   if w:
       result.append(dictionary[w])
   return result

def decompress(compressed):

   """Decompress a list of output ks to a string."""

   # Build the dictionary.
   dict_size = 256
   dictionary = dict((chr(i), chr(i)) for i in xrange(dict_size))
   # in Python 3: dictionary = {chr(i): chr(i) for i in range(dict_size)}

   w = result = compressed.pop(0)
   for k in compressed:
       if k in dictionary:
           entry = dictionary[k]
       elif k == dict_size:
           entry = w + w[0]
       else:
           raise ValueError('Bad compressed k: %s' % k)
       result += entry

       # Add w+entry[0] to the dictionary.
       dictionary[dict_size] = w + entry[0]
       dict_size += 1

       w = entry
   return result

How to use:

compressed = compress('TOBEORNOTTOBEORTOBEORNOT') print (compressed) decompressed = decompress(compressed) print (decompressed)</lang>

Output:

['T', 'O', 'B', 'E', 'O', 'R', 'N', 'O', 'T', 256, 258, 260, 265, 259, 261, 263]
TOBEORNOTTOBEORTOBEORNOT

Ruby

In this version the hashes contain mixed typed data: <lang ruby># Compress a string to a list of output symbols. def compress(uncompressed)

   # Build the dictionary.
   dict_size = 256
   dictionary = Hash[ Array.new(dict_size) {|i| [i.chr, i.chr]} ]

   w = ""
   result = []
   for c in uncompressed.split()
       wc = w + c
       if dictionary.has_key?(wc)
           w = wc
       else
           result << dictionary[w]
           # Add wc to the dictionary.
           dictionary[wc] = dict_size
           dict_size += 1
           w = c
       end
   end

   # Output the code for w.
   result << dictionary[w] unless w.empty?
   result

end

Decompress a list of output ks to a string.

def decompress(compressed)

   # Build the dictionary.
   dict_size = 256
   dictionary = Hash[ Array.new(dict_size) {|i| [i.chr, i.chr]} ]

   w = result = compressed.shift
   for k in compressed
       if dictionary.has_key?(k)
           entry = dictionary[k]
       elsif k == dict_size
           entry = w + w[0,1]
       else
           raise 'Bad compressed k: %s' % k
       end
       result += entry

       # Add w+entry[0] to the dictionary.
       dictionary[dict_size] = w + entry[0,1]
       dict_size += 1

       w = entry
   end
   result

end

How to use:

compressed = compress('TOBEORNOTTOBEORTOBEORNOT') p compressed decompressed = decompress(compressed) puts decompressed</lang>

Output:

["T", "O", "B", "E", "O", "R", "N", "O", "T", 256, 258, 260, 265, 259, 261, 263]
TOBEORNOTTOBEORTOBEORNOT

Seed7

<lang seed7>$ include "seed7_05.s7i";

const func string: lzwCompress (in string: uncompressed) is func

 result
   var string: result is "";
 local
   var char: ch is ' ';
   var hash [string] char: mydict is (hash [string] char).value;
   var string: buffer is "";
   var string: xstr is "";
 begin
   for ch range chr(0) to chr(255) do
     mydict @:= [str(ch)] ch;
   end for;
   for ch range uncompressed do
     xstr := buffer & str(ch);
     if xstr in mydict then
       buffer &:= str(ch)
     else
       result &:= str(mydict[buffer]);
       mydict @:= [xstr] chr(length(mydict));
       buffer := str(ch);
     end if;
   end for;
   if buffer <> "" then
     result &:= str(mydict[buffer]);
   end if;
 end func;

const func string: lzwDecompress (in string: compressed) is func

 result
   var string: result is "";
 local
   var char: ch is ' ';
   var hash [char] string: mydict is (hash [char] string).value;
   var string: buffer is "";
   var string: current is "";
   var string: chain is "";
 begin
   for ch range chr(0) to chr(255) do
     mydict @:= [ch] str(ch);
   end for;
   for ch range compressed do
     if buffer = "" then
       buffer := mydict[ch];
       result &:= buffer;
     elsif ch <= chr(255) then
       current := mydict[ch];
       result &:= current;
       chain := buffer & current;
       mydict @:= [chr(length(mydict))] chain;
       buffer := current;
     else
       if ch in mydict then
         chain := mydict[ch];
       else
         chain := buffer & str(buffer[1]);
       end if;
       result &:= chain;
       mydict @:= [chr(length(mydict))] buffer & str(chain[1]);
       buffer := chain;
     end if;
   end for;
 end func;

const proc: main is func

 local
   var string: compressed is "";
   var string: uncompressed is "";
 begin
   compressed := lzwCompress("TOBEORNOTTOBEORTOBEORNOT");
   writeln(literal(compressed));
   uncompressed := lzwDecompress(compressed);
   writeln(uncompressed);
 end func;</lang>

Output:

"TOBEORNOT\256\\258\\260\\265\\259\\261\\263\"
TOBEORNOTTOBEORTOBEORNOT

Original source: [1] and [2]

Tcl

<lang tcl>namespace eval LZW {

   variable char2int
   variable chars
   for {set i 0} {$i < 256} {incr i} {
       set char [binary format c $i]
       set char2int($char) $i
       lappend chars $char
   }

}

proc LZW::encode {data} {

   variable char2int
   array set dict [array get char2int]

   set w ""
   set result [list]

   foreach c [split $data ""] {
       set wc $w$c
       if {[info exists dict($wc)]} {
           set w $wc
       } else {
           lappend result $dict($w)
           set dict($wc) [array size dict]
           set w $c
       }
   }
   lappend result $dict($w)

}

proc LZW::decode {cdata} {

   variable chars
   set dict $chars

   set k [lindex $cdata 0]
   set w [lindex $dict $k]
   set result $w

   foreach k [lrange $cdata 1 end] {
       set currSizeDict [llength $dict]
       if {$k < $currSizeDict} {
           set entry [lindex $dict $k]
       } elseif {$k == $currSizeDict} {
           set entry $w[string index $w 0]
       } else {
           error "invalid code ($k) in ($cdata)"
       }
       append result $entry
       lappend dict $w[string index $entry 0]
       set w $entry
   }
   return $result

}

set s TOBEORNOTTOBEORTOBEORNOT# set e [LZW::encode $s] ;# ==> 84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263 35 set d [LZW::decode $e] ;# ==> TOBEORNOTTOBEORTOBEORNOT#

or

if {$s eq [LZW::decode [LZW::encode $s]]} then {puts success} else {puts fail} ;# ==> success