LZW compression: Difference between revisions
Line 975:
{
printf("%u\n", [obj unsignedIntValue]);
}
[lzw release];
}
[array release];
|
Revision as of 09:58, 13 August 2009
You are encouraged to solve this task according to the task description, using any language you may know.
The Lempel-Ziv-Welch (LZW) algorithm provides lossless data compression. You can read a complete description of it in the Wikipedia article on the subject. It was patented, but it fell in the public domain in 2004.
C
See Bit oriented IO for bitio.h and Basic string manipulation functions for estrings.h.
<lang c>#include <stdio.h>
- include <stdlib.h>
- include <stdbool.h>
- include <string.h>
- include "bitio.h"
- include "estrings.h"
- define MIN(A,B) ( ((A)<(B)) ? (A) : (B) )
- define MAX(A,B) ( ((A)>(B)) ? (A) : (B) )
- define DEBUG
- if defined(DEBUG)
- define Debug(...) fprintf(stderr, __VA_ARGS__)
- else
- define Debug(...)
- endif</lang>
We need a dictionary and so we implement one according to our need.
<lang c>struct DictionaryNode {
String string; unsigned int code; struct DictionaryNode *next;
}; typedef struct DictionaryNode DN;
- define NODES_PER_ALLOC 512
struct DictionaryStruct {
DN *first; DN **nodes; size_t num_of_nodes; size_t num_of_alloc; bool sorted;
}; typedef struct DictionaryStruct *Dictionary;
Dictionary newDictionary() {
Dictionary t; t = malloc(sizeof(struct DictionaryStruct)); if ( t==NULL ) return NULL; t->num_of_nodes = 0; t->num_of_alloc = 1; t->nodes = malloc(t->num_of_alloc * NODES_PER_ALLOC * sizeof(void *)); t->first = NULL; t->sorted = false; return t;
}
bool _expandDictionary(Dictionary d) {
d->nodes = realloc(d->nodes, (d->num_of_alloc + 1) * NODES_PER_ALLOC * sizeof(void *)); if ( d->nodes == NULL ) return false; d->num_of_alloc++; return true;
}
bool dictionaryPut_ns(Dictionary d, String s, unsigned int c) {
DN *newdw; if ( (d->num_of_nodes + 1) > (d->num_of_alloc * NODES_PER_ALLOC) ) { if ( !_expandDictionary(d) ) return false; } newdw = malloc(sizeof(DN)); if ( newdw == NULL ) return false; newdw->string = cloneString(s); newdw->code = c; newdw->next = d->first; d->first = newdw; d->nodes[d->num_of_nodes] = newdw; d->num_of_nodes++; d->sorted = false; return true;
}
int _dict_strcompare(const void *av, const void *bv) {
DN *a = *((DN **)av); DN *b = *((DN **)bv); return compareStrings(a->string, b->string);
}
void dictionarySort(Dictionary d) {
if ( ! (d->sorted) ) { qsort(d->nodes, d->num_of_nodes, sizeof(void *), _dict_strcompare); d->sorted = true; }
}
bool dictionaryPut(Dictionary d, String s, unsigned int c) {
bool added = dictionaryPut_ns(d, s, c); if ( added ) { dictionarySort(d); } return added;
}
DN *dictionaryLookUp(Dictionary d, String s)
{
if ( d->num_of_nodes == 0 ) return NULL; int low, high, med, p; /* perform a "binary" search ... */ for(low=-1, high = d->num_of_nodes-1; (high-low)>1 ; ) { med = (high+low) >> 1; p = compareStrings(s, d->nodes[med]->string); if ( p <= 0 ) { high = med; } else { low = med; } } if ( compareStrings(s, d->nodes[high]->string) == 0 ) return d->nodes[high]; return NULL;
}
void destroyDictionary(Dictionary d) {
int i; if ( d==NULL ) return; if ( d->nodes != NULL ) { for(i=0; i < d->num_of_nodes; i++) { destroyString(d->nodes[i]->string); free(d->nodes[i]); } free(d->nodes); } free(d);
}</lang>
LZW code:
<lang c>#define BITS_PER_WORD 9 unsigned int codemark = 256; unsigned char num_of_bits = BITS_PER_WORD;
struct LZWHandleStruct {
unsigned int codemark; unsigned char num_of_bits; Dictionary dict; size_t bytes_written; FILE *iostream;
}; typedef struct LZWHandleStruct LZWHandle;
LZWHandle *lzw_begin(FILE *o) {
LZWHandle *lh; lh = malloc(sizeof(LZWHandle)); if ( lh != NULL ) { lh->bytes_written = 0; lh->codemark = 256; lh->num_of_bits = BITS_PER_WORD; lh->dict = newDictionary(); lh->iostream = o; if ( lh->dict == NULL ) { free(lh); return NULL; } } return lh;
}
bool lzw_compress(LZWHandle *lh, const char *stream, size_t len) {
unsigned int i; char tc; Dictionary d = lh->dict; DN *oc; String w = NULL; String wc = NULL; String t = NULL; size_t wbits = 0; t = newString(); for(i=0; i < 256; i++) { tc = i; setString(t, &tc, 1); dictionaryPut_ns(d, t, i); } destroyString(t); t = NULL; /* a trick: we have put values in order, so it is sorted*/ d->sorted = true; w = newString(); wc = newString(); for(i=0; i < len; i++) { copyString(wc, w); appendChar(wc, stream[i]); if ( (dictionaryLookUp(d, wc)) != NULL ) { copyString(w, wc); } else { oc = dictionaryLookUp(d, w); if ( oc != NULL ) { /* guardring for bugs :D */ wbits += bits_write(oc->code, lh->num_of_bits, lh->iostream); Debug("%u ; %d ; %u\n", oc->code, lh->num_of_bits, lh->codemark); } dictionaryPut(d, wc, lh->codemark); lh->codemark++; if ( lh->codemark >= (1 << lh->num_of_bits) ) { lh->num_of_bits++; } setString(w, &stream[i], 1); } } if ( ! stringIsEmpty(w) ) { oc = dictionaryLookUp(d, w); /* no guard ring for bugs here :) */ wbits += bits_write(oc->code, lh->num_of_bits, lh->iostream); Debug("%u ; %d ; %u\n", oc->code, lh->num_of_bits, lh->codemark); } lh->bytes_written = wbits >> 3; destroyString(w); destroyString(wc); return true;
}
int lzw_end(LZWHandle *lh) {
int oby=0; if ( lh != NULL ) { oby=bits_flush(lh->iostream); destroyDictionary(lh->dict); free(lh); } return oby;
}</lang>
Testing:
<lang c>const char *text = "TOBEORNOTTOBEORTOBEORNOT"; int main() {
LZWHandle *lzw; size_t writtenbytecount = 0; lzw = lzw_begin(stdout); if ( lzw != NULL ) { lzw_compress(lzw, text, strlen(text)); writtenbytecount += lzw->bytes_written; Debug("bits on exit: %u\n", lzw->num_of_bits); lzw_end(lzw); fprintf(stderr, "n. of input bytes: %u\n" "n. of output bytes: %u\n", strlen(text), writtenbytecount ); } else return 1; return 0;
}</lang>
This test code really emits the compressed bytes stream. The decimal values of the codes (as other solutions do) are sent as debug informations.
Common Lisp
This version is based upon the Perl one. It doesn't contain mixed type data at the cost of being more consy. It includes vector operation routines, since using VECTOR-PUSH-APPEND
reallocates the whole vector with each call.
The Babel library is required to convert octet vectors to strings. Lisp strings can contain characters out of the ASCII/latin1 character set, including the whole Unicode range in them. The exact encoding used is dependent upon the user's locale (LC_CTYPE
on Unix).
<lang lisp>(declaim (ftype (function (vector vector &optional fixnum fixnum) vector)
vector-append))
(defun vector-append (old new &optional (start2 0) end2)
(declare (optimize (speed 3) (safety 0) (debug 0))) (prog1 old (let* ((old-fill (fill-pointer old)) (new-fill (+ old-fill (length new)))) (when (> new-fill (array-dimension old 0)) (adjust-array old (* 4 new-fill))) (setf (fill-pointer old) new-fill) (replace old new :start1 old-fill :start2 start2 :end2 end2))))
(declaim (ftype (function (vector t) vector) vector-append1)) (defun vector-append1 (old new)
(prog1 old (let* ((old-fill (fill-pointer old)) (new-fill (1+ old-fill))) (when (> new-fill (array-dimension old 0)) (adjust-array old (* 4 new-fill))) (setf (fill-pointer old) new-fill) (setf (aref old old-fill) new))))
(declaim (ftype (function (&optional t) vector) make-empty-vector)) (defun make-empty-vector (&optional (element-type t))
(make-array 0 :element-type element-type :fill-pointer 0 :adjustable t))
(declaim (ftype (function (t &optional t) vector) make-vector-with-elt))
(defun make-vector-with-elt (elt &optional (element-type t))
(make-array 1 :element-type element-type :fill-pointer 1 :adjustable t :initial-element elt))
(declaim (ftype (function (vector t) vector) vector-append1-new)) (defun vector-append1-new (old new)
(vector-append1 (vector-append (make-empty-vector 'octet) old) new))
(declaim (ftype (function (vector vector) vector) vector-append-new)) (defun vector-append-new (old new)
(vector-append (vector-append (make-empty-vector 'octet) old) new))
(deftype octet () '(unsigned-byte 8))
(declaim (ftype (function () hash-table) build-dictionary)) (defun build-dictionary ()
(let ((dictionary (make-hash-table :test #'equalp))) (loop for i below 256 do (let ((vec (make-vector-with-elt i 'octet))) (setf (gethash vec dictionary) vec))) dictionary))
(declaim (ftype (function ((vector octet)) (vector octet))
lzw-compress-octets))
(defun lzw-compress-octets (octets)
(declare (optimize (speed 3) (safety 0) (debug 0))) (loop with dictionary-size of-type fixnum = 256 with w = (make-empty-vector 'octet) with result = (make-empty-vector 't) with dictionary = (build-dictionary) for c across octets for wc = (vector-append1-new w c) if (gethash wc dictionary) do (setq w wc) else do (vector-append result (gethash w dictionary)) (setf (gethash wc dictionary) (make-vector-with-elt dictionary-size)) (incf dictionary-size) (setq w (make-vector-with-elt c 'octet)) finally (unless (zerop (length (the (vector octet) w))) (vector-append result (gethash w dictionary))) (return result)))
(declaim (ftype (function (vector) (vector octet)) lzw-decompress)) (defun #1=lzw-decompress (octets)
(declare (optimize (speed 3) (safety 0) (debug 0))) (when (zerop (length octets)) (return-from #1# (make-empty-vector 'octet))) (loop with dictionary-size = 256 with dictionary = (build-dictionary) with result = (make-vector-with-elt (aref octets 0) 'octet) with w = (copy-seq result) for i from 1 below (length octets) for k = (make-vector-with-elt (aref octets i) 't) for entry = (or (gethash k dictionary) (if (equalp k dictionary-size) (coerce (list w (aref w 0)) '(vector octet)) (error "bad compresed entry at pos ~S" i))) do (vector-append result entry) (setf (gethash (make-vector-with-elt dictionary-size) dictionary) (vector-append1-new w (aref entry 0))) (incf dictionary-size) (setq w entry) finally (return result)))
(defgeneric lzw-compress (datum)
(:method ((string string)) (lzw-compress (babel:string-to-octets string))) (:method ((octets vector)) (lzw-compress-octets octets)))
(defun lzw-decompress-to-string (octets)
(babel:octets-to-string (lzw-decompress octets)))
(defun test (string)
(assert (equal #2=(lzw-decompress-to-string (lzw-compress string)) string) () "Can't compress ~S properly, got ~S instead" string #2#) t)
</lang>
And the format used:
<lang lisp>CL-USER> (test "TOBEORNOTTOBEORTOBEORNOT") T CL-USER> (lzw-compress "TOBEORNOTTOBEORTOBEORNOT")
- (84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263)
CL-USER> (lzw-decompress-to-string *) "TOBEORNOTTOBEORTOBEORNOT"</lang>
C++
<lang cpp>#include <string>
- include <map>
// Compress a string to a list of output symbols. // The result will be written to the output iterator // starting at "result"; the final iterator is returned. template <typename Iterator> Iterator compress(const std::string &uncompressed, Iterator result) {
// Build the dictionary. int dictSize = 256; std::map<std::string,int> dictionary; for (int i = 0; i < 256; i++) dictionary[std::string(1, i)] = i; std::string w; for (std::string::const_iterator it = uncompressed.begin(); it != uncompressed.end(); ++it) { char c = *it; std::string wc = w + c; if (dictionary.count(wc)) w = wc; else { *result++ = dictionary[w]; // Add wc to the dictionary. dictionary[wc] = dictSize++; w = std::string(1, c); } } // Output the code for w. if (!w.empty()) *result++ = dictionary[w]; return result;
}
// Decompress a list of output ks to a string. // "begin" and "end" must form a valid range of ints template <typename Iterator> std::string decompress(Iterator begin, Iterator end) {
// Build the dictionary. int dictSize = 256; std::map<int,std::string> dictionary; for (int i = 0; i < 256; i++) dictionary[i] = std::string(1, i); std::string w(1, *begin++); std::string result = w; std::string entry; for ( ; begin != end; begin++) { int k = *begin; if (dictionary.count(k)) entry = dictionary[k]; else if (k == dictSize) entry = w + w[0]; else throw "Bad compressed k"; result += entry; // Add w+entry[0] to the dictionary. dictionary[dictSize++] = w + entry[0]; w = entry; } return result;
}
- include <iostream>
- include <iterator>
- include <vector>
int main() {
std::vector<int> compressed; compress("TOBEORNOTTOBEORTOBEORNOT", std::back_inserter(compressed)); copy(compressed.begin(), compressed.end(), std::ostream_iterator<int>(std::cout, ", ")); std::cout << std::endl; std::string decompressed = decompress(compressed.begin(), compressed.end()); std::cout << decompressed << std::endl; return 0;
}</lang>
Clojure
<lang Clojure> (defn make-dict []
(let [vals (range 0 256)] (zipmap (map (comp #'list #'char) vals) vals)))
(defn compress [#^String text]
(loop [t (seq text) r '() w '() dict (make-dict) s 256] (let [c (first t)] (if c (let [wc (cons c w)] (if (get dict wc) (recur (rest t) r wc dict s) (recur (rest t) (cons (get dict w) r) (list c) (assoc dict wc s) (inc s)))) (reverse (if w (cons (get dict w) r) r))))))
(compress "TOBEORNOTTOBEORTOBEORNOT") </lang> The output: <lang Clojure> (84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263) </lang>
D
D 1, with Phobos, from the Python version (the final writefln works only on 7-bit ASCII strings): <lang d> import std.stdio: writefln;
/// Compress a string to a list of output symbols. int[] compress(string uncompressed) {
// build the dictionary int dict_size = 256; int[string] dictionary; for (int i; i < dict_size; i++) dictionary["" ~ cast(char)i] = i;
string w; int[] result; foreach (c; uncompressed) { string wc = w ~ c; if (wc in dictionary) w = wc; else { result ~= dictionary[w]; // add wc to the dictionary dictionary[wc] = dict_size; dict_size++; w = "" ~ c; } }
// output the code for w if (w.length) result ~= dictionary[w]; return result;
}
/// Decompress a list of output ks to a string.
string decompress(int[] compressed) {
// build the dictionary int dict_size = 256; string[int] dictionary; for (int i; i < dict_size; i++) dictionary[i] = "" ~ cast(char)i;
string w = "" ~ cast(char)compressed[0]; string result = w; foreach (k; compressed[1 .. $]) { string entry; if (k in dictionary) entry = dictionary[k]; else if (k == dict_size) entry = w ~ w[0]; else throw new Exception("Bad compressed k"); result ~= entry;
// add w+entry[0] to the dictionary dictionary[dict_size] = w ~ entry[0]; dict_size++;
w = entry; } return result;
}
void main() {
auto compressed = compress("TOBEORNOTTOBEORTOBEORNOT"); writefln(compressed);
auto decompressed = decompress(compressed); writefln(decompressed);
} </lang> The output: <lang d> [84,79,66,69,79,82,78,79,84,256,258,260,265,259,261,263] TOBEORNOTTOBEORTOBEORNOT </lang>
Forth
256 value next-symbol \ current string fragment create w 256 allot \ counted string : w=c ( c -- ) w 1+ c! 1 w c! ; : w+c ( c -- ) w count + c! w c@ 1+ w c! ;
\ Compression \ dictionary of strings to symbols 0 value dict : init-dict table to dict 256 to next-symbol dict set-current ; : free-dict forth-wordlist set-current ; : in-dict? ( key len -- ? ) \ can assume len > 1 dict search-wordlist dup if nip then ; : lookup-dict ( key len -- symbol ) dup 1 = if drop c@ exit then dict search-wordlist if >body @ else abort" bad-dict!" then ; : put-dict ( data key len -- ) nextname create , ; \ output buffer of symbols \ in real life, these symbols would be packed into octets variable out-size create out 256 cells allot : output ( symbol -- ) dup out out-size @ cells + ! 1 out-size +! dup 256 < if emit space else . then ; : compress ( addr len -- ) init-dict 0 out-size ! over c@ w=c 1 /string bounds do i c@ w+c w count in-dict? 0= if w count 1- lookup-dict output next-symbol dup w count put-dict 1+ to next-symbol i c@ w=c then loop w count lookup-dict output free-dict ;
\ Decompression \ array of symbols to strings (in real code this would need to be growable) \ next-symbol is reused for the size of this table create symtab 256 cells allot 0 value start : init-symtab 256 to next-symbol here to start ; : free-symtab start here - allot ; : get-symbol ( symbol -- addr len ) dup 256 < if pad c! pad 1 exit then 256 - cells symtab + @ count ; : add-symbol ( addr len -- ) here symtab next-symbol 256 - cells + ! s, next-symbol 1+ to next-symbol ; create entry 256 allot : decompress ( addr len -- ) init-symtab over @ dup emit w=c cells bounds cell+ do i @ next-symbol < if i @ get-symbol entry place else i @ next-symbol = if w 1+ c@ w count + c! w count 1+ entry place else abort" bad symbol!" then then entry count type \ output entry 1+ c@ w+c w count add-symbol entry count w place 1 cells +loop free-symtab ;
\ Testing s" TOBEORNOTTOBEORTOBEORNOT" compress cr \ T O B E O R N O T 256 258 260 265 259 261 263 out out-size @ decompress cr \ TOBEORNOTTOBEORTOBEORNOT
Java
<lang java>import java.util.*;
public class LZW {
/** Compress a string to a list of output symbols. */ public static List<Integer> compress(String uncompressed) { // Build the dictionary. int dictSize = 256; Map<String,Integer> dictionary = new HashMap<String,Integer>(); for (int i = 0; i < 256; i++) dictionary.put("" + (char)i, i); String w = ""; List<Integer> result = new ArrayList<Integer>(); for (char c : uncompressed.toCharArray()) { String wc = w + c; if (dictionary.containsKey(wc)) w = wc; else { result.add(dictionary.get(w)); // Add wc to the dictionary. dictionary.put(wc, dictSize++); w = "" + c; } } // Output the code for w. if (!w.equals("")) result.add(dictionary.get(w)); return result; } /** Decompress a list of output ks to a string. */ public static String decompress(List<Integer> compressed) { // Build the dictionary. int dictSize = 256; Map<Integer,String> dictionary = new HashMap<Integer,String>(); for (int i = 0; i < 256; i++) dictionary.put(i, "" + (char)i); String w = "" + (char)(int)compressed.remove(0); String result = w; for (int k : compressed) { String entry; if (dictionary.containsKey(k)) entry = dictionary.get(k); else if (k == dictSize) entry = w + w.charAt(0); else throw new IllegalArgumentException("Bad compressed k: " + k); result += entry; // Add w+entry[0] to the dictionary. dictionary.put(dictSize++, w + entry.charAt(0)); w = entry; } return result; }
public static void main(String[] args) { List<Integer> compressed = compress("TOBEORNOTTOBEORTOBEORNOT"); System.out.println(compressed); String decompressed = decompress(compressed); System.out.println(decompressed); }
}</lang>
JavaScript
<lang javascript> //LZW Compression/Decompression for Strings var LZW = {
"compress" : function(uncompressed) { // Build the dictionary. var dictSize = 256; var dictionary = {}; for (var i = 0; i < 256; i++) { dictionary[String.fromCharCode(i)] = i; } var w = ""; var result = []; for (var i = 0; i < uncompressed.length; i++) { var c = uncompressed.charAt(i); var wc = w + c; if (dictionary[wc]) w = wc; else { result.push(dictionary[w]); // Add wc to the dictionary. dictionary[wc] = dictSize++; w = "" + c; } } // Output the code for w. if (w != "") result.push(dictionary[w]); return result; },
"decompress" : function(compressed) { // Build the dictionary. var dictSize = 256; var dictionary = []; for (var i = 0; i < 256; i++) { dictionary[i] = String.fromCharCode(i); } var w = String.fromCharCode(compressed[0]); var result = w; for (var i = 1; i < compressed.length; i++) { var entry = ""; var k = compressed[i]; if (dictionary[k]) entry = dictionary[k]; else if (k == dictSize) entry = w + w.charAt(0); else return null; result += entry; // Add w+entry[0] to the dictionary. dictionary[dictSize++] = w + entry.charAt(0); w = entry; } return result; }
}
// For Test Purposes
var comp = LZW.compress("TOBEORNOTTOBEORTOBEORNOT");
var decomp = LZW.decompress(comp);
document.write(comp+'
'+decomp);
</lang>
Output: <lang javascript> 84,79,66,69,79,82,78,79,84,256,258,260,265,259,261,263 TOBEORNOTTOBEORTOBEORNOT </lang>
Objective-C
The class for the LZW compression algorithm:
<lang objc>#import <Foundation/Foundation.h>
- import <stdio.h>
@interface LZWCompressor : NSObject {
@private NSMutableArray *iostream; NSMutableDictionary *dict; NSUInteger codemark;
}
-(LZWCompressor *) init; -(LZWCompressor *) initWithArray: (NSMutableArray *) stream; -(BOOL) compressData: (NSData *) string; -(void) setArray: (NSMutableArray *) stream; -(NSArray *) getArray; @end
@implementation LZWCompressor : NSObject
-(LZWCompressor *) init {
self = [super init]; if ( self ) { iostream = nil; codemark = 256; dict = [[NSMutableDictionary alloc] initWithCapacity: 512]; } return self;
}
-(LZWCompressor *) initWithArray: (NSMutableArray *) stream {
self = [self init]; if ( self ) { [self setArray: stream]; } return self;
}
-(void) dealloc {
[dict release]; [iostream release]; [super dealloc];
}
-(void) setArray: (NSMutableArray *) stream {
iostream = [stream retain];
}
-(BOOL) compressData: (NSData *) string; {
NSUInteger i; unsigned char j; // prepare dict for(i=0; i < 256; i++) { j = i; NSData *s = [NSData dataWithBytes: &j length: 1]; [dict setObject: [NSNumber numberWithUnsignedInt: i] forKey: s]; } NSMutableData *w = [NSMutableData data]; NSMutableData *wc = [NSMutableData data]; for(i=0; i < [string length]; i++) { [wc setData: w]; [wc appendData: [string subdataWithRange: NSMakeRange(i, 1)]]; if ( [dict objectForKey: wc] != nil ) { [w setData: wc]; } else { [iostream addObject: [dict objectForKey: w]]; [dict setObject: [NSNumber numberWithUnsignedInt: codemark] forKey: wc]; codemark++; [w setData: [string subdataWithRange: NSMakeRange(i, 1)]]; } } if ( [w length] != 0 ) { [iostream addObject: [dict objectForKey: w]]; } return YES;
}
-(NSArray *) getArray {
return iostream;
}
@end</lang>
Usage example:
<lang objc>const char *text = "TOBEORNOTTOBEORTOBEORNOT";
int main() {
NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init]; NSMutableArray *array = [[NSMutableArray alloc] init]; LZWCompressor *lzw = [[LZWCompressor alloc] initWithArray: array ]; if ( lzw ) { [lzw compressData: [NSData dataWithBytes: text length: strlen(text)]]; NSEnumerator *en = [array objectEnumerator]; id obj; while( (obj = [en nextObject]) ) { printf("%u\n", [obj unsignedIntValue]); } [lzw release]; } [array release]; [pool release]; return EXIT_SUCCESS;
}</lang>
Output (reformatted by hand):
84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263
OCaml
<lang ocaml>#directory "+site-lib/extlib/"
- load "extLib.cma"
open ExtString
(** compress a string to a list of output symbols *) let compress ~uncompressed =
(* build the dictionary *) let dict_size = 256 in let dictionary = Hashtbl.create 397 in for i=0 to 255 do let str = String.make 1 (char_of_int i) in Hashtbl.add dictionary str i done; let f = (fun (w, dict_size, result) c -> let c = String.make 1 c in let wc = w ^ c in if Hashtbl.mem dictionary wc then (wc, dict_size, result) else begin (* add wc to the dictionary *) Hashtbl.add dictionary wc dict_size; let this = Hashtbl.find dictionary w in (c, dict_size + 1, this::result) end ) in let w, _, result = String.fold_left f ("", dict_size, []) uncompressed in
(* output the code for w *) let result = if w = "" then result else (Hashtbl.find dictionary w) :: result in
(List.rev result)
exception ValueError of string
(** decompress a list of output symbols to a string *) let decompress ~compressed =
(* build the dictionary *) let dict_size = 256 in let dictionary = Hashtbl.create 397 in for i=0 to pred dict_size do let str = String.make 1 (char_of_int i) in Hashtbl.add dictionary i str done;
let w, compressed = match compressed with | hd::tl -> (String.make 1 (char_of_int hd)), tl | [] -> failwith "empty input" in let result = w::[] in
let result, _, _ = List.fold_left (fun (result, w, dict_size) k -> let entry = if Hashtbl.mem dictionary k then Hashtbl.find dictionary k else if k = Hashtbl.length dictionary then w ^ (String.make 1 w.[0]) else raise(ValueError(Printf.sprintf "Bad compressed k: %d" k)) in let result = entry :: result in (* add (w ^ entry.[0]) to the dictionary *) Hashtbl.add dictionary dict_size (w ^ (String.make 1 entry.[0])); (result, entry, dict_size + 1) ) (result, w, dict_size) compressed in (List.rev result)
- </lang>
here is the interface: <lang ocaml>val compress : uncompressed:string -> int list val decompress : compressed:int list -> string list</lang>
How to use:
The compressed datas are a list of symbols (of type int) that will require more than 8 bits to be saved.
So to know how many bits are required, you need to know how many bits are required for the greatest symbol in the list.
<lang ocaml>let greatest = List.fold_left max 0 ;;
(** number of bits needed to encode the integer m *) let n_bits m =
let m = float m in let rec aux n = let max = (2. ** n) -. 1. in if max >= m then int_of_float n else aux (n +. 1.0) in aux 1.0
let write_compressed ~filename ~compressed =
let nbits = n_bits(greatest compressed) in let oc = open_out filename in output_byte oc nbits; let ob = IO.output_bits(IO.output_channel oc) in List.iter (IO.write_bits ob nbits) compressed; IO.flush_bits ob; close_out oc;
let read_compressed ~filename =
let ic = open_in filename in let nbits = input_byte ic in let ib = IO.input_bits(IO.input_channel ic) in let rec loop acc = try let code = IO.read_bits ib nbits in loop (code::acc) with _ -> List.rev acc in let compressed = loop [] in let result = decompress ~compressed in let buf = Buffer.create 2048 in List.iter (Buffer.add_string buf) result; (Buffer.contents buf)
- </lang>
Perl
In this version the hashes contain mixed typed data: <lang perl># Compress a string to a list of output symbols. sub compress {
my $uncompressed = shift;
# Build the dictionary. my $dict_size = 256; my %dictionary = map {chr $_ => chr $_} 0..$dict_size-1;
my $w = ""; my @result; foreach my $c (split , $uncompressed) { my $wc = $w . $c; if (exists $dictionary{$wc}) { $w = $wc; } else { push @result, $dictionary{$w}; # Add wc to the dictionary. $dictionary{$wc} = $dict_size; $dict_size++; $w = $c; } }
# Output the code for w. if ($w) { push @result, $dictionary{$w}; } return @result;
}
- Decompress a list of output ks to a string.
sub decompress {
my @compressed = @_;
# Build the dictionary. my $dict_size = 256; my %dictionary = map {chr $_ => chr $_} 0..$dict_size-1;
my $w = shift @compressed; my $result = $w; foreach my $k (@compressed) { my $entry; if (exists $dictionary{$k}) { $entry = $dictionary{$k}; } elsif ($k == $dict_size) { $entry = $w . substr(w,0,1); } else { die "Bad compressed k: $k"; } $result .= $entry;
# Add w+entry[0] to the dictionary. $dictionary{$dict_size} = $w . substr($entry,0,1); $dict_size++;
$w = $entry; } return $result;
}
- How to use:
my @compressed = compress('TOBEORNOTTOBEORTOBEORNOT'); print "@compressed\n"; my $decompressed = decompress(@compressed); print "$decompressed\n";</lang>
Output:
T O B E O R N O T 256 258 260 265 259 261 263 TOBEORNOTTOBEORTOBEORNOT
Python
In this version the dicts contain mixed typed data: <lang python>def compress(uncompressed):
"""Compress a string to a list of output symbols."""
# Build the dictionary. dict_size = 256 dictionary = dict((chr(i), chr(i)) for i in xrange(dict_size)) # in Python 3: dictionary = {chr(i): chr(i) for i in range(dict_size)}
w = "" result = [] for c in uncompressed: wc = w + c if wc in dictionary: w = wc else: result.append(dictionary[w]) # Add wc to the dictionary. dictionary[wc] = dict_size dict_size += 1 w = c
# Output the code for w. if w: result.append(dictionary[w]) return result
def decompress(compressed):
"""Decompress a list of output ks to a string."""
# Build the dictionary. dict_size = 256 dictionary = dict((chr(i), chr(i)) for i in xrange(dict_size)) # in Python 3: dictionary = {chr(i): chr(i) for i in range(dict_size)}
w = result = compressed.pop(0) for k in compressed: if k in dictionary: entry = dictionary[k] elif k == dict_size: entry = w + w[0] else: raise ValueError('Bad compressed k: %s' % k) result += entry
# Add w+entry[0] to the dictionary. dictionary[dict_size] = w + entry[0] dict_size += 1
w = entry return result
- How to use:
compressed = compress('TOBEORNOTTOBEORTOBEORNOT') print (compressed) decompressed = decompress(compressed) print (decompressed)</lang>
Output:
['T', 'O', 'B', 'E', 'O', 'R', 'N', 'O', 'T', 256, 258, 260, 265, 259, 261, 263] TOBEORNOTTOBEORTOBEORNOT
Ruby
In this version the hashes contain mixed typed data: <lang ruby># Compress a string to a list of output symbols. def compress(uncompressed)
# Build the dictionary. dict_size = 256 dictionary = Hash[ Array.new(dict_size) {|i| [i.chr, i.chr]} ]
w = "" result = [] for c in uncompressed.split() wc = w + c if dictionary.has_key?(wc) w = wc else result << dictionary[w] # Add wc to the dictionary. dictionary[wc] = dict_size dict_size += 1 w = c end end
# Output the code for w. result << dictionary[w] unless w.empty? result
end
- Decompress a list of output ks to a string.
def decompress(compressed)
# Build the dictionary. dict_size = 256 dictionary = Hash[ Array.new(dict_size) {|i| [i.chr, i.chr]} ]
w = result = compressed.shift for k in compressed if dictionary.has_key?(k) entry = dictionary[k] elsif k == dict_size entry = w + w[0,1] else raise 'Bad compressed k: %s' % k end result += entry
# Add w+entry[0] to the dictionary. dictionary[dict_size] = w + entry[0,1] dict_size += 1
w = entry end result
end
- How to use:
compressed = compress('TOBEORNOTTOBEORTOBEORNOT') p compressed decompressed = decompress(compressed) puts decompressed</lang>
Output:
["T", "O", "B", "E", "O", "R", "N", "O", "T", 256, 258, 260, 265, 259, 261, 263] TOBEORNOTTOBEORTOBEORNOT
Seed7
<lang seed7>$ include "seed7_05.s7i";
const func string: lzwCompress (in string: uncompressed) is func
result var string: result is ""; local var char: ch is ' '; var hash [string] char: mydict is (hash [string] char).value; var string: buffer is ""; var string: xstr is ""; begin for ch range chr(0) to chr(255) do mydict @:= [str(ch)] ch; end for; for ch range uncompressed do xstr := buffer & str(ch); if xstr in mydict then buffer &:= str(ch) else result &:= str(mydict[buffer]); mydict @:= [xstr] chr(length(mydict)); buffer := str(ch); end if; end for; if buffer <> "" then result &:= str(mydict[buffer]); end if; end func;
const func string: lzwDecompress (in string: compressed) is func
result var string: result is ""; local var char: ch is ' '; var hash [char] string: mydict is (hash [char] string).value; var string: buffer is ""; var string: current is ""; var string: chain is ""; begin for ch range chr(0) to chr(255) do mydict @:= [ch] str(ch); end for; for ch range compressed do if buffer = "" then buffer := mydict[ch]; result &:= buffer; elsif ch <= chr(255) then current := mydict[ch]; result &:= current; chain := buffer & current; mydict @:= [chr(length(mydict))] chain; buffer := current; else if ch in mydict then chain := mydict[ch]; else chain := buffer & str(buffer[1]); end if; result &:= chain; mydict @:= [chr(length(mydict))] buffer & str(chain[1]); buffer := chain; end if; end for; end func;
const proc: main is func
local var string: compressed is ""; var string: uncompressed is ""; begin compressed := lzwCompress("TOBEORNOTTOBEORTOBEORNOT"); writeln(literal(compressed)); uncompressed := lzwDecompress(compressed); writeln(uncompressed); end func;</lang>
Output:
"TOBEORNOT\256\\258\\260\\265\\259\\261\\263\" TOBEORNOTTOBEORTOBEORNOT
Tcl
<lang tcl>namespace eval LZW {
variable char2int variable chars for {set i 0} {$i < 256} {incr i} { set char [binary format c $i] set char2int($char) $i lappend chars $char }
}
proc LZW::encode {data} {
variable char2int array set dict [array get char2int]
set w "" set result [list]
foreach c [split $data ""] { set wc $w$c if {[info exists dict($wc)]} { set w $wc } else { lappend result $dict($w) set dict($wc) [array size dict] set w $c } } lappend result $dict($w)
}
proc LZW::decode {cdata} {
variable chars set dict $chars
set k [lindex $cdata 0] set w [lindex $dict $k] set result $w
foreach k [lrange $cdata 1 end] { set currSizeDict [llength $dict] if {$k < $currSizeDict} { set entry [lindex $dict $k] } elseif {$k == $currSizeDict} { set entry $w[string index $w 0] } else { error "invalid code ($k) in ($cdata)" } append result $entry lappend dict $w[string index $entry 0] set w $entry } return $result
}
set s TOBEORNOTTOBEORTOBEORNOT# set e [LZW::encode $s] ;# ==> 84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263 35 set d [LZW::decode $e] ;# ==> TOBEORNOTTOBEORTOBEORNOT#
- or
if {$s eq [LZW::decode [LZW::encode $s]]} then {puts success} else {puts fail} ;# ==> success