Verify distribution uniformity/Naive: Difference between revisions

Content deleted Content added

Inline

Revision as of 05:13, 13 August 2009

This task is an adjunct to Seven-dice from Five-dice.

Create a function to check that the random integers returned from a small-integer generator function have uniform distribution.

The function should take as arguments:

The function producing random integers.
The number of times to call the integer generator.
A 'delta' value of some sort that indicates how close to a flat distribution is close enough.

The function should produce:

Some indication of the distribution achieved.
An 'error' if the distribution is not flat enough.

Show the distribution checker working when the produced distribution is flat enough and when it is not. (Use a generator from Seven-dice from Five-dice).

See also:

Verify Distribution Uniformity with Chi-Squared Test‎

C

Library: Judy

<lang c>#include <stdio.h>

include <stdlib.h>
include <stdbool.h>
include <math.h>

include <Judy.h>

bool distcheck(int (*dist)(), int n, double D) {

 Pvoid_t h = (Pvoid_t) NULL;
 PWord_t value;
 PWord_t element;

 Word_t i;
 int h_length;

 // populate hashes
 for(i=0; i < n; i++) {
   int rn = dist();
   JLI(value, h, rn);
   ++*value;
 }

 JLC(h_length, h, 0, -1);

 double target = 1.0 * n / (double)h_length;

 i = 0;
 JLF(element, h, i);
 while(element != NULL) {
   if ( abs(*element - target) > 0.01*n*D ) {
     fprintf(stderr, "distribution potentially skewed for '%d': expected '%d', got '%d'\n",

i, (int)target, *element);

     JudyLFreeArray(&h, PJE0);
     return false; // bad distr.
   }
   JLN(element, h, i);
 }

 JudyLFreeArray(&h, PJE0);
 return true; // distr. ok

}

int main() {

 distcheck(rand, 1000000, 1);
 return 0;

}</lang>

OCaml

<lang ocaml>let distcheck fn n ?(delta=1.0) () =

 let h = Hashtbl.create 5 in
 for i = 1 to n do
   let v = fn() in
   let n = 
     try Hashtbl.find h v 
     with Not_found -> 0 
   in
   Hashtbl.replace h v (n+1)
 done;
 Hashtbl.iter (fun v n -> Printf.printf "%d => %d\n%!" v n) h;
 let target = (float n) /. float (Hashtbl.length h) in
 Hashtbl.iter (fun key value ->
   if abs_float(float value -. target) > 0.01 *. delta *. (float n)
   then (Printf.eprintf
     "distribution potentially skewed for '%d': expected around %f, got %d\n%!"
      key target value)
 ) h;

</lang>

Python

Works with: Python version 3.1

<lang python>from collections import Counter from pprint import pprint as pp

def distcheck(fn, repeats, delta):

   \
   Bin the answers to fn() and check bin counts are within +/- delta %
   of repeats/bincount
   bin = Counter(fn() for i in range(repeats))
   target = repeats // len(bin)
   deltacount = int(delta / 100. * target)
   assert all( abs(target - count) < deltacount
               for count in bin.values() ), "Bin distribution skewed from %i +/- %i: %s" % (
                   target, deltacount, [ (key, target - count)
                                         for key, count in sorted(bin.items()) ]
                   )
   pp(dict(bin))</lang>

Sample output:

>>> distcheck(dice5, 1000000, 1)
{1: 200244, 2: 199831, 3: 199548, 4: 199853, 5: 200524}
>>> distcheck(dice5, 1000, 1)
Traceback (most recent call last):
  File "<pyshell#30>", line 1, in <module>
    distcheck(dice5, 1000, 1)
  File "C://Paddys/rand7fromrand5.py", line 54, in distcheck
    for key, count in sorted(bin.items()) ]
AssertionError: Bin distribution skewed from 200 +/- 2: [(1, 4), (2, -33), (3, 6), (4, 11), (5, 12)]

Ruby

Translation of: Tcl

<lang ruby>def distcheck(n, delta=1)

 unless block_given?
   raise ArgumentError, "pass a block to this method"
 end
 
 h = Hash.new(0)
 n.times {h[ yield ] += 1}
 
 target = 1.0 * n / h.length
 h.each do |key, value| 
   if (value - target).abs > 0.01 * delta * n
     raise StandardError,
       "distribution potentially skewed for '#{key}': expected around #{target}, got #{value}"
   end
 end
 
 h.keys.sort.each {|k| print "#{k} #{h[k]} "}
 puts

end

if __FILE__ == $0

 begin
   distcheck(100_000) {rand(10)}
   distcheck(100_000) {rand > 0.95} 
 rescue StandardError => e
   p e
 end

end</lang>

output:

0 9986 1 9826 2 9861 3 10034 4 9876 5 10114 6 10329 7 9924 8 10123 9 9927 
#<StandardError: distribution potentially skewed for 'false': expected around 50000.0, got 94841>

Tcl

<lang tcl>proc distcheck {random times {delta 1}} {

   for {set i 0} {$i<$times} {incr i} {incr vals([uplevel 1 $random])}
   set target [expr {$times / [array size vals]}]
   foreach {k v} [array get vals] {
       if {abs($v - $target) > $times  * $delta / 100.0} {
          error "distribution potentially skewed for $k: expected around $target, got $v"
       }
   }
   foreach k [lsort -integer [array names vals]] {lappend result $k $vals($k)}
   return $result

}</lang> Demonstration: <lang tcl># First, a uniformly distributed random variable puts [distcheck {expr {int(10*rand())}} 100000]

Now, one that definitely isn't!

puts [distcheck {expr {rand()>0.95}} 100000]</lang> Which produces this output (error in red):

0 10003 1 9851 2 10058 3 10193 4 10126 5 10002 6 9852 7 9964 8 9957 9 9994
distribution potentially skewed for 0: expected around 50000, got 94873

@@ Line 91: / Line 91: @@
 =={{header|Python}}==
+{{works with|Python|3.1}}
 <lang python>from collections import Counter
 from pprint import pprint as pp