A closer look at QMCPy’s Sobol’ generator

from qmcpy import *
from numpy import *
from matplotlib import pyplot
from time import time
import os

Basic usage

s = DigitalNetB2(5,seed=7)
s
DigitalNetB2 (DiscreteDistribution Object)
    d               5
    dvec            [0 1 2 3 4]
    randomize       LMS_DS
    graycode        0
    entropy         7
    spawn_key       ()
s.gen_samples(4) # generate Sobol' samples
array([[0.56269008, 0.17377997, 0.48903356, 0.72162356, 0.2540441 ],
       [0.346653  , 0.65070632, 0.98032014, 0.47528211, 0.89141566],
       [0.82074548, 0.95490574, 0.62413225, 0.21383165, 0.2009243 ],
       [0.10422261, 0.49458097, 0.09375278, 0.96749778, 0.5872364 ]])
s.gen_samples(n_min=2,n_max=4) # generate from specific range. If range is not powers of 2, use graycode
array([[0.82074548, 0.95490574, 0.62413225, 0.21383165, 0.2009243 ],
       [0.10422261, 0.49458097, 0.09375278, 0.96749778, 0.5872364 ]])
t0 = time()
s.gen_samples(2**25)
print('Time: %.2f'%(time()-t0))
Time: 1.77

Randomize with digital shift / linear matrix scramble

s = DigitalNetB2(2,randomize='LMS_DS') # linear matrix scramble with digital shift (default)
s.gen_samples(2)
array([[0.19108946, 0.54321668],
       [0.79173853, 0.39566573]])
s = DigitalNetB2(2,randomize='LMS') # just linear matrix scrambling
s.gen_samples(2, warn=False) # suppress warning that the first point is still the orgin
array([[0.        , 0.        ],
       [0.81182601, 0.87542258]])
s = DigitalNetB2(2,randomize='DS') # just digital shift
s.gen_samples(2)
array([[0.49126963, 0.3658057 ],
       [0.99126963, 0.8658057 ]])

Support for graycode and natural ordering

s = DigitalNetB2(2,randomize=False,graycode=False)
s.gen_samples(n_min=4,n_max=8,warn=False) # don't warn about non-randomized samples including the orgin
array([[0.125, 0.625],
       [0.625, 0.125],
       [0.375, 0.375],
       [0.875, 0.875]])
s = DigitalNetB2(2,randomize=False,graycode=True)
s.gen_samples(n_min=4,n_max=8,warn=False)
array([[0.375, 0.375],
       [0.875, 0.875],
       [0.625, 0.125],
       [0.125, 0.625]])

Custom Dimensions

s = DigitalNetB2(3,randomize=False)
s.gen_samples(n_min=4,n_max=8)
array([[0.125, 0.625, 0.375],
       [0.625, 0.125, 0.875],
       [0.375, 0.375, 0.625],
       [0.875, 0.875, 0.125]])
s = DigitalNetB2([1,2],randomize=False) # use only the second and third dimensions in the sequence
s.gen_samples(n_min=4,n_max=8)
array([[0.625, 0.375],
       [0.125, 0.875],
       [0.375, 0.625],
       [0.875, 0.125]])

Custom generating matrices

# a previously created generating matrix (not the default)
z = load('../qmcpy/discrete_distribution/digital_net_b2/generating_matrices/sobol_mat.51.30.30.msb.npy')
# max dimension 51, max samples 2^30, most significant bit in top of column
print(z.dtype)
z[:2,:2]
int64
array([[536870912, 805306368],
       [536870912, 268435456]])
z_custom = z[:10,:] # say this is our custom generating matrix. Make sure the datatype is numpy.int64
d_max,m_max = z_custom.shape
t_max = log2(z[0,0])+1 # number of bits in the first integer
f_path = 'my_sobol_mat.%d.%d.%d.msb.npy'%(d_max,t_max,m_max)
print(f_path)
save(f_path, z_custom)# save it to a file with proper naming convention
s = DigitalNetB2(3,generating_matrices=f_path) # plug in the path
print(s.gen_samples(4))
os.remove(f_path)
my_sobol_mat.10.30.30.msb.npy
[[0.51716073 0.27786139 0.31060685]
 [0.27369895 0.83466347 0.73047579]
 [0.24273408 0.09222157 0.77344039]
 [0.98569235 0.51987794 0.19732142]]

Elementay intervals

def plt_ei(x,ax,x_cuts,y_cuts):
    for ix in arange(1,x_cuts,dtype=float): ax.axvline(x=ix/x_cuts,color='r',alpha=.5)
    for iy in arange(1,y_cuts,dtype=float): ax.axhline(y=iy/y_cuts,color='r',alpha=.5)
    ax.scatter(x[:,0],x[:,1],color='b',s=25)
    ax.set_xlim([0,1])
    ax.set_xticks([0,1])
    ax.set_ylim([0,1])
    ax.set_yticks([0,1])
    ax.set_aspect(1)
# unrandomized
fig,ax = pyplot.subplots(ncols=3,nrows=2,figsize=(15,10))
s = DigitalNetB2(2,randomize=False)
plt_ei(s.gen_samples(2**2,warn=False),ax[0,0],2,2)
plt_ei(s.gen_samples(2**3,warn=False),ax[1,0],8,0)
plt_ei(s.gen_samples(2**4,warn=False),ax[0,1],4,4)
plt_ei(s.gen_samples(2**4,warn=False),ax[1,1],2,8)
plt_ei(s.gen_samples(2**6,warn=False),ax[0,2],8,8)
plt_ei(s.gen_samples(2**6,warn=False),ax[1,2],16,4)
fig.suptitle("Unrandomized Sobol' points");
../_images/digital_net_b2_22_0.png
fig,ax = pyplot.subplots(ncols=3,nrows=2,figsize=(15,10))
s = DigitalNetB2(2,randomize='LMS_DS')
plt_ei(s.gen_samples(2**2,warn=False),ax[0,0],2,2)
plt_ei(s.gen_samples(2**3,warn=False),ax[1,0],8,0)
plt_ei(s.gen_samples(2**4,warn=False),ax[0,1],4,4)
plt_ei(s.gen_samples(2**4,warn=False),ax[1,1],2,8)
plt_ei(s.gen_samples(2**6,warn=False),ax[0,2],8,8)
plt_ei(s.gen_samples(2**6,warn=False),ax[1,2],16,4)
fig.suptitle("Sobol' points with linear matrix scramble and digital shift");
../_images/digital_net_b2_23_0.png
fig,ax = pyplot.subplots(ncols=3,nrows=2,figsize=(15,10))
s = DigitalNetB2(2,graycode=True,randomize='LMS')
plt_ei(s.gen_samples(2**2,warn=False),ax[0,0],2,2)
plt_ei(s.gen_samples(2**3,warn=False),ax[1,0],8,0)
plt_ei(s.gen_samples(2**4,warn=False),ax[0,1],4,4)
plt_ei(s.gen_samples(2**4,warn=False),ax[1,1],2,8)
plt_ei(s.gen_samples(2**6,warn=False),ax[0,2],8,8)
plt_ei(s.gen_samples(2**6,warn=False),ax[1,2],16,4)
fig.suptitle("Sobol' points with linear matrix scramling shift");
../_images/digital_net_b2_24_0.png
fig,ax = pyplot.subplots(ncols=3,nrows=2,figsize=(15,10))
s = DigitalNetB2(2,graycode=True,randomize='DS')
plt_ei(s.gen_samples(2**2,warn=False),ax[0,0],2,2)
plt_ei(s.gen_samples(2**3,warn=False),ax[1,0],8,0)
plt_ei(s.gen_samples(2**4,warn=False),ax[0,1],4,4)
plt_ei(s.gen_samples(2**4,warn=False),ax[1,1],2,8)
plt_ei(s.gen_samples(2**6,warn=False),ax[0,2],8,8)
plt_ei(s.gen_samples(2**6,warn=False),ax[1,2],16,4)
fig.suptitle("Sobol' points with digital shift");
../_images/digital_net_b2_25_0.png
fig,ax = pyplot.subplots(figsize=(15,5))
s = DigitalNetB2([50,51],randomize='LMS_DS')
plt_ei(s.gen_samples(2**4),ax,4,4)
fig.suptitle("Sobol' points dimension 50 vs 51");
# nice properties do not necessary hold in higher dimensions
../_images/digital_net_b2_26_0.png
fig,ax = pyplot.subplots(figsize=(15,5))
s = DigitalNetB2(2,randomize='LMS_DS',graycode=True)
plt_ei(s.gen_samples(n_min=1,n_max=16),ax,4,4)
x16 = s.gen_samples(n_min=16,n_max=17)
ax.scatter(x16[:,0],x16[:,1],color='g',s=100)
fig.suptitle("Sobol' points 2-17");
# bbetter to take points 1-16 instead of 2-16
../_images/digital_net_b2_27_0.png

Skipping points vs. randomization

The first point in a Sobol’ sequence is \vec{0}. Therefore, some software packages skip this point as various transformations will map 0 to NAN. For example, the inverse CDF of a Gaussian density at \vec{0} is -\infty. However, we argue that it is better to randomize points than to simply skip the first point for the following reasons:

  1. Skipping the first point does not give the uniformity advantages when taking powers of 2. For example, notice the green point in the above plot of “Sobol’ points 2-17”.

  2. Randomizing Sobol’ points will return 0 with probability 0.

A more thorough explanation can be found in Art Owen’s paper On dropping the first Sobol’ point

So always randomize your Sobol’ unless you specifically need unrandomized points. In QMCPy the Sobol’ generator defaults to randomization with a linear matrix scramble.

The below code runs tests comparing unrandomized vs. randomization with linear matrix scramble with digital shift vs. randomization with just digital shift. Furthermore, we compare taking the first 2^n points vs. dropping the first point and taking 2^n points vs. dropping the first point and taking 2^n-1 points.

The 2D keister function is used for testing purposes as it can be exactly integrated using mathematica:
N[Integrate[E^(-x1^2 - x2^2) Cos[Sqrt[x1^2 + x2^2]], {x1, -Infinity, Infinity}, {x2, -Infinity, Infinity}]]

Plots the median (line) and fills the middle 10% of observations

def plt_k2d_sobol(ax,rtype,colors,plts=[1,2,3]):
    trials = 100
    solution = 1.808186429263620
    ms = arange(10,18)
    ax.set_xscale('log',base=2)
    ax.set_yscale('log',base=10)
    epsilons = {}
    if 1 in plts:
        epsilons['$2^m$ points with skipped $1^{st}$ point'] = zeros((trials,len(ms)),dtype=double)
    if 2 in plts:
        epsilons['$2^m-1$ points with skipped $1^{st}$ point'] = zeros((trials,len(ms)),dtype=double)
    if 3 in plts:
        epsilons['$2^n$ points no skippping'] = zeros((trials,len(ms)),dtype=double)
    for i,m in enumerate(ms):
        for t in range(trials):
            s = DigitalNetB2(2,randomize=rtype,graycode=True)
            k = Keister(s)
            if 1 in plts:
                epsilons['$2^m$ points with skipped $1^{st}$ point'][t,i] = \
                     abs(k.f(s.gen_samples(n_min=1,n_max=1+2**m)).mean()-solution)
            if 2 in plts:
                epsilons['$2^m-1$ points with skipped $1^{st}$ point'][t,i] = \
                     abs(k.f(s.gen_samples(n_min=1,n_max=2**m)).mean()-solution)
            if 3 in plts:
                epsilons['$2^n$ points no skippping'][t,i] = \
                     abs(k.f(s.gen_samples(n=2**m)).mean()-solution)
    for i,(label,eps) in enumerate(epsilons.items()):
        bot = percentile(eps, 40, axis=0)
        med = percentile(eps, 50, axis=0)
        top = percentile(eps, 60, axis=0)
        ax.loglog(2**ms,med,label=label+' with %s randomization'%rtype,color=colors[i])
        ax.fill_between(2**ms, bot, top, color=colors[i], alpha=.3)
# compare randomization fig,ax = pyplot.subplots(figsize=(10,10))
fig,ax = pyplot.subplots(figsize=(10,10))
plt_k2d_sobol(ax,None,['r','g'],[1,2])
plt_k2d_sobol(ax,'LMS_DS',['c','m','b'],[1,2,3])
plt_k2d_sobol(ax,'DS',['y','k','orangered'],[1,2,3])
ax.legend()
ax.set_xlabel('n')
ax.set_ylabel('$\\epsilon$');
../_images/digital_net_b2_30_0.png