-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcentral_limit_theorem.py
59 lines (44 loc) · 2.73 KB
/
central_limit_theorem.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#A simulation to explain Central Limit Theorem:
# even when a sample is not normally distributed, if you draw multiple samples and take each of their averages,
# these averages will represent a normal distribution.
#in an e-commerce shop, most of our customers are non-buying customers.
# So the distribution actually looks like an exponential, and since a Poisson can be derived from an exponential,
# let’s make some exponential distributions to reflect our customers’ purchases.
#Let us assume our customer base has an average order value of $170,
# so we will create exponential distributions with this average.
# We will attempt to get this value by looking at some sample averages.
#I draw a sample of 1000 customers. Then repeat this 4 times. I get the following four distributions
# (To get graphs similar to this, use the code at the end with repeat_sample_draws_exponential(4, 1000, 170, True) )
import numpy as np
import matplotlib.pyplot as plt
def repeat_sample_draws_exponential(n, samp_size, mu, show_all=False):
means = []
samples = []
for ii in range(0, n):
samples.append(np.random.exponential(mu, samp_size))
means.append(np.mean(samples[ii]))
if show_all:
pltdim = np.math.ceil(np.math.sqrt(n))
fig, axs = plt.subplots(pltdim, pltdim, figsize=(8, 8), gridspec_kw={'hspace': 0.2}, sharex=True, sharey=True)
fig.suptitle('Individual Samples\' Order Value Distribution')
fig.text(0.5, 0.04, 'Order Values ($)', ha='center')
fig.text(0.04, 0.5, 'Number of Customers', ha='center', rotation='vertical')
axs = axs.flatten()
for ii in range(0, n):
plt.sca(axs[ii])
plt.gca().hist(samples[ii], bins=int(50), histtype='step',
label='$mean = {0:.2f}$'.format(np.mean(samples[ii])), range=[0, 2 * mu])
if n < 10:
plt.gca().set_title('Sample #{0} : average={1:.2f}'.format(ii, np.mean(samples[ii])))
for item in ([axs[ii].title, axs[ii].xaxis.label, axs[ii].yaxis.label] +
axs[ii].get_xticklabels() + axs[ii].get_yticklabels()):
item.set_fontsize(8)
plt.savefig('expdist_{0}_mu_{1}_sample_{2}_sampsize'.format(mu, n, samp_size))
plt.clf()
plt.hist(means, bins=int(10), histtype='step')
plt.title('Overall Average of {} Samples\' Average Order Value'.format(n))
plt.xlabel('Average of Individual Sample\'s Order Value ($)')
plt.savefig('average_of_expdist_{0}_mu_{1}_sample_{2}_sampsize'.format(mu, n, samp_size))
print('mean of the samples is {0:.2f}'.format(np.mean(means)))
print('standard deviation of the samples is {0:.2f}'.format(np.std(means)))
repeat_sample_draws_exponential(100, 1000, 170, True)