WeighingBabies/load2.py at master · alkutnikar/WeighingBabies · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# coding: utf-8
from pandas import *
import pandas as pd
import numpy as np
from urllib import urlopen
from bokeh.plotting import *
import scipy.special
from matplotlib import pyplot as plt
print 'Pls wait till the data loads and prints columns'
babyCSV = urlopen("/home/alakshminara/Downloads/2008_births.csv")
DF_baby = read_csv(babyCSV)
columns = DF_baby.columns
print list(DF_baby.columns.values)
DF_sammi = pd.DataFrame
DF_train = pd.DataFrame
DF_test = pd.DataFrame
DF_eval = pd.DataFrame

def usr_load_sammi():
	global DF_sammi
	DF_sammi = pd.DataFrame(DF_baby[(DF_baby['RACEMOM']==1) & (DF_baby['RACEDAD']==1) &
                                (DF_baby['MAGE'] > 25)  & (DF_baby['BPOUND'] < 20) & (DF_baby['MAGE'] < 50)
                                & (DF_baby['SEX'] == 2)])
	print 'Mean Birth Weight for SammiDF ',DF_sammi['BPOUND'].mean()
	print 'Median Birth Weight for SammiDF ', DF_sammi['BPOUND'].median()


def usr_scatter_plot(var1,var2):

       	figure(title="Dataset of Babies similar to Sammi's Baby",
       	x_axis_label = var1,
       	y_axis_label = var2)
	# sample the distribution

	# compute ideal values
	#x = DF_sammi[var1]


	# EXERCISE: output to a static HTML file
	output_file('plot1.html')
	# EXERCISE: turn on plot hold
	hold()


	scatter(DF_sammi[var1],DF_sammi[var2], marker="square", color="black")#, title="Dataset of Babies similar to Sammi's Baby",xlabel=var1, ylabel=var2)


	# Move the legend to a better place.
	# Acceptable values: 'top_left', 'top_right', 'bottom_left', and 'bottom_right'

	show()

def usr_histogram_plot(var1):
	hold(False)
       	figure(title="Dataset of Babies similar to Sammi's Baby",
       	x_axis_label = 'Birth Weight (lbs.)')
	# sample the distribution

	mu, sigma = 6.834,1      # NOTE: you can tinker with these values if you like

	# sample the distribution
	measured = np.random.normal(mu, sigma, 1000)
	hist, edges = np.histogram(measured, density=True, bins=200)

	# compute ideal values
	x = DF_sammi[var1]
	pdf = 1/(sigma * np.sqrt(2*np.pi)) * np.exp(-(x-mu)**2 / (2*sigma**2))


	# EXERCISE: output to a static HTML file
	output_file('plot.html')
	# EXERCISE: turn on plot hold
	hold()

	# Use the `quad` renderer to display the histogram bars.
	quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
	     fill_color="#036565", line_color="#033649",

	     # NOTE: these are only needed on the first renderer

	     title="Dataset of Babies similar to Sammi's Baby",
	     tools=""
	)


	# Move the legend to a better place.
	# Acceptable values: 'top_left', 'top_right', 'bottom_left', and 'bottom_right'
	legend().orientation = "top_left"

	show()


def usr_random_splitDF():
	DF_temp = pd.DataFrame
	rand_nos = np.random.rand(len(DF_baby)) < 0.7
	DF_train = DF_baby[rand_nos]
	DF_temp = DF_baby[~rand_nos]

	rand_nos = np.random.rand(len(DF_temp)) < 0.6
	DF_test = DF_temp[rand_nos]
	DF_eval = DF_temp[~rand_nos]

	print 'Train(len) : {0}'.format(str(len(DF_train)))
	print 'Test(len) : {0}'.format(str(len(DF_test)))
	print 'Eval(len) : {0}'.format(str(len(DF_eval)))


#usr_load_sammi()
#usr_scatter_plot('GAINED','BPOUND')
#usr_histogram_plot('BPOUND')
#print_mean()
usr_random_splitDF()