-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathload.py
More file actions
157 lines (111 loc) · 4.34 KB
/
load.py
File metadata and controls
157 lines (111 loc) · 4.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# coding: utf-8
from pandas import *
import pandas as pd
import numpy as np
from urllib import urlopen
from bokeh.plotting import *
import scipy.special
from matplotlib import pyplot as plt
print 'Pls wait till the data loads and prints columns'
babyCSV = urlopen("/home/alakshminara/Downloads/2008_births.csv")
babyModCSV = urlopen("/home/alakshminara/Desktop/DataScience/WeighingBabies/birthCSVmod.csv");
DF_baby = read_csv(babyCSV)
DF_babymod = read_csv(babyModCSV)
DF_babymod['WGROUP'] = DF_baby.WEIGHT
columns = DF_baby.columns
print list(DF_baby.columns.values)
DF_sammi = pd.DataFrame
DF_train = pd.DataFrame
DF_test = pd.DataFrame
DF_eval = pd.DataFrame
DF_sammi_male = pd.DataFrame
def usr_load_sammi():
global DF_sammi
DF_sammi = pd.DataFrame(DF_baby[(DF_baby['RACEMOM']==1) & (DF_baby['RACEDAD']==1) &
(DF_baby['MAGE'] > 25) & (DF_baby['BPOUND'] < 20) & (DF_baby['MAGE'] < 50)
& (DF_baby['SEX'] == 2)])
DF_sammi_male = pd.DataFrame(DF_baby[(DF_baby['RACEMOM']==1) & (DF_baby['RACEDAD']==1) &
(DF_baby['MAGE'] > 25) & (DF_baby['BPOUND'] < 20) & (DF_baby['MAGE'] < 50)
& (DF_baby['SEX'] == 1)])
print 'Mean Birth Weight for SammiDF ',DF_sammi['BPOUND'].mean()
print 'Median Birth Weight for SammiDF ', DF_sammi['BPOUND'].median()
def usr_scatter_plot(var1,var2):
figure(title="Dataset of Babies similar to Sammi's Baby",
x_axis_label = var1,
y_axis_label = var2)
# sample the distribution
# compute ideal values
#x = DF_sammi[var1]
# EXERCISE: output to a static HTML file
output_file('plot1.html')
# EXERCISE: turn on plot hold
hold()
scatter(DF_sammi[var1],DF_sammi[var2], marker="square", color="black")#, title="Dataset of Babies similar to Sammi's Baby",xlabel=var1, ylabel=var2)
# Move the legend to a better place.
# Acceptable values: 'top_left', 'top_right', 'bottom_left', and 'bottom_right'
show()
def usr_line_plot_USA():
# compute ideal values
#x = DF_sammi[var1]
figure(title="Dataset of Babies in US",
x_axis_label = 'Baby Weight (lbs.)',
y_axis_label = 'Frequency')
# EXERCISE: output to a static HTML file
# EXERCISE: turn on plot hold
y=[23292,31900,67140,218296,788148,1663512,1120642,280270,39109,4443,4361]
x=[2,3,4,5,6,7,8,9,10,11,12]
line(x,y, marker="square", color="black")#, title="Dataset of Babies similar to Sammi's Baby",xlabel=var1, ylabel=var2)
# Move the legend to a better place.
# Acceptable values: 'top_left', 'top_right', 'bottom_left', and 'bottom_right'
show()
def usr_histogram_plot(var1):
hold(False)
figure(title="Babies similar to Sammi's Baby",
x_axis_label = var1)
# sample the distribution
mu, sigma = 6.834,1 # NOTE: you can tinker with these values if you like
# sample the distribution
measured = np.random.normal(mu, sigma, 1000)
hist, edges = np.histogram(measured, density=True, bins=200)
# compute ideal values
x = DF_sammi[var1]
x = (x-min(x))/(max(x)-min(x))
pdf = 1/(sigma * np.sqrt(2*np.pi)) * np.exp(-(x-mu)**2 / (2*sigma**2))
# EXERCISE: output to a static HTML file
output_file('plot.html')
# EXERCISE: turn on plot hold
hold()
# Use the `quad` renderer to display the histogram bars.
quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
fill_color="#036565", line_color="#033649",
# NOTE: these are only needed on the first renderer
tools=""
)
# Move the legend to a better place.
# Acceptable values: 'top_left', 'top_right', 'bottom_left', and 'bottom_right'
legend().orientation = "top_left"
show()
def usr_random_splitDF():
DF_temp = pd.DataFrame
rand_nos = np.random.rand(len(DF_baby)) < 0.7
DF_train = DF_baby[rand_nos]
DF_temp = DF_baby[~rand_nos]
rand_nos = np.random.rand(len(DF_temp)) < 0.6
DF_test = DF_temp[rand_nos]
DF_eval = DF_temp[~rand_nos]
print 'Train(len) : {0}'.format(str(len(DF_train)))
print 'Test(len) : {0}'.format(str(len(DF_test)))
print 'Eval(len) : {0}'.format(str(len(DF_eval)))
def usr_print_mod_csv():
print list(DF_babymod.columns.values)
print DF_babymod.head()
def usr_decision_tree():
import trees
usr_load_sammi()
#usr_scatter_plot('GAINED','BPOUND')
#usr_histogram_plot('BPOUND')
#usr_histogram_plot_male('BPOUND')
#usr_scatter_plot2()
#print_mean()
#usr_random_splitDF()
#usr_print_mod_csv()