forked from briandalessandro/DataScienceCourse
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcourse_utils.py
More file actions
244 lines (195 loc) · 7.42 KB
/
course_utils.py
File metadata and controls
244 lines (195 loc) · 7.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import math
from sklearn.metrics import roc_curve, auc
import pickle
def evenSplit(dat,fld):
'''
Evenly splits the data on a given binary field, returns a shuffled dataframe
'''
pos=dat[(dat[fld]==1)]
neg=dat[(dat[fld]==0)]
neg_shuf=neg.reindex(np.random.permutation(neg.index))
fin_temp=pos.append(neg_shuf[:pos.shape[0]],ignore_index=True)
fin_temp=fin_temp.reindex(np.random.permutation(fin_temp.index))
return fin_temp
def trainTest(dat, pct):
'''
Randomly splits data into train and test
'''
dat_shuf = dat.reindex(np.random.permutation(dat.index))
trn = dat_shuf[:int(np.floor(dat_shuf.shape[0]*pct))]
tst = dat_shuf[int(np.floor(dat_shuf.shape[0]*pct)):]
return [trn, tst]
def downSample(dat,fld,mult):
'''
Evenly splits the data on a given binary field, returns a shuffled dataframe
'''
pos=dat[(dat[fld]==1)]
neg=dat[(dat[fld]==0)]
neg_shuf=neg.reindex(np.random.permutation(neg.index))
tot=min(pos.shape[0]*mult,neg.shape[0])
fin_temp=pos.append(neg_shuf[:tot],ignore_index=True)
fin_temp['r'] = np.random.random(fin_temp.shape[0])
fin_temp = fin_temp.sort_values(by = 'r').reset_index(drop = True)
fin_temp = fin_temp.drop('r', 1)
return fin_temp
def scaleData(d):
'''
This function takes data and normalizes it to have the same scale (num-min)/(max-min)
'''
#Note, by creating df_scale like this we preserve the index
df_scale=pd.DataFrame(d.iloc[:,1],columns=['temp'])
for c in d.columns.values:
df_scale[c]=(d[c]-d[c].min())/(d[c].max()-d[c].min())
return df_scale.drop('temp',1)
def plot_dec_line(mn,mx,b0,b1,a,col,lab):
'''
This function plots a line in a 2 dim space given by the equation [b0,b1] * x + a = 0
'''
x = np.random.uniform(mn,mx,100)
dec_line = list(map(lambda x_i: -1*(x_i*b0/b1+a/b1),x))
plt.plot(x,dec_line,col,label=lab)
def plotSVM(X, Y, my_svm):
'''
Plots the separating line along with SV's and margin lines
Code here derived or taken from this example http://scikit-learn.org/stable/auto_examples/svm/plot_separating_hyperplane.html
'''
# get the separating hyperplane
w = my_svm.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(X.iloc[:,0].min(), X.iloc[:,0].max()) # Changed so we are taking min, max on the same coordinate
yy = a * xx - (my_svm.intercept_[0]) / w[1]
# plot the parallels to the separating hyperplane that pass through the
# support vectors.
b = my_svm.support_vectors_[np.abs(my_svm.decision_function(my_svm.support_vectors_)-1).argmin()]
yy_down = a * xx + (b[1] - a * b[0]) #By solving equation b[1] = a * b[0] + c for c.
b = my_svm.support_vectors_[np.abs(my_svm.decision_function(my_svm.support_vectors_)+1).argmin()]
yy_up = a * xx + (b[1] - a * b[0])
# plot the line, the points, and the nearest vectors to the plane
plt.plot(xx, yy, 'k-')
plt.plot(xx, yy_down, 'k--')
plt.plot(xx, yy_up, 'k--')
plt.scatter(my_svm.support_vectors_[:, 0], my_svm.support_vectors_[:, 1], s=80, facecolors='none')
plt.plot(X[(Y==-1)].iloc[:,0], X[(Y==-1)].iloc[:,1],'r.')
plt.plot(X[(Y==1)].iloc[:,0], X[(Y==1)].iloc[:,1],'b+')
#plt.axis('tight')
#plt.show()
def getP(val):
'''
Get f(x) where f is the logistic function
'''
return (1+math.exp(-1*val))**-1
def getY(val):
'''
Return a binary indicator based on a binomial draw with prob=f(val). f the logistic function.
'''
return (int(getP(val)>np.random.uniform(0,1,1)[0]))
def gen_logistic_dataframe(n,alpha,betas):
'''
Aa function that generates a random logistic dataset
n is the number of samples
alpha, betas are the logistic truth
'''
X = np.random.random([n,len(betas)])
Y = list(map(getY,X.dot(betas)+alpha))
d = pd.DataFrame(X,columns=['f'+str(j) for j in range(X.shape[1])])
d['Y'] = Y
return d
def plotAUC(truth, pred, lab):
fpr, tpr, thresholds = roc_curve(truth, pred)
roc_auc = auc(fpr, tpr)
c = (np.random.rand(), np.random.rand(), np.random.rand())
plt.plot(fpr, tpr, color=c, label= lab+' (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
def LogLoss(dat, beta, alpha):
X = dat.drop('Y',1)
Y = dat['Y']
XB=X.dot(np.array(beta))+alpha*np.ones(len(Y))
P=(1+np.exp(-1*XB))**-1
return ((Y==1)*np.log(P)+(Y==0)*np.log(1-P)).mean()
def LogLossP(Y, P):
return ((Y==1)*np.log(P)+(Y==0)*np.log(1-P)).mean()
def plotSVD(sig):
norm = math.sqrt(sum(sig*sig))
energy_k = [math.sqrt(k)/norm for k in np.cumsum(sig*sig)]
plt.figure()
ax1 = plt.subplot(211)
ax1.bar(range(len(sig+1)), [0]+sig, 0.35)
plt.title('Kth Singular Value')
plt.tick_params(axis='x',which='both',bottom='off',top='off',labelbottom='off')
ax2 = plt.subplot(212)
plt.plot(range(len(sig)+1), [0]+energy_k)
plt.title('Normalized Sum-of-Squares of Kth Singular Value')
ax2.set_xlabel('Kth Singular Value')
ax2.set_ylim([0, 1])
def genY(x, err, betas):
'''
Goal: generate a Y variable as Y=XB+e
Input
1. an np array x of length n
2. a random noise vector r of length n
3. a (d+1) x 1 vector of coefficients b - each represents ith degree of x
'''
d = pd.DataFrame(x, columns=['x'])
y = err
for i,b in enumerate(betas):
y = y + b*x**i
d['y'] = y
return d
def makePolyFeat(d, deg):
'''
Goal: Generate features up to X**deg
1. a data frame with two features X and Y
4. a degree 'deg' (from which we make polynomial features
'''
#Generate Polynomial terms
for i in range(2, deg+1):
d['x'+str(i)] = d['x']**i
return d
def save_obj(obj, name ):
with open(name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name ):
with open(name + '.pkl', 'r') as f:
return pickle.load(f)
def happyClass(sig, n):
'''
sig is the noise parameter and n is sample size
'''
eye1 = [(0.7, 0.75), 0.1]
eye2 = [(0.3, 0.75), 0.1]
X1 = np.random.random(n)
X2 = np.random.random(n)
Y1 = 1*(((X1 - eye1[0][0])**2 + (X2 - eye1[0][1])**2 + np.random.randn(n)*sig) < eye1[1]**2)
Y2 = 1*(((X1 - eye2[0][0])**2 + (X2 - eye2[0][1])**2 + np.random.randn(n)*sig) < eye2[1]**2)
Y3 = 1*(abs(X2 - 0.1 - 4*(X1 - 0.5)**2) + np.random.randn(n)*5*sig < 0.05) * 1*(X2 < 0.5)
Y = 1*((Y1 + Y2 + Y3) > 0)
D = pd.DataFrame({'X1':X1, 'X2':X2})
D['Y'] = Y
return D
def plotZgen(clf, dat, pc, t, fig):
'''
This plots a 2d decision boundary given a trained classifier
Note the data must have two fields X1 and X2 to work
'''
plot_step = 0.02
x_min, x_max = dat['X1'].min(), dat['X1'].max()
y_min, y_max = dat['X2'].min(), dat['X2'].max()
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),np.arange(y_min, y_max, plot_step))
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
ax = fig.add_subplot(pc[0], pc[1], pc[2])
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.cool)
plt.plot(dat['X1'][(dat.Y==1)], dat['X2'][(noisy_test.Y==1)], 'r.', markersize = 2)
plt.title(t)
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)