import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
df = pd.read_csv("https://www.mth548.org/_static/kde_marathon_results/marathon_results.csv")
df["tot_minutes"] = pd.to_timedelta(df["Finish"]).dt.total_seconds()/60
df
Age | M/F | Country | 5K | 10K | 15K | 20K | Half | 25K | 30K | 35K | 40K | Finish | Pace | Overall | Gender | Division | tot_minutes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 25 | M | ETH | 00:14:43 | 00:29:43 | 00:44:57 | 01:00:29 | 01:04:02 | 01:16:07 | 01:32:00 | 01:47:59 | 02:02:39 | 02:09:17 | 00:04:56 | 1 | 1 | 1 | 129.283333 |
1 | 30 | M | ETH | 00:14:43 | 00:29:43 | 00:44:58 | 01:00:28 | 01:04:01 | 01:16:07 | 01:31:59 | 01:47:59 | 02:02:42 | 02:09:48 | 00:04:58 | 2 | 2 | 2 | 129.800000 |
2 | 29 | M | KEN | 00:14:43 | 00:29:43 | 00:44:57 | 01:00:29 | 01:04:02 | 01:16:07 | 01:32:00 | 01:47:59 | 02:03:01 | 02:10:22 | 00:04:59 | 3 | 3 | 3 | 130.366667 |
3 | 28 | M | KEN | 00:14:43 | 00:29:44 | 00:45:01 | 01:00:29 | 01:04:02 | 01:16:07 | 01:32:00 | 01:48:03 | 02:03:47 | 02:10:47 | 00:05:00 | 4 | 4 | 4 | 130.783333 |
4 | 32 | M | KEN | 00:14:43 | 00:29:44 | 00:44:58 | 01:00:28 | 01:04:01 | 01:16:07 | 01:32:00 | 01:47:59 | 02:03:27 | 02:10:49 | 00:05:00 | 5 | 5 | 5 | 130.816667 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
26293 | 64 | F | USA | 00:50:15 | 01:43:31 | 02:36:53 | 03:32:26 | 03:43:46 | 04:25:53 | 05:19:44 | 06:17:19 | 07:13:34 | 07:38:56 | 00:17:31 | 26594 | 12015 | 269 | 458.933333 |
26294 | 61 | F | USA | 00:48:36 | 01:39:39 | 02:39:13 | 03:35:58 | 03:47:55 | 04:32:44 | 05:31:58 | 06:28:56 | 07:26:19 | 07:51:30 | 00:17:59 | 26595 | 12016 | 270 | 471.500000 |
26295 | 66 | F | USA | 00:53:03 | 01:47:16 | 02:41:45 | 03:37:07 | 03:48:21 | 04:33:51 | 05:38:56 | 06:38:51 | 07:36:18 | 07:59:33 | 00:18:18 | 26596 | 12017 | 91 | 479.550000 |
26296 | 53 | M | USA | 00:49:04 | 01:40:12 | 02:33:31 | 03:31:41 | 03:43:35 | 04:29:20 | 05:31:11 | 06:33:35 | 07:35:38 | 08:00:37 | 00:18:20 | 26597 | 14580 | 2055 | 480.616667 |
26297 | 62 | M | USA | 00:40:14 | 01:28:18 | 02:26:46 | 03:28:41 | 03:40:36 | 04:36:06 | 05:43:44 | 06:51:31 | 07:41:28 | 08:06:01 | 00:18:33 | 26598 | 14581 | 898 | 486.016667 |
26298 rows × 18 columns
from scipy.stats import gaussian_kde
kde = gaussian_kde(df['tot_minutes'])
kde(200)
array([0.01028678])
kde.integrate_box(120, 180)
0.09945655722252811
plt.style.use('bmh')
plt.figure(figsize=(10, 5))
kde = gaussian_kde(df['tot_minutes'], bw_method=0.01)
x = np.linspace(120, 500, 400)
plt.plot(x, kde(x));
kde2 = gaussian_kde(df['tot_minutes'], bw_method=0.5)
plt.figure(figsize=(10, 5))
x = np.linspace(120, 500, 400)
plt.plot(x, kde2(x));
dfm = df[df["M/F"] == 'M']
dff = df[df["M/F"] == 'F']
kdem = gaussian_kde(dfm['tot_minutes'])
kdef = gaussian_kde(dff['tot_minutes'])
plt.figure(figsize=(10, 5))
x = np.linspace(120, 500, 400)
plt.plot(x, kdem(x), label='M')
plt.plot(x, kdef(x), label='F')
plt.legend()
plt.show()
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.5, random_state=123)
train_df
Age | M/F | Country | 5K | 10K | 15K | 20K | Half | 25K | 30K | 35K | 40K | Finish | Pace | Overall | Gender | Division | tot_minutes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1024 | 33 | M | USA | 00:18:33 | 00:37:18 | 00:56:26 | 01:15:51 | 01:20:12 | 01:35:51 | 01:56:38 | 02:18:35 | 02:40:36 | 02:50:47 | 00:06:31 | 1026 | 985 | 825 | 170.783333 |
14892 | 40 | F | USA | 00:26:44 | 00:53:34 | 01:20:44 | 01:47:42 | 01:53:30 | 02:14:47 | 02:42:09 | 03:09:04 | 03:34:16 | 03:45:06 | 00:08:36 | 14970 | 5196 | 914 | 225.100000 |
13216 | 55 | M | USA | 00:25:14 | 00:49:51 | 01:14:52 | 01:41:52 | 01:47:19 | 02:07:11 | 02:33:40 | 03:01:24 | 03:27:57 | 03:39:35 | 00:08:23 | 13277 | 9100 | 653 | 219.583333 |
10302 | 37 | M | USA | 00:26:17 | 00:50:37 | 01:15:09 | 01:40:43 | 01:46:17 | 02:07:12 | 02:33:12 | 02:57:48 | 03:21:09 | 03:30:35 | 00:08:02 | 10337 | 7833 | 3560 | 210.583333 |
13164 | 57 | M | USA | 00:25:18 | 00:49:50 | 01:14:45 | 01:39:49 | 01:45:22 | 02:05:06 | 02:31:41 | 02:59:42 | 03:27:39 | 03:39:26 | 00:08:23 | 13224 | 9080 | 647 | 219.433333 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
15377 | 26 | F | USA | 00:25:18 | 00:51:06 | 01:16:33 | 01:41:52 | 01:47:26 | 02:08:08 | 02:35:55 | 03:05:23 | 03:34:32 | 03:46:50 | 00:08:40 | 15463 | 5505 | 3578 | 226.833333 |
21602 | 60 | F | CAN | 00:31:03 | 01:01:33 | 01:30:58 | 02:00:14 | 02:06:34 | 02:30:29 | 03:02:19 | 03:33:13 | 04:05:01 | 04:19:26 | 00:09:54 | 21787 | 9327 | 114 | 259.433333 |
17730 | 43 | F | USA | 00:27:06 | 00:53:30 | 01:20:51 | 01:48:06 | 01:54:00 | 02:16:06 | 02:44:59 | 03:14:54 | 03:42:59 | 03:55:26 | 00:08:59 | 17852 | 6994 | 1297 | 235.433333 |
15725 | 47 | F | USA | 00:27:22 | 00:53:47 | 01:19:38 | 01:47:14 | 01:53:03 | 02:13:46 | 02:41:02 | 03:08:51 | 03:36:21 | 03:48:11 | 00:08:43 | 15818 | 5737 | 649 | 228.183333 |
19966 | 37 | F | USA | 00:28:36 | 00:55:46 | 01:24:31 | 01:52:27 | 01:58:58 | 02:23:39 | 02:55:00 | 03:25:49 | 03:54:46 | 04:07:24 | 00:09:27 | 20116 | 8325 | 4424 | 247.400000 |
13149 rows × 18 columns
test_df
Age | M/F | Country | 5K | 10K | 15K | 20K | Half | 25K | 30K | 35K | 40K | Finish | Pace | Overall | Gender | Division | tot_minutes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
15955 | 47 | F | USA | 00:27:31 | 00:53:43 | 01:19:39 | 01:45:57 | 01:51:39 | 02:12:27 | 02:40:08 | 03:08:26 | 03:36:40 | 03:48:56 | 00:08:44 | 16054 | 5886 | 682 | 228.933333 |
1083 | 37 | M | NED | 00:18:30 | 00:37:13 | 00:56:19 | 01:15:54 | 01:20:10 | 01:35:53 | 01:57:06 | 02:20:18 | 02:42:10 | 02:51:28 | 00:06:33 | 1085 | 1039 | 870 | 171.466667 |
358 | 40 | M | CAN | 00:18:31 | 00:37:16 | 00:56:05 | 01:15:04 | 01:19:12 | 01:34:06 | 01:53:31 | 02:13:14 | 02:33:04 | 02:42:01 | 00:06:11 | 359 | 343 | 21 | 162.016667 |
5136 | 34 | F | USA | 00:20:19 | 00:41:00 | 01:01:57 | 01:23:14 | 01:27:55 | 01:44:44 | 02:07:58 | 02:34:10 | 03:00:20 | 03:12:22 | 00:07:21 | 5149 | 515 | 449 | 192.366667 |
16029 | 40 | F | CAN | 00:25:02 | 00:51:08 | 01:19:07 | 01:45:56 | 01:51:43 | 02:13:12 | 02:41:19 | 03:11:08 | 03:37:51 | 03:49:11 | 00:08:45 | 16128 | 5935 | 1083 | 229.183333 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
19657 | 48 | F | USA | 00:27:53 | 00:58:05 | 01:26:21 | 01:54:59 | 02:01:09 | 02:23:39 | 02:52:50 | 03:23:20 | 03:52:51 | 04:05:29 | 00:09:22 | 19804 | 8137 | 1247 | 245.483333 |
10744 | 42 | F | GBR | 00:25:47 | 00:50:38 | 01:15:34 | 01:40:12 | 01:45:37 | 02:05:56 | 02:30:53 | 02:56:18 | 03:21:03 | 03:31:57 | 00:08:06 | 10781 | 2736 | 349 | 211.950000 |
17175 | 41 | M | USA | 00:24:40 | 00:53:59 | 01:21:15 | 01:48:49 | 01:54:28 | 02:16:23 | 02:44:47 | 03:13:03 | 03:40:48 | 03:53:21 | 00:08:55 | 17291 | 10665 | 1671 | 233.350000 |
3806 | 27 | M | USA | 00:21:44 | 00:43:56 | 01:06:09 | 01:28:25 | 01:33:19 | 01:50:49 | 02:13:21 | 02:35:36 | 02:56:51 | 03:06:07 | 00:07:06 | 3814 | 3546 | 2458 | 186.116667 |
10433 | 24 | F | USA | 00:23:43 | 00:46:45 | 01:09:57 | 01:33:42 | 01:38:51 | 01:57:49 | 02:24:02 | 02:51:45 | 03:18:56 | 03:30:59 | 00:08:03 | 10469 | 2570 | 2027 | 210.983333 |
13149 rows × 18 columns
from sklearn.neighbors import KNeighborsClassifier
k = 50
neigh = KNeighborsClassifier(n_neighbors=k)
neigh.fit(train_df[['tot_minutes']], train_df['M/F'])
KNeighborsClassifier(n_neighbors=50)
knn_pred = neigh.predict(test_df[['tot_minutes']])
knn_pred
array(['F', 'M', 'M', ..., 'F', 'M', 'F'], dtype=object)
(knn_pred == test_df['M/F']).sum()/len(test_df)
0.6453722716556393
neigh.predict_proba([[200]])
/Users/bb/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but KNeighborsClassifier was fitted with feature names warnings.warn(
array([[0.36, 0.64]])
neigh.classes_
array(['F', 'M'], dtype=object)
plt.figure(figsize=(10, 5))
x = np.linspace(120, 360, 400)
plt.plot(x, neigh.predict_proba(x.reshape(-1, 1))[:, 0])
plt.plot
/Users/bb/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but KNeighborsClassifier was fitted with feature names warnings.warn(
<function matplotlib.pyplot.plot(*args, scalex=True, scaley=True, data=None, **kwargs)>
neigh.predict_proba(x.reshape(-1, 1))[:, 0]
/Users/bb/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but KNeighborsClassifier was fitted with feature names warnings.warn(
array([0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.06, 0.04, 0.04, 0.02, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.02, 0.02, 0.02, 0.02, 0.02, 0. , 0. , 0.04, 0.04, 0.04, 0.02, 0. , 0. , 0. , 0.04, 0.08, 0.06, 0.08, 0.12, 0.08, 0.08, 0.06, 0. , 0.04, 0.08, 0.06, 0.14, 0.08, 0.02, 0.12, 0.06, 0.06, 0.1 , 0.1 , 0.08, 0.14, 0.1 , 0.06, 0.06, 0.06, 0.1 , 0.1 , 0.14, 0.2 , 0.1 , 0.12, 0.1 , 0.18, 0.22, 0.14, 0.18, 0.32, 0.22, 0.18, 0.24, 0.28, 0.24, 0.14, 0.38, 0.36, 0.32, 0.2 , 0.22, 0.28, 0.26, 0.28, 0.3 , 0.36, 0.34, 0.38, 0.34, 0.32, 0.46, 0.26, 0.44, 0.44, 0.48, 0.54, 0.44, 0.56, 0.42, 0.52, 0.44, 0.5 , 0.52, 0.58, 0.54, 0.7 , 0.76, 0.68, 0.68, 0.56, 0.62, 0.56, 0.52, 0.58, 0.54, 0.58, 0.52, 0.52, 0.52, 0.58, 0.54, 0.68, 0.7 , 0.62, 0.54, 0.6 , 0.58, 0.66, 0.74, 0.66, 0.68, 0.5 , 0.76, 0.68, 0.6 , 0.62, 0.54, 0.44, 0.66, 0.48, 0.6 , 0.7 , 0.64, 0.56, 0.62, 0.58, 0.56, 0.52, 0.38, 0.56, 0.58, 0.56, 0.56, 0.68, 0.54, 0.62, 0.6 , 0.62, 0.64, 0.58, 0.58, 0.74, 0.56, 0.72, 0.5 , 0.68, 0.52, 0.7 , 0.6 , 0.62, 0.62, 0.58, 0.64, 0.74, 0.64, 0.52, 0.48, 0.56, 0.5 , 0.6 , 0.56, 0.52, 0.5 , 0.66, 0.58, 0.54, 0.52, 0.7 , 0.56, 0.5 , 0.64, 0.66, 0.58, 0.58, 0.56, 0.48, 0.42, 0.36, 0.52, 0.58, 0.52, 0.5 , 0.46, 0.5 , 0.58, 0.62, 0.6 , 0.58, 0.6 , 0.68, 0.64, 0.6 , 0.54, 0.38, 0.54, 0.66, 0.62, 0.46, 0.46, 0.4 , 0.44, 0.56, 0.54, 0.54, 0.46, 0.58, 0.6 , 0.56, 0.52, 0.54, 0.58, 0.56, 0.6 , 0.56, 0.62, 0.6 , 0.6 , 0.6 , 0.6 , 0.58, 0.58, 0.52, 0.62, 0.62, 0.5 , 0.5 , 0.5 , 0.58, 0.62, 0.64, 0.64, 0.6 , 0.58, 0.62, 0.62, 0.68, 0.62, 0.6 , 0.64, 0.54, 0.5 , 0.52, 0.48, 0.42, 0.48, 0.42, 0.38, 0.42, 0.42, 0.52, 0.58, 0.64, 0.66, 0.6 , 0.64, 0.58, 0.54, 0.62, 0.66, 0.54, 0.58, 0.64, 0.58, 0.52, 0.5 , 0.48, 0.46, 0.44, 0.46, 0.56, 0.6 , 0.64, 0.66, 0.64, 0.62, 0.62, 0.6 , 0.58, 0.58, 0.54, 0.52, 0.48, 0.5 , 0.5 , 0.5 , 0.5 , 0.5 , 0.58, 0.6 , 0.62, 0.64, 0.64, 0.66, 0.66, 0.7 , 0.64, 0.62, 0.62, 0.62, 0.62, 0.66, 0.68, 0.72, 0.68, 0.66, 0.72, 0.74, 0.74, 0.74, 0.7 , 0.68, 0.68, 0.68, 0.72, 0.72, 0.7 , 0.68, 0.64, 0.58, 0.58, 0.58, 0.56, 0.5 , 0.44, 0.42, 0.42, 0.46, 0.42, 0.44, 0.46, 0.46, 0.46])
train_df
Age | M/F | Country | 5K | 10K | 15K | 20K | Half | 25K | 30K | 35K | 40K | Finish | Pace | Overall | Gender | Division | tot_minutes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1024 | 33 | M | USA | 00:18:33 | 00:37:18 | 00:56:26 | 01:15:51 | 01:20:12 | 01:35:51 | 01:56:38 | 02:18:35 | 02:40:36 | 02:50:47 | 00:06:31 | 1026 | 985 | 825 | 170.783333 |
14892 | 40 | F | USA | 00:26:44 | 00:53:34 | 01:20:44 | 01:47:42 | 01:53:30 | 02:14:47 | 02:42:09 | 03:09:04 | 03:34:16 | 03:45:06 | 00:08:36 | 14970 | 5196 | 914 | 225.100000 |
13216 | 55 | M | USA | 00:25:14 | 00:49:51 | 01:14:52 | 01:41:52 | 01:47:19 | 02:07:11 | 02:33:40 | 03:01:24 | 03:27:57 | 03:39:35 | 00:08:23 | 13277 | 9100 | 653 | 219.583333 |
10302 | 37 | M | USA | 00:26:17 | 00:50:37 | 01:15:09 | 01:40:43 | 01:46:17 | 02:07:12 | 02:33:12 | 02:57:48 | 03:21:09 | 03:30:35 | 00:08:02 | 10337 | 7833 | 3560 | 210.583333 |
13164 | 57 | M | USA | 00:25:18 | 00:49:50 | 01:14:45 | 01:39:49 | 01:45:22 | 02:05:06 | 02:31:41 | 02:59:42 | 03:27:39 | 03:39:26 | 00:08:23 | 13224 | 9080 | 647 | 219.433333 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
15377 | 26 | F | USA | 00:25:18 | 00:51:06 | 01:16:33 | 01:41:52 | 01:47:26 | 02:08:08 | 02:35:55 | 03:05:23 | 03:34:32 | 03:46:50 | 00:08:40 | 15463 | 5505 | 3578 | 226.833333 |
21602 | 60 | F | CAN | 00:31:03 | 01:01:33 | 01:30:58 | 02:00:14 | 02:06:34 | 02:30:29 | 03:02:19 | 03:33:13 | 04:05:01 | 04:19:26 | 00:09:54 | 21787 | 9327 | 114 | 259.433333 |
17730 | 43 | F | USA | 00:27:06 | 00:53:30 | 01:20:51 | 01:48:06 | 01:54:00 | 02:16:06 | 02:44:59 | 03:14:54 | 03:42:59 | 03:55:26 | 00:08:59 | 17852 | 6994 | 1297 | 235.433333 |
15725 | 47 | F | USA | 00:27:22 | 00:53:47 | 01:19:38 | 01:47:14 | 01:53:03 | 02:13:46 | 02:41:02 | 03:08:51 | 03:36:21 | 03:48:11 | 00:08:43 | 15818 | 5737 | 649 | 228.183333 |
19966 | 37 | F | USA | 00:28:36 | 00:55:46 | 01:24:31 | 01:52:27 | 01:58:58 | 02:23:39 | 02:55:00 | 03:25:49 | 03:54:46 | 04:07:24 | 00:09:27 | 20116 | 8325 | 4424 | 247.400000 |
13149 rows × 18 columns
test_df
Age | M/F | Country | 5K | 10K | 15K | 20K | Half | 25K | 30K | 35K | 40K | Finish | Pace | Overall | Gender | Division | tot_minutes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
15955 | 47 | F | USA | 00:27:31 | 00:53:43 | 01:19:39 | 01:45:57 | 01:51:39 | 02:12:27 | 02:40:08 | 03:08:26 | 03:36:40 | 03:48:56 | 00:08:44 | 16054 | 5886 | 682 | 228.933333 |
1083 | 37 | M | NED | 00:18:30 | 00:37:13 | 00:56:19 | 01:15:54 | 01:20:10 | 01:35:53 | 01:57:06 | 02:20:18 | 02:42:10 | 02:51:28 | 00:06:33 | 1085 | 1039 | 870 | 171.466667 |
358 | 40 | M | CAN | 00:18:31 | 00:37:16 | 00:56:05 | 01:15:04 | 01:19:12 | 01:34:06 | 01:53:31 | 02:13:14 | 02:33:04 | 02:42:01 | 00:06:11 | 359 | 343 | 21 | 162.016667 |
5136 | 34 | F | USA | 00:20:19 | 00:41:00 | 01:01:57 | 01:23:14 | 01:27:55 | 01:44:44 | 02:07:58 | 02:34:10 | 03:00:20 | 03:12:22 | 00:07:21 | 5149 | 515 | 449 | 192.366667 |
16029 | 40 | F | CAN | 00:25:02 | 00:51:08 | 01:19:07 | 01:45:56 | 01:51:43 | 02:13:12 | 02:41:19 | 03:11:08 | 03:37:51 | 03:49:11 | 00:08:45 | 16128 | 5935 | 1083 | 229.183333 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
19657 | 48 | F | USA | 00:27:53 | 00:58:05 | 01:26:21 | 01:54:59 | 02:01:09 | 02:23:39 | 02:52:50 | 03:23:20 | 03:52:51 | 04:05:29 | 00:09:22 | 19804 | 8137 | 1247 | 245.483333 |
10744 | 42 | F | GBR | 00:25:47 | 00:50:38 | 01:15:34 | 01:40:12 | 01:45:37 | 02:05:56 | 02:30:53 | 02:56:18 | 03:21:03 | 03:31:57 | 00:08:06 | 10781 | 2736 | 349 | 211.950000 |
17175 | 41 | M | USA | 00:24:40 | 00:53:59 | 01:21:15 | 01:48:49 | 01:54:28 | 02:16:23 | 02:44:47 | 03:13:03 | 03:40:48 | 03:53:21 | 00:08:55 | 17291 | 10665 | 1671 | 233.350000 |
3806 | 27 | M | USA | 00:21:44 | 00:43:56 | 01:06:09 | 01:28:25 | 01:33:19 | 01:50:49 | 02:13:21 | 02:35:36 | 02:56:51 | 03:06:07 | 00:07:06 | 3814 | 3546 | 2458 | 186.116667 |
10433 | 24 | F | USA | 00:23:43 | 00:46:45 | 01:09:57 | 01:33:42 | 01:38:51 | 01:57:49 | 02:24:02 | 02:51:45 | 03:18:56 | 03:30:59 | 00:08:03 | 10469 | 2570 | 2027 | 210.983333 |
13149 rows × 18 columns
kde = gaussian_kde(train_df['tot_minutes'], bw_method = 0.3)
plt.figure(figsize=(10, 5))
x = np.linspace(120, 500, 400)
plt.plot(x, kde(x));
kde(180)
array([0.00662687])
likelihood = kde(test_df['tot_minutes']).prod()
likelihood
0.0
0.5**2000
0.0
log_likelihood = np.log10(kde(test_df['tot_minutes'])).sum()
log_likelihood
-28746.170695622182
choices = []
for bw in np.linspace(0.01, 0.3, 20):
kde = gaussian_kde(train_df['tot_minutes'], bw_method=bw)
log_likelihood = np.log10(kde(test_df['tot_minutes'])).sum()
print(bw, log_likelihood)
choices.append((bw, log_likelihood))
0.01 -28817.917595202038 0.02526315789473684 -28709.48265025039 0.04052631578947368 -28696.36462240536 0.05578947368421053 -28692.098901252437 0.07105263157894737 -28690.45313426285 0.0863157894736842 -28690.196250451263 0.10157894736842105 -28690.79905684394 0.11684210526315789 -28691.982108391796 0.13210526315789473 -28693.63706897139 0.1473684210526316 -28695.755302217007 0.16263157894736843 -28698.366808507242 0.17789473684210527 -28701.504011156496 0.1931578947368421 -28705.187108819886 0.20842105263157895 -28709.422653258458 0.2236842105263158 -28714.207903649116 0.23894736842105263 -28719.53606510508 0.25421052631578944 -28725.400168116994 0.2694736842105263 -28731.79516403717 0.2847368421052632 -28738.718637586047 0.3 -28746.170695622182
choices
[(0.01, -28817.917595202038), (0.02526315789473684, -28709.48265025039), (0.04052631578947368, -28696.36462240536), (0.05578947368421053, -28692.098901252437), (0.07105263157894737, -28690.45313426285), (0.0863157894736842, -28690.196250451263), (0.10157894736842105, -28690.79905684394), (0.11684210526315789, -28691.982108391796), (0.13210526315789473, -28693.63706897139), (0.1473684210526316, -28695.755302217007), (0.16263157894736843, -28698.366808507242), (0.17789473684210527, -28701.504011156496), (0.1931578947368421, -28705.187108819886), (0.20842105263157895, -28709.422653258458), (0.2236842105263158, -28714.207903649116), (0.23894736842105263, -28719.53606510508), (0.25421052631578944, -28725.400168116994), (0.2694736842105263, -28731.79516403717), (0.2847368421052632, -28738.718637586047), (0.3, -28746.170695622182)]
bw = 0.0863157894736842
kde = gaussian_kde(train_df['tot_minutes'], bw_method = bw)
plt.figure(figsize=(10, 5))
x = np.linspace(120, 500, 400)
plt.plot(x, kde(x));
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
df = pd.read_csv("https://www.mth548.org/_static/kde_marathon_results/marathon_results.csv")
df["tot_minutes"] = pd.to_timedelta(df["Finish"]).dt.total_seconds()/60
df
Age | M/F | Country | 5K | 10K | 15K | 20K | Half | 25K | 30K | 35K | 40K | Finish | Pace | Overall | Gender | Division | tot_minutes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 25 | M | ETH | 00:14:43 | 00:29:43 | 00:44:57 | 01:00:29 | 01:04:02 | 01:16:07 | 01:32:00 | 01:47:59 | 02:02:39 | 02:09:17 | 00:04:56 | 1 | 1 | 1 | 129.283333 |
1 | 30 | M | ETH | 00:14:43 | 00:29:43 | 00:44:58 | 01:00:28 | 01:04:01 | 01:16:07 | 01:31:59 | 01:47:59 | 02:02:42 | 02:09:48 | 00:04:58 | 2 | 2 | 2 | 129.800000 |
2 | 29 | M | KEN | 00:14:43 | 00:29:43 | 00:44:57 | 01:00:29 | 01:04:02 | 01:16:07 | 01:32:00 | 01:47:59 | 02:03:01 | 02:10:22 | 00:04:59 | 3 | 3 | 3 | 130.366667 |
3 | 28 | M | KEN | 00:14:43 | 00:29:44 | 00:45:01 | 01:00:29 | 01:04:02 | 01:16:07 | 01:32:00 | 01:48:03 | 02:03:47 | 02:10:47 | 00:05:00 | 4 | 4 | 4 | 130.783333 |
4 | 32 | M | KEN | 00:14:43 | 00:29:44 | 00:44:58 | 01:00:28 | 01:04:01 | 01:16:07 | 01:32:00 | 01:47:59 | 02:03:27 | 02:10:49 | 00:05:00 | 5 | 5 | 5 | 130.816667 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
26293 | 64 | F | USA | 00:50:15 | 01:43:31 | 02:36:53 | 03:32:26 | 03:43:46 | 04:25:53 | 05:19:44 | 06:17:19 | 07:13:34 | 07:38:56 | 00:17:31 | 26594 | 12015 | 269 | 458.933333 |
26294 | 61 | F | USA | 00:48:36 | 01:39:39 | 02:39:13 | 03:35:58 | 03:47:55 | 04:32:44 | 05:31:58 | 06:28:56 | 07:26:19 | 07:51:30 | 00:17:59 | 26595 | 12016 | 270 | 471.500000 |
26295 | 66 | F | USA | 00:53:03 | 01:47:16 | 02:41:45 | 03:37:07 | 03:48:21 | 04:33:51 | 05:38:56 | 06:38:51 | 07:36:18 | 07:59:33 | 00:18:18 | 26596 | 12017 | 91 | 479.550000 |
26296 | 53 | M | USA | 00:49:04 | 01:40:12 | 02:33:31 | 03:31:41 | 03:43:35 | 04:29:20 | 05:31:11 | 06:33:35 | 07:35:38 | 08:00:37 | 00:18:20 | 26597 | 14580 | 2055 | 480.616667 |
26297 | 62 | M | USA | 00:40:14 | 01:28:18 | 02:26:46 | 03:28:41 | 03:40:36 | 04:36:06 | 05:43:44 | 06:51:31 | 07:41:28 | 08:06:01 | 00:18:33 | 26598 | 14581 | 898 | 486.016667 |
26298 rows × 18 columns
def f(x, y):
return (x**2 + y - 11)**2 + (x + y**2 - 7)**2 - 150
f(1, 2)
-82
x = np.linspace(0, 5, 6)
y = np.linspace(0, 5, 6)
x
array([0., 1., 2., 3., 4., 5.])
X, Y = np.meshgrid(x, y)
X, Y
(array([[0., 1., 2., 3., 4., 5.], [0., 1., 2., 3., 4., 5.], [0., 1., 2., 3., 4., 5.], [0., 1., 2., 3., 4., 5.], [0., 1., 2., 3., 4., 5.], [0., 1., 2., 3., 4., 5.]]), array([[0., 0., 0., 0., 0., 0.], [1., 1., 1., 1., 1., 1.], [2., 2., 2., 2., 2., 2.], [3., 3., 3., 3., 3., 3.], [4., 4., 4., 4., 4., 4.], [5., 5., 5., 5., 5., 5.]]))
plt.plot(X, Y, 'r.');
Z = f(X, Y)
Z
array([[ 20., -14., -76., -130., -116., 50.], [ -14., -44., -98., -140., -110., 76.], [ -60., -82., -124., -150., -100., 110.], [ -82., -92., -118., -124., -50., 188.], [ -20., -14., -20., -2., 100., 370.], [ 210., 236., 254., 300., 434., 740.]])
import plotly.graph_objects as go
fig = go.Figure(go.Surface(x=X, y=Y, z=Z))
fig.show()
x = np.linspace(-5, 5, 400)
y = np.linspace(-5, 5, 400)
X, Y = np.meshgrid(x, y)
Z = f(X, Y)
fig = go.Figure(go.Surface(x=X, y=Y, z=Z, colorscale="Picnic"))
fig.show()
plt.plot(figsize=(6, 6))
plt.contour(X, Y, Z, levels=20, colors='k');
plt.plot(figsize=(6, 6))
plt.contour(X, Y, Z, levels=20, cmap='jet');
plt.plot(figsize=(6, 6))
plt.contourf(X, Y, Z, levels=20, cmap='jet');
plt.plot(figsize=(6, 6))
plt.contourf(X, Y, Z, levels=20, cmap='jet')
plt.contour(X, Y, Z, levels=20, colors='k');
plt.figure(figsize=(10, 6))
plt.plot(df['tot_minutes'], df['Age'], 'r.', alpha=0.2);
data = np.array([[0, 0],
[1, 0],
[0, 1],
[1, 1]
])
data
array([[0, 0], [1, 0], [0, 1], [1, 1]])
data.T
array([[0, 1, 0, 1], [0, 0, 1, 1]])
from scipy.stats import gaussian_kde
kde = gaussian_kde(data.T, bw_method=0.7)
kde([0.5, 0.7])
array([0.22263723])
x = np.linspace(-1, 2, 400)
y = np.linspace(-1, 2, 400)
X, Y = np.meshgrid(x, y)
Z = kde([X.reshape(-1), Y.reshape(-1)]).reshape(X.shape)
Z
array([[0.00053435, 0.00055943, 0.00058549, ..., 0.00058549, 0.00055943, 0.00053435], [0.00055943, 0.00058569, 0.00061296, ..., 0.00061296, 0.00058569, 0.00055943], [0.00058549, 0.00061296, 0.00064151, ..., 0.00064151, 0.00061296, 0.00058549], ..., [0.00058549, 0.00061296, 0.00064151, ..., 0.00064151, 0.00061296, 0.00058549], [0.00055943, 0.00058569, 0.00061296, ..., 0.00061296, 0.00058569, 0.00055943], [0.00053435, 0.00055943, 0.00058549, ..., 0.00058549, 0.00055943, 0.00053435]])
fig = go.Figure(go.Surface(x=X, y=Y, z=Z))
fig.show()
plt.figure(figsize=(6, 6))
plt.contourf(X, Y, Z, levels=10, cmap="Reds")
plt.contour(X, Y, Z, levels=10, colors='k')
plt.show()
bw_method = 0.2
kde = gaussian_kde(df[["tot_minutes", "Age"]].T, bw_method=bw_method)
df[["tot_minutes", "Age"]].T
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 26288 | 26289 | 26290 | 26291 | 26292 | 26293 | 26294 | 26295 | 26296 | 26297 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
tot_minutes | 129.283333 | 129.8 | 130.366667 | 130.783333 | 130.816667 | 130.866667 | 131.333333 | 132.7 | 133.583333 | 133.866667 | ... | 402.016667 | 404.466667 | 416.783333 | 417.833333 | 420.5 | 458.933333 | 471.5 | 479.55 | 480.616667 | 486.016667 |
Age | 25.000000 | 30.0 | 29.000000 | 28.000000 | 32.000000 | 30.000000 | 32.000000 | 39.0 | 27.000000 | 33.000000 | ... | 47.000000 | 46.000000 | 71.000000 | 57.000000 | 37.0 | 64.000000 | 61.0 | 66.00 | 53.000000 | 62.000000 |
2 rows × 26298 columns
kde([180, 30])
array([0.00024693])
kde.integrate_box([180, 30], [240, 40])
0.15874527277356906
x = np.linspace(120, 360, 100)
y = np.linspace(10, 70, 100)
X, Y = np.meshgrid(x, y)
Z = kde([X.reshape(-1), Y.reshape(-1)]).reshape(X.shape)
plt.figure(figsize=(10, 6))
plt.contourf(X, Y, Z, levels=10, cmap='Reds')
plt.contour(X, Y, Z, levels=10, colors='k')
<matplotlib.contour.QuadContourSet at 0x7fa9f65e02e0>
import seaborn as sns
df = sns.load_dataset('tips')
df
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
from ipywidgets import interact, fixed
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
sns.set_context("notebook")
df = sns.load_dataset('tips')
def tip_plot(frac):
frac=frac/100
plt.figure(figsize=(12,7))
sns.scatterplot(data=df, x="total_bill", y="tip", marker='o')
x = np.arange(0, 55)
plt.plot(x, frac*x, c='b', label=f"{frac:.0%} tip")
plt.ylim(0, 11)
plt.title("Total bill vs tip amount")
plt.legend()
plt.show()
interact(tip_plot, frac=(10, 20));
interactive(children=(IntSlider(value=15, description='frac', max=20, min=10), Output()), _dom_classes=('widge…
df['tip_fraction'] = df['tip']/df['total_bill']
df
total_bill | tip | sex | smoker | day | time | size | tip_fraction | |
---|---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 | 0.059447 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 | 0.160542 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 | 0.166587 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 | 0.139780 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 | 0.146808 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 | 0.203927 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 | 0.073584 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 | 0.088222 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 | 0.098204 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 | 0.159744 |
244 rows × 8 columns
mean_tip = df['tip_fraction'].mean()
mean_tip
0.16080258172250478
df['naive_tip_prediction'] = mean_tip*df['total_bill']
df
total_bill | tip | sex | smoker | day | time | size | tip_fraction | naive_tip_prediction | |
---|---|---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 | 0.059447 | 2.732036 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 | 0.160542 | 1.662699 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 | 0.166587 | 3.378462 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 | 0.139780 | 3.807805 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 | 0.146808 | 3.954135 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 | 0.203927 | 4.668099 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 | 0.073584 | 4.370614 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 | 0.088222 | 3.645395 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 | 0.098204 | 2.865502 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 | 0.159744 | 3.019872 |
244 rows × 9 columns
df['naive_prediction_error'] = df['naive_tip_prediction'] - df['tip']
df
total_bill | tip | sex | smoker | day | time | size | tip_fraction | naive_tip_prediction | naive_prediction_error | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 | 0.059447 | 2.732036 | 1.722036 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 | 0.160542 | 1.662699 | 0.002699 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 | 0.166587 | 3.378462 | -0.121538 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 | 0.139780 | 3.807805 | 0.497805 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 | 0.146808 | 3.954135 | 0.344135 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 | 0.203927 | 4.668099 | -1.251901 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 | 0.073584 | 4.370614 | 2.370614 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 | 0.088222 | 3.645395 | 1.645395 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 | 0.098204 | 2.865502 | 1.115502 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 | 0.159744 | 3.019872 | 0.019872 |
244 rows × 10 columns
df['naive_prediction_error'].mean()
0.18335196705924947
np.abs(df['naive_prediction_error']).mean()
0.7968739185842583
np.abs(df['naive_prediction_error']).describe()
count 244.000000 mean 0.796874 std 0.826432 min 0.002699 25% 0.234597 50% 0.509510 75% 1.068028 max 4.623554 Name: naive_prediction_error, dtype: float64
Cost function for a few values of a and b:
def cost(a, b):
prediction = a*df['total_bill'] + b
return ((prediction - df['tip'])**2).sum()
cost(0.16, 0)
317.83711808
cost(0.2, 0)
650.866652
cost(0.14, 0.5)
294.36668427999996