Commit 979a88c36e8e832d4a38b058c355919d741dc010
1 parent
aa77698374
Exists in
main
picking up the input/output data values
Showing 6 changed files with 270 additions and 68 deletions Side-by-side Diff
python-notebook/__pycache__/constants.cpython-37.pyc
View file @
979a88c
python-notebook/__pycache__/tools.cpython-37.pyc
View file @
979a88c
python-notebook/constants.py
View file @
979a88c
1 | +# cut off values that are not in the range of the data | |
2 | +THRESHOLD_OF_DAYS_PER_USER = 10 | |
3 | + | |
4 | +# cut off values for the number of consecutive minutes for a walk | |
5 | +MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5 | |
6 | + | |
7 | +# cut off values for the number of steps per minute for an active minute | |
8 | +MINIMUM_STEPS_PER_MINUTE = 60 | |
9 | + | |
10 | +# cut off value for the number of weeks for looking back | |
11 | +NUMBER_OF_WEEKS_FOR_LOOKING_BACK = 5 |
python-notebook/data_loading.ipynb
View file @
979a88c
... | ... | @@ -9,7 +9,7 @@ |
9 | 9 | }, |
10 | 10 | { |
11 | 11 | "cell_type": "code", |
12 | - "execution_count": 142, | |
12 | + "execution_count": 161, | |
13 | 13 | "metadata": {}, |
14 | 14 | "outputs": [], |
15 | 15 | "source": [ |
... | ... | @@ -19,9 +19,8 @@ |
19 | 19 | "from pandas import read_csv\n", |
20 | 20 | "import pandas as pd\n", |
21 | 21 | "import os\n", |
22 | - "from datetime import datetime, date, timedelta\n", | |
23 | - "from itertools import product\n", | |
24 | - "# %load_ext line_profiler" | |
22 | + "from tools import *\n", | |
23 | + "from constants import *" | |
25 | 24 | ] |
26 | 25 | }, |
27 | 26 | { |
28 | 27 | |
... | ... | @@ -33,65 +32,12 @@ |
33 | 32 | }, |
34 | 33 | { |
35 | 34 | "cell_type": "code", |
36 | - "execution_count": 143, | |
35 | + "execution_count": 162, | |
37 | 36 | "metadata": {}, |
38 | 37 | "outputs": [], |
39 | 38 | "source": [ |
40 | 39 | "# to use unlimited memory for large dataframes\n", |
41 | - "pd.options.mode.chained_assignment = None\n", | |
42 | - "\n", | |
43 | - "# convert a datetime object to a date object\n", | |
44 | - "def get_date(x):\n", | |
45 | - " return date(x.year, x.month, x.day)\n", | |
46 | - "\n", | |
47 | - "# convert a datetime object to an integer, which denotes the number of minutes since midnight\n", | |
48 | - "def get_minute_index(x):\n", | |
49 | - " return (x.hour * 60) + x.minute\n", | |
50 | - "\n", | |
51 | - "# return a range of dates\n", | |
52 | - "def date_range(start_date, end_date):\n", | |
53 | - " delta = end_date - start_date\n", | |
54 | - "\n", | |
55 | - " for i in range(delta.days + 1):\n", | |
56 | - " yield start_date + timedelta(days=i)\n", | |
57 | - "\n", | |
58 | - "# define an iterative walk calculation (merging consecutive active minutes)\n", | |
59 | - "def calculate_walk(cv):\n", | |
60 | - " nv = cv.copy(deep=True)\n", | |
61 | - " nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n", | |
62 | - "\n", | |
63 | - " # move midnight minutes to previous day\n", | |
64 | - " nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n", | |
65 | - " nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n", | |
66 | - " \n", | |
67 | - " nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n", | |
68 | - " jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n", | |
69 | - " jv[\"add_count\"] += 1\n", | |
70 | - " jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n", | |
71 | - "\n", | |
72 | - " return jv \n", | |
73 | - "\n", | |
74 | - "# generate complete product of vectors\n", | |
75 | - "def product_df(mat1, mat2):\n", | |
76 | - " mat1 = mat1.drop_duplicates()\n", | |
77 | - " mat2 = mat2.drop_duplicates()\n", | |
78 | - "\n", | |
79 | - " temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n", | |
80 | - " for i, acol in enumerate(mat1.columns):\n", | |
81 | - " temp[acol] = temp[0].apply(lambda x: x[i])\n", | |
82 | - " for i, acol in enumerate(mat2.columns):\n", | |
83 | - " temp[acol] = temp[1].apply(lambda x: x[i])\n", | |
84 | - " temp = temp.drop(columns=[0, 1])\n", | |
85 | - " return temp\n", | |
86 | - "\n", | |
87 | - "# cut off values that are not in the range of the data\n", | |
88 | - "THRESHOLD_OF_DAYS_PER_USER = 10\n", | |
89 | - "\n", | |
90 | - "# cut off values for the number of consecutive minutes for a walk\n", | |
91 | - "MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5\n", | |
92 | - "\n", | |
93 | - "# cut off values for the number of steps per minute for an active minute\n", | |
94 | - "MINIMUM_STEPS_PER_MINUTE = 60\n" | |
40 | + "pd.options.mode.chained_assignment = None" | |
95 | 41 | ] |
96 | 42 | }, |
97 | 43 | { |
... | ... | @@ -103,7 +49,7 @@ |
103 | 49 | }, |
104 | 50 | { |
105 | 51 | "cell_type": "code", |
106 | - "execution_count": 144, | |
52 | + "execution_count": 163, | |
107 | 53 | "metadata": {}, |
108 | 54 | "outputs": [], |
109 | 55 | "source": [ |
... | ... | @@ -124,7 +70,7 @@ |
124 | 70 | }, |
125 | 71 | { |
126 | 72 | "cell_type": "code", |
127 | - "execution_count": 145, | |
73 | + "execution_count": 164, | |
128 | 74 | "metadata": {}, |
129 | 75 | "outputs": [], |
130 | 76 | "source": [ |
... | ... | @@ -173,7 +119,7 @@ |
173 | 119 | }, |
174 | 120 | { |
175 | 121 | "cell_type": "code", |
176 | - "execution_count": 146, | |
122 | + "execution_count": 165, | |
177 | 123 | "metadata": {}, |
178 | 124 | "outputs": [], |
179 | 125 | "source": [ |
... | ... | @@ -190,7 +136,7 @@ |
190 | 136 | }, |
191 | 137 | { |
192 | 138 | "cell_type": "code", |
193 | - "execution_count": 147, | |
139 | + "execution_count": 166, | |
194 | 140 | "metadata": {}, |
195 | 141 | "outputs": [ |
196 | 142 | { |
... | ... | @@ -257,7 +203,7 @@ |
257 | 203 | }, |
258 | 204 | { |
259 | 205 | "cell_type": "code", |
260 | - "execution_count": 148, | |
206 | + "execution_count": 167, | |
261 | 207 | "metadata": {}, |
262 | 208 | "outputs": [ |
263 | 209 | { |
... | ... | @@ -300,7 +246,7 @@ |
300 | 246 | }, |
301 | 247 | { |
302 | 248 | "cell_type": "code", |
303 | - "execution_count": 149, | |
249 | + "execution_count": 168, | |
304 | 250 | "metadata": {}, |
305 | 251 | "outputs": [], |
306 | 252 | "source": [ |
... | ... | @@ -346,7 +292,7 @@ |
346 | 292 | }, |
347 | 293 | { |
348 | 294 | "cell_type": "code", |
349 | - "execution_count": 150, | |
295 | + "execution_count": 169, | |
350 | 296 | "metadata": {}, |
351 | 297 | "outputs": [], |
352 | 298 | "source": [ |
353 | 299 | |
354 | 300 | |
... | ... | @@ -380,13 +326,25 @@ |
380 | 326 | }, |
381 | 327 | { |
382 | 328 | "cell_type": "code", |
383 | - "execution_count": 151, | |
329 | + "execution_count": 170, | |
384 | 330 | "metadata": {}, |
385 | 331 | "outputs": [], |
386 | 332 | "source": [ |
333 | + "# converting data type\n", | |
334 | + "padded_hours[\"user\"] = padded_hours[\"user\"].astype(int)\n", | |
335 | + "padded_hours[\"hour\"] = padded_hours[\"hour\"].astype(int)\n", | |
336 | + "padded_hours[\"walked\"] = padded_hours[\"walked\"].astype(int)\n", | |
337 | + "\n", | |
338 | + "padded_threehours[\"user\"] = padded_threehours[\"user\"].astype(int)\n", | |
339 | + "padded_threehours[\"threehour\"] = padded_threehours[\"threehour\"].astype(int)\n", | |
340 | + "padded_threehours[\"walked\"] = padded_threehours[\"walked\"].astype(int)\n", | |
341 | + "\n", | |
387 | 342 | "# save the data\n", |
388 | 343 | "padded_hours.to_csv(os.path.join(data_dir, \"padded_hours.csv\"), index=False)\n", |
389 | - "padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)" | |
344 | + "padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)\n", | |
345 | + "\n", | |
346 | + "padded_hours.to_pickle(os.path.join(data_dir, \"padded_hours.pkl\"))\n", | |
347 | + "padded_threehours.to_pickle(os.path.join(data_dir, \"padded_threehours.pkl\"))" | |
390 | 348 | ] |
391 | 349 | } |
392 | 350 | ], |
python-notebook/prepare_trteva_data.ipynb
View file @
979a88c
1 | +{ | |
2 | + "cells": [ | |
3 | + { | |
4 | + "cell_type": "code", | |
5 | + "execution_count": 1, | |
6 | + "metadata": {}, | |
7 | + "outputs": [], | |
8 | + "source": [ | |
9 | + "import numpy as np\n", | |
10 | + "import pandas as pd\n", | |
11 | + "import os\n", | |
12 | + "from tools import *\n", | |
13 | + "from constants import *\n", | |
14 | + "from tensorflow.keras.utils import to_categorical" | |
15 | + ] | |
16 | + }, | |
17 | + { | |
18 | + "cell_type": "markdown", | |
19 | + "metadata": {}, | |
20 | + "source": [ | |
21 | + "# Prepare Training, Testing, and Validation Data\n", | |
22 | + "## Loading the preprocessed data" | |
23 | + ] | |
24 | + }, | |
25 | + { | |
26 | + "cell_type": "code", | |
27 | + "execution_count": 2, | |
28 | + "metadata": {}, | |
29 | + "outputs": [], | |
30 | + "source": [ | |
31 | + "# to use unlimited memory for large dataframes\n", | |
32 | + "pd.options.mode.chained_assignment = None\n", | |
33 | + "\n", | |
34 | + "data_dir = '../data'\n", | |
35 | + "\n", | |
36 | + "padded_hours = pd.read_pickle(os.path.join(data_dir, 'padded_hours.pkl'))\n", | |
37 | + "padded_threehours = pd.read_pickle(os.path.join(data_dir, 'padded_threehours.pkl'))" | |
38 | + ] | |
39 | + }, | |
40 | + { | |
41 | + "cell_type": "markdown", | |
42 | + "metadata": {}, | |
43 | + "source": [ | |
44 | + "## Enumerating Output Data" | |
45 | + ] | |
46 | + }, | |
47 | + { | |
48 | + "cell_type": "code", | |
49 | + "execution_count": 3, | |
50 | + "metadata": {}, | |
51 | + "outputs": [], | |
52 | + "source": [ | |
53 | + "# return output value\n", | |
54 | + "def get_output(y):\n", | |
55 | + " return y[\"walked\"]\n", | |
56 | + "\n", | |
57 | + "# return intput value\n", | |
58 | + "def get_input(y, padded_hours):\n", | |
59 | + " # base information\n", | |
60 | + " user = y[\"user\"]\n", | |
61 | + " local_date = y[\"local_date\"]\n", | |
62 | + " threehour_idx = y[\"threehour\"]\n", | |
63 | + " \n", | |
64 | + " # derived information\n", | |
65 | + " hour_idx = threehour_idx * 3\n", | |
66 | + " encoded_hour_idx = to_categorical(hour_idx, num_classes=24)\n", | |
67 | + " end_date = local_date - timedelta(days=1)\n", | |
68 | + " start_date = end_date - timedelta(days=7*NUMBER_OF_WEEKS_FOR_LOOKING_BACK-1)\n", | |
69 | + " weekday = local_date.weekday()\n", | |
70 | + " encoded_weekday = to_categorical(weekday, num_classes=7)\n", | |
71 | + " encoded_month = to_categorical(local_date.month, num_classes=12)\n", | |
72 | + " encoded_day_of_month = to_categorical(local_date.day, num_classes=31)\n", | |
73 | + "\n", | |
74 | + " gait = pd.Series([], dtype=int)\n", | |
75 | + " # gait movement\n", | |
76 | + " zero_move = 0\n", | |
77 | + " for a_date in date_range(start_date, end_date):\n", | |
78 | + " day_df = padded_hours[(padded_hours[\"user\"] == user) & (padded_hours[\"local_date\"] == a_date)]\n", | |
79 | + " if day_df.size == 0:\n", | |
80 | + " gait = pd.concat([gait, pd.Series([1,0,0] * 24, dtype=int)])\n", | |
81 | + " zero_move += 1\n", | |
82 | + " else:\n", | |
83 | + " gait = pd.concat([gait, pd.Series(to_categorical(day_df[\"walked\"].values, 3, dtype=int).reshape(24*3), dtype=int)])\n", | |
84 | + " if zero_move == 5 * 7:\n", | |
85 | + " raise Exception(\"No movement data\")\n", | |
86 | + "\n", | |
87 | + " return_series = pd.Series([], dtype=int)\n", | |
88 | + " return_series = pd.concat([return_series, pd.Series(encoded_hour_idx, dtype=np.int_)])\n", | |
89 | + " return_series = pd.concat([return_series, pd.Series(encoded_weekday, dtype=np.int_)])\n", | |
90 | + " return_series = pd.concat([return_series, pd.Series(encoded_month, dtype=np.int_)])\n", | |
91 | + " return_series = pd.concat([return_series, pd.Series(encoded_day_of_month, dtype=np.int_)])\n", | |
92 | + " return_series = pd.concat([return_series, gait])\n", | |
93 | + " \n", | |
94 | + " return return_series\n", | |
95 | + "\n", | |
96 | + "def get_database(start_idx, end_idx):\n", | |
97 | + " database = pd.DataFrame({}, dtype=int)\n", | |
98 | + "\n", | |
99 | + " for i in range(start_idx, end_idx):\n", | |
100 | + " try:\n", | |
101 | + " y = padded_threehours.iloc[i, :]\n", | |
102 | + " user = y[\"user\"]\n", | |
103 | + " local_date = y[\"local_date\"]\n", | |
104 | + " first_day = padded_hours[padded_hours[\"user\"] == user][\"local_date\"].min()\n", | |
105 | + " date_diff = (local_date - first_day).days\n", | |
106 | + "\n", | |
107 | + " threehour_idx = y[\"threehour\"]\n", | |
108 | + " hour_idx = threehour_idx * 3\n", | |
109 | + "\n", | |
110 | + " output = get_output(y)\n", | |
111 | + " input = get_input(y, padded_hours)\n", | |
112 | + "\n", | |
113 | + " temp_series = pd.Series([], dtype=int)\n", | |
114 | + " temp_series = pd.concat([temp_series, pd.Series(user, dtype=int)])\n", | |
115 | + " temp_series = pd.concat([temp_series, pd.Series(date_diff, dtype=int)])\n", | |
116 | + " temp_series = pd.concat([temp_series, pd.Series(threehour_idx, dtype=int)])\n", | |
117 | + " temp_series = pd.concat([temp_series, pd.Series(hour_idx, dtype=int)])\n", | |
118 | + " temp_series = pd.concat([temp_series, pd.Series(output, dtype=int)])\n", | |
119 | + " temp_series = pd.concat([temp_series, pd.Series(input, dtype=int)])\n", | |
120 | + "\n", | |
121 | + " database = pd.concat([database, temp_series], axis=1)\n", | |
122 | + " # print(input)\n", | |
123 | + " except Exception as e:\n", | |
124 | + " # print(\"Error:\", e)\n", | |
125 | + " pass\n", | |
126 | + "\n", | |
127 | + " return database\n", | |
128 | + "\n" | |
129 | + ] | |
130 | + }, | |
131 | + { | |
132 | + "cell_type": "code", | |
133 | + "execution_count": 4, | |
134 | + "metadata": {}, | |
135 | + "outputs": [], | |
136 | + "source": [ | |
137 | + "from tensorflow.keras.datasets import mnist\n", | |
138 | + "from tensorflow.keras.models import Sequential\n", | |
139 | + "from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation\n", | |
140 | + "\n", | |
141 | + "\n", | |
142 | + "(x_train, y_train), (x_test, y_test) = mnist.load_data(path='mnist.npz')\n", | |
143 | + "\n", | |
144 | + "X_train = x_train.reshape(60000, 784).astype('float32') / 255\n", | |
145 | + "X_test = x_test.reshape(10000, 784).astype('float32') / 255\n", | |
146 | + "\n", | |
147 | + "Y_train = to_categorical(y_train, 10)\n", | |
148 | + "Y_test = to_categorical(y_test, 10)" | |
149 | + ] | |
150 | + }, | |
151 | + { | |
152 | + "cell_type": "code", | |
153 | + "execution_count": null, | |
154 | + "metadata": {}, | |
155 | + "outputs": [], | |
156 | + "source": [] | |
157 | + } | |
158 | + ], | |
159 | + "metadata": { | |
160 | + "interpreter": { | |
161 | + "hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a" | |
162 | + }, | |
163 | + "kernelspec": { | |
164 | + "display_name": "Python 3.7.9 64-bit ('venv': venv)", | |
165 | + "language": "python", | |
166 | + "name": "python3" | |
167 | + }, | |
168 | + "language_info": { | |
169 | + "codemirror_mode": { | |
170 | + "name": "ipython", | |
171 | + "version": 3 | |
172 | + }, | |
173 | + "file_extension": ".py", | |
174 | + "mimetype": "text/x-python", | |
175 | + "name": "python", | |
176 | + "nbconvert_exporter": "python", | |
177 | + "pygments_lexer": "ipython3", | |
178 | + "version": "3.7.9" | |
179 | + }, | |
180 | + "orig_nbformat": 4 | |
181 | + }, | |
182 | + "nbformat": 4, | |
183 | + "nbformat_minor": 2 | |
184 | +} |
python-notebook/tools.py
View file @
979a88c
1 | +import pandas as pd | |
2 | + | |
3 | +from itertools import product | |
4 | +from datetime import date, timedelta | |
5 | + | |
6 | + | |
7 | +# convert a datetime object to a date object | |
8 | +def get_date(x): | |
9 | + return date(x.year, x.month, x.day) | |
10 | + | |
11 | +# convert a datetime object to an integer, which denotes the number of minutes since midnight | |
12 | +def get_minute_index(x): | |
13 | + return (x.hour * 60) + x.minute | |
14 | + | |
15 | +# return a range of dates | |
16 | +def date_range(start_date, end_date): | |
17 | + delta = end_date - start_date | |
18 | + | |
19 | + for i in range(delta.days + 1): | |
20 | + yield start_date + timedelta(days=i) | |
21 | + | |
22 | +# define an iterative walk calculation (merging consecutive active minutes) | |
23 | +def calculate_walk(cv): | |
24 | + nv = cv.copy(deep=True) | |
25 | + nv["prev_minute_index"] = nv["local_minute_index"] - 1 | |
26 | + | |
27 | + # move midnight minutes to previous day | |
28 | + nv[nv["prev_minute_index"] < 0]["local_date"] -= timedelta(days=1) | |
29 | + nv[nv["prev_minute_index"] < 0]["prev_minute_index"] = 1439 | |
30 | + | |
31 | + nv = nv[["user", "local_date", "prev_minute_index"]] | |
32 | + jv = cv.merge(nv, left_on=["user", "local_date", "local_minute_index"], right_on=["user", "local_date", "prev_minute_index"], how="inner") | |
33 | + jv["add_count"] += 1 | |
34 | + jv = jv[["user", "local_date", "local_minute_index", "add_count"]] | |
35 | + | |
36 | + return jv | |
37 | + | |
38 | +# generate complete product of vectors | |
39 | +def product_df(mat1, mat2): | |
40 | + mat1 = mat1.drop_duplicates() | |
41 | + mat2 = mat2.drop_duplicates() | |
42 | + | |
43 | + temp = pd.DataFrame(list(product(mat1.values, mat2.values))) | |
44 | + for i, acol in enumerate(mat1.columns): | |
45 | + temp[acol] = temp[0].apply(lambda x: x[i]) | |
46 | + for i, acol in enumerate(mat2.columns): | |
47 | + temp[acol] = temp[1].apply(lambda x: x[i]) | |
48 | + temp = temp.drop(columns=[0, 1]) | |
49 | + return temp |