Commit 163adf28ec78be3d551592fa51ed7a50c0aa6dcd
1 parent
979a88c36e
Exists in
main
make it faster
Showing 1 changed file with 62 additions and 9 deletions Inline Diff
python-notebook/prepare_trteva_data.ipynb
View file @
163adf2
{ | 1 | 1 | { | |
"cells": [ | 2 | 2 | "cells": [ | |
{ | 3 | 3 | { | |
"cell_type": "code", | 4 | 4 | "cell_type": "code", | |
"execution_count": 1, | 5 | 5 | "execution_count": 1, | |
"metadata": {}, | 6 | 6 | "metadata": {}, | |
"outputs": [], | 7 | 7 | "outputs": [], | |
"source": [ | 8 | 8 | "source": [ | |
"import numpy as np\n", | 9 | 9 | "import numpy as np\n", | |
"import pandas as pd\n", | 10 | 10 | "import pandas as pd\n", | |
"import os\n", | 11 | 11 | "import os\n", | |
"from tools import *\n", | 12 | 12 | "from tools import *\n", | |
"from constants import *\n", | 13 | 13 | "from constants import *\n", | |
"from tensorflow.keras.utils import to_categorical" | 14 | 14 | "from tensorflow.keras.utils import to_categorical\n", | |
15 | "\n", | |||
16 | "# %load_ext line_profiler" | |||
] | 15 | 17 | ] | |
}, | 16 | 18 | }, | |
{ | 17 | 19 | { | |
"cell_type": "markdown", | 18 | 20 | "cell_type": "markdown", | |
"metadata": {}, | 19 | 21 | "metadata": {}, | |
"source": [ | 20 | 22 | "source": [ | |
"# Prepare Training, Testing, and Validation Data\n", | 21 | 23 | "# Prepare Training, Testing, and Validation Data\n", | |
"## Loading the preprocessed data" | 22 | 24 | "## Loading the preprocessed data" | |
] | 23 | 25 | ] | |
}, | 24 | 26 | }, | |
{ | 25 | 27 | { | |
"cell_type": "code", | 26 | 28 | "cell_type": "code", | |
"execution_count": 2, | 27 | 29 | "execution_count": 2, | |
"metadata": {}, | 28 | 30 | "metadata": {}, | |
"outputs": [], | 29 | 31 | "outputs": [], | |
"source": [ | 30 | 32 | "source": [ | |
"# to use unlimited memory for large dataframes\n", | 31 | 33 | "# to use unlimited memory for large dataframes\n", | |
"pd.options.mode.chained_assignment = None\n", | 32 | 34 | "pd.options.mode.chained_assignment = None\n", | |
"\n", | 33 | 35 | "\n", | |
"data_dir = '../data'\n", | 34 | 36 | "data_dir = '../data'\n", | |
"\n", | 35 | 37 | "\n", | |
"padded_hours = pd.read_pickle(os.path.join(data_dir, 'padded_hours.pkl'))\n", | 36 | 38 | "padded_hours = pd.read_pickle(os.path.join(data_dir, 'padded_hours.pkl'))\n", | |
"padded_threehours = pd.read_pickle(os.path.join(data_dir, 'padded_threehours.pkl'))" | 37 | 39 | "padded_threehours = pd.read_pickle(os.path.join(data_dir, 'padded_threehours.pkl'))" | |
] | 38 | 40 | ] | |
}, | 39 | 41 | }, | |
{ | 40 | 42 | { | |
"cell_type": "markdown", | 41 | 43 | "cell_type": "markdown", | |
"metadata": {}, | 42 | 44 | "metadata": {}, | |
"source": [ | 43 | 45 | "source": [ | |
46 | "# Expanding one-hot-encoded gaits" | |||
47 | ] | |||
48 | }, | |||
49 | { | |||
50 | "cell_type": "code", | |||
51 | "execution_count": 13, | |||
52 | "metadata": {}, | |||
53 | "outputs": [ | |||
54 | { | |||
55 | "name": "stdout", | |||
56 | "output_type": "stream", | |||
57 | "text": [ | |||
58 | "(42360, 4) -> (127080, 4)\n" | |||
59 | ] | |||
60 | } | |||
61 | ], | |||
62 | "source": [ | |||
63 | "def mass_one_hot_encoding(padded_hours, colname, n_classes):\n", | |||
64 | " def __mass_one_hot_encoding(padded_hours, colname, n_classes, n):\n", | |||
65 | " temp = padded_hours[padded_hours[colname] == n]\n", | |||
66 | "\n", | |||
67 | " return_df = pd.DataFrame(dtype=int)\n", | |||
68 | "\n", | |||
69 | " for i in range(n_classes):\n", | |||
70 | " temp_2 = temp.copy(deep=True)\n", | |||
71 | " temp_2[\"var\"] = i\n", | |||
72 | " temp_2[\"value\"] = (n == i) if 1 else 0\n", | |||
73 | " return_df = pd.concat([return_df, temp_2], ignore_index=True)\n", | |||
74 | "\n", | |||
75 | " return return_df\n", | |||
76 | " \n", | |||
77 | " mass_encoded = pd.DataFrame(dtype=int)\n", | |||
78 | " for n in range(n_classes):\n", | |||
79 | " mass_encoded = pd.concat([mass_encoded, __mass_one_hot_encoding(padded_hours, colname, n_classes, n)], ignore_index=True)\n", | |||
80 | " return mass_encoded\n", | |||
81 | "\n", | |||
82 | "padded_hours_encoded = mass_one_hot_encoding(padded_hours, 'walked', 3)\n", | |||
83 | "padded_hours_encoded[\"local_date\"] = padded_hours_encoded[\"local_date\"].astype(str)\n", | |||
84 | "padded_hours_encoded = padded_hours_encoded.set_index(['user', 'local_date']).sort_index()\n", | |||
85 | "\n", | |||
86 | "print(\"{} -> {}\".format(padded_hours.shape, padded_hours_encoded.shape))" | |||
87 | ] | |||
88 | }, | |||
89 | { | |||
90 | "cell_type": "markdown", | |||
91 | "metadata": {}, | |||
92 | "source": [ | |||
"## Enumerating Output Data" | 44 | 93 | "## Enumerating Output Data" | |
] | 45 | 94 | ] | |
}, | 46 | 95 | }, | |
{ | 47 | 96 | { | |
"cell_type": "code", | 48 | 97 | "cell_type": "code", | |
"execution_count": 3, | 49 | 98 | "execution_count": 15, | |
"metadata": {}, | 50 | 99 | "metadata": {}, | |
"outputs": [], | 51 | 100 | "outputs": [], | |
"source": [ | 52 | 101 | "source": [ | |
"# return output value\n", | 53 | 102 | "# return output value\n", | |
"def get_output(y):\n", | 54 | 103 | "def get_output(y):\n", | |
" return y[\"walked\"]\n", | 55 | 104 | " return y[\"walked\"]\n", | |
"\n", | 56 | 105 | "\n", | |
"# return intput value\n", | 57 | 106 | "# return intput value\n", | |
"def get_input(y, padded_hours):\n", | 58 | 107 | "def get_input(y, padded_hours):\n", | |
" # base information\n", | 59 | 108 | " # base information\n", | |
" user = y[\"user\"]\n", | 60 | 109 | " user = y[\"user\"]\n", | |
" local_date = y[\"local_date\"]\n", | 61 | 110 | " local_date = y[\"local_date\"]\n", | |
" threehour_idx = y[\"threehour\"]\n", | 62 | 111 | " threehour_idx = y[\"threehour\"]\n", | |
" \n", | 63 | 112 | " \n", | |
" # derived information\n", | 64 | 113 | " # derived information\n", | |
" hour_idx = threehour_idx * 3\n", | 65 | 114 | " hour_idx = threehour_idx * 3\n", | |
" encoded_hour_idx = to_categorical(hour_idx, num_classes=24)\n", | 66 | 115 | " encoded_hour_idx = to_categorical(hour_idx, num_classes=24)\n", | |
" end_date = local_date - timedelta(days=1)\n", | 67 | 116 | " end_date = local_date - timedelta(days=1)\n", | |
" start_date = end_date - timedelta(days=7*NUMBER_OF_WEEKS_FOR_LOOKING_BACK-1)\n", | 68 | 117 | " start_date = end_date - timedelta(days=7*NUMBER_OF_WEEKS_FOR_LOOKING_BACK-1)\n", | |
" weekday = local_date.weekday()\n", | 69 | 118 | " weekday = local_date.weekday()\n", | |
" encoded_weekday = to_categorical(weekday, num_classes=7)\n", | 70 | 119 | " encoded_weekday = to_categorical(weekday, num_classes=7)\n", | |
" encoded_month = to_categorical(local_date.month, num_classes=12)\n", | 71 | 120 | " encoded_month = to_categorical(local_date.month, num_classes=12)\n", | |
" encoded_day_of_month = to_categorical(local_date.day, num_classes=31)\n", | 72 | 121 | " encoded_day_of_month = to_categorical(local_date.day, num_classes=31)\n", | |
"\n", | 73 | 122 | "\n", | |
" gait = pd.Series([], dtype=int)\n", | 74 | 123 | " gait = pd.Series([], dtype=int)\n", | |
" # gait movement\n", | 75 | 124 | " # gait movement\n", | |
" zero_move = 0\n", | 76 | 125 | " zero_move = 0\n", | |
" for a_date in date_range(start_date, end_date):\n", | 77 | 126 | " for a_date in date_range(start_date, end_date):\n", | |
" day_df = padded_hours[(padded_hours[\"user\"] == user) & (padded_hours[\"local_date\"] == a_date)]\n", | 78 | 127 | " key = (user, a_date.strftime(\"%Y-%m-%d\"))\n", | |
" if day_df.size == 0:\n", | 79 | 128 | " if key in padded_hours_encoded.index:\n", | |
" gait = pd.concat([gait, pd.Series([1,0,0] * 24, dtype=int)])\n", | 80 | 129 | " day_df = padded_hours_encoded.loc[key, \"value\"]\n", | |
" zero_move += 1\n", | 81 | 130 | " gait = pd.concat([gait, day_df], ignore_index=True)\n", | |
" else:\n", | 82 | 131 | " else:\n", | |
" gait = pd.concat([gait, pd.Series(to_categorical(day_df[\"walked\"].values, 3, dtype=int).reshape(24*3), dtype=int)])\n", | 83 | 132 | " gait = pd.concat([gait, pd.Series([1,0,0] * 24, dtype=int)], ignore_index=True)\n", | |
133 | " zero_move += 1\n", | |||
" if zero_move == 5 * 7:\n", | 84 | 134 | " if zero_move == 5 * 7:\n", | |
" raise Exception(\"No movement data\")\n", | 85 | 135 | " raise Exception(\"No movement data\")\n", | |
"\n", | 86 | 136 | "\n", | |
" return_series = pd.Series([], dtype=int)\n", | 87 | 137 | " return_series = pd.Series([], dtype=int)\n", | |
" return_series = pd.concat([return_series, pd.Series(encoded_hour_idx, dtype=np.int_)])\n", | 88 | 138 | " return_series = pd.concat([return_series, pd.Series(encoded_hour_idx, dtype=np.int_)])\n", | |
" return_series = pd.concat([return_series, pd.Series(encoded_weekday, dtype=np.int_)])\n", | 89 | 139 | " return_series = pd.concat([return_series, pd.Series(encoded_weekday, dtype=np.int_)])\n", | |
" return_series = pd.concat([return_series, pd.Series(encoded_month, dtype=np.int_)])\n", | 90 | 140 | " return_series = pd.concat([return_series, pd.Series(encoded_month, dtype=np.int_)])\n", | |
" return_series = pd.concat([return_series, pd.Series(encoded_day_of_month, dtype=np.int_)])\n", | 91 | 141 | " return_series = pd.concat([return_series, pd.Series(encoded_day_of_month, dtype=np.int_)])\n", | |
" return_series = pd.concat([return_series, gait])\n", | 92 | 142 | " return_series = pd.concat([return_series, gait])\n", | |
" \n", | 93 | 143 | " \n", | |
" return return_series\n", | 94 | 144 | " return return_series\n", | |
"\n", | 95 | 145 | "\n", | |
"def get_database(start_idx, end_idx):\n", | 96 | 146 | "def get_database(start_idx, end_idx):\n", | |
" database = pd.DataFrame({}, dtype=int)\n", | 97 | 147 | " database = pd.DataFrame({}, dtype=int)\n", | |
"\n", | 98 | 148 | "\n", | |
" for i in range(start_idx, end_idx):\n", | 99 | 149 | " for i in range(start_idx, end_idx):\n", | |
" try:\n", | 100 | 150 | " try:\n", | |
" y = padded_threehours.iloc[i, :]\n", | 101 | 151 | " y = padded_threehours.iloc[i, :]\n", | |
" user = y[\"user\"]\n", | 102 | 152 | " user = y[\"user\"]\n", | |
" local_date = y[\"local_date\"]\n", | 103 | 153 | " local_date = y[\"local_date\"]\n", | |
" first_day = padded_hours[padded_hours[\"user\"] == user][\"local_date\"].min()\n", | 104 | 154 | " first_day = padded_hours[padded_hours[\"user\"] == user][\"local_date\"].min()\n", | |
" date_diff = (local_date - first_day).days\n", | 105 | 155 | " date_diff = (local_date - first_day).days\n", | |
"\n", | 106 | 156 | "\n", | |
" threehour_idx = y[\"threehour\"]\n", | 107 | 157 | " threehour_idx = y[\"threehour\"]\n", | |
" hour_idx = threehour_idx * 3\n", | 108 | 158 | " hour_idx = threehour_idx * 3\n", | |
"\n", | 109 | 159 | "\n", | |
" output = get_output(y)\n", | 110 | 160 | " output = get_output(y)\n", | |
" input = get_input(y, padded_hours)\n", | 111 | 161 | " input = get_input(y, padded_hours)\n", | |
"\n", | 112 | 162 | "\n", | |
" temp_series = pd.Series([], dtype=int)\n", | 113 | 163 | " temp_series = pd.Series([], dtype=int)\n", | |
" temp_series = pd.concat([temp_series, pd.Series(user, dtype=int)])\n", | 114 | 164 | " temp_series = pd.concat([temp_series, pd.Series(user, dtype=int)])\n", | |
" temp_series = pd.concat([temp_series, pd.Series(date_diff, dtype=int)])\n", | 115 | 165 | " temp_series = pd.concat([temp_series, pd.Series(date_diff, dtype=int)])\n", | |
" temp_series = pd.concat([temp_series, pd.Series(threehour_idx, dtype=int)])\n", | 116 | 166 | " temp_series = pd.concat([temp_series, pd.Series(threehour_idx, dtype=int)])\n", | |
" temp_series = pd.concat([temp_series, pd.Series(hour_idx, dtype=int)])\n", | 117 | 167 | " temp_series = pd.concat([temp_series, pd.Series(hour_idx, dtype=int)])\n", | |
" temp_series = pd.concat([temp_series, pd.Series(output, dtype=int)])\n", | 118 | 168 | " temp_series = pd.concat([temp_series, pd.Series(output, dtype=int)])\n", | |
" temp_series = pd.concat([temp_series, pd.Series(input, dtype=int)])\n", | 119 | 169 | " temp_series = pd.concat([temp_series, pd.Series(input, dtype=int)]).reset_index(drop=True)\n", | |
"\n", | 120 | 170 | "\n", | |
" database = pd.concat([database, temp_series], axis=1)\n", | 121 | 171 | " database = pd.concat([database, temp_series], axis=1)\n", | |
" # print(input)\n", | 122 | 172 | " # print(input)\n", | |
" except Exception as e:\n", | 123 | 173 | " except Exception as e:\n", | |
" # print(\"Error:\", e)\n", | 124 | 174 | " # print(\"Error:\", e)\n", | |
" pass\n", | 125 | 175 | " pass\n", | |
"\n", | 126 | 176 | "\n", | |
" return database\n", | 127 | 177 | " return database\n", | |
"\n" | 128 | 178 | "\n", | |
179 | "database = get_database(0, 100)\n", | |||
180 | "\n", | |||
181 | "database.to_pickle(os.path.join(data_dir, \"database.pkl\"))" | |||
] | 129 | 182 | ] | |
}, | 130 | 183 | }, | |
{ | 131 | 184 | { | |
"cell_type": "code", | 132 | 185 | "cell_type": "code", | |
"execution_count": 4, | 133 | 186 | "execution_count": 4, | |
"metadata": {}, | 134 | 187 | "metadata": {}, | |
"outputs": [], | 135 | 188 | "outputs": [], | |
"source": [ | 136 | 189 | "source": [ | |
"from tensorflow.keras.datasets import mnist\n", | 137 | 190 | "from tensorflow.keras.datasets import mnist\n", | |
"from tensorflow.keras.models import Sequential\n", | 138 | 191 | "from tensorflow.keras.models import Sequential\n", | |
"from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation\n", | 139 | 192 | "from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation\n", | |
"\n", | 140 | 193 | "\n", | |
"\n", | 141 | 194 | "\n", | |
"(x_train, y_train), (x_test, y_test) = mnist.load_data(path='mnist.npz')\n", | 142 | 195 | "(x_train, y_train), (x_test, y_test) = mnist.load_data(path='mnist.npz')\n", | |
"\n", | 143 | 196 | "\n", | |
"X_train = x_train.reshape(60000, 784).astype('float32') / 255\n", | 144 | 197 | "X_train = x_train.reshape(60000, 784).astype('float32') / 255\n", | |
"X_test = x_test.reshape(10000, 784).astype('float32') / 255\n", | 145 | 198 | "X_test = x_test.reshape(10000, 784).astype('float32') / 255\n", | |
"\n", | 146 | 199 | "\n", | |
"Y_train = to_categorical(y_train, 10)\n", | 147 | 200 | "Y_train = to_categorical(y_train, 10)\n", | |
"Y_test = to_categorical(y_test, 10)" | 148 | 201 | "Y_test = to_categorical(y_test, 10)" | |
] | 149 | 202 | ] | |
}, | 150 | 203 | }, | |
{ | 151 | 204 | { | |
"cell_type": "code", | 152 | 205 | "cell_type": "code", | |
"execution_count": null, | 153 | 206 | "execution_count": null, | |
"metadata": {}, | 154 | 207 | "metadata": {}, | |
"outputs": [], | 155 | 208 | "outputs": [], | |
"source": [] | 156 | 209 | "source": [] | |
} | 157 | 210 | } | |
], | 158 | 211 | ], | |
"metadata": { | 159 | 212 | "metadata": { | |
"interpreter": { | 160 | 213 | "interpreter": { | |
"hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a" | 161 | 214 | "hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a" | |
}, | 162 | 215 | }, | |
"kernelspec": { | 163 | 216 | "kernelspec": { | |
"display_name": "Python 3.7.9 64-bit ('venv': venv)", | 164 | 217 | "display_name": "Python 3.7.9 64-bit ('venv': venv)", | |
"language": "python", | 165 | 218 | "language": "python", | |
"name": "python3" | 166 | 219 | "name": "python3" | |
}, | 167 | 220 | }, | |
"language_info": { | 168 | 221 | "language_info": { | |
"codemirror_mode": { | 169 | 222 | "codemirror_mode": { | |
"name": "ipython", | 170 | 223 | "name": "ipython", | |
"version": 3 | 171 | 224 | "version": 3 | |
}, | 172 | 225 | }, | |
"file_extension": ".py", | 173 | 226 | "file_extension": ".py", | |
"mimetype": "text/x-python", | 174 | 227 | "mimetype": "text/x-python", | |
"name": "python", | 175 | 228 | "name": "python", | |
"nbconvert_exporter": "python", | 176 | 229 | "nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | 177 | 230 | "pygments_lexer": "ipython3", | |
"version": "3.7.9" | 178 | 231 | "version": "3.7.9" | |
}, | 179 | 232 | }, | |
"orig_nbformat": 4 | 180 | 233 | "orig_nbformat": 4 | |
}, | 181 | 234 | }, | |
"nbformat": 4, | 182 | 235 | "nbformat": 4, | |
"nbformat_minor": 2 | 183 | 236 | "nbformat_minor": 2 | |
} | 184 | 237 | } | |
185 | 238 | |||