Commit 163adf28ec78be3d551592fa51ed7a50c0aa6dcd

Authored by Junghwan Park
1 parent 979a88c36e
Exists in main

make it faster

Showing 1 changed file with 62 additions and 9 deletions Inline Diff

python-notebook/prepare_trteva_data.ipynb View file @ 163adf2
{ 1 1 {
"cells": [ 2 2 "cells": [
{ 3 3 {
"cell_type": "code", 4 4 "cell_type": "code",
"execution_count": 1, 5 5 "execution_count": 1,
"metadata": {}, 6 6 "metadata": {},
"outputs": [], 7 7 "outputs": [],
"source": [ 8 8 "source": [
"import numpy as np\n", 9 9 "import numpy as np\n",
"import pandas as pd\n", 10 10 "import pandas as pd\n",
"import os\n", 11 11 "import os\n",
"from tools import *\n", 12 12 "from tools import *\n",
"from constants import *\n", 13 13 "from constants import *\n",
"from tensorflow.keras.utils import to_categorical" 14 14 "from tensorflow.keras.utils import to_categorical\n",
15 "\n",
16 "# %load_ext line_profiler"
] 15 17 ]
}, 16 18 },
{ 17 19 {
"cell_type": "markdown", 18 20 "cell_type": "markdown",
"metadata": {}, 19 21 "metadata": {},
"source": [ 20 22 "source": [
"# Prepare Training, Testing, and Validation Data\n", 21 23 "# Prepare Training, Testing, and Validation Data\n",
"## Loading the preprocessed data" 22 24 "## Loading the preprocessed data"
] 23 25 ]
}, 24 26 },
{ 25 27 {
"cell_type": "code", 26 28 "cell_type": "code",
"execution_count": 2, 27 29 "execution_count": 2,
"metadata": {}, 28 30 "metadata": {},
"outputs": [], 29 31 "outputs": [],
"source": [ 30 32 "source": [
"# to use unlimited memory for large dataframes\n", 31 33 "# to use unlimited memory for large dataframes\n",
"pd.options.mode.chained_assignment = None\n", 32 34 "pd.options.mode.chained_assignment = None\n",
"\n", 33 35 "\n",
"data_dir = '../data'\n", 34 36 "data_dir = '../data'\n",
"\n", 35 37 "\n",
"padded_hours = pd.read_pickle(os.path.join(data_dir, 'padded_hours.pkl'))\n", 36 38 "padded_hours = pd.read_pickle(os.path.join(data_dir, 'padded_hours.pkl'))\n",
"padded_threehours = pd.read_pickle(os.path.join(data_dir, 'padded_threehours.pkl'))" 37 39 "padded_threehours = pd.read_pickle(os.path.join(data_dir, 'padded_threehours.pkl'))"
] 38 40 ]
}, 39 41 },
{ 40 42 {
"cell_type": "markdown", 41 43 "cell_type": "markdown",
"metadata": {}, 42 44 "metadata": {},
"source": [ 43 45 "source": [
46 "# Expanding one-hot-encoded gaits"
47 ]
48 },
49 {
50 "cell_type": "code",
51 "execution_count": 13,
52 "metadata": {},
53 "outputs": [
54 {
55 "name": "stdout",
56 "output_type": "stream",
57 "text": [
58 "(42360, 4) -> (127080, 4)\n"
59 ]
60 }
61 ],
62 "source": [
63 "def mass_one_hot_encoding(padded_hours, colname, n_classes):\n",
64 " def __mass_one_hot_encoding(padded_hours, colname, n_classes, n):\n",
65 " temp = padded_hours[padded_hours[colname] == n]\n",
66 "\n",
67 " return_df = pd.DataFrame(dtype=int)\n",
68 "\n",
69 " for i in range(n_classes):\n",
70 " temp_2 = temp.copy(deep=True)\n",
71 " temp_2[\"var\"] = i\n",
72 " temp_2[\"value\"] = (n == i) if 1 else 0\n",
73 " return_df = pd.concat([return_df, temp_2], ignore_index=True)\n",
74 "\n",
75 " return return_df\n",
76 " \n",
77 " mass_encoded = pd.DataFrame(dtype=int)\n",
78 " for n in range(n_classes):\n",
79 " mass_encoded = pd.concat([mass_encoded, __mass_one_hot_encoding(padded_hours, colname, n_classes, n)], ignore_index=True)\n",
80 " return mass_encoded\n",
81 "\n",
82 "padded_hours_encoded = mass_one_hot_encoding(padded_hours, 'walked', 3)\n",
83 "padded_hours_encoded[\"local_date\"] = padded_hours_encoded[\"local_date\"].astype(str)\n",
84 "padded_hours_encoded = padded_hours_encoded.set_index(['user', 'local_date']).sort_index()\n",
85 "\n",
86 "print(\"{} -> {}\".format(padded_hours.shape, padded_hours_encoded.shape))"
87 ]
88 },
89 {
90 "cell_type": "markdown",
91 "metadata": {},
92 "source": [
"## Enumerating Output Data" 44 93 "## Enumerating Output Data"
] 45 94 ]
}, 46 95 },
{ 47 96 {
"cell_type": "code", 48 97 "cell_type": "code",
"execution_count": 3, 49 98 "execution_count": 15,
"metadata": {}, 50 99 "metadata": {},
"outputs": [], 51 100 "outputs": [],
"source": [ 52 101 "source": [
"# return output value\n", 53 102 "# return output value\n",
"def get_output(y):\n", 54 103 "def get_output(y):\n",
" return y[\"walked\"]\n", 55 104 " return y[\"walked\"]\n",
"\n", 56 105 "\n",
"# return intput value\n", 57 106 "# return intput value\n",
"def get_input(y, padded_hours):\n", 58 107 "def get_input(y, padded_hours):\n",
" # base information\n", 59 108 " # base information\n",
" user = y[\"user\"]\n", 60 109 " user = y[\"user\"]\n",
" local_date = y[\"local_date\"]\n", 61 110 " local_date = y[\"local_date\"]\n",
" threehour_idx = y[\"threehour\"]\n", 62 111 " threehour_idx = y[\"threehour\"]\n",
" \n", 63 112 " \n",
" # derived information\n", 64 113 " # derived information\n",
" hour_idx = threehour_idx * 3\n", 65 114 " hour_idx = threehour_idx * 3\n",
" encoded_hour_idx = to_categorical(hour_idx, num_classes=24)\n", 66 115 " encoded_hour_idx = to_categorical(hour_idx, num_classes=24)\n",
" end_date = local_date - timedelta(days=1)\n", 67 116 " end_date = local_date - timedelta(days=1)\n",
" start_date = end_date - timedelta(days=7*NUMBER_OF_WEEKS_FOR_LOOKING_BACK-1)\n", 68 117 " start_date = end_date - timedelta(days=7*NUMBER_OF_WEEKS_FOR_LOOKING_BACK-1)\n",
" weekday = local_date.weekday()\n", 69 118 " weekday = local_date.weekday()\n",
" encoded_weekday = to_categorical(weekday, num_classes=7)\n", 70 119 " encoded_weekday = to_categorical(weekday, num_classes=7)\n",
" encoded_month = to_categorical(local_date.month, num_classes=12)\n", 71 120 " encoded_month = to_categorical(local_date.month, num_classes=12)\n",
" encoded_day_of_month = to_categorical(local_date.day, num_classes=31)\n", 72 121 " encoded_day_of_month = to_categorical(local_date.day, num_classes=31)\n",
"\n", 73 122 "\n",
" gait = pd.Series([], dtype=int)\n", 74 123 " gait = pd.Series([], dtype=int)\n",
" # gait movement\n", 75 124 " # gait movement\n",
" zero_move = 0\n", 76 125 " zero_move = 0\n",
" for a_date in date_range(start_date, end_date):\n", 77 126 " for a_date in date_range(start_date, end_date):\n",
" day_df = padded_hours[(padded_hours[\"user\"] == user) & (padded_hours[\"local_date\"] == a_date)]\n", 78 127 " key = (user, a_date.strftime(\"%Y-%m-%d\"))\n",
" if day_df.size == 0:\n", 79 128 " if key in padded_hours_encoded.index:\n",
" gait = pd.concat([gait, pd.Series([1,0,0] * 24, dtype=int)])\n", 80 129 " day_df = padded_hours_encoded.loc[key, \"value\"]\n",
" zero_move += 1\n", 81 130 " gait = pd.concat([gait, day_df], ignore_index=True)\n",
" else:\n", 82 131 " else:\n",
" gait = pd.concat([gait, pd.Series(to_categorical(day_df[\"walked\"].values, 3, dtype=int).reshape(24*3), dtype=int)])\n", 83 132 " gait = pd.concat([gait, pd.Series([1,0,0] * 24, dtype=int)], ignore_index=True)\n",
133 " zero_move += 1\n",
" if zero_move == 5 * 7:\n", 84 134 " if zero_move == 5 * 7:\n",
" raise Exception(\"No movement data\")\n", 85 135 " raise Exception(\"No movement data\")\n",
"\n", 86 136 "\n",
" return_series = pd.Series([], dtype=int)\n", 87 137 " return_series = pd.Series([], dtype=int)\n",
" return_series = pd.concat([return_series, pd.Series(encoded_hour_idx, dtype=np.int_)])\n", 88 138 " return_series = pd.concat([return_series, pd.Series(encoded_hour_idx, dtype=np.int_)])\n",
" return_series = pd.concat([return_series, pd.Series(encoded_weekday, dtype=np.int_)])\n", 89 139 " return_series = pd.concat([return_series, pd.Series(encoded_weekday, dtype=np.int_)])\n",
" return_series = pd.concat([return_series, pd.Series(encoded_month, dtype=np.int_)])\n", 90 140 " return_series = pd.concat([return_series, pd.Series(encoded_month, dtype=np.int_)])\n",
" return_series = pd.concat([return_series, pd.Series(encoded_day_of_month, dtype=np.int_)])\n", 91 141 " return_series = pd.concat([return_series, pd.Series(encoded_day_of_month, dtype=np.int_)])\n",
" return_series = pd.concat([return_series, gait])\n", 92 142 " return_series = pd.concat([return_series, gait])\n",
" \n", 93 143 " \n",
" return return_series\n", 94 144 " return return_series\n",
"\n", 95 145 "\n",
"def get_database(start_idx, end_idx):\n", 96 146 "def get_database(start_idx, end_idx):\n",
" database = pd.DataFrame({}, dtype=int)\n", 97 147 " database = pd.DataFrame({}, dtype=int)\n",
"\n", 98 148 "\n",
" for i in range(start_idx, end_idx):\n", 99 149 " for i in range(start_idx, end_idx):\n",
" try:\n", 100 150 " try:\n",
" y = padded_threehours.iloc[i, :]\n", 101 151 " y = padded_threehours.iloc[i, :]\n",
" user = y[\"user\"]\n", 102 152 " user = y[\"user\"]\n",
" local_date = y[\"local_date\"]\n", 103 153 " local_date = y[\"local_date\"]\n",
" first_day = padded_hours[padded_hours[\"user\"] == user][\"local_date\"].min()\n", 104 154 " first_day = padded_hours[padded_hours[\"user\"] == user][\"local_date\"].min()\n",
" date_diff = (local_date - first_day).days\n", 105 155 " date_diff = (local_date - first_day).days\n",
"\n", 106 156 "\n",
" threehour_idx = y[\"threehour\"]\n", 107 157 " threehour_idx = y[\"threehour\"]\n",
" hour_idx = threehour_idx * 3\n", 108 158 " hour_idx = threehour_idx * 3\n",
"\n", 109 159 "\n",
" output = get_output(y)\n", 110 160 " output = get_output(y)\n",
" input = get_input(y, padded_hours)\n", 111 161 " input = get_input(y, padded_hours)\n",
"\n", 112 162 "\n",
" temp_series = pd.Series([], dtype=int)\n", 113 163 " temp_series = pd.Series([], dtype=int)\n",
" temp_series = pd.concat([temp_series, pd.Series(user, dtype=int)])\n", 114 164 " temp_series = pd.concat([temp_series, pd.Series(user, dtype=int)])\n",
" temp_series = pd.concat([temp_series, pd.Series(date_diff, dtype=int)])\n", 115 165 " temp_series = pd.concat([temp_series, pd.Series(date_diff, dtype=int)])\n",
" temp_series = pd.concat([temp_series, pd.Series(threehour_idx, dtype=int)])\n", 116 166 " temp_series = pd.concat([temp_series, pd.Series(threehour_idx, dtype=int)])\n",
" temp_series = pd.concat([temp_series, pd.Series(hour_idx, dtype=int)])\n", 117 167 " temp_series = pd.concat([temp_series, pd.Series(hour_idx, dtype=int)])\n",
" temp_series = pd.concat([temp_series, pd.Series(output, dtype=int)])\n", 118 168 " temp_series = pd.concat([temp_series, pd.Series(output, dtype=int)])\n",
" temp_series = pd.concat([temp_series, pd.Series(input, dtype=int)])\n", 119 169 " temp_series = pd.concat([temp_series, pd.Series(input, dtype=int)]).reset_index(drop=True)\n",
"\n", 120 170 "\n",
" database = pd.concat([database, temp_series], axis=1)\n", 121 171 " database = pd.concat([database, temp_series], axis=1)\n",
" # print(input)\n", 122 172 " # print(input)\n",
" except Exception as e:\n", 123 173 " except Exception as e:\n",
" # print(\"Error:\", e)\n", 124 174 " # print(\"Error:\", e)\n",
" pass\n", 125 175 " pass\n",
"\n", 126 176 "\n",
" return database\n", 127 177 " return database\n",
"\n" 128 178 "\n",
179 "database = get_database(0, 100)\n",
180 "\n",
181 "database.to_pickle(os.path.join(data_dir, \"database.pkl\"))"
] 129 182 ]
}, 130 183 },
{ 131 184 {
"cell_type": "code", 132 185 "cell_type": "code",
"execution_count": 4, 133 186 "execution_count": 4,
"metadata": {}, 134 187 "metadata": {},
"outputs": [], 135 188 "outputs": [],
"source": [ 136 189 "source": [
"from tensorflow.keras.datasets import mnist\n", 137 190 "from tensorflow.keras.datasets import mnist\n",
"from tensorflow.keras.models import Sequential\n", 138 191 "from tensorflow.keras.models import Sequential\n",
"from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation\n", 139 192 "from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation\n",
"\n", 140 193 "\n",
"\n", 141 194 "\n",
"(x_train, y_train), (x_test, y_test) = mnist.load_data(path='mnist.npz')\n", 142 195 "(x_train, y_train), (x_test, y_test) = mnist.load_data(path='mnist.npz')\n",
"\n", 143 196 "\n",
"X_train = x_train.reshape(60000, 784).astype('float32') / 255\n", 144 197 "X_train = x_train.reshape(60000, 784).astype('float32') / 255\n",
"X_test = x_test.reshape(10000, 784).astype('float32') / 255\n", 145 198 "X_test = x_test.reshape(10000, 784).astype('float32') / 255\n",
"\n", 146 199 "\n",
"Y_train = to_categorical(y_train, 10)\n", 147 200 "Y_train = to_categorical(y_train, 10)\n",
"Y_test = to_categorical(y_test, 10)" 148 201 "Y_test = to_categorical(y_test, 10)"
] 149 202 ]
}, 150 203 },
{ 151 204 {
"cell_type": "code", 152 205 "cell_type": "code",
"execution_count": null, 153 206 "execution_count": null,
"metadata": {}, 154 207 "metadata": {},
"outputs": [], 155 208 "outputs": [],
"source": [] 156 209 "source": []
} 157 210 }
], 158 211 ],
"metadata": { 159 212 "metadata": {
"interpreter": { 160 213 "interpreter": {
"hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a" 161 214 "hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a"
}, 162 215 },
"kernelspec": { 163 216 "kernelspec": {
"display_name": "Python 3.7.9 64-bit ('venv': venv)", 164 217 "display_name": "Python 3.7.9 64-bit ('venv': venv)",
"language": "python", 165 218 "language": "python",
"name": "python3" 166 219 "name": "python3"
}, 167 220 },
"language_info": { 168 221 "language_info": {
"codemirror_mode": { 169 222 "codemirror_mode": {
"name": "ipython", 170 223 "name": "ipython",
"version": 3 171 224 "version": 3
}, 172 225 },
"file_extension": ".py", 173 226 "file_extension": ".py",
"mimetype": "text/x-python", 174 227 "mimetype": "text/x-python",
"name": "python", 175 228 "name": "python",
"nbconvert_exporter": "python", 176 229 "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", 177 230 "pygments_lexer": "ipython3",
"version": "3.7.9" 178 231 "version": "3.7.9"
}, 179 232 },
"orig_nbformat": 4 180 233 "orig_nbformat": 4
}, 181 234 },
"nbformat": 4, 182 235 "nbformat": 4,
"nbformat_minor": 2 183 236 "nbformat_minor": 2
} 184 237 }
185 238