Commit 979a88c36e8e832d4a38b058c355919d741dc010

Authored by Junghwan Park
1 parent aa77698374
Exists in main

picking up the input/output data values

Showing 6 changed files with 270 additions and 68 deletions Side-by-side Diff

python-notebook/__pycache__/constants.cpython-37.pyc View file @ 979a88c

No preview for this file type

python-notebook/__pycache__/tools.cpython-37.pyc View file @ 979a88c

No preview for this file type

python-notebook/constants.py View file @ 979a88c
  1 +# cut off values that are not in the range of the data
  2 +THRESHOLD_OF_DAYS_PER_USER = 10
  3 +
  4 +# cut off values for the number of consecutive minutes for a walk
  5 +MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5
  6 +
  7 +# cut off values for the number of steps per minute for an active minute
  8 +MINIMUM_STEPS_PER_MINUTE = 60
  9 +
  10 +# cut off value for the number of weeks for looking back
  11 +NUMBER_OF_WEEKS_FOR_LOOKING_BACK = 5
python-notebook/data_loading.ipynb View file @ 979a88c
... ... @@ -9,7 +9,7 @@
9 9 },
10 10 {
11 11 "cell_type": "code",
12   - "execution_count": 142,
  12 + "execution_count": 161,
13 13 "metadata": {},
14 14 "outputs": [],
15 15 "source": [
... ... @@ -19,9 +19,8 @@
19 19 "from pandas import read_csv\n",
20 20 "import pandas as pd\n",
21 21 "import os\n",
22   - "from datetime import datetime, date, timedelta\n",
23   - "from itertools import product\n",
24   - "# %load_ext line_profiler"
  22 + "from tools import *\n",
  23 + "from constants import *"
25 24 ]
26 25 },
27 26 {
28 27  
... ... @@ -33,65 +32,12 @@
33 32 },
34 33 {
35 34 "cell_type": "code",
36   - "execution_count": 143,
  35 + "execution_count": 162,
37 36 "metadata": {},
38 37 "outputs": [],
39 38 "source": [
40 39 "# to use unlimited memory for large dataframes\n",
41   - "pd.options.mode.chained_assignment = None\n",
42   - "\n",
43   - "# convert a datetime object to a date object\n",
44   - "def get_date(x):\n",
45   - " return date(x.year, x.month, x.day)\n",
46   - "\n",
47   - "# convert a datetime object to an integer, which denotes the number of minutes since midnight\n",
48   - "def get_minute_index(x):\n",
49   - " return (x.hour * 60) + x.minute\n",
50   - "\n",
51   - "# return a range of dates\n",
52   - "def date_range(start_date, end_date):\n",
53   - " delta = end_date - start_date\n",
54   - "\n",
55   - " for i in range(delta.days + 1):\n",
56   - " yield start_date + timedelta(days=i)\n",
57   - "\n",
58   - "# define an iterative walk calculation (merging consecutive active minutes)\n",
59   - "def calculate_walk(cv):\n",
60   - " nv = cv.copy(deep=True)\n",
61   - " nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n",
62   - "\n",
63   - " # move midnight minutes to previous day\n",
64   - " nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n",
65   - " nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n",
66   - " \n",
67   - " nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n",
68   - " jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n",
69   - " jv[\"add_count\"] += 1\n",
70   - " jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
71   - "\n",
72   - " return jv \n",
73   - "\n",
74   - "# generate complete product of vectors\n",
75   - "def product_df(mat1, mat2):\n",
76   - " mat1 = mat1.drop_duplicates()\n",
77   - " mat2 = mat2.drop_duplicates()\n",
78   - "\n",
79   - " temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n",
80   - " for i, acol in enumerate(mat1.columns):\n",
81   - " temp[acol] = temp[0].apply(lambda x: x[i])\n",
82   - " for i, acol in enumerate(mat2.columns):\n",
83   - " temp[acol] = temp[1].apply(lambda x: x[i])\n",
84   - " temp = temp.drop(columns=[0, 1])\n",
85   - " return temp\n",
86   - "\n",
87   - "# cut off values that are not in the range of the data\n",
88   - "THRESHOLD_OF_DAYS_PER_USER = 10\n",
89   - "\n",
90   - "# cut off values for the number of consecutive minutes for a walk\n",
91   - "MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5\n",
92   - "\n",
93   - "# cut off values for the number of steps per minute for an active minute\n",
94   - "MINIMUM_STEPS_PER_MINUTE = 60\n"
  40 + "pd.options.mode.chained_assignment = None"
95 41 ]
96 42 },
97 43 {
... ... @@ -103,7 +49,7 @@
103 49 },
104 50 {
105 51 "cell_type": "code",
106   - "execution_count": 144,
  52 + "execution_count": 163,
107 53 "metadata": {},
108 54 "outputs": [],
109 55 "source": [
... ... @@ -124,7 +70,7 @@
124 70 },
125 71 {
126 72 "cell_type": "code",
127   - "execution_count": 145,
  73 + "execution_count": 164,
128 74 "metadata": {},
129 75 "outputs": [],
130 76 "source": [
... ... @@ -173,7 +119,7 @@
173 119 },
174 120 {
175 121 "cell_type": "code",
176   - "execution_count": 146,
  122 + "execution_count": 165,
177 123 "metadata": {},
178 124 "outputs": [],
179 125 "source": [
... ... @@ -190,7 +136,7 @@
190 136 },
191 137 {
192 138 "cell_type": "code",
193   - "execution_count": 147,
  139 + "execution_count": 166,
194 140 "metadata": {},
195 141 "outputs": [
196 142 {
... ... @@ -257,7 +203,7 @@
257 203 },
258 204 {
259 205 "cell_type": "code",
260   - "execution_count": 148,
  206 + "execution_count": 167,
261 207 "metadata": {},
262 208 "outputs": [
263 209 {
... ... @@ -300,7 +246,7 @@
300 246 },
301 247 {
302 248 "cell_type": "code",
303   - "execution_count": 149,
  249 + "execution_count": 168,
304 250 "metadata": {},
305 251 "outputs": [],
306 252 "source": [
... ... @@ -346,7 +292,7 @@
346 292 },
347 293 {
348 294 "cell_type": "code",
349   - "execution_count": 150,
  295 + "execution_count": 169,
350 296 "metadata": {},
351 297 "outputs": [],
352 298 "source": [
353 299  
354 300  
... ... @@ -380,13 +326,25 @@
380 326 },
381 327 {
382 328 "cell_type": "code",
383   - "execution_count": 151,
  329 + "execution_count": 170,
384 330 "metadata": {},
385 331 "outputs": [],
386 332 "source": [
  333 + "# converting data type\n",
  334 + "padded_hours[\"user\"] = padded_hours[\"user\"].astype(int)\n",
  335 + "padded_hours[\"hour\"] = padded_hours[\"hour\"].astype(int)\n",
  336 + "padded_hours[\"walked\"] = padded_hours[\"walked\"].astype(int)\n",
  337 + "\n",
  338 + "padded_threehours[\"user\"] = padded_threehours[\"user\"].astype(int)\n",
  339 + "padded_threehours[\"threehour\"] = padded_threehours[\"threehour\"].astype(int)\n",
  340 + "padded_threehours[\"walked\"] = padded_threehours[\"walked\"].astype(int)\n",
  341 + "\n",
387 342 "# save the data\n",
388 343 "padded_hours.to_csv(os.path.join(data_dir, \"padded_hours.csv\"), index=False)\n",
389   - "padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)"
  344 + "padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)\n",
  345 + "\n",
  346 + "padded_hours.to_pickle(os.path.join(data_dir, \"padded_hours.pkl\"))\n",
  347 + "padded_threehours.to_pickle(os.path.join(data_dir, \"padded_threehours.pkl\"))"
390 348 ]
391 349 }
392 350 ],
python-notebook/prepare_trteva_data.ipynb View file @ 979a88c
  1 +{
  2 + "cells": [
  3 + {
  4 + "cell_type": "code",
  5 + "execution_count": 1,
  6 + "metadata": {},
  7 + "outputs": [],
  8 + "source": [
  9 + "import numpy as np\n",
  10 + "import pandas as pd\n",
  11 + "import os\n",
  12 + "from tools import *\n",
  13 + "from constants import *\n",
  14 + "from tensorflow.keras.utils import to_categorical"
  15 + ]
  16 + },
  17 + {
  18 + "cell_type": "markdown",
  19 + "metadata": {},
  20 + "source": [
  21 + "# Prepare Training, Testing, and Validation Data\n",
  22 + "## Loading the preprocessed data"
  23 + ]
  24 + },
  25 + {
  26 + "cell_type": "code",
  27 + "execution_count": 2,
  28 + "metadata": {},
  29 + "outputs": [],
  30 + "source": [
  31 + "# to use unlimited memory for large dataframes\n",
  32 + "pd.options.mode.chained_assignment = None\n",
  33 + "\n",
  34 + "data_dir = '../data'\n",
  35 + "\n",
  36 + "padded_hours = pd.read_pickle(os.path.join(data_dir, 'padded_hours.pkl'))\n",
  37 + "padded_threehours = pd.read_pickle(os.path.join(data_dir, 'padded_threehours.pkl'))"
  38 + ]
  39 + },
  40 + {
  41 + "cell_type": "markdown",
  42 + "metadata": {},
  43 + "source": [
  44 + "## Enumerating Output Data"
  45 + ]
  46 + },
  47 + {
  48 + "cell_type": "code",
  49 + "execution_count": 3,
  50 + "metadata": {},
  51 + "outputs": [],
  52 + "source": [
  53 + "# return output value\n",
  54 + "def get_output(y):\n",
  55 + " return y[\"walked\"]\n",
  56 + "\n",
  57 + "# return intput value\n",
  58 + "def get_input(y, padded_hours):\n",
  59 + " # base information\n",
  60 + " user = y[\"user\"]\n",
  61 + " local_date = y[\"local_date\"]\n",
  62 + " threehour_idx = y[\"threehour\"]\n",
  63 + " \n",
  64 + " # derived information\n",
  65 + " hour_idx = threehour_idx * 3\n",
  66 + " encoded_hour_idx = to_categorical(hour_idx, num_classes=24)\n",
  67 + " end_date = local_date - timedelta(days=1)\n",
  68 + " start_date = end_date - timedelta(days=7*NUMBER_OF_WEEKS_FOR_LOOKING_BACK-1)\n",
  69 + " weekday = local_date.weekday()\n",
  70 + " encoded_weekday = to_categorical(weekday, num_classes=7)\n",
  71 + " encoded_month = to_categorical(local_date.month, num_classes=12)\n",
  72 + " encoded_day_of_month = to_categorical(local_date.day, num_classes=31)\n",
  73 + "\n",
  74 + " gait = pd.Series([], dtype=int)\n",
  75 + " # gait movement\n",
  76 + " zero_move = 0\n",
  77 + " for a_date in date_range(start_date, end_date):\n",
  78 + " day_df = padded_hours[(padded_hours[\"user\"] == user) & (padded_hours[\"local_date\"] == a_date)]\n",
  79 + " if day_df.size == 0:\n",
  80 + " gait = pd.concat([gait, pd.Series([1,0,0] * 24, dtype=int)])\n",
  81 + " zero_move += 1\n",
  82 + " else:\n",
  83 + " gait = pd.concat([gait, pd.Series(to_categorical(day_df[\"walked\"].values, 3, dtype=int).reshape(24*3), dtype=int)])\n",
  84 + " if zero_move == 5 * 7:\n",
  85 + " raise Exception(\"No movement data\")\n",
  86 + "\n",
  87 + " return_series = pd.Series([], dtype=int)\n",
  88 + " return_series = pd.concat([return_series, pd.Series(encoded_hour_idx, dtype=np.int_)])\n",
  89 + " return_series = pd.concat([return_series, pd.Series(encoded_weekday, dtype=np.int_)])\n",
  90 + " return_series = pd.concat([return_series, pd.Series(encoded_month, dtype=np.int_)])\n",
  91 + " return_series = pd.concat([return_series, pd.Series(encoded_day_of_month, dtype=np.int_)])\n",
  92 + " return_series = pd.concat([return_series, gait])\n",
  93 + " \n",
  94 + " return return_series\n",
  95 + "\n",
  96 + "def get_database(start_idx, end_idx):\n",
  97 + " database = pd.DataFrame({}, dtype=int)\n",
  98 + "\n",
  99 + " for i in range(start_idx, end_idx):\n",
  100 + " try:\n",
  101 + " y = padded_threehours.iloc[i, :]\n",
  102 + " user = y[\"user\"]\n",
  103 + " local_date = y[\"local_date\"]\n",
  104 + " first_day = padded_hours[padded_hours[\"user\"] == user][\"local_date\"].min()\n",
  105 + " date_diff = (local_date - first_day).days\n",
  106 + "\n",
  107 + " threehour_idx = y[\"threehour\"]\n",
  108 + " hour_idx = threehour_idx * 3\n",
  109 + "\n",
  110 + " output = get_output(y)\n",
  111 + " input = get_input(y, padded_hours)\n",
  112 + "\n",
  113 + " temp_series = pd.Series([], dtype=int)\n",
  114 + " temp_series = pd.concat([temp_series, pd.Series(user, dtype=int)])\n",
  115 + " temp_series = pd.concat([temp_series, pd.Series(date_diff, dtype=int)])\n",
  116 + " temp_series = pd.concat([temp_series, pd.Series(threehour_idx, dtype=int)])\n",
  117 + " temp_series = pd.concat([temp_series, pd.Series(hour_idx, dtype=int)])\n",
  118 + " temp_series = pd.concat([temp_series, pd.Series(output, dtype=int)])\n",
  119 + " temp_series = pd.concat([temp_series, pd.Series(input, dtype=int)])\n",
  120 + "\n",
  121 + " database = pd.concat([database, temp_series], axis=1)\n",
  122 + " # print(input)\n",
  123 + " except Exception as e:\n",
  124 + " # print(\"Error:\", e)\n",
  125 + " pass\n",
  126 + "\n",
  127 + " return database\n",
  128 + "\n"
  129 + ]
  130 + },
  131 + {
  132 + "cell_type": "code",
  133 + "execution_count": 4,
  134 + "metadata": {},
  135 + "outputs": [],
  136 + "source": [
  137 + "from tensorflow.keras.datasets import mnist\n",
  138 + "from tensorflow.keras.models import Sequential\n",
  139 + "from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation\n",
  140 + "\n",
  141 + "\n",
  142 + "(x_train, y_train), (x_test, y_test) = mnist.load_data(path='mnist.npz')\n",
  143 + "\n",
  144 + "X_train = x_train.reshape(60000, 784).astype('float32') / 255\n",
  145 + "X_test = x_test.reshape(10000, 784).astype('float32') / 255\n",
  146 + "\n",
  147 + "Y_train = to_categorical(y_train, 10)\n",
  148 + "Y_test = to_categorical(y_test, 10)"
  149 + ]
  150 + },
  151 + {
  152 + "cell_type": "code",
  153 + "execution_count": null,
  154 + "metadata": {},
  155 + "outputs": [],
  156 + "source": []
  157 + }
  158 + ],
  159 + "metadata": {
  160 + "interpreter": {
  161 + "hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a"
  162 + },
  163 + "kernelspec": {
  164 + "display_name": "Python 3.7.9 64-bit ('venv': venv)",
  165 + "language": "python",
  166 + "name": "python3"
  167 + },
  168 + "language_info": {
  169 + "codemirror_mode": {
  170 + "name": "ipython",
  171 + "version": 3
  172 + },
  173 + "file_extension": ".py",
  174 + "mimetype": "text/x-python",
  175 + "name": "python",
  176 + "nbconvert_exporter": "python",
  177 + "pygments_lexer": "ipython3",
  178 + "version": "3.7.9"
  179 + },
  180 + "orig_nbformat": 4
  181 + },
  182 + "nbformat": 4,
  183 + "nbformat_minor": 2
  184 +}
python-notebook/tools.py View file @ 979a88c
  1 +import pandas as pd
  2 +
  3 +from itertools import product
  4 +from datetime import date, timedelta
  5 +
  6 +
  7 +# convert a datetime object to a date object
  8 +def get_date(x):
  9 + return date(x.year, x.month, x.day)
  10 +
  11 +# convert a datetime object to an integer, which denotes the number of minutes since midnight
  12 +def get_minute_index(x):
  13 + return (x.hour * 60) + x.minute
  14 +
  15 +# return a range of dates
  16 +def date_range(start_date, end_date):
  17 + delta = end_date - start_date
  18 +
  19 + for i in range(delta.days + 1):
  20 + yield start_date + timedelta(days=i)
  21 +
  22 +# define an iterative walk calculation (merging consecutive active minutes)
  23 +def calculate_walk(cv):
  24 + nv = cv.copy(deep=True)
  25 + nv["prev_minute_index"] = nv["local_minute_index"] - 1
  26 +
  27 + # move midnight minutes to previous day
  28 + nv[nv["prev_minute_index"] < 0]["local_date"] -= timedelta(days=1)
  29 + nv[nv["prev_minute_index"] < 0]["prev_minute_index"] = 1439
  30 +
  31 + nv = nv[["user", "local_date", "prev_minute_index"]]
  32 + jv = cv.merge(nv, left_on=["user", "local_date", "local_minute_index"], right_on=["user", "local_date", "prev_minute_index"], how="inner")
  33 + jv["add_count"] += 1
  34 + jv = jv[["user", "local_date", "local_minute_index", "add_count"]]
  35 +
  36 + return jv
  37 +
  38 +# generate complete product of vectors
  39 +def product_df(mat1, mat2):
  40 + mat1 = mat1.drop_duplicates()
  41 + mat2 = mat2.drop_duplicates()
  42 +
  43 + temp = pd.DataFrame(list(product(mat1.values, mat2.values)))
  44 + for i, acol in enumerate(mat1.columns):
  45 + temp[acol] = temp[0].apply(lambda x: x[i])
  46 + for i, acol in enumerate(mat2.columns):
  47 + temp[acol] = temp[1].apply(lambda x: x[i])
  48 + temp = temp.drop(columns=[0, 1])
  49 + return temp