Commit 979a88c36e8e832d4a38b058c355919d741dc010

Authored by Junghwan Park
1 parent aa77698374
Exists in main

picking up the input/output data values

Showing 6 changed files with 270 additions and 68 deletions Inline Diff

python-notebook/__pycache__/constants.cpython-37.pyc View file @ 979a88c

No preview for this file type

python-notebook/__pycache__/tools.cpython-37.pyc View file @ 979a88c

No preview for this file type

python-notebook/constants.py View file @ 979a88c
File was created 1 # cut off values that are not in the range of the data
2 THRESHOLD_OF_DAYS_PER_USER = 10
3
4 # cut off values for the number of consecutive minutes for a walk
5 MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5
6
7 # cut off values for the number of steps per minute for an active minute
8 MINIMUM_STEPS_PER_MINUTE = 60
python-notebook/data_loading.ipynb View file @ 979a88c
{ 1 1 {
"cells": [ 2 2 "cells": [
{ 3 3 {
"cell_type": "markdown", 4 4 "cell_type": "markdown",
"metadata": {}, 5 5 "metadata": {},
"source": [ 6 6 "source": [
"# Loading libraries" 7 7 "# Loading libraries"
] 8 8 ]
}, 9 9 },
{ 10 10 {
"cell_type": "code", 11 11 "cell_type": "code",
"execution_count": 142, 12 12 "execution_count": 161,
"metadata": {}, 13 13 "metadata": {},
"outputs": [], 14 14 "outputs": [],
"source": [ 15 15 "source": [
"import numpy as np\n", 16 16 "import numpy as np\n",
"import matplotlib.pyplot as plt\n", 17 17 "import matplotlib.pyplot as plt\n",
"import seaborn as sns\n", 18 18 "import seaborn as sns\n",
"from pandas import read_csv\n", 19 19 "from pandas import read_csv\n",
"import pandas as pd\n", 20 20 "import pandas as pd\n",
"import os\n", 21 21 "import os\n",
"from datetime import datetime, date, timedelta\n", 22 22 "from tools import *\n",
"from itertools import product\n", 23 23 "from constants import *"
"# %load_ext line_profiler" 24
] 25 24 ]
}, 26 25 },
{ 27 26 {
"cell_type": "markdown", 28 27 "cell_type": "markdown",
"metadata": {}, 29 28 "metadata": {},
"source": [ 30 29 "source": [
"# Defining Functions and Adjusting Settings" 31 30 "# Defining Functions and Adjusting Settings"
] 32 31 ]
}, 33 32 },
{ 34 33 {
"cell_type": "code", 35 34 "cell_type": "code",
"execution_count": 143, 36 35 "execution_count": 162,
"metadata": {}, 37 36 "metadata": {},
"outputs": [], 38 37 "outputs": [],
"source": [ 39 38 "source": [
"# to use unlimited memory for large dataframes\n", 40 39 "# to use unlimited memory for large dataframes\n",
"pd.options.mode.chained_assignment = None\n", 41 40 "pd.options.mode.chained_assignment = None"
"\n", 42
"# convert a datetime object to a date object\n", 43
"def get_date(x):\n", 44
" return date(x.year, x.month, x.day)\n", 45
"\n", 46
"# convert a datetime object to an integer, which denotes the number of minutes since midnight\n", 47
"def get_minute_index(x):\n", 48
" return (x.hour * 60) + x.minute\n", 49
"\n", 50
"# return a range of dates\n", 51
"def date_range(start_date, end_date):\n", 52
" delta = end_date - start_date\n", 53
"\n", 54
" for i in range(delta.days + 1):\n", 55
" yield start_date + timedelta(days=i)\n", 56
"\n", 57
"# define an iterative walk calculation (merging consecutive active minutes)\n", 58
"def calculate_walk(cv):\n", 59
" nv = cv.copy(deep=True)\n", 60
" nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n", 61
"\n", 62
" # move midnight minutes to previous day\n", 63
" nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n", 64
" nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n", 65
" \n", 66
" nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n", 67
" jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n", 68
" jv[\"add_count\"] += 1\n", 69
" jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n", 70
"\n", 71
" return jv \n", 72
"\n", 73
"# generate complete product of vectors\n", 74
"def product_df(mat1, mat2):\n", 75
" mat1 = mat1.drop_duplicates()\n", 76
" mat2 = mat2.drop_duplicates()\n", 77
"\n", 78
" temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n", 79
" for i, acol in enumerate(mat1.columns):\n", 80
" temp[acol] = temp[0].apply(lambda x: x[i])\n", 81
" for i, acol in enumerate(mat2.columns):\n", 82
" temp[acol] = temp[1].apply(lambda x: x[i])\n", 83
" temp = temp.drop(columns=[0, 1])\n", 84
" return temp\n", 85
"\n", 86
"# cut off values that are not in the range of the data\n", 87
"THRESHOLD_OF_DAYS_PER_USER = 10\n", 88
"\n", 89
"# cut off values for the number of consecutive minutes for a walk\n", 90
"MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5\n", 91
"\n", 92
"# cut off values for the number of steps per minute for an active minute\n", 93
"MINIMUM_STEPS_PER_MINUTE = 60\n" 94
] 95 41 ]
}, 96 42 },
{ 97 43 {
"cell_type": "markdown", 98 44 "cell_type": "markdown",
"metadata": {}, 99 45 "metadata": {},
"source": [ 100 46 "source": [
"# Loading data files" 101 47 "# Loading data files"
] 102 48 ]
}, 103 49 },
{ 104 50 {
"cell_type": "code", 105 51 "cell_type": "code",
"execution_count": 144, 106 52 "execution_count": 163,
"metadata": {}, 107 53 "metadata": {},
"outputs": [], 108 54 "outputs": [],
"source": [ 109 55 "source": [
"data_dir = '../data'\n", 110 56 "data_dir = '../data'\n",
"\n", 111 57 "\n",
"daily = read_csv(os.path.join(data_dir, 'daily.csv'))\n", 112 58 "daily = read_csv(os.path.join(data_dir, 'daily.csv'))\n",
"dose = read_csv(os.path.join(data_dir, 'dose.csv'))\n", 113 59 "dose = read_csv(os.path.join(data_dir, 'dose.csv'))\n",
"jawbone = read_csv(os.path.join(data_dir, 'jawbone.csv'), low_memory=False)\n" 114 60 "jawbone = read_csv(os.path.join(data_dir, 'jawbone.csv'), low_memory=False)\n"
] 115 61 ]
}, 116 62 },
{ 117 63 {
"cell_type": "markdown", 118 64 "cell_type": "markdown",
"metadata": {}, 119 65 "metadata": {},
"source": [ 120 66 "source": [
"# Preprocessing\n", 121 67 "# Preprocessing\n",
"## Picking up the variables" 122 68 "## Picking up the variables"
] 123 69 ]
}, 124 70 },
{ 125 71 {
"cell_type": "code", 126 72 "cell_type": "code",
"execution_count": 145, 127 73 "execution_count": 164,
"metadata": {}, 128 74 "metadata": {},
"outputs": [], 129 75 "outputs": [],
"source": [ 130 76 "source": [
"# Column names of jawbone data\n", 131 77 "# Column names of jawbone data\n",
"# 'Var1', 'user', 'start_datetime', 'end_datetime', 'timezone', 'userid',\n", 132 78 "# 'Var1', 'user', 'start_datetime', 'end_datetime', 'timezone', 'userid',\n",
"# 'steps', 'gmtoff', 'tz', 'start_date', 'end_date', 'start_utime',\n", 133 79 "# 'steps', 'gmtoff', 'tz', 'start_date', 'end_date', 'start_utime',\n",
"# 'end_utime', 'start_udate', 'end_udate', 'intake_date', 'intake_utime',\n", 134 80 "# 'end_utime', 'start_udate', 'end_udate', 'intake_date', 'intake_utime',\n",
"# 'intake_tz', 'intake_gmtoff', 'intake_hour', 'intake_min',\n", 135 81 "# 'intake_tz', 'intake_gmtoff', 'intake_hour', 'intake_min',\n",
"# 'intake_slot', 'travel_start', 'travel_end', 'exit_date',\n", 136 82 "# 'intake_slot', 'travel_start', 'travel_end', 'exit_date',\n",
"# 'dropout_date', 'last_date', 'last_utime', 'last_tz', 'last_gmtoff',\n", 137 83 "# 'dropout_date', 'last_date', 'last_utime', 'last_tz', 'last_gmtoff',\n",
"# 'last_hour', 'last_min', 'start_utime_local', 'end_utime_local'\n", 138 84 "# 'last_hour', 'last_min', 'start_utime_local', 'end_utime_local'\n",
"\n", 139 85 "\n",
"\n", 140 86 "\n",
"# duplicate jawbone data\n", 141 87 "# duplicate jawbone data\n",
"jawbone2 = jawbone.copy(deep=True)\n", 142 88 "jawbone2 = jawbone.copy(deep=True)\n",
"\n", 143 89 "\n",
"# convert string datetimes to actual datetime objects\n", 144 90 "# convert string datetimes to actual datetime objects\n",
"jawbone2[\"start_utime_local\"] = pd.to_datetime(\n", 145 91 "jawbone2[\"start_utime_local\"] = pd.to_datetime(\n",
" jawbone2[\"start_utime_local\"], format=\"%Y-%m-%d %H:%M:%S\")\n", 146 92 " jawbone2[\"start_utime_local\"], format=\"%Y-%m-%d %H:%M:%S\")\n",
"jawbone2[\"start_datetime\"] = pd.to_datetime(\n", 147 93 "jawbone2[\"start_datetime\"] = pd.to_datetime(\n",
" jawbone2[\"start_datetime\"], format=\"%Y-%m-%d %H:%M:%S\")\n", 148 94 " jawbone2[\"start_datetime\"], format=\"%Y-%m-%d %H:%M:%S\")\n",
"\n", 149 95 "\n",
"# calculate the timezone offset\n", 150 96 "# calculate the timezone offset\n",
"jawbone2[\"tz_offset\"] = jawbone2[\"start_datetime\"] - \\\n", 151 97 "jawbone2[\"tz_offset\"] = jawbone2[\"start_datetime\"] - \\\n",
" jawbone2[\"start_utime_local\"]\n", 152 98 " jawbone2[\"start_utime_local\"]\n",
"\n", 153 99 "\n",
"\n", 154 100 "\n",
"# selecting only important columns\n", 155 101 "# selecting only important columns\n",
"jawbone3 = jawbone2[[\"user\", \"start_utime_local\",\n", 156 102 "jawbone3 = jawbone2[[\"user\", \"start_utime_local\",\n",
" \"end_utime_local\", \"tz_offset\", \"steps\"]]\n", 157 103 " \"end_utime_local\", \"tz_offset\", \"steps\"]]\n",
"\n", 158 104 "\n",
"# picking up the local date\n", 159 105 "# picking up the local date\n",
"jawbone3[\"local_date\"] = jawbone3[\"start_utime_local\"].apply(get_date)\n", 160 106 "jawbone3[\"local_date\"] = jawbone3[\"start_utime_local\"].apply(get_date)\n",
"\n", 161 107 "\n",
"# picking up the local minute index\n", 162 108 "# picking up the local minute index\n",
"jawbone3[\"local_minute_index\"] = jawbone3[\"start_utime_local\"].apply(\n", 163 109 "jawbone3[\"local_minute_index\"] = jawbone3[\"start_utime_local\"].apply(\n",
" get_minute_index)\n" 164 110 " get_minute_index)\n"
] 165 111 ]
}, 166 112 },
{ 167 113 {
"cell_type": "markdown", 168 114 "cell_type": "markdown",
"metadata": {}, 169 115 "metadata": {},
"source": [ 170 116 "source": [
"## Making a key info database" 171 117 "## Making a key info database"
] 172 118 ]
}, 173 119 },
{ 174 120 {
"cell_type": "code", 175 121 "cell_type": "code",
"execution_count": 146, 176 122 "execution_count": 165,
"metadata": {}, 177 123 "metadata": {},
"outputs": [], 178 124 "outputs": [],
"source": [ 179 125 "source": [
"# picking up the user - date data\n", 180 126 "# picking up the user - date data\n",
"user_date = jawbone3[[\"user\", \"local_date\"]].drop_duplicates()" 181 127 "user_date = jawbone3[[\"user\", \"local_date\"]].drop_duplicates()"
] 182 128 ]
}, 183 129 },
{ 184 130 {
"cell_type": "markdown", 185 131 "cell_type": "markdown",
"metadata": {}, 186 132 "metadata": {},
"source": [ 187 133 "source": [
"## Removing users with too small amount of data" 188 134 "## Removing users with too small amount of data"
] 189 135 ]
}, 190 136 },
{ 191 137 {
"cell_type": "code", 192 138 "cell_type": "code",
"execution_count": 147, 193 139 "execution_count": 166,
"metadata": {}, 194 140 "metadata": {},
"outputs": [ 195 141 "outputs": [
{ 196 142 {
"name": "stdout", 197 143 "name": "stdout",
"output_type": "stream", 198 144 "output_type": "stream",
"text": [ 199 145 "text": [
"Threshold: 10\n", 200 146 "Threshold: 10\n",
"Users to be removed:[12, 36, 38]\n", 201 147 "Users to be removed:[12, 36, 38]\n",
"Shape Change: 258889 -> 258363 (-526, -0.2%)\n" 202 148 "Shape Change: 258889 -> 258363 (-526, -0.2%)\n"
] 203 149 ]
}, 204 150 },
{ 205 151 {
"data": { 206 152 "data": {
"image/png": "", 207 153 "image/png": "",
"text/plain": [ 208 154 "text/plain": [
"<Figure size 432x288 with 1 Axes>" 209 155 "<Figure size 432x288 with 1 Axes>"
] 210 156 ]
}, 211 157 },
"metadata": {}, 212 158 "metadata": {},
"output_type": "display_data" 213 159 "output_type": "display_data"
} 214 160 }
], 215 161 ],
"source": [ 216 162 "source": [
"# making a stat of the number of days per user\n", 217 163 "# making a stat of the number of days per user\n",
"stat_user = user_date.groupby(['user'])['local_date'].nunique().sort_values()\n", 218 164 "stat_user = user_date.groupby(['user'])['local_date'].nunique().sort_values()\n",
"\n", 219 165 "\n",
"ax = plt.figure()\n", 220 166 "ax = plt.figure()\n",
"ax.patch.set_facecolor('white')\n", 221 167 "ax.patch.set_facecolor('white')\n",
"ax = sns.histplot(stat_user)\n", 222 168 "ax = sns.histplot(stat_user)\n",
"ax.set_title('Distribution of number of days per user')\n", 223 169 "ax.set_title('Distribution of number of days per user')\n",
"ax.set_xlabel('Number of days')\n", 224 170 "ax.set_xlabel('Number of days')\n",
"ax.set_ylabel('Frequency')\n", 225 171 "ax.set_ylabel('Frequency')\n",
"\n", 226 172 "\n",
"\n", 227 173 "\n",
"# filter out users that have less days of data than THRESHOLD_OF_DAYS_PER_USER\n", 228 174 "# filter out users that have less days of data than THRESHOLD_OF_DAYS_PER_USER\n",
"users_to_be_removed = stat_user[stat_user < THRESHOLD_OF_DAYS_PER_USER].index\n", 229 175 "users_to_be_removed = stat_user[stat_user < THRESHOLD_OF_DAYS_PER_USER].index\n",
"\n", 230 176 "\n",
"print(\"Threshold: {}\".format(THRESHOLD_OF_DAYS_PER_USER))\n", 231 177 "print(\"Threshold: {}\".format(THRESHOLD_OF_DAYS_PER_USER))\n",
"print(\"Users to be removed:{}\".format(list(users_to_be_removed)))\n", 232 178 "print(\"Users to be removed:{}\".format(list(users_to_be_removed)))\n",
"\n", 233 179 "\n",
"jawbone4 = jawbone3[~jawbone3[\"user\"].isin(users_to_be_removed)]\n", 234 180 "jawbone4 = jawbone3[~jawbone3[\"user\"].isin(users_to_be_removed)]\n",
"\n", 235 181 "\n",
"user_date2 = user_date[~user_date[\"user\"].isin(users_to_be_removed)]\n", 236 182 "user_date2 = user_date[~user_date[\"user\"].isin(users_to_be_removed)]\n",
"\n", 237 183 "\n",
"# printing the amount of data removed\n", 238 184 "# printing the amount of data removed\n",
"jawbone3_count, _ = jawbone3.shape\n", 239 185 "jawbone3_count, _ = jawbone3.shape\n",
"jawbone4_count, _ = jawbone4.shape\n", 240 186 "jawbone4_count, _ = jawbone4.shape\n",
"\n", 241 187 "\n",
"print(\"Shape Change: {} -> {} (-{}, -{}%)\".format(\n", 242 188 "print(\"Shape Change: {} -> {} (-{}, -{}%)\".format(\n",
" jawbone3_count, \n", 243 189 " jawbone3_count, \n",
" jawbone4_count, \n", 244 190 " jawbone4_count, \n",
" jawbone3_count - jawbone4_count, \n", 245 191 " jawbone3_count - jawbone4_count, \n",
" round((jawbone3_count - jawbone4_count) / jawbone3_count * 100, 2)\n", 246 192 " round((jawbone3_count - jawbone4_count) / jawbone3_count * 100, 2)\n",
" )\n", 247 193 " )\n",
")" 248 194 ")"
] 249 195 ]
}, 250 196 },
{ 251 197 {
"cell_type": "markdown", 252 198 "cell_type": "markdown",
"metadata": {}, 253 199 "metadata": {},
"source": [ 254 200 "source": [
"## Find consecutive minute walks" 255 201 "## Find consecutive minute walks"
] 256 202 ]
}, 257 203 },
{ 258 204 {
"cell_type": "code", 259 205 "cell_type": "code",
"execution_count": 148, 260 206 "execution_count": 167,
"metadata": {}, 261 207 "metadata": {},
"outputs": [ 262 208 "outputs": [
{ 263 209 {
"name": "stdout", 264 210 "name": "stdout",
"output_type": "stream", 265 211 "output_type": "stream",
"text": [ 266 212 "text": [
"Iteration: 0, length: 377396\n", 267 213 "Iteration: 0, length: 377396\n",
"Iteration: 1, length: 229752\n", 268 214 "Iteration: 1, length: 229752\n",
"Iteration: 2, length: 170648\n", 269 215 "Iteration: 2, length: 170648\n",
"Iteration: 3, length: 137484\n", 270 216 "Iteration: 3, length: 137484\n",
"Iteration: 4, length: 178268\n", 271 217 "Iteration: 4, length: 178268\n",
"Final, length: 94884\n" 272 218 "Final, length: 94884\n"
] 273 219 ]
} 274 220 }
], 275 221 ],
"source": [ 276 222 "source": [
"# prepare the data for the walk calculation\n", 277 223 "# prepare the data for the walk calculation\n",
"current_vector = jawbone4[[\"user\", \"local_date\", \"local_minute_index\", \"steps\"]]\n", 278 224 "current_vector = jawbone4[[\"user\", \"local_date\", \"local_minute_index\", \"steps\"]]\n",
"current_vector[\"add_count\"] = 1\n", 279 225 "current_vector[\"add_count\"] = 1\n",
"current_vector = current_vector[current_vector[\"steps\"] > MINIMUM_STEPS_PER_MINUTE]\n", 280 226 "current_vector = current_vector[current_vector[\"steps\"] > MINIMUM_STEPS_PER_MINUTE]\n",
"current_vector = current_vector[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n", 281 227 "current_vector = current_vector[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
"\n", 282 228 "\n",
"# iteratively calculate the walk\n", 283 229 "# iteratively calculate the walk\n",
"for i in range(0, MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK):\n", 284 230 "for i in range(0, MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK):\n",
" print(\"Iteration: {}, length: {}\".format(i, current_vector.size))\n", 285 231 " print(\"Iteration: {}, length: {}\".format(i, current_vector.size))\n",
" new_vector = calculate_walk(current_vector)\n", 286 232 " new_vector = calculate_walk(current_vector)\n",
" current_vector = new_vector\n", 287 233 " current_vector = new_vector\n",
"\n", 288 234 "\n",
"print(\"Final, length: {}\".format(current_vector.size))\n", 289 235 "print(\"Final, length: {}\".format(current_vector.size))\n",
"\n", 290 236 "\n",
"consecutive_minutes = current_vector[[\"user\", \"local_date\", \"local_minute_index\"]].drop_duplicates()" 291 237 "consecutive_minutes = current_vector[[\"user\", \"local_date\", \"local_minute_index\"]].drop_duplicates()"
] 292 238 ]
}, 293 239 },
{ 294 240 {
"cell_type": "markdown", 295 241 "cell_type": "markdown",
"metadata": {}, 296 242 "metadata": {},
"source": [ 297 243 "source": [
"## Map consecutive minutes to 1hr and 3hr units" 298 244 "## Map consecutive minutes to 1hr and 3hr units"
] 299 245 ]
}, 300 246 },
{ 301 247 {
"cell_type": "code", 302 248 "cell_type": "code",
"execution_count": 149, 303 249 "execution_count": 168,
"metadata": {}, 304 250 "metadata": {},
"outputs": [], 305 251 "outputs": [],
"source": [ 306 252 "source": [
"# calculate hour index and three hour index\n", 307 253 "# calculate hour index and three hour index\n",
"consecutive_minutes[\"hour\"] = np.int_(np.floor(consecutive_minutes[\"local_minute_index\"] / 60))\n", 308 254 "consecutive_minutes[\"hour\"] = np.int_(np.floor(consecutive_minutes[\"local_minute_index\"] / 60))\n",
"consecutive_minutes[\"threehour\"] = np.int_(np.floor(consecutive_minutes[\"hour\"] / 3))\n", 309 255 "consecutive_minutes[\"threehour\"] = np.int_(np.floor(consecutive_minutes[\"hour\"] / 3))\n",
"\n", 310 256 "\n",
"# calculate the number of walks per user, per hour\n", 311 257 "# calculate the number of walks per user, per hour\n",
"walk_by_hours = consecutive_minutes[[\"user\", \"local_date\", \"hour\"]].drop_duplicates().reset_index()\n", 312 258 "walk_by_hours = consecutive_minutes[[\"user\", \"local_date\", \"hour\"]].drop_duplicates().reset_index()\n",
"walk_by_hours[\"walked\"] = 2\n", 313 259 "walk_by_hours[\"walked\"] = 2\n",
"\n", 314 260 "\n",
"# calculate the number of walks per user, per three hour\n", 315 261 "# calculate the number of walks per user, per three hour\n",
"walk_by_threehours = consecutive_minutes[[\"user\", \"local_date\", \"threehour\"]].drop_duplicates().reset_index()\n", 316 262 "walk_by_threehours = consecutive_minutes[[\"user\", \"local_date\", \"threehour\"]].drop_duplicates().reset_index()\n",
"walk_by_threehours[\"walked\"] = 2\n", 317 263 "walk_by_threehours[\"walked\"] = 2\n",
"\n", 318 264 "\n",
"# generate hour vector and three hour vector\n", 319 265 "# generate hour vector and three hour vector\n",
"hours = pd.DataFrame({\"hour\": range(0,24)})\n", 320 266 "hours = pd.DataFrame({\"hour\": range(0,24)})\n",
"threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n", 321 267 "threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n",
"\n", 322 268 "\n",
"# generate complete product dataframe\n", 323 269 "# generate complete product dataframe\n",
"measured_hour = product_df(walk_by_hours[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n", 324 270 "measured_hour = product_df(walk_by_hours[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n",
"measured_threehour = product_df(walk_by_threehours[[\"user\", \"local_date\"]], threehours[[\"threehour\"]])\n", 325 271 "measured_threehour = product_df(walk_by_threehours[[\"user\", \"local_date\"]], threehours[[\"threehour\"]])\n",
"\n", 326 272 "\n",
"# pad the hourly walk data (fill in missing hours with 1s)\n", 327 273 "# pad the hourly walk data (fill in missing hours with 1s)\n",
"padded_hours = walk_by_hours.merge(measured_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\")\n", 328 274 "padded_hours = walk_by_hours.merge(measured_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\")\n",
"padded_hours = padded_hours[[\"user\", \"local_date\", \"hour\", \"walked\"]]\n", 329 275 "padded_hours = padded_hours[[\"user\", \"local_date\", \"hour\", \"walked\"]]\n",
"padded_hours = padded_hours.fillna(1)\n", 330 276 "padded_hours = padded_hours.fillna(1)\n",
"\n", 331 277 "\n",
"# pad the walk data with 3 hours unit (fill in missing hours with 1s)\n", 332 278 "# pad the walk data with 3 hours unit (fill in missing hours with 1s)\n",
"padded_threehours = walk_by_threehours.merge(measured_threehour, on=[\"user\", \"local_date\", \"threehour\"], how=\"right\")\n", 333 279 "padded_threehours = walk_by_threehours.merge(measured_threehour, on=[\"user\", \"local_date\", \"threehour\"], how=\"right\")\n",
"padded_threehours = padded_threehours[[\"user\", \"local_date\", \"threehour\", \"walked\"]]\n", 334 280 "padded_threehours = padded_threehours[[\"user\", \"local_date\", \"threehour\", \"walked\"]]\n",
"padded_threehours = padded_threehours.fillna(1)" 335 281 "padded_threehours = padded_threehours.fillna(1)"
] 336 282 ]
}, 337 283 },
{ 338 284 {
"cell_type": "markdown", 339 285 "cell_type": "markdown",
"metadata": {}, 340 286 "metadata": {},
"source": [ 341 287 "source": [
"## Pad unmeasured missing data with 0s\n", 342 288 "## Pad unmeasured missing data with 0s\n",
"\n", 343 289 "\n",
"For three hour window data, since it works as output vector, 0 padding is not done. Instead, missing output data is not used for training." 344 290 "For three hour window data, since it works as output vector, 0 padding is not done. Instead, missing output data is not used for training."
] 345 291 ]
}, 346 292 },
{ 347 293 {
"cell_type": "code", 348 294 "cell_type": "code",
"execution_count": 150, 349 295 "execution_count": 169,
"metadata": {}, 350 296 "metadata": {},
"outputs": [], 351 297 "outputs": [],
"source": [ 352 298 "source": [
"# generate start and end date for each user\n", 353 299 "# generate start and end date for each user\n",
"start_date = padded_hours.groupby([\"user\"])[\"local_date\"].min()\n", 354 300 "start_date = padded_hours.groupby([\"user\"])[\"local_date\"].min()\n",
"end_date = padded_hours.groupby([\"user\"])[\"local_date\"].max()\n", 355 301 "end_date = padded_hours.groupby([\"user\"])[\"local_date\"].max()\n",
"\n", 356 302 "\n",
"# generate the user list\n", 357 303 "# generate the user list\n",
"users = start_date.index\n", 358 304 "users = start_date.index\n",
"all_dates = pd.DataFrame({\"user\": [], \"local_date\": []})\n", 359 305 "all_dates = pd.DataFrame({\"user\": [], \"local_date\": []})\n",
"\n", 360 306 "\n",
"# generate the lists of dates between start and end date\n", 361 307 "# generate the lists of dates between start and end date\n",
"for userid in users:\n", 362 308 "for userid in users:\n",
" current_user_dates = pd.DataFrame({\"user\": userid, \"local_date\": list(date_range(start_date[userid], end_date[userid]))})\n", 363 309 " current_user_dates = pd.DataFrame({\"user\": userid, \"local_date\": list(date_range(start_date[userid], end_date[userid]))})\n",
" \n", 364 310 " \n",
" all_dates = pd.concat([all_dates, current_user_dates])\n", 365 311 " all_dates = pd.concat([all_dates, current_user_dates])\n",
"\n", 366 312 "\n",
"# generate the base vector for the padding\n", 367 313 "# generate the base vector for the padding\n",
"all_dates_hour = product_df(all_dates[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n", 368 314 "all_dates_hour = product_df(all_dates[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n",
"\n", 369 315 "\n",
"# final padded gait data\n", 370 316 "# final padded gait data\n",
"padded_hours = padded_hours.merge(all_dates_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\").fillna(0)\n" 371 317 "padded_hours = padded_hours.merge(all_dates_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\").fillna(0)\n"
] 372 318 ]
}, 373 319 },
{ 374 320 {
"cell_type": "markdown", 375 321 "cell_type": "markdown",
"metadata": {}, 376 322 "metadata": {},
"source": [ 377 323 "source": [
"# Saving Data" 378 324 "# Saving Data"
] 379 325 ]
}, 380 326 },
{ 381 327 {
"cell_type": "code", 382 328 "cell_type": "code",
"execution_count": 151, 383 329 "execution_count": 170,
"metadata": {}, 384 330 "metadata": {},
"outputs": [], 385 331 "outputs": [],
"source": [ 386 332 "source": [
333 "# converting data type\n",
334 "padded_hours[\"user\"] = padded_hours[\"user\"].astype(int)\n",
335 "padded_hours[\"hour\"] = padded_hours[\"hour\"].astype(int)\n",
336 "padded_hours[\"walked\"] = padded_hours[\"walked\"].astype(int)\n",
337 "\n",
338 "padded_threehours[\"user\"] = padded_threehours[\"user\"].astype(int)\n",
339 "padded_threehours[\"threehour\"] = padded_threehours[\"threehour\"].astype(int)\n",
340 "padded_threehours[\"walked\"] = padded_threehours[\"walked\"].astype(int)\n",
341 "\n",
"# save the data\n", 387 342 "# save the data\n",
"padded_hours.to_csv(os.path.join(data_dir, \"padded_hours.csv\"), index=False)\n", 388 343 "padded_hours.to_csv(os.path.join(data_dir, \"padded_hours.csv\"), index=False)\n",
"padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)" 389 344 "padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)\n",
345 "\n",
346 "padded_hours.to_pickle(os.path.join(data_dir, \"padded_hours.pkl\"))\n",
347 "padded_threehours.to_pickle(os.path.join(data_dir, \"padded_threehours.pkl\"))"
] 390 348 ]
} 391 349 }
], 392 350 ],
"metadata": { 393 351 "metadata": {
"interpreter": { 394 352 "interpreter": {
"hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a" 395 353 "hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a"
}, 396 354 },
"kernelspec": { 397 355 "kernelspec": {
"display_name": "Python 3.7.9 64-bit ('venv': venv)", 398 356 "display_name": "Python 3.7.9 64-bit ('venv': venv)",
"language": "python", 399 357 "language": "python",
"name": "python3" 400 358 "name": "python3"
}, 401 359 },
"language_info": { 402 360 "language_info": {
"codemirror_mode": { 403 361 "codemirror_mode": {
"name": "ipython", 404 362 "name": "ipython",
"version": 3 405 363 "version": 3
}, 406 364 },
"file_extension": ".py", 407 365 "file_extension": ".py",
"mimetype": "text/x-python", 408 366 "mimetype": "text/x-python",
"name": "python", 409 367 "name": "python",
"nbconvert_exporter": "python", 410 368 "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", 411 369 "pygments_lexer": "ipython3",
"version": "3.7.9" 412 370 "version": "3.7.9"
}, 413 371 },
"orig_nbformat": 4 414 372 "orig_nbformat": 4
}, 415 373 },
"nbformat": 4, 416 374 "nbformat": 4,
"nbformat_minor": 2 417 375 "nbformat_minor": 2
} 418 376 }
419 377
python-notebook/prepare_trteva_data.ipynb View file @ 979a88c
File was created 1 {
2 "cells": [
3 {
4 "cell_type": "code",
5 "execution_count": 1,
6 "metadata": {},
7 "outputs": [],
8 "source": [
9 "import numpy as np\n",
10 "import pandas as pd\n",
11 "import os\n",
12 "from tools import *\n",
13 "from constants import *\n",
14 "from tensorflow.keras.utils import to_categorical"
15 ]
16 },
17 {
18 "cell_type": "markdown",
19 "metadata": {},
20 "source": [
21 "# Prepare Training, Testing, and Validation Data\n",
22 "## Loading the preprocessed data"
23 ]
24 },
25 {
26 "cell_type": "code",
27 "execution_count": 2,
28 "metadata": {},
29 "outputs": [],
30 "source": [
31 "# to use unlimited memory for large dataframes\n",
32 "pd.options.mode.chained_assignment = None\n",
33 "\n",
34 "data_dir = '../data'\n",
35 "\n",
36 "padded_hours = pd.read_pickle(os.path.join(data_dir, 'padded_hours.pkl'))\n",
37 "padded_threehours = pd.read_pickle(os.path.join(data_dir, 'padded_threehours.pkl'))"
38 ]
39 },
40 {
41 "cell_type": "markdown",
42 "metadata": {},
43 "source": [
44 "## Enumerating Output Data"
45 ]
46 },
47 {
48 "cell_type": "code",
49 "execution_count": 3,
50 "metadata": {},
51 "outputs": [],
52 "source": [
53 "# return output value\n",
54 "def get_output(y):\n",
55 " return y[\"walked\"]\n",
56 "\n",
57 "# return intput value\n",
58 "def get_input(y, padded_hours):\n",
59 " # base information\n",
60 " user = y[\"user\"]\n",
61 " local_date = y[\"local_date\"]\n",
62 " threehour_idx = y[\"threehour\"]\n",
63 " \n",
64 " # derived information\n",
65 " hour_idx = threehour_idx * 3\n",
66 " encoded_hour_idx = to_categorical(hour_idx, num_classes=24)\n",
67 " end_date = local_date - timedelta(days=1)\n",
68 " start_date = end_date - timedelta(days=7*NUMBER_OF_WEEKS_FOR_LOOKING_BACK-1)\n",
69 " weekday = local_date.weekday()\n",
70 " encoded_weekday = to_categorical(weekday, num_classes=7)\n",
71 " encoded_month = to_categorical(local_date.month, num_classes=12)\n",
72 " encoded_day_of_month = to_categorical(local_date.day, num_classes=31)\n",
73 "\n",
74 " gait = pd.Series([], dtype=int)\n",
75 " # gait movement\n",
76 " zero_move = 0\n",
77 " for a_date in date_range(start_date, end_date):\n",
78 " day_df = padded_hours[(padded_hours[\"user\"] == user) & (padded_hours[\"local_date\"] == a_date)]\n",
79 " if day_df.size == 0:\n",
80 " gait = pd.concat([gait, pd.Series([1,0,0] * 24, dtype=int)])\n",
81 " zero_move += 1\n",
82 " else:\n",
83 " gait = pd.concat([gait, pd.Series(to_categorical(day_df[\"walked\"].values, 3, dtype=int).reshape(24*3), dtype=int)])\n",
84 " if zero_move == 5 * 7:\n",
85 " raise Exception(\"No movement data\")\n",
86 "\n",
87 " return_series = pd.Series([], dtype=int)\n",
88 " return_series = pd.concat([return_series, pd.Series(encoded_hour_idx, dtype=np.int_)])\n",
89 " return_series = pd.concat([return_series, pd.Series(encoded_weekday, dtype=np.int_)])\n",
90 " return_series = pd.concat([return_series, pd.Series(encoded_month, dtype=np.int_)])\n",
91 " return_series = pd.concat([return_series, pd.Series(encoded_day_of_month, dtype=np.int_)])\n",
92 " return_series = pd.concat([return_series, gait])\n",
93 " \n",
94 " return return_series\n",
95 "\n",
96 "def get_database(start_idx, end_idx):\n",
97 " database = pd.DataFrame({}, dtype=int)\n",
98 "\n",
99 " for i in range(start_idx, end_idx):\n",
100 " try:\n",
101 " y = padded_threehours.iloc[i, :]\n",
102 " user = y[\"user\"]\n",
103 " local_date = y[\"local_date\"]\n",
104 " first_day = padded_hours[padded_hours[\"user\"] == user][\"local_date\"].min()\n",
105 " date_diff = (local_date - first_day).days\n",
106 "\n",
107 " threehour_idx = y[\"threehour\"]\n",
108 " hour_idx = threehour_idx * 3\n",
109 "\n",
110 " output = get_output(y)\n",
111 " input = get_input(y, padded_hours)\n",
112 "\n",
113 " temp_series = pd.Series([], dtype=int)\n",
114 " temp_series = pd.concat([temp_series, pd.Series(user, dtype=int)])\n",
115 " temp_series = pd.concat([temp_series, pd.Series(date_diff, dtype=int)])\n",
116 " temp_series = pd.concat([temp_series, pd.Series(threehour_idx, dtype=int)])\n",
117 " temp_series = pd.concat([temp_series, pd.Series(hour_idx, dtype=int)])\n",
118 " temp_series = pd.concat([temp_series, pd.Series(output, dtype=int)])\n",
119 " temp_series = pd.concat([temp_series, pd.Series(input, dtype=int)])\n",
120 "\n",
121 " database = pd.concat([database, temp_series], axis=1)\n",
122 " # print(input)\n",
123 " except Exception as e:\n",
124 " # print(\"Error:\", e)\n",
125 " pass\n",
126 "\n",
127 " return database\n",
128 "\n"
129 ]
130 },
131 {
132 "cell_type": "code",
133 "execution_count": 4,
134 "metadata": {},
135 "outputs": [],
136 "source": [
137 "from tensorflow.keras.datasets import mnist\n",
138 "from tensorflow.keras.models import Sequential\n",
139 "from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation\n",
140 "\n",
141 "\n",
142 "(x_train, y_train), (x_test, y_test) = mnist.load_data(path='mnist.npz')\n",
143 "\n",
144 "X_train = x_train.reshape(60000, 784).astype('float32') / 255\n",
145 "X_test = x_test.reshape(10000, 784).astype('float32') / 255\n",
146 "\n",
147 "Y_train = to_categorical(y_train, 10)\n",
148 "Y_test = to_categorical(y_test, 10)"
149 ]
150 },
151 {
152 "cell_type": "code",
153 "execution_count": null,
154 "metadata": {},
155 "outputs": [],
156 "source": []
157 }
158 ],
159 "metadata": {
160 "interpreter": {
161 "hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a"
162 },
163 "kernelspec": {
164 "display_name": "Python 3.7.9 64-bit ('venv': venv)",
165 "language": "python",
166 "name": "python3"
167 },
168 "language_info": {
169 "codemirror_mode": {
170 "name": "ipython",
171 "version": 3
172 },
173 "file_extension": ".py",
174 "mimetype": "text/x-python",
175 "name": "python",
176 "nbconvert_exporter": "python",
177 "pygments_lexer": "ipython3",
178 "version": "3.7.9"
179 },
180 "orig_nbformat": 4
181 },
182 "nbformat": 4,
183 "nbformat_minor": 2
184 }
python-notebook/tools.py View file @ 979a88c
File was created 1 import pandas as pd
2
3 from itertools import product
4 from datetime import date, timedelta
5
6
7 # convert a datetime object to a date object
8 def get_date(x):
9 return date(x.year, x.month, x.day)
10
11 # convert a datetime object to an integer, which denotes the number of minutes since midnight
12 def get_minute_index(x):
13 return (x.hour * 60) + x.minute
14
15 # return a range of dates
16 def date_range(start_date, end_date):
17 delta = end_date - start_date
18
19 for i in range(delta.days + 1):
20 yield start_date + timedelta(days=i)
21
22 # define an iterative walk calculation (merging consecutive active minutes)
23 def calculate_walk(cv):
24 nv = cv.copy(deep=True)
25 nv["prev_minute_index"] = nv["local_minute_index"] - 1
26
27 # move midnight minutes to previous day
28 nv[nv["prev_minute_index"] < 0]["local_date"] -= timedelta(days=1)
29 nv[nv["prev_minute_index"] < 0]["prev_minute_index"] = 1439
30
31 nv = nv[["user", "local_date", "prev_minute_index"]]
32 jv = cv.merge(nv, left_on=["user", "local_date", "local_minute_index"], right_on=["user", "local_date", "prev_minute_index"], how="inner")
33 jv["add_count"] += 1
34 jv = jv[["user", "local_date", "local_minute_index", "add_count"]]
35
36 return jv
37