Commit f39b4d8ba4659f5ffd97ef9d87ce3d7108231450

Authored by Junghwan Park
1 parent 57ff8482ac
Exists in main

Finished the preprocessing.

Showing 1 changed file with 34 additions and 54 deletions Inline Diff

python-notebook/data_loading.ipynb View file @ f39b4d8
{ 1 1 {
"cells": [ 2 2 "cells": [
{ 3 3 {
"cell_type": "markdown", 4 4 "cell_type": "markdown",
"metadata": {}, 5 5 "metadata": {},
"source": [ 6 6 "source": [
"# Loading libraries" 7 7 "# Loading libraries"
] 8 8 ]
}, 9 9 },
{ 10 10 {
"cell_type": "code", 11 11 "cell_type": "code",
"execution_count": 17, 12 12 "execution_count": 17,
"metadata": {}, 13 13 "metadata": {},
"outputs": [], 14 14 "outputs": [],
"source": [ 15 15 "source": [
"import numpy as np\n", 16 16 "import numpy as np\n",
"import matplotlib.pyplot as plt\n", 17 17 "import matplotlib.pyplot as plt\n",
"import seaborn as sns\n", 18 18 "import seaborn as sns\n",
"from pandas import read_csv\n", 19 19 "from pandas import read_csv\n",
"import pandas as pd\n", 20 20 "import pandas as pd\n",
"import os\n", 21 21 "import os\n",
"from datetime import datetime, date, timedelta\n", 22 22 "from datetime import datetime, date, timedelta\n",
"from itertools import product\n", 23 23 "from itertools import product\n",
"# %load_ext line_profiler" 24 24 "# %load_ext line_profiler"
] 25 25 ]
}, 26 26 },
{ 27 27 {
"cell_type": "markdown", 28 28 "cell_type": "markdown",
"metadata": {}, 29 29 "metadata": {},
"source": [ 30 30 "source": [
"# Defining Functions and Adjusting Settings" 31 31 "# Defining Functions and Adjusting Settings"
] 32 32 ]
}, 33 33 },
{ 34 34 {
"cell_type": "code", 35 35 "cell_type": "code",
"execution_count": 36, 36 36 "execution_count": 36,
"metadata": {}, 37 37 "metadata": {},
"outputs": [], 38 38 "outputs": [],
"source": [ 39 39 "source": [
"pd.options.mode.chained_assignment = None\n", 40 40 "pd.options.mode.chained_assignment = None\n",
"\n", 41 41 "\n",
"def get_date(x):\n", 42 42 "def get_date(x):\n",
" return date(x.year, x.month, x.day)\n", 43 43 " return date(x.year, x.month, x.day)\n",
"\n", 44 44 "\n",
"def get_minute_index(x):\n", 45 45 "def get_minute_index(x):\n",
" return (x.hour * 60) + x.minute\n", 46 46 " return (x.hour * 60) + x.minute\n",
"\n", 47 47 "\n",
"\n", 48 48 "\n",
"# cut off values that are not in the range of the data\n", 49 49 "# cut off values that are not in the range of the data\n",
"THRESHOLD_OF_DAYS_PER_USER = 10\n", 50 50 "THRESHOLD_OF_DAYS_PER_USER = 10\n",
"\n", 51 51 "\n",
"# cut off values for the number of consecutive minutes for a walk\n", 52 52 "# cut off values for the number of consecutive minutes for a walk\n",
"MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5\n", 53 53 "MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5\n",
"\n", 54 54 "\n",
"# cut off values for the number of steps per minute for an active minute\n", 55 55 "# cut off values for the number of steps per minute for an active minute\n",
"MINIMUM_STEPS_PER_MINUTE = 60\n" 56 56 "MINIMUM_STEPS_PER_MINUTE = 60\n"
] 57 57 ]
}, 58 58 },
{ 59 59 {
"cell_type": "markdown", 60 60 "cell_type": "markdown",
"metadata": {}, 61 61 "metadata": {},
"source": [ 62 62 "source": [
"# Loading data files" 63 63 "# Loading data files"
] 64 64 ]
}, 65 65 },
{ 66 66 {
"cell_type": "code", 67 67 "cell_type": "code",
"execution_count": 5, 68 68 "execution_count": 5,
"metadata": {}, 69 69 "metadata": {},
"outputs": [], 70 70 "outputs": [],
"source": [ 71 71 "source": [
"data_dir = '../data'\n", 72 72 "data_dir = '../data'\n",
"\n", 73 73 "\n",
"daily = read_csv(os.path.join(data_dir, 'daily.csv'))\n", 74 74 "daily = read_csv(os.path.join(data_dir, 'daily.csv'))\n",
"dose = read_csv(os.path.join(data_dir, 'dose.csv'))\n", 75 75 "dose = read_csv(os.path.join(data_dir, 'dose.csv'))\n",
"jawbone = read_csv(os.path.join(data_dir, 'jawbone.csv'), low_memory=False)\n" 76 76 "jawbone = read_csv(os.path.join(data_dir, 'jawbone.csv'), low_memory=False)\n"
] 77 77 ]
}, 78 78 },
{ 79 79 {
"cell_type": "markdown", 80 80 "cell_type": "markdown",
"metadata": {}, 81 81 "metadata": {},
"source": [ 82 82 "source": [
"# Preprocessing\n", 83 83 "# Preprocessing\n",
"## Picking up the variables" 84 84 "## Picking up the variables"
] 85 85 ]
}, 86 86 },
{ 87 87 {
"cell_type": "code", 88 88 "cell_type": "code",
"execution_count": 6, 89 89 "execution_count": 6,
"metadata": {}, 90 90 "metadata": {},
"outputs": [], 91 91 "outputs": [],
"source": [ 92 92 "source": [
"# Column names of jawbone data\n", 93 93 "# Column names of jawbone data\n",
"# 'Var1', 'user', 'start_datetime', 'end_datetime', 'timezone', 'userid',\n", 94 94 "# 'Var1', 'user', 'start_datetime', 'end_datetime', 'timezone', 'userid',\n",
"# 'steps', 'gmtoff', 'tz', 'start_date', 'end_date', 'start_utime',\n", 95 95 "# 'steps', 'gmtoff', 'tz', 'start_date', 'end_date', 'start_utime',\n",
"# 'end_utime', 'start_udate', 'end_udate', 'intake_date', 'intake_utime',\n", 96 96 "# 'end_utime', 'start_udate', 'end_udate', 'intake_date', 'intake_utime',\n",
"# 'intake_tz', 'intake_gmtoff', 'intake_hour', 'intake_min',\n", 97 97 "# 'intake_tz', 'intake_gmtoff', 'intake_hour', 'intake_min',\n",
"# 'intake_slot', 'travel_start', 'travel_end', 'exit_date',\n", 98 98 "# 'intake_slot', 'travel_start', 'travel_end', 'exit_date',\n",
"# 'dropout_date', 'last_date', 'last_utime', 'last_tz', 'last_gmtoff',\n", 99 99 "# 'dropout_date', 'last_date', 'last_utime', 'last_tz', 'last_gmtoff',\n",
"# 'last_hour', 'last_min', 'start_utime_local', 'end_utime_local'\n", 100 100 "# 'last_hour', 'last_min', 'start_utime_local', 'end_utime_local'\n",
"\n", 101 101 "\n",
"\n", 102 102 "\n",
"# duplicate jawbone data\n", 103 103 "# duplicate jawbone data\n",
"jawbone2 = jawbone.copy(deep=True)\n", 104 104 "jawbone2 = jawbone.copy(deep=True)\n",
"\n", 105 105 "\n",
"# convert string datetimes to actual datetime objects\n", 106 106 "# convert string datetimes to actual datetime objects\n",
"jawbone2[\"start_utime_local\"] = pd.to_datetime(\n", 107 107 "jawbone2[\"start_utime_local\"] = pd.to_datetime(\n",
" jawbone2[\"start_utime_local\"], format=\"%Y-%m-%d %H:%M:%S\")\n", 108 108 " jawbone2[\"start_utime_local\"], format=\"%Y-%m-%d %H:%M:%S\")\n",
"jawbone2[\"start_datetime\"] = pd.to_datetime(\n", 109 109 "jawbone2[\"start_datetime\"] = pd.to_datetime(\n",
" jawbone2[\"start_datetime\"], format=\"%Y-%m-%d %H:%M:%S\")\n", 110 110 " jawbone2[\"start_datetime\"], format=\"%Y-%m-%d %H:%M:%S\")\n",
"\n", 111 111 "\n",
"# calculate the timezone offset\n", 112 112 "# calculate the timezone offset\n",
"jawbone2[\"tz_offset\"] = jawbone2[\"start_datetime\"] - \\\n", 113 113 "jawbone2[\"tz_offset\"] = jawbone2[\"start_datetime\"] - \\\n",
" jawbone2[\"start_utime_local\"]\n", 114 114 " jawbone2[\"start_utime_local\"]\n",
"\n", 115 115 "\n",
"\n", 116 116 "\n",
"# selecting only important columns\n", 117 117 "# selecting only important columns\n",
"jawbone3 = jawbone2[[\"user\", \"start_utime_local\",\n", 118 118 "jawbone3 = jawbone2[[\"user\", \"start_utime_local\",\n",
" \"end_utime_local\", \"tz_offset\", \"steps\"]]\n", 119 119 " \"end_utime_local\", \"tz_offset\", \"steps\"]]\n",
"\n", 120 120 "\n",
"# picking up the local date\n", 121 121 "# picking up the local date\n",
"jawbone3[\"local_date\"] = jawbone3[\"start_utime_local\"].apply(get_date)\n", 122 122 "jawbone3[\"local_date\"] = jawbone3[\"start_utime_local\"].apply(get_date)\n",
"\n", 123 123 "\n",
"# picking up the local minute index\n", 124 124 "# picking up the local minute index\n",
"jawbone3[\"local_minute_index\"] = jawbone3[\"start_utime_local\"].apply(\n", 125 125 "jawbone3[\"local_minute_index\"] = jawbone3[\"start_utime_local\"].apply(\n",
" get_minute_index)\n" 126 126 " get_minute_index)\n"
] 127 127 ]
}, 128 128 },
{ 129 129 {
"cell_type": "markdown", 130 130 "cell_type": "markdown",
"metadata": {}, 131 131 "metadata": {},
"source": [ 132 132 "source": [
"## Making a key info database" 133 133 "## Making a key info database"
] 134 134 ]
}, 135 135 },
{ 136 136 {
"cell_type": "code", 137 137 "cell_type": "code",
"execution_count": 7, 138 138 "execution_count": 7,
"metadata": {}, 139 139 "metadata": {},
"outputs": [], 140 140 "outputs": [],
"source": [ 141 141 "source": [
"# picking up the user - date data\n", 142 142 "# picking up the user - date data\n",
"user_date = jawbone3[[\"user\", \"local_date\"]].drop_duplicates()" 143 143 "user_date = jawbone3[[\"user\", \"local_date\"]].drop_duplicates()"
] 144 144 ]
}, 145 145 },
{ 146 146 {
"cell_type": "markdown", 147 147 "cell_type": "markdown",
"metadata": {}, 148 148 "metadata": {},
"source": [ 149 149 "source": [
"## Removing users with too small amount of data" 150 150 "## Removing users with too small amount of data"
] 151 151 ]
}, 152 152 },
{ 153 153 {
"cell_type": "code", 154 154 "cell_type": "code",
"execution_count": 13, 155 155 "execution_count": 13,
"metadata": {}, 156 156 "metadata": {},
"outputs": [ 157 157 "outputs": [
{ 158 158 {
"name": "stdout", 159 159 "name": "stdout",
"output_type": "stream", 160 160 "output_type": "stream",
"text": [ 161 161 "text": [
"Threshold: 10\n", 162 162 "Threshold: 10\n",
"Users to be removed:[12, 36, 38]\n", 163 163 "Users to be removed:[12, 36, 38]\n",
"Shape Change: 258889 -> 258363 (-526, -0.2%)\n" 164 164 "Shape Change: 258889 -> 258363 (-526, -0.2%)\n"
] 165 165 ]
}, 166 166 },
{ 167 167 {
"data": { 168 168 "data": {
"image/png": "", 169 169 "image/png": "",
"text/plain": [ 170 170 "text/plain": [
"<Figure size 432x288 with 1 Axes>" 171 171 "<Figure size 432x288 with 1 Axes>"
] 172 172 ]
}, 173 173 },
"metadata": {}, 174 174 "metadata": {},
"output_type": "display_data" 175 175 "output_type": "display_data"
} 176 176 }
], 177 177 ],
"source": [ 178 178 "source": [
"# making a stat of the number of days per user\n", 179 179 "# making a stat of the number of days per user\n",
"stat_user = user_date.groupby(['user'])['local_date'].nunique().sort_values()\n", 180 180 "stat_user = user_date.groupby(['user'])['local_date'].nunique().sort_values()\n",
"\n", 181 181 "\n",
"ax = plt.figure()\n", 182 182 "ax = plt.figure()\n",
"ax.patch.set_facecolor('white')\n", 183 183 "ax.patch.set_facecolor('white')\n",
"ax = sns.histplot(stat_user)\n", 184 184 "ax = sns.histplot(stat_user)\n",
"ax.set_title('Distribution of number of days per user')\n", 185 185 "ax.set_title('Distribution of number of days per user')\n",
"ax.set_xlabel('Number of days')\n", 186 186 "ax.set_xlabel('Number of days')\n",
"ax.set_ylabel('Frequency')\n", 187 187 "ax.set_ylabel('Frequency')\n",
"\n", 188 188 "\n",
"\n", 189 189 "\n",
"# filter out users that have less days of data than THRESHOLD_OF_DAYS_PER_USER\n", 190 190 "# filter out users that have less days of data than THRESHOLD_OF_DAYS_PER_USER\n",
"users_to_be_removed = stat_user[stat_user < THRESHOLD_OF_DAYS_PER_USER].index\n", 191 191 "users_to_be_removed = stat_user[stat_user < THRESHOLD_OF_DAYS_PER_USER].index\n",
"\n", 192 192 "\n",
"print(\"Threshold: {}\".format(THRESHOLD_OF_DAYS_PER_USER))\n", 193 193 "print(\"Threshold: {}\".format(THRESHOLD_OF_DAYS_PER_USER))\n",
"print(\"Users to be removed:{}\".format(list(users_to_be_removed)))\n", 194 194 "print(\"Users to be removed:{}\".format(list(users_to_be_removed)))\n",
"\n", 195 195 "\n",
"jawbone4 = jawbone3[~jawbone3[\"user\"].isin(users_to_be_removed)]\n", 196 196 "jawbone4 = jawbone3[~jawbone3[\"user\"].isin(users_to_be_removed)]\n",
"\n", 197 197 "\n",
"user_date2 = user_date[~user_date[\"user\"].isin(users_to_be_removed)]\n", 198 198 "user_date2 = user_date[~user_date[\"user\"].isin(users_to_be_removed)]\n",
"\n", 199 199 "\n",
"# printing the amount of data removed\n", 200 200 "# printing the amount of data removed\n",
"jawbone3_count, _ = jawbone3.shape\n", 201 201 "jawbone3_count, _ = jawbone3.shape\n",
"jawbone4_count, _ = jawbone4.shape\n", 202 202 "jawbone4_count, _ = jawbone4.shape\n",
"\n", 203 203 "\n",
"print(\"Shape Change: {} -> {} (-{}, -{}%)\".format(\n", 204 204 "print(\"Shape Change: {} -> {} (-{}, -{}%)\".format(\n",
" jawbone3_count, \n", 205 205 " jawbone3_count, \n",
" jawbone4_count, \n", 206 206 " jawbone4_count, \n",
" jawbone3_count - jawbone4_count, \n", 207 207 " jawbone3_count - jawbone4_count, \n",
" round((jawbone3_count - jawbone4_count) / jawbone3_count * 100, 2)\n", 208 208 " round((jawbone3_count - jawbone4_count) / jawbone3_count * 100, 2)\n",
" )\n", 209 209 " )\n",
")" 210 210 ")"
] 211 211 ]
}, 212 212 },
{ 213 213 {
"cell_type": "markdown", 214 214 "cell_type": "markdown",
"metadata": {}, 215 215 "metadata": {},
"source": [ 216 216 "source": [
"## Find consecutive minute walks" 217 217 "## Find consecutive minute walks"
] 218 218 ]
}, 219 219 },
{ 220 220 {
"cell_type": "code", 221 221 "cell_type": "code",
"execution_count": 37, 222 222 "execution_count": 37,
"metadata": {}, 223 223 "metadata": {},
"outputs": [ 224 224 "outputs": [
{ 225 225 {
"name": "stdout", 226 226 "name": "stdout",
"output_type": "stream", 227 227 "output_type": "stream",
"text": [ 228 228 "text": [
"Iteration: 0, length: 377396\n", 229 229 "Iteration: 0, length: 377396\n",
"Iteration: 1, length: 229752\n", 230 230 "Iteration: 1, length: 229752\n",
"Iteration: 2, length: 170648\n", 231 231 "Iteration: 2, length: 170648\n",
"Iteration: 3, length: 137484\n", 232 232 "Iteration: 3, length: 137484\n",
"Iteration: 4, length: 178268\n", 233 233 "Iteration: 4, length: 178268\n",
"Final, length: 94884\n" 234 234 "Final, length: 94884\n"
] 235 235 ]
} 236 236 }
], 237 237 ],
"source": [ 238 238 "source": [
"# prepare the data for the walk calculation\n", 239 239 "# prepare the data for the walk calculation\n",
"current_vector = jawbone4[[\"user\", \"local_date\", \"local_minute_index\", \"steps\"]]\n", 240 240 "current_vector = jawbone4[[\"user\", \"local_date\", \"local_minute_index\", \"steps\"]]\n",
"current_vector[\"add_count\"] = 1\n", 241 241 "current_vector[\"add_count\"] = 1\n",
"current_vector = current_vector[current_vector[\"steps\"] > MINIMUM_STEPS_PER_MINUTE]\n", 242 242 "current_vector = current_vector[current_vector[\"steps\"] > MINIMUM_STEPS_PER_MINUTE]\n",
"current_vector = current_vector[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n", 243 243 "current_vector = current_vector[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
"\n", 244 244 "\n",
"# define an iterative walk calculation (merging consecutive active minutes)\n", 245 245 "# define an iterative walk calculation (merging consecutive active minutes)\n",
"def calculate_walk(cv):\n", 246 246 "def calculate_walk(cv):\n",
" nv = cv.copy(deep=True)\n", 247 247 " nv = cv.copy(deep=True)\n",
" nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n", 248 248 " nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n",
"\n", 249 249 "\n",
" # move midnight minutes to previous day\n", 250 250 " # move midnight minutes to previous day\n",
" nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n", 251 251 " nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n",
" nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n", 252 252 " nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n",
" \n", 253 253 " \n",
" nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n", 254 254 " nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n",
" jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n", 255 255 " jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n",
" jv[\"add_count\"] += 1\n", 256 256 " jv[\"add_count\"] += 1\n",
" jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n", 257 257 " jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
"\n", 258 258 "\n",
" return jv \n", 259 259 " return jv \n",
"\n", 260 260 "\n",
"\n", 261 261 "\n",
"# iteratively calculate the walk\n", 262 262 "# iteratively calculate the walk\n",
"for i in range(0, MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK):\n", 263 263 "for i in range(0, MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK):\n",
" print(\"Iteration: {}, length: {}\".format(i, current_vector.size))\n", 264 264 " print(\"Iteration: {}, length: {}\".format(i, current_vector.size))\n",
" new_vector = calculate_walk(current_vector)\n", 265 265 " new_vector = calculate_walk(current_vector)\n",
" current_vector = new_vector\n", 266 266 " current_vector = new_vector\n",
"\n", 267 267 "\n",
"print(\"Final, length: {}\".format(current_vector.size))\n", 268 268 "print(\"Final, length: {}\".format(current_vector.size))\n",
"\n", 269 269 "\n",
"consecutive_minutes = current_vector[[\"user\", \"local_date\", \"local_minute_index\"]].drop_duplicates()" 270 270 "consecutive_minutes = current_vector[[\"user\", \"local_date\", \"local_minute_index\"]].drop_duplicates()"
] 271 271 ]
}, 272 272 },
{ 273 273 {
"cell_type": "markdown", 274 274 "cell_type": "markdown",
"metadata": {}, 275 275 "metadata": {},
"source": [ 276 276 "source": [
"## Map consecutive minutes to 1hr and 3hr units" 277 277 "## Map consecutive minutes to 1hr and 3hr units"
] 278 278 ]
}, 279 279 },
{ 280 280 {
"cell_type": "code", 281 281 "cell_type": "code",
"execution_count": 110, 282 282 "execution_count": 111,
"metadata": {}, 283 283 "metadata": {},
"outputs": [ 284 284 "outputs": [],
{ 285
"name": "stdout", 286
"output_type": "stream", 287
"text": [ 288
" index user local_date threehour walked\n", 289
"0 0 1 2015-07-22 2 2\n", 290
"1 3 1 2015-07-22 6 2\n", 291
"2 30 1 2015-07-23 2 2\n", 292
"3 50 1 2015-07-23 3 2\n", 293
"4 58 1 2015-07-23 5 2\n", 294
" user local_date threehour walked\n", 295
"0 1 2015-07-22 0 1.0\n", 296
"1 1 2015-07-22 1 1.0\n", 297
"2 1 2015-07-22 2 2.0\n", 298
"3 1 2015-07-22 3 1.0\n", 299
"4 1 2015-07-22 4 1.0\n", 300
"5 1 2015-07-22 5 1.0\n", 301
"6 1 2015-07-22 6 2.0\n", 302
"7 1 2015-07-22 7 1.0\n", 303
"8 1 2015-07-23 0 1.0\n", 304
"9 1 2015-07-23 1 1.0\n", 305
"10 1 2015-07-23 2 2.0\n", 306
"11 1 2015-07-23 3 2.0\n", 307
"12 1 2015-07-23 4 1.0\n", 308
"13 1 2015-07-23 5 2.0\n", 309
"14 1 2015-07-23 6 1.0\n", 310
"15 1 2015-07-23 7 1.0\n" 311
] 312
} 313
], 314
"source": [ 315 285 "source": [
"# calculate hour index and three hour index\n", 316 286 "# calculate hour index and three hour index\n",
"consecutive_minutes[\"hour\"] = np.int_(np.floor(consecutive_minutes[\"local_minute_index\"] / 60))\n", 317 287 "consecutive_minutes[\"hour\"] = np.int_(np.floor(consecutive_minutes[\"local_minute_index\"] / 60))\n",
"consecutive_minutes[\"threehour\"] = np.int_(np.floor(consecutive_minutes[\"hour\"] / 3))\n", 318 288 "consecutive_minutes[\"threehour\"] = np.int_(np.floor(consecutive_minutes[\"hour\"] / 3))\n",
"\n", 319 289 "\n",
"# calculate the number of walks per user, per hour\n", 320 290 "# calculate the number of walks per user, per hour\n",
"walk_by_hours = consecutive_minutes[[\"user\", \"local_date\", \"hour\"]].drop_duplicates().reset_index()\n", 321 291 "walk_by_hours = consecutive_minutes[[\"user\", \"local_date\", \"hour\"]].drop_duplicates().reset_index()\n",
"walk_by_hours[\"walked\"] = 2\n", 322 292 "walk_by_hours[\"walked\"] = 2\n",
"\n", 323 293 "\n",
"# calculate the number of walks per user, per three hour\n", 324 294 "# calculate the number of walks per user, per three hour\n",
"walk_by_threehours = consecutive_minutes[[\"user\", \"local_date\", \"threehour\"]].drop_duplicates().reset_index()\n", 325 295 "walk_by_threehours = consecutive_minutes[[\"user\", \"local_date\", \"threehour\"]].drop_duplicates().reset_index()\n",
"walk_by_threehours[\"walked\"] = 2\n", 326 296 "walk_by_threehours[\"walked\"] = 2\n",
"\n", 327 297 "\n",
"# generate hour vector and three hour vector\n", 328 298 "# generate hour vector and three hour vector\n",
"hours = pd.DataFrame({\"hour\": range(0,24)})\n", 329 299 "hours = pd.DataFrame({\"hour\": range(0,24)})\n",
"threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n", 330 300 "threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n",
"\n", 331 301 "\n",
"\n", 332
"walk_by_hours = walk_by_hours[:5]\n", 333
"walk_by_threehours = walk_by_threehours[:5]\n", 334
"\n", 335
"# generate complete product of vectors\n", 336 302 "# generate complete product of vectors\n",
"def product_df(mat1, mat2):\n", 337 303 "def product_df(mat1, mat2):\n",
" mat1 = mat1.drop_duplicates()\n", 338 304 " mat1 = mat1.drop_duplicates()\n",
" mat2 = mat2.drop_duplicates()\n", 339 305 " mat2 = mat2.drop_duplicates()\n",
"\n", 340 306 "\n",
" temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n", 341 307 " temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n",
" for i, acol in enumerate(mat1.columns):\n", 342 308 " for i, acol in enumerate(mat1.columns):\n",
" temp[acol] = temp[0].apply(lambda x: x[i])\n", 343 309 " temp[acol] = temp[0].apply(lambda x: x[i])\n",
" for i, acol in enumerate(mat2.columns):\n", 344 310 " for i, acol in enumerate(mat2.columns):\n",
" temp[acol] = temp[1].apply(lambda x: x[i])\n", 345 311 " temp[acol] = temp[1].apply(lambda x: x[i])\n",
" temp = temp.drop(columns=[0, 1])\n", 346 312 " temp = temp.drop(columns=[0, 1])\n",
" return temp\n", 347 313 " return temp\n",
"\n", 348 314 "\n",
"measured_hour = product_df(walk_by_hours[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n", 349 315 "measured_hour = product_df(walk_by_hours[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n",
"measured_threehour = product_df(walk_by_threehours[[\"user\", \"local_date\"]], threehours[[\"threehour\"]])\n", 350 316 "measured_threehour = product_df(walk_by_threehours[[\"user\", \"local_date\"]], threehours[[\"threehour\"]])\n",
"\n", 351 317 "\n",
"# pad the hourly walk data (fill in missing hours with 1s)\n", 352 318 "# pad the hourly walk data (fill in missing hours with 1s)\n",
"padded_hours = walk_by_hours.merge(measured_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\")\n", 353 319 "padded_hours = walk_by_hours.merge(measured_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\")\n",
"padded_hours = padded_hours[[\"user\", \"local_date\", \"hour\", \"walked\"]]\n", 354 320 "padded_hours = padded_hours[[\"user\", \"local_date\", \"hour\", \"walked\"]]\n",
"padded_hours = padded_hours.fillna(1)\n", 355 321 "padded_hours = padded_hours.fillna(1)\n",
"\n", 356 322 "\n",
"# pad the walk data with 3 hours unit (fill in missing hours with 1s)\n", 357 323 "# pad the walk data with 3 hours unit (fill in missing hours with 1s)\n",
"padded_threehours = walk_by_threehours.merge(measured_threehour, on=[\"user\", \"local_date\", \"threehour\"], how=\"right\")\n", 358 324 "padded_threehours = walk_by_threehours.merge(measured_threehour, on=[\"user\", \"local_date\", \"threehour\"], how=\"right\")\n",
"padded_threehours = padded_threehours[[\"user\", \"local_date\", \"threehour\", \"walked\"]]\n", 359 325 "padded_threehours = padded_threehours[[\"user\", \"local_date\", \"threehour\", \"walked\"]]\n",
"padded_threehours = padded_threehours.fillna(1)" 360 326 "padded_threehours = padded_threehours.fillna(1)"
] 361 327 ]
}, 362 328 },
{ 363 329 {
330 "cell_type": "markdown",
331 "metadata": {},
332 "source": [
333 "## Pad unmeasured missing data with 0s\n",
334 "\n",
335 "For three hour window data, since it works as output vector, 0 padding is not done. Instead, missing output data is not used for training."
336 ]
337 },
338 {
"cell_type": "code", 364 339 "cell_type": "code",
"execution_count": null, 365 340 "execution_count": 141,
"metadata": {}, 366 341 "metadata": {},
"outputs": [], 367 342 "outputs": [],
"source": [ 368 343 "source": [
344 "# generate start and end date for each user\n",
345 "start_date = padded_hours.groupby([\"user\"])[\"local_date\"].min()\n",
346 "end_date = padded_hours.groupby([\"user\"])[\"local_date\"].max()\n",
"\n", 369 347 "\n",
348 "# generate the user list\n",
349 "users = start_date.index\n",
350 "all_dates = pd.DataFrame({\"user\": [], \"local_date\": []})\n",
"\n", 370 351 "\n",
"standard_minute_index = pd.Series(name=\"local_minute_index\", data=np.arange(0, 1440, 1))\n", 371 352 "def date_range(start_date, end_date):\n",
353 " delta = end_date - start_date\n",
"\n", 372 354 "\n",
"a_user = users[0]\n", 373 355 " for i in range(delta.days + 1):\n",
"a_date = user_date2.local_date[0]\n", 374 356 " yield start_date + timedelta(days=i)\n",
"\n", 375 357 "\n",
"a_jawbone3 = jawbone3.loc[(jawbone3.user == a_user) & (jawbone3.local_date == a_date), :]\n", 376 358 "# generate the lists of dates between start and end date\n",
359 "for userid in users:\n",
360 " current_user_dates = pd.DataFrame({\"user\": userid, \"local_date\": list(date_range(start_date[userid], end_date[userid]))})\n",
361 " \n",
362 " all_dates = pd.concat([all_dates, current_user_dates])\n",
"\n", 377 363 "\n",
"vec = a_jawbone3[[\"local_minute_index\", \"steps\"]]\n", 378 364 "# generate the base vector for the padding\n",
365 "all_dates_hour = product_df(all_dates[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n",
"\n", 379 366 "\n",
"steps = [0] * 1440\n", 380 367 "# final padded gait data\n",
368 "padded_hours = padded_hours.merge(all_dates_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\").fillna(0)\n",
"\n", 381 369 "\n",
"for index, row in vec.iterrows():\n", 382
" steps[row.local_minute_index] += row.steps\n", 383
"\n", 384 370 "\n",
"print(steps)\n", 385 371 "# save the data\n",
"steps_series = pd.Series(name=\"steps\", data=steps)\n", 386 372 "padded_hours.to_csv(os.path.join(data_dir, \"padded_hours.csv\"), index=False)\n",
"steps_series[\"over60\"] = (steps_series > 60) * 1\n", 387 373 "padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)"
"\n", 388
"steps_series[\"roll\"] = steps_series.rolling(window=5, min_periods=1).sum()\n", 389
"\n", 390
"steps_series.roll.plot()\n", 391
"\n", 392
"\n" 393
] 394 374 ]
} 395 375 }
], 396 376 ],
"metadata": { 397 377 "metadata": {
"interpreter": { 398 378 "interpreter": {
"hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a" 399 379 "hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a"
}, 400 380 },
"kernelspec": { 401 381 "kernelspec": {
"display_name": "Python 3.7.9 64-bit ('venv': venv)", 402 382 "display_name": "Python 3.7.9 64-bit ('venv': venv)",
"language": "python", 403 383 "language": "python",
"name": "python3" 404 384 "name": "python3"
}, 405 385 },
"language_info": { 406 386 "language_info": {
"codemirror_mode": { 407 387 "codemirror_mode": {
"name": "ipython", 408 388 "name": "ipython",
"version": 3 409 389 "version": 3
}, 410 390 },
"file_extension": ".py", 411 391 "file_extension": ".py",
"mimetype": "text/x-python", 412 392 "mimetype": "text/x-python",
"name": "python", 413 393 "name": "python",
"nbconvert_exporter": "python", 414 394 "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", 415 395 "pygments_lexer": "ipython3",
"version": "3.7.9" 416 396 "version": "3.7.9"
}, 417 397 },
"orig_nbformat": 4 418 398 "orig_nbformat": 4
}, 419 399 },
"nbformat": 4, 420 400 "nbformat": 4,
"nbformat_minor": 2 421 401 "nbformat_minor": 2
} 422 402 }
423 403