Commit cf37877009d2c931937801ab68f427ce3c847775

Authored by Junghwan Park
1 parent f39b4d8ba4
Exists in main

rearranged code order

Showing 1 changed file with 64 additions and 48 deletions Inline Diff

python-notebook/data_loading.ipynb View file @ cf37877
{ 1 1 {
"cells": [ 2 2 "cells": [
{ 3 3 {
"cell_type": "markdown", 4 4 "cell_type": "markdown",
"metadata": {}, 5 5 "metadata": {},
"source": [ 6 6 "source": [
"# Loading libraries" 7 7 "# Loading libraries"
] 8 8 ]
}, 9 9 },
{ 10 10 {
"cell_type": "code", 11 11 "cell_type": "code",
"execution_count": 17, 12 12 "execution_count": 142,
"metadata": {}, 13 13 "metadata": {},
"outputs": [], 14 14 "outputs": [],
"source": [ 15 15 "source": [
"import numpy as np\n", 16 16 "import numpy as np\n",
"import matplotlib.pyplot as plt\n", 17 17 "import matplotlib.pyplot as plt\n",
"import seaborn as sns\n", 18 18 "import seaborn as sns\n",
"from pandas import read_csv\n", 19 19 "from pandas import read_csv\n",
"import pandas as pd\n", 20 20 "import pandas as pd\n",
"import os\n", 21 21 "import os\n",
"from datetime import datetime, date, timedelta\n", 22 22 "from datetime import datetime, date, timedelta\n",
"from itertools import product\n", 23 23 "from itertools import product\n",
"# %load_ext line_profiler" 24 24 "# %load_ext line_profiler"
] 25 25 ]
}, 26 26 },
{ 27 27 {
"cell_type": "markdown", 28 28 "cell_type": "markdown",
"metadata": {}, 29 29 "metadata": {},
"source": [ 30 30 "source": [
"# Defining Functions and Adjusting Settings" 31 31 "# Defining Functions and Adjusting Settings"
] 32 32 ]
}, 33 33 },
{ 34 34 {
"cell_type": "code", 35 35 "cell_type": "code",
"execution_count": 36, 36 36 "execution_count": 143,
"metadata": {}, 37 37 "metadata": {},
"outputs": [], 38 38 "outputs": [],
"source": [ 39 39 "source": [
40 "# to use unlimited memory for large dataframes\n",
"pd.options.mode.chained_assignment = None\n", 40 41 "pd.options.mode.chained_assignment = None\n",
"\n", 41 42 "\n",
43 "# convert a datetime object to a date object\n",
"def get_date(x):\n", 42 44 "def get_date(x):\n",
" return date(x.year, x.month, x.day)\n", 43 45 " return date(x.year, x.month, x.day)\n",
"\n", 44 46 "\n",
47 "# convert a datetime object to an integer, which denotes the number of minutes since midnight\n",
"def get_minute_index(x):\n", 45 48 "def get_minute_index(x):\n",
" return (x.hour * 60) + x.minute\n", 46 49 " return (x.hour * 60) + x.minute\n",
"\n", 47 50 "\n",
51 "# return a range of dates\n",
52 "def date_range(start_date, end_date):\n",
53 " delta = end_date - start_date\n",
"\n", 48 54 "\n",
55 " for i in range(delta.days + 1):\n",
56 " yield start_date + timedelta(days=i)\n",
57 "\n",
58 "# define an iterative walk calculation (merging consecutive active minutes)\n",
59 "def calculate_walk(cv):\n",
60 " nv = cv.copy(deep=True)\n",
61 " nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n",
62 "\n",
63 " # move midnight minutes to previous day\n",
64 " nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n",
65 " nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n",
66 " \n",
67 " nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n",
68 " jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n",
69 " jv[\"add_count\"] += 1\n",
70 " jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
71 "\n",
72 " return jv \n",
73 "\n",
74 "# generate complete product of vectors\n",
75 "def product_df(mat1, mat2):\n",
76 " mat1 = mat1.drop_duplicates()\n",
77 " mat2 = mat2.drop_duplicates()\n",
78 "\n",
79 " temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n",
80 " for i, acol in enumerate(mat1.columns):\n",
81 " temp[acol] = temp[0].apply(lambda x: x[i])\n",
82 " for i, acol in enumerate(mat2.columns):\n",
83 " temp[acol] = temp[1].apply(lambda x: x[i])\n",
84 " temp = temp.drop(columns=[0, 1])\n",
85 " return temp\n",
86 "\n",
"# cut off values that are not in the range of the data\n", 49 87 "# cut off values that are not in the range of the data\n",
"THRESHOLD_OF_DAYS_PER_USER = 10\n", 50 88 "THRESHOLD_OF_DAYS_PER_USER = 10\n",
"\n", 51 89 "\n",
"# cut off values for the number of consecutive minutes for a walk\n", 52 90 "# cut off values for the number of consecutive minutes for a walk\n",
"MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5\n", 53 91 "MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5\n",
"\n", 54 92 "\n",
"# cut off values for the number of steps per minute for an active minute\n", 55 93 "# cut off values for the number of steps per minute for an active minute\n",
"MINIMUM_STEPS_PER_MINUTE = 60\n" 56 94 "MINIMUM_STEPS_PER_MINUTE = 60\n"
] 57 95 ]
}, 58 96 },
{ 59 97 {
"cell_type": "markdown", 60 98 "cell_type": "markdown",
"metadata": {}, 61 99 "metadata": {},
"source": [ 62 100 "source": [
"# Loading data files" 63 101 "# Loading data files"
] 64 102 ]
}, 65 103 },
{ 66 104 {
"cell_type": "code", 67 105 "cell_type": "code",
"execution_count": 5, 68 106 "execution_count": 144,
"metadata": {}, 69 107 "metadata": {},
"outputs": [], 70 108 "outputs": [],
"source": [ 71 109 "source": [
"data_dir = '../data'\n", 72 110 "data_dir = '../data'\n",
"\n", 73 111 "\n",
"daily = read_csv(os.path.join(data_dir, 'daily.csv'))\n", 74 112 "daily = read_csv(os.path.join(data_dir, 'daily.csv'))\n",
"dose = read_csv(os.path.join(data_dir, 'dose.csv'))\n", 75 113 "dose = read_csv(os.path.join(data_dir, 'dose.csv'))\n",
"jawbone = read_csv(os.path.join(data_dir, 'jawbone.csv'), low_memory=False)\n" 76 114 "jawbone = read_csv(os.path.join(data_dir, 'jawbone.csv'), low_memory=False)\n"
] 77 115 ]
}, 78 116 },
{ 79 117 {
"cell_type": "markdown", 80 118 "cell_type": "markdown",
"metadata": {}, 81 119 "metadata": {},
"source": [ 82 120 "source": [
"# Preprocessing\n", 83 121 "# Preprocessing\n",
"## Picking up the variables" 84 122 "## Picking up the variables"
] 85 123 ]
}, 86 124 },
{ 87 125 {
"cell_type": "code", 88 126 "cell_type": "code",
"execution_count": 6, 89 127 "execution_count": 145,
"metadata": {}, 90 128 "metadata": {},
"outputs": [], 91 129 "outputs": [],
"source": [ 92 130 "source": [
"# Column names of jawbone data\n", 93 131 "# Column names of jawbone data\n",
"# 'Var1', 'user', 'start_datetime', 'end_datetime', 'timezone', 'userid',\n", 94 132 "# 'Var1', 'user', 'start_datetime', 'end_datetime', 'timezone', 'userid',\n",
"# 'steps', 'gmtoff', 'tz', 'start_date', 'end_date', 'start_utime',\n", 95 133 "# 'steps', 'gmtoff', 'tz', 'start_date', 'end_date', 'start_utime',\n",
"# 'end_utime', 'start_udate', 'end_udate', 'intake_date', 'intake_utime',\n", 96 134 "# 'end_utime', 'start_udate', 'end_udate', 'intake_date', 'intake_utime',\n",
"# 'intake_tz', 'intake_gmtoff', 'intake_hour', 'intake_min',\n", 97 135 "# 'intake_tz', 'intake_gmtoff', 'intake_hour', 'intake_min',\n",
"# 'intake_slot', 'travel_start', 'travel_end', 'exit_date',\n", 98 136 "# 'intake_slot', 'travel_start', 'travel_end', 'exit_date',\n",
"# 'dropout_date', 'last_date', 'last_utime', 'last_tz', 'last_gmtoff',\n", 99 137 "# 'dropout_date', 'last_date', 'last_utime', 'last_tz', 'last_gmtoff',\n",
"# 'last_hour', 'last_min', 'start_utime_local', 'end_utime_local'\n", 100 138 "# 'last_hour', 'last_min', 'start_utime_local', 'end_utime_local'\n",
"\n", 101 139 "\n",
"\n", 102 140 "\n",
"# duplicate jawbone data\n", 103 141 "# duplicate jawbone data\n",
"jawbone2 = jawbone.copy(deep=True)\n", 104 142 "jawbone2 = jawbone.copy(deep=True)\n",
"\n", 105 143 "\n",
"# convert string datetimes to actual datetime objects\n", 106 144 "# convert string datetimes to actual datetime objects\n",
"jawbone2[\"start_utime_local\"] = pd.to_datetime(\n", 107 145 "jawbone2[\"start_utime_local\"] = pd.to_datetime(\n",
" jawbone2[\"start_utime_local\"], format=\"%Y-%m-%d %H:%M:%S\")\n", 108 146 " jawbone2[\"start_utime_local\"], format=\"%Y-%m-%d %H:%M:%S\")\n",
"jawbone2[\"start_datetime\"] = pd.to_datetime(\n", 109 147 "jawbone2[\"start_datetime\"] = pd.to_datetime(\n",
" jawbone2[\"start_datetime\"], format=\"%Y-%m-%d %H:%M:%S\")\n", 110 148 " jawbone2[\"start_datetime\"], format=\"%Y-%m-%d %H:%M:%S\")\n",
"\n", 111 149 "\n",
"# calculate the timezone offset\n", 112 150 "# calculate the timezone offset\n",
"jawbone2[\"tz_offset\"] = jawbone2[\"start_datetime\"] - \\\n", 113 151 "jawbone2[\"tz_offset\"] = jawbone2[\"start_datetime\"] - \\\n",
" jawbone2[\"start_utime_local\"]\n", 114 152 " jawbone2[\"start_utime_local\"]\n",
"\n", 115 153 "\n",
"\n", 116 154 "\n",
"# selecting only important columns\n", 117 155 "# selecting only important columns\n",
"jawbone3 = jawbone2[[\"user\", \"start_utime_local\",\n", 118 156 "jawbone3 = jawbone2[[\"user\", \"start_utime_local\",\n",
" \"end_utime_local\", \"tz_offset\", \"steps\"]]\n", 119 157 " \"end_utime_local\", \"tz_offset\", \"steps\"]]\n",
"\n", 120 158 "\n",
"# picking up the local date\n", 121 159 "# picking up the local date\n",
"jawbone3[\"local_date\"] = jawbone3[\"start_utime_local\"].apply(get_date)\n", 122 160 "jawbone3[\"local_date\"] = jawbone3[\"start_utime_local\"].apply(get_date)\n",
"\n", 123 161 "\n",
"# picking up the local minute index\n", 124 162 "# picking up the local minute index\n",
"jawbone3[\"local_minute_index\"] = jawbone3[\"start_utime_local\"].apply(\n", 125 163 "jawbone3[\"local_minute_index\"] = jawbone3[\"start_utime_local\"].apply(\n",
" get_minute_index)\n" 126 164 " get_minute_index)\n"
] 127 165 ]
}, 128 166 },
{ 129 167 {
"cell_type": "markdown", 130 168 "cell_type": "markdown",
"metadata": {}, 131 169 "metadata": {},
"source": [ 132 170 "source": [
"## Making a key info database" 133 171 "## Making a key info database"
] 134 172 ]
}, 135 173 },
{ 136 174 {
"cell_type": "code", 137 175 "cell_type": "code",
"execution_count": 7, 138 176 "execution_count": 146,
"metadata": {}, 139 177 "metadata": {},
"outputs": [], 140 178 "outputs": [],
"source": [ 141 179 "source": [
"# picking up the user - date data\n", 142 180 "# picking up the user - date data\n",
"user_date = jawbone3[[\"user\", \"local_date\"]].drop_duplicates()" 143 181 "user_date = jawbone3[[\"user\", \"local_date\"]].drop_duplicates()"
] 144 182 ]
}, 145 183 },
{ 146 184 {
"cell_type": "markdown", 147 185 "cell_type": "markdown",
"metadata": {}, 148 186 "metadata": {},
"source": [ 149 187 "source": [
"## Removing users with too small amount of data" 150 188 "## Removing users with too small amount of data"
] 151 189 ]
}, 152 190 },
{ 153 191 {
"cell_type": "code", 154 192 "cell_type": "code",
"execution_count": 13, 155 193 "execution_count": 147,
"metadata": {}, 156 194 "metadata": {},
"outputs": [ 157 195 "outputs": [
{ 158 196 {
"name": "stdout", 159 197 "name": "stdout",
"output_type": "stream", 160 198 "output_type": "stream",
"text": [ 161 199 "text": [
"Threshold: 10\n", 162 200 "Threshold: 10\n",
"Users to be removed:[12, 36, 38]\n", 163 201 "Users to be removed:[12, 36, 38]\n",
"Shape Change: 258889 -> 258363 (-526, -0.2%)\n" 164 202 "Shape Change: 258889 -> 258363 (-526, -0.2%)\n"
] 165 203 ]
}, 166 204 },
{ 167 205 {
"data": { 168 206 "data": {
"image/png": "", 169 207 "image/png": "",
"text/plain": [ 170 208 "text/plain": [
"<Figure size 432x288 with 1 Axes>" 171 209 "<Figure size 432x288 with 1 Axes>"
] 172 210 ]
}, 173 211 },
"metadata": {}, 174 212 "metadata": {},
"output_type": "display_data" 175 213 "output_type": "display_data"
} 176 214 }
], 177 215 ],
"source": [ 178 216 "source": [
"# making a stat of the number of days per user\n", 179 217 "# making a stat of the number of days per user\n",
"stat_user = user_date.groupby(['user'])['local_date'].nunique().sort_values()\n", 180 218 "stat_user = user_date.groupby(['user'])['local_date'].nunique().sort_values()\n",
"\n", 181 219 "\n",
"ax = plt.figure()\n", 182 220 "ax = plt.figure()\n",
"ax.patch.set_facecolor('white')\n", 183 221 "ax.patch.set_facecolor('white')\n",
"ax = sns.histplot(stat_user)\n", 184 222 "ax = sns.histplot(stat_user)\n",
"ax.set_title('Distribution of number of days per user')\n", 185 223 "ax.set_title('Distribution of number of days per user')\n",
"ax.set_xlabel('Number of days')\n", 186 224 "ax.set_xlabel('Number of days')\n",
"ax.set_ylabel('Frequency')\n", 187 225 "ax.set_ylabel('Frequency')\n",
"\n", 188 226 "\n",
"\n", 189 227 "\n",
"# filter out users that have less days of data than THRESHOLD_OF_DAYS_PER_USER\n", 190 228 "# filter out users that have less days of data than THRESHOLD_OF_DAYS_PER_USER\n",
"users_to_be_removed = stat_user[stat_user < THRESHOLD_OF_DAYS_PER_USER].index\n", 191 229 "users_to_be_removed = stat_user[stat_user < THRESHOLD_OF_DAYS_PER_USER].index\n",
"\n", 192 230 "\n",
"print(\"Threshold: {}\".format(THRESHOLD_OF_DAYS_PER_USER))\n", 193 231 "print(\"Threshold: {}\".format(THRESHOLD_OF_DAYS_PER_USER))\n",
"print(\"Users to be removed:{}\".format(list(users_to_be_removed)))\n", 194 232 "print(\"Users to be removed:{}\".format(list(users_to_be_removed)))\n",
"\n", 195 233 "\n",
"jawbone4 = jawbone3[~jawbone3[\"user\"].isin(users_to_be_removed)]\n", 196 234 "jawbone4 = jawbone3[~jawbone3[\"user\"].isin(users_to_be_removed)]\n",
"\n", 197 235 "\n",
"user_date2 = user_date[~user_date[\"user\"].isin(users_to_be_removed)]\n", 198 236 "user_date2 = user_date[~user_date[\"user\"].isin(users_to_be_removed)]\n",
"\n", 199 237 "\n",
"# printing the amount of data removed\n", 200 238 "# printing the amount of data removed\n",
"jawbone3_count, _ = jawbone3.shape\n", 201 239 "jawbone3_count, _ = jawbone3.shape\n",
"jawbone4_count, _ = jawbone4.shape\n", 202 240 "jawbone4_count, _ = jawbone4.shape\n",
"\n", 203 241 "\n",
"print(\"Shape Change: {} -> {} (-{}, -{}%)\".format(\n", 204 242 "print(\"Shape Change: {} -> {} (-{}, -{}%)\".format(\n",
" jawbone3_count, \n", 205 243 " jawbone3_count, \n",
" jawbone4_count, \n", 206 244 " jawbone4_count, \n",
" jawbone3_count - jawbone4_count, \n", 207 245 " jawbone3_count - jawbone4_count, \n",
" round((jawbone3_count - jawbone4_count) / jawbone3_count * 100, 2)\n", 208 246 " round((jawbone3_count - jawbone4_count) / jawbone3_count * 100, 2)\n",
" )\n", 209 247 " )\n",
")" 210 248 ")"
] 211 249 ]
}, 212 250 },
{ 213 251 {
"cell_type": "markdown", 214 252 "cell_type": "markdown",
"metadata": {}, 215 253 "metadata": {},
"source": [ 216 254 "source": [
"## Find consecutive minute walks" 217 255 "## Find consecutive minute walks"
] 218 256 ]
}, 219 257 },
{ 220 258 {
"cell_type": "code", 221 259 "cell_type": "code",
"execution_count": 37, 222 260 "execution_count": 148,
"metadata": {}, 223 261 "metadata": {},
"outputs": [ 224 262 "outputs": [
{ 225 263 {
"name": "stdout", 226 264 "name": "stdout",
"output_type": "stream", 227 265 "output_type": "stream",
"text": [ 228 266 "text": [
"Iteration: 0, length: 377396\n", 229 267 "Iteration: 0, length: 377396\n",
"Iteration: 1, length: 229752\n", 230 268 "Iteration: 1, length: 229752\n",
"Iteration: 2, length: 170648\n", 231 269 "Iteration: 2, length: 170648\n",
"Iteration: 3, length: 137484\n", 232 270 "Iteration: 3, length: 137484\n",
"Iteration: 4, length: 178268\n", 233 271 "Iteration: 4, length: 178268\n",
"Final, length: 94884\n" 234 272 "Final, length: 94884\n"
] 235 273 ]
} 236 274 }
], 237 275 ],
"source": [ 238 276 "source": [
"# prepare the data for the walk calculation\n", 239 277 "# prepare the data for the walk calculation\n",
"current_vector = jawbone4[[\"user\", \"local_date\", \"local_minute_index\", \"steps\"]]\n", 240 278 "current_vector = jawbone4[[\"user\", \"local_date\", \"local_minute_index\", \"steps\"]]\n",
"current_vector[\"add_count\"] = 1\n", 241 279 "current_vector[\"add_count\"] = 1\n",
"current_vector = current_vector[current_vector[\"steps\"] > MINIMUM_STEPS_PER_MINUTE]\n", 242 280 "current_vector = current_vector[current_vector[\"steps\"] > MINIMUM_STEPS_PER_MINUTE]\n",
"current_vector = current_vector[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n", 243 281 "current_vector = current_vector[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
"\n", 244 282 "\n",
"# define an iterative walk calculation (merging consecutive active minutes)\n", 245
"def calculate_walk(cv):\n", 246
" nv = cv.copy(deep=True)\n", 247
" nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n", 248
"\n", 249
" # move midnight minutes to previous day\n", 250
" nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n", 251
" nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n", 252
" \n", 253
" nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n", 254
" jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n", 255
" jv[\"add_count\"] += 1\n", 256
" jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n", 257
"\n", 258
" return jv \n", 259
"\n", 260
"\n", 261
"# iteratively calculate the walk\n", 262 283 "# iteratively calculate the walk\n",
"for i in range(0, MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK):\n", 263 284 "for i in range(0, MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK):\n",
" print(\"Iteration: {}, length: {}\".format(i, current_vector.size))\n", 264 285 " print(\"Iteration: {}, length: {}\".format(i, current_vector.size))\n",
" new_vector = calculate_walk(current_vector)\n", 265 286 " new_vector = calculate_walk(current_vector)\n",
" current_vector = new_vector\n", 266 287 " current_vector = new_vector\n",
"\n", 267 288 "\n",
"print(\"Final, length: {}\".format(current_vector.size))\n", 268 289 "print(\"Final, length: {}\".format(current_vector.size))\n",
"\n", 269 290 "\n",
"consecutive_minutes = current_vector[[\"user\", \"local_date\", \"local_minute_index\"]].drop_duplicates()" 270 291 "consecutive_minutes = current_vector[[\"user\", \"local_date\", \"local_minute_index\"]].drop_duplicates()"
] 271 292 ]
}, 272 293 },
{ 273 294 {
"cell_type": "markdown", 274 295 "cell_type": "markdown",
"metadata": {}, 275 296 "metadata": {},
"source": [ 276 297 "source": [
"## Map consecutive minutes to 1hr and 3hr units" 277 298 "## Map consecutive minutes to 1hr and 3hr units"
] 278 299 ]
}, 279 300 },
{ 280 301 {
"cell_type": "code", 281 302 "cell_type": "code",
"execution_count": 111, 282 303 "execution_count": 149,
"metadata": {}, 283 304 "metadata": {},
"outputs": [], 284 305 "outputs": [],
"source": [ 285 306 "source": [
"# calculate hour index and three hour index\n", 286 307 "# calculate hour index and three hour index\n",
"consecutive_minutes[\"hour\"] = np.int_(np.floor(consecutive_minutes[\"local_minute_index\"] / 60))\n", 287 308 "consecutive_minutes[\"hour\"] = np.int_(np.floor(consecutive_minutes[\"local_minute_index\"] / 60))\n",
"consecutive_minutes[\"threehour\"] = np.int_(np.floor(consecutive_minutes[\"hour\"] / 3))\n", 288 309 "consecutive_minutes[\"threehour\"] = np.int_(np.floor(consecutive_minutes[\"hour\"] / 3))\n",
"\n", 289 310 "\n",
"# calculate the number of walks per user, per hour\n", 290 311 "# calculate the number of walks per user, per hour\n",
"walk_by_hours = consecutive_minutes[[\"user\", \"local_date\", \"hour\"]].drop_duplicates().reset_index()\n", 291 312 "walk_by_hours = consecutive_minutes[[\"user\", \"local_date\", \"hour\"]].drop_duplicates().reset_index()\n",
"walk_by_hours[\"walked\"] = 2\n", 292 313 "walk_by_hours[\"walked\"] = 2\n",
"\n", 293 314 "\n",
"# calculate the number of walks per user, per three hour\n", 294 315 "# calculate the number of walks per user, per three hour\n",
"walk_by_threehours = consecutive_minutes[[\"user\", \"local_date\", \"threehour\"]].drop_duplicates().reset_index()\n", 295 316 "walk_by_threehours = consecutive_minutes[[\"user\", \"local_date\", \"threehour\"]].drop_duplicates().reset_index()\n",
"walk_by_threehours[\"walked\"] = 2\n", 296 317 "walk_by_threehours[\"walked\"] = 2\n",
"\n", 297 318 "\n",
"# generate hour vector and three hour vector\n", 298 319 "# generate hour vector and three hour vector\n",
"hours = pd.DataFrame({\"hour\": range(0,24)})\n", 299 320 "hours = pd.DataFrame({\"hour\": range(0,24)})\n",
"threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n", 300 321 "threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n",
"\n", 301 322 "\n",
"# generate complete product of vectors\n", 302 323 "# generate complete product dataframe\n",
"def product_df(mat1, mat2):\n", 303
" mat1 = mat1.drop_duplicates()\n", 304
" mat2 = mat2.drop_duplicates()\n", 305
"\n", 306
" temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n", 307
" for i, acol in enumerate(mat1.columns):\n", 308
" temp[acol] = temp[0].apply(lambda x: x[i])\n", 309
" for i, acol in enumerate(mat2.columns):\n", 310
" temp[acol] = temp[1].apply(lambda x: x[i])\n", 311
" temp = temp.drop(columns=[0, 1])\n", 312
" return temp\n", 313
"\n", 314
"measured_hour = product_df(walk_by_hours[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n", 315 324 "measured_hour = product_df(walk_by_hours[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n",
"measured_threehour = product_df(walk_by_threehours[[\"user\", \"local_date\"]], threehours[[\"threehour\"]])\n", 316 325 "measured_threehour = product_df(walk_by_threehours[[\"user\", \"local_date\"]], threehours[[\"threehour\"]])\n",
"\n", 317 326 "\n",
"# pad the hourly walk data (fill in missing hours with 1s)\n", 318 327 "# pad the hourly walk data (fill in missing hours with 1s)\n",
"padded_hours = walk_by_hours.merge(measured_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\")\n", 319 328 "padded_hours = walk_by_hours.merge(measured_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\")\n",
"padded_hours = padded_hours[[\"user\", \"local_date\", \"hour\", \"walked\"]]\n", 320 329 "padded_hours = padded_hours[[\"user\", \"local_date\", \"hour\", \"walked\"]]\n",
"padded_hours = padded_hours.fillna(1)\n", 321 330 "padded_hours = padded_hours.fillna(1)\n",
"\n", 322 331 "\n",
"# pad the walk data with 3 hours unit (fill in missing hours with 1s)\n", 323 332 "# pad the walk data with 3 hours unit (fill in missing hours with 1s)\n",
"padded_threehours = walk_by_threehours.merge(measured_threehour, on=[\"user\", \"local_date\", \"threehour\"], how=\"right\")\n", 324 333 "padded_threehours = walk_by_threehours.merge(measured_threehour, on=[\"user\", \"local_date\", \"threehour\"], how=\"right\")\n",
"padded_threehours = padded_threehours[[\"user\", \"local_date\", \"threehour\", \"walked\"]]\n", 325 334 "padded_threehours = padded_threehours[[\"user\", \"local_date\", \"threehour\", \"walked\"]]\n",
"padded_threehours = padded_threehours.fillna(1)" 326 335 "padded_threehours = padded_threehours.fillna(1)"
] 327 336 ]
}, 328 337 },
{ 329 338 {
"cell_type": "markdown", 330 339 "cell_type": "markdown",
"metadata": {}, 331 340 "metadata": {},
"source": [ 332 341 "source": [
"## Pad unmeasured missing data with 0s\n", 333 342 "## Pad unmeasured missing data with 0s\n",
"\n", 334 343 "\n",
"For three hour window data, since it works as output vector, 0 padding is not done. Instead, missing output data is not used for training." 335 344 "For three hour window data, since it works as output vector, 0 padding is not done. Instead, missing output data is not used for training."
] 336 345 ]
}, 337 346 },
{ 338 347 {
"cell_type": "code", 339 348 "cell_type": "code",
"execution_count": 141, 340 349 "execution_count": 150,
"metadata": {}, 341 350 "metadata": {},
"outputs": [], 342 351 "outputs": [],
"source": [ 343 352 "source": [
"# generate start and end date for each user\n", 344 353 "# generate start and end date for each user\n",
"start_date = padded_hours.groupby([\"user\"])[\"local_date\"].min()\n", 345 354 "start_date = padded_hours.groupby([\"user\"])[\"local_date\"].min()\n",
"end_date = padded_hours.groupby([\"user\"])[\"local_date\"].max()\n", 346 355 "end_date = padded_hours.groupby([\"user\"])[\"local_date\"].max()\n",
"\n", 347 356 "\n",
"# generate the user list\n", 348 357 "# generate the user list\n",
"users = start_date.index\n", 349 358 "users = start_date.index\n",
"all_dates = pd.DataFrame({\"user\": [], \"local_date\": []})\n", 350 359 "all_dates = pd.DataFrame({\"user\": [], \"local_date\": []})\n",
"\n", 351 360 "\n",
"def date_range(start_date, end_date):\n", 352
" delta = end_date - start_date\n", 353
"\n", 354
" for i in range(delta.days + 1):\n", 355
" yield start_date + timedelta(days=i)\n", 356
"\n", 357
"# generate the lists of dates between start and end date\n", 358 361 "# generate the lists of dates between start and end date\n",
"for userid in users:\n", 359 362 "for userid in users:\n",
" current_user_dates = pd.DataFrame({\"user\": userid, \"local_date\": list(date_range(start_date[userid], end_date[userid]))})\n", 360 363 " current_user_dates = pd.DataFrame({\"user\": userid, \"local_date\": list(date_range(start_date[userid], end_date[userid]))})\n",
" \n", 361 364 " \n",
" all_dates = pd.concat([all_dates, current_user_dates])\n", 362 365 " all_dates = pd.concat([all_dates, current_user_dates])\n",
"\n", 363 366 "\n",
"# generate the base vector for the padding\n", 364 367 "# generate the base vector for the padding\n",
"all_dates_hour = product_df(all_dates[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n", 365 368 "all_dates_hour = product_df(all_dates[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n",
"\n", 366 369 "\n",
"# final padded gait data\n", 367 370 "# final padded gait data\n",
"padded_hours = padded_hours.merge(all_dates_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\").fillna(0)\n", 368 371 "padded_hours = padded_hours.merge(all_dates_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\").fillna(0)\n"
"\n", 369 372 ]
"\n", 370 373 },
374 {
375 "cell_type": "markdown",
376 "metadata": {},
377 "source": [
378 "# Saving Data"
379 ]
380 },
381 {
382 "cell_type": "code",
383 "execution_count": 151,
384 "metadata": {},
385 "outputs": [],
386 "source": [
"# save the data\n", 371 387 "# save the data\n",
"padded_hours.to_csv(os.path.join(data_dir, \"padded_hours.csv\"), index=False)\n", 372 388 "padded_hours.to_csv(os.path.join(data_dir, \"padded_hours.csv\"), index=False)\n",
"padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)" 373 389 "padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)"
] 374 390 ]
} 375 391 }
], 376 392 ],
"metadata": { 377 393 "metadata": {
"interpreter": { 378 394 "interpreter": {
"hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a" 379 395 "hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a"
}, 380 396 },
"kernelspec": { 381 397 "kernelspec": {
"display_name": "Python 3.7.9 64-bit ('venv': venv)", 382 398 "display_name": "Python 3.7.9 64-bit ('venv': venv)",
"language": "python", 383 399 "language": "python",
"name": "python3" 384 400 "name": "python3"
}, 385 401 },
"language_info": { 386 402 "language_info": {
"codemirror_mode": { 387 403 "codemirror_mode": {
"name": "ipython", 388 404 "name": "ipython",
"version": 3 389 405 "version": 3
}, 390 406 },
"file_extension": ".py", 391 407 "file_extension": ".py",
"mimetype": "text/x-python", 392 408 "mimetype": "text/x-python",
"name": "python", 393 409 "name": "python",
"nbconvert_exporter": "python", 394 410 "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", 395 411 "pygments_lexer": "ipython3",
"version": "3.7.9" 396 412 "version": "3.7.9"
}, 397 413 },
"orig_nbformat": 4 398 414 "orig_nbformat": 4
}, 399 415 },
"nbformat": 4, 400 416 "nbformat": 4,
"nbformat_minor": 2 401 417 "nbformat_minor": 2
} 402 418 }
403 419