Commit 1ba50105eb005cf94e0450393bb5160c369a57a5

Authored by Junghwan Park
1 parent a5d4a032f6
Exists in main

added walk hour vector / hourly and per three hours

Showing 1 changed file with 211 additions and 23 deletions Inline Diff

python-notebook/data_loading.ipynb View file @ 1ba5010
{ 1 1 {
"cells": [ 2 2 "cells": [
{ 3 3 {
"cell_type": "markdown", 4 4 "cell_type": "markdown",
"metadata": {}, 5 5 "metadata": {},
"source": [ 6 6 "source": [
"# Loading libraries" 7 7 "# Loading libraries"
] 8 8 ]
}, 9 9 },
{ 10 10 {
"cell_type": "code", 11 11 "cell_type": "code",
"execution_count": 3, 12 12 "execution_count": 17,
"metadata": {}, 13 13 "metadata": {},
"outputs": [], 14 14 "outputs": [],
"source": [ 15 15 "source": [
"import numpy as np\n", 16 16 "import numpy as np\n",
"import matplotlib.pyplot as plt\n", 17 17 "import matplotlib.pyplot as plt\n",
"import seaborn as sns\n", 18 18 "import seaborn as sns\n",
"from pandas import read_csv\n", 19 19 "from pandas import read_csv\n",
"import pandas as pd\n", 20 20 "import pandas as pd\n",
"import os\n", 21 21 "import os\n",
"from datetime import datetime, date\n", 22 22 "from datetime import datetime, date, timedelta\n",
23 "from itertools import product\n",
"# %load_ext line_profiler" 23 24 "# %load_ext line_profiler"
] 24 25 ]
}, 25 26 },
{ 26 27 {
"cell_type": "markdown", 27 28 "cell_type": "markdown",
"metadata": {}, 28 29 "metadata": {},
"source": [ 29 30 "source": [
"# Defining Functions and Adjusting Settings" 30 31 "# Defining Functions and Adjusting Settings"
] 31 32 ]
}, 32 33 },
{ 33 34 {
"cell_type": "code", 34 35 "cell_type": "code",
"execution_count": 4, 35 36 "execution_count": 36,
"metadata": {}, 36 37 "metadata": {},
"outputs": [], 37 38 "outputs": [],
"source": [ 38 39 "source": [
"pd.options.mode.chained_assignment = None\n", 39 40 "pd.options.mode.chained_assignment = None\n",
"\n", 40 41 "\n",
"def get_date(x):\n", 41 42 "def get_date(x):\n",
" return date(x.year, x.month, x.day)\n", 42 43 " return date(x.year, x.month, x.day)\n",
"\n", 43 44 "\n",
"def get_minute_index(x):\n", 44 45 "def get_minute_index(x):\n",
" return (x.hour * 60) + x.minute" 45 46 " return (x.hour * 60) + x.minute\n",
47 "\n",
48 "\n",
49 "# cut off values that are not in the range of the data\n",
50 "THRESHOLD_OF_DAYS_PER_USER = 10\n",
51 "\n",
52 "# cut off values for the number of consecutive minutes for a walk\n",
53 "MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5\n",
54 "\n",
55 "# cut off values for the number of steps per minute for an active minute\n",
56 "MINIMUM_STEPS_PER_MINUTE = 60\n"
] 46 57 ]
}, 47 58 },
{ 48 59 {
"cell_type": "markdown", 49 60 "cell_type": "markdown",
"metadata": {}, 50 61 "metadata": {},
"source": [ 51 62 "source": [
"# Loading data files" 52 63 "# Loading data files"
] 53 64 ]
}, 54 65 },
{ 55 66 {
"cell_type": "code", 56 67 "cell_type": "code",
"execution_count": 5, 57 68 "execution_count": 5,
"metadata": {}, 58 69 "metadata": {},
"outputs": [], 59 70 "outputs": [],
"source": [ 60 71 "source": [
"data_dir = '../data'\n", 61 72 "data_dir = '../data'\n",
"\n", 62 73 "\n",
"daily = read_csv(os.path.join(data_dir, 'daily.csv'))\n", 63 74 "daily = read_csv(os.path.join(data_dir, 'daily.csv'))\n",
"dose = read_csv(os.path.join(data_dir, 'dose.csv'))\n", 64 75 "dose = read_csv(os.path.join(data_dir, 'dose.csv'))\n",
"jawbone = read_csv(os.path.join(data_dir, 'jawbone.csv'), low_memory=False)\n" 65 76 "jawbone = read_csv(os.path.join(data_dir, 'jawbone.csv'), low_memory=False)\n"
] 66 77 ]
}, 67 78 },
{ 68 79 {
"cell_type": "markdown", 69 80 "cell_type": "markdown",
"metadata": {}, 70 81 "metadata": {},
"source": [ 71 82 "source": [
"# Preprocessing\n", 72 83 "# Preprocessing\n",
"## Picking up the variables" 73 84 "## Picking up the variables"
] 74 85 ]
}, 75 86 },
{ 76 87 {
"cell_type": "code", 77 88 "cell_type": "code",
"execution_count": 6, 78 89 "execution_count": 6,
"metadata": {}, 79 90 "metadata": {},
"outputs": [], 80 91 "outputs": [],
"source": [ 81 92 "source": [
"# Column names of jawbone data\n", 82 93 "# Column names of jawbone data\n",
"# 'Var1', 'user', 'start_datetime', 'end_datetime', 'timezone', 'userid',\n", 83 94 "# 'Var1', 'user', 'start_datetime', 'end_datetime', 'timezone', 'userid',\n",
"# 'steps', 'gmtoff', 'tz', 'start_date', 'end_date', 'start_utime',\n", 84 95 "# 'steps', 'gmtoff', 'tz', 'start_date', 'end_date', 'start_utime',\n",
"# 'end_utime', 'start_udate', 'end_udate', 'intake_date', 'intake_utime',\n", 85 96 "# 'end_utime', 'start_udate', 'end_udate', 'intake_date', 'intake_utime',\n",
"# 'intake_tz', 'intake_gmtoff', 'intake_hour', 'intake_min',\n", 86 97 "# 'intake_tz', 'intake_gmtoff', 'intake_hour', 'intake_min',\n",
"# 'intake_slot', 'travel_start', 'travel_end', 'exit_date',\n", 87 98 "# 'intake_slot', 'travel_start', 'travel_end', 'exit_date',\n",
"# 'dropout_date', 'last_date', 'last_utime', 'last_tz', 'last_gmtoff',\n", 88 99 "# 'dropout_date', 'last_date', 'last_utime', 'last_tz', 'last_gmtoff',\n",
"# 'last_hour', 'last_min', 'start_utime_local', 'end_utime_local'\n", 89 100 "# 'last_hour', 'last_min', 'start_utime_local', 'end_utime_local'\n",
"\n", 90 101 "\n",
"\n", 91 102 "\n",
"# duplicate jawbone data\n", 92 103 "# duplicate jawbone data\n",
"jawbone2 = jawbone.copy(deep=True)\n", 93 104 "jawbone2 = jawbone.copy(deep=True)\n",
"\n", 94 105 "\n",
"# convert string datetimes to actual datetime objects\n", 95 106 "# convert string datetimes to actual datetime objects\n",
"jawbone2[\"start_utime_local\"] = pd.to_datetime(\n", 96 107 "jawbone2[\"start_utime_local\"] = pd.to_datetime(\n",
" jawbone2[\"start_utime_local\"], format=\"%Y-%m-%d %H:%M:%S\")\n", 97 108 " jawbone2[\"start_utime_local\"], format=\"%Y-%m-%d %H:%M:%S\")\n",
"jawbone2[\"start_datetime\"] = pd.to_datetime(\n", 98 109 "jawbone2[\"start_datetime\"] = pd.to_datetime(\n",
" jawbone2[\"start_datetime\"], format=\"%Y-%m-%d %H:%M:%S\")\n", 99 110 " jawbone2[\"start_datetime\"], format=\"%Y-%m-%d %H:%M:%S\")\n",
"\n", 100 111 "\n",
"# calculate the timezone offset\n", 101 112 "# calculate the timezone offset\n",
"jawbone2[\"tz_offset\"] = jawbone2[\"start_datetime\"] - \\\n", 102 113 "jawbone2[\"tz_offset\"] = jawbone2[\"start_datetime\"] - \\\n",
" jawbone2[\"start_utime_local\"]\n", 103 114 " jawbone2[\"start_utime_local\"]\n",
"\n", 104 115 "\n",
"\n", 105 116 "\n",
"# selecting only important columns\n", 106 117 "# selecting only important columns\n",
"jawbone3 = jawbone2[[\"user\", \"start_utime_local\",\n", 107 118 "jawbone3 = jawbone2[[\"user\", \"start_utime_local\",\n",
" \"end_utime_local\", \"tz_offset\", \"steps\"]]\n", 108 119 " \"end_utime_local\", \"tz_offset\", \"steps\"]]\n",
"\n", 109 120 "\n",
"# picking up the local date\n", 110 121 "# picking up the local date\n",
"jawbone3[\"local_date\"] = jawbone3[\"start_utime_local\"].apply(get_date)\n", 111 122 "jawbone3[\"local_date\"] = jawbone3[\"start_utime_local\"].apply(get_date)\n",
"\n", 112 123 "\n",
"# picking up the local minute index\n", 113 124 "# picking up the local minute index\n",
"jawbone3[\"local_minute_index\"] = jawbone3[\"start_utime_local\"].apply(\n", 114 125 "jawbone3[\"local_minute_index\"] = jawbone3[\"start_utime_local\"].apply(\n",
" get_minute_index)\n" 115 126 " get_minute_index)\n"
] 116 127 ]
}, 117 128 },
{ 118 129 {
"cell_type": "markdown", 119 130 "cell_type": "markdown",
"metadata": {}, 120 131 "metadata": {},
"source": [ 121 132 "source": [
"## Making a key info database" 122 133 "## Making a key info database"
] 123 134 ]
}, 124 135 },
{ 125 136 {
"cell_type": "code", 126 137 "cell_type": "code",
"execution_count": 7, 127 138 "execution_count": 7,
"metadata": {}, 128 139 "metadata": {},
"outputs": [], 129 140 "outputs": [],
"source": [ 130 141 "source": [
"# picking up the user - date data\n", 131 142 "# picking up the user - date data\n",
"user_date = jawbone3[[\"user\", \"local_date\"]].drop_duplicates()" 132 143 "user_date = jawbone3[[\"user\", \"local_date\"]].drop_duplicates()"
] 133 144 ]
}, 134 145 },
{ 135 146 {
"cell_type": "markdown", 136 147 "cell_type": "markdown",
"metadata": {}, 137 148 "metadata": {},
"source": [ 138 149 "source": [
"## Removing users with too small amount of data" 139 150 "## Removing users with too small amount of data"
] 140 151 ]
}, 141 152 },
{ 142 153 {
"cell_type": "code", 143 154 "cell_type": "code",
"execution_count": 12, 144 155 "execution_count": 13,
"metadata": {}, 145 156 "metadata": {},
"outputs": [ 146 157 "outputs": [
{ 147 158 {
"name": "stdout", 148 159 "name": "stdout",
"output_type": "stream", 149 160 "output_type": "stream",
"text": [ 150 161 "text": [
"Threshold: 10\n", 151 162 "Threshold: 10\n",
"Users to be removed:[12, 36, 38]\n", 152 163 "Users to be removed:[12, 36, 38]\n",
"Shape Change: 258889 -> 258363 (-526, -0.2%)\n" 153 164 "Shape Change: 258889 -> 258363 (-526, -0.2%)\n"
] 154 165 ]
}, 155 166 },
{ 156 167 {
"data": { 157 168 "data": {
"image/png": "", 158 169 "image/png": "",
"text/plain": [ 159 170 "text/plain": [
"<Figure size 432x288 with 1 Axes>" 160 171 "<Figure size 432x288 with 1 Axes>"
] 161 172 ]
}, 162 173 },
"metadata": {}, 163 174 "metadata": {},
"output_type": "display_data" 164 175 "output_type": "display_data"
} 165 176 }
], 166 177 ],
"source": [ 167 178 "source": [
"# making a stat of the number of days per user\n", 168 179 "# making a stat of the number of days per user\n",
"stat_user = user_date.groupby(['user'])['local_date'].nunique().sort_values()\n", 169 180 "stat_user = user_date.groupby(['user'])['local_date'].nunique().sort_values()\n",
"\n", 170 181 "\n",
"ax = plt.figure()\n", 171 182 "ax = plt.figure()\n",
"ax.patch.set_facecolor('white')\n", 172 183 "ax.patch.set_facecolor('white')\n",
"ax = sns.histplot(stat_user)\n", 173 184 "ax = sns.histplot(stat_user)\n",
"ax.set_title('Distribution of number of days per user')\n", 174 185 "ax.set_title('Distribution of number of days per user')\n",
"ax.set_xlabel('Number of days')\n", 175 186 "ax.set_xlabel('Number of days')\n",
"ax.set_ylabel('Frequency')\n", 176 187 "ax.set_ylabel('Frequency')\n",
"\n", 177 188 "\n",
"# cut off values that are not in the range of the data\n", 178
"THRESHOLD_OF_DAYS_PER_USER = 10\n", 179
"\n", 180 189 "\n",
"# filter out users that have less days of data than THRESHOLD_OF_DAYS_PER_USER\n", 181 190 "# filter out users that have less days of data than THRESHOLD_OF_DAYS_PER_USER\n",
"users_to_be_removed = stat_user[stat_user < THRESHOLD_OF_DAYS_PER_USER].index\n", 182 191 "users_to_be_removed = stat_user[stat_user < THRESHOLD_OF_DAYS_PER_USER].index\n",
"\n", 183 192 "\n",
"print(\"Threshold: {}\".format(THRESHOLD_OF_DAYS_PER_USER))\n", 184 193 "print(\"Threshold: {}\".format(THRESHOLD_OF_DAYS_PER_USER))\n",
"print(\"Users to be removed:{}\".format(list(users_to_be_removed)))\n", 185 194 "print(\"Users to be removed:{}\".format(list(users_to_be_removed)))\n",
"\n", 186 195 "\n",
"jawbone4 = jawbone3[~jawbone3[\"user\"].isin(users_to_be_removed)]\n", 187 196 "jawbone4 = jawbone3[~jawbone3[\"user\"].isin(users_to_be_removed)]\n",
"\n", 188 197 "\n",
198 "user_date2 = user_date[~user_date[\"user\"].isin(users_to_be_removed)]\n",
"\n", 189 199 "\n",
"# printing the amount of data removed\n", 190 200 "# printing the amount of data removed\n",
"jawbone3_count, _ = jawbone3.shape\n", 191 201 "jawbone3_count, _ = jawbone3.shape\n",
"jawbone4_count, _ = jawbone4.shape\n", 192 202 "jawbone4_count, _ = jawbone4.shape\n",
"\n", 193 203 "\n",
"print(\"Shape Change: {} -> {} (-{}, -{}%)\".format(\n", 194 204 "print(\"Shape Change: {} -> {} (-{}, -{}%)\".format(\n",
" jawbone3_count, \n", 195 205 " jawbone3_count, \n",
" jawbone4_count, \n", 196 206 " jawbone4_count, \n",
" jawbone3_count - jawbone4_count, \n", 197 207 " jawbone3_count - jawbone4_count, \n",
" round((jawbone3_count - jawbone4_count) / jawbone3_count * 100, 2)\n", 198 208 " round((jawbone3_count - jawbone4_count) / jawbone3_count * 100, 2)\n",
" )\n", 199 209 " )\n",
")" 200 210 ")"
] 201 211 ]
}, 202 212 },
{ 203 213 {
214 "cell_type": "markdown",
215 "metadata": {},
216 "source": [
217 "## Find consecutive minute walks"
218 ]
219 },
220 {
"cell_type": "code", 204 221 "cell_type": "code",
"execution_count": null, 205 222 "execution_count": 37,
"metadata": {}, 206 223 "metadata": {},
"outputs": [ 207 224 "outputs": [
{ 208 225 {
"ename": "NameError", 209 226 "name": "stdout",
"evalue": "name 'users' is not defined", 210 227 "output_type": "stream",
"output_type": "error", 211 228 "text": [
"traceback": [ 212 229 "Iteration: 0, length: 377396\n",
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 213 230 "Iteration: 1, length: 229752\n",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 214 231 "Iteration: 2, length: 170648\n",
"\u001b[0;32m/var/folders/m6/l3x11zj94l3dp3wnxy1vnscc0000gn/T/ipykernel_50945/4152346818.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mstandard_minute_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"local_minute_index\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1440\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0ma_user\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0musers\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0ma_date\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0muser_date2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlocal_date\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 215 232 "Iteration: 3, length: 137484\n",
"\u001b[0;31mNameError\u001b[0m: name 'users' is not defined" 216 233 "Iteration: 4, length: 178268\n",
234 "Final, length: 94884\n"
] 217 235 ]
} 218 236 }
], 219 237 ],
"source": [ 220 238 "source": [
239 "# prepare the data for the walk calculation\n",
240 "current_vector = jawbone4[[\"user\", \"local_date\", \"local_minute_index\", \"steps\"]]\n",
241 "current_vector[\"add_count\"] = 1\n",
242 "current_vector = current_vector[current_vector[\"steps\"] > MINIMUM_STEPS_PER_MINUTE]\n",
243 "current_vector = current_vector[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
244 "\n",
245 "# define an iterative walk calculation (merging consecutive active minutes)\n",
246 "def calculate_walk(cv):\n",
247 " nv = cv.copy(deep=True)\n",
248 " nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n",
249 "\n",
250 " # move midnight minutes to previous day\n",
251 " nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n",
252 " nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n",
253 " \n",
254 " nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n",
255 " jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n",
256 " jv[\"add_count\"] += 1\n",
257 " jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
258 "\n",
259 " return jv \n",
260 "\n",
261 "\n",
262 "# iteratively calculate the walk\n",
263 "for i in range(0, MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK):\n",
264 " print(\"Iteration: {}, length: {}\".format(i, current_vector.size))\n",
265 " new_vector = calculate_walk(current_vector)\n",
266 " current_vector = new_vector\n",
267 "\n",
268 "print(\"Final, length: {}\".format(current_vector.size))\n",
269 "\n",
270 "consecutive_minutes = current_vector[[\"user\", \"local_date\", \"local_minute_index\"]].drop_duplicates()"
271 ]
272 },
273 {
274 "cell_type": "markdown",
275 "metadata": {},
276 "source": [
277 "## Map consecutive minutes to 1hr and 3hr units"
278 ]
279 },
280 {
281 "cell_type": "code",
282 "execution_count": 108,
283 "metadata": {},
284 "outputs": [
285 {
286 "name": "stdout",
287 "output_type": "stream",
288 "text": [
289 " index user local_date hour walked\n",
290 "0 0 1 2015-07-22 8 2\n",
291 "1 3 1 2015-07-22 18 2\n",
292 "2 10 1 2015-07-22 19 2\n",
293 "3 30 1 2015-07-23 8 2\n",
294 "4 50 1 2015-07-23 9 2\n",
295 " user local_date hour walked\n",
296 "0 1 2015-07-22 0 1.0\n",
297 "1 1 2015-07-22 1 1.0\n",
298 "2 1 2015-07-22 2 1.0\n",
299 "3 1 2015-07-22 3 1.0\n",
300 "4 1 2015-07-22 4 1.0\n",
301 "5 1 2015-07-22 5 1.0\n",
302 "6 1 2015-07-22 6 1.0\n",
303 "7 1 2015-07-22 7 1.0\n",
304 "8 1 2015-07-22 8 2.0\n",
305 "9 1 2015-07-22 9 1.0\n",
306 "10 1 2015-07-22 10 1.0\n",
307 "11 1 2015-07-22 11 1.0\n",
308 "12 1 2015-07-22 12 1.0\n",
309 "13 1 2015-07-22 13 1.0\n",
310 "14 1 2015-07-22 14 1.0\n",
311 "15 1 2015-07-22 15 1.0\n",
312 "16 1 2015-07-22 16 1.0\n",
313 "17 1 2015-07-22 17 1.0\n",
314 "18 1 2015-07-22 18 2.0\n",
315 "19 1 2015-07-22 19 2.0\n",
316 "20 1 2015-07-22 20 1.0\n",
317 "21 1 2015-07-22 21 1.0\n",
318 "22 1 2015-07-22 22 1.0\n",
319 "23 1 2015-07-22 23 1.0\n",
320 "24 1 2015-07-23 0 1.0\n",
321 "25 1 2015-07-23 1 1.0\n",
322 "26 1 2015-07-23 2 1.0\n",
323 "27 1 2015-07-23 3 1.0\n",
324 "28 1 2015-07-23 4 1.0\n",
325 "29 1 2015-07-23 5 1.0\n",
326 "30 1 2015-07-23 6 1.0\n",
327 "31 1 2015-07-23 7 1.0\n",
328 "32 1 2015-07-23 8 2.0\n",
329 "33 1 2015-07-23 9 2.0\n",
330 "34 1 2015-07-23 10 1.0\n",
331 "35 1 2015-07-23 11 1.0\n",
332 "36 1 2015-07-23 12 1.0\n",
333 "37 1 2015-07-23 13 1.0\n",
334 "38 1 2015-07-23 14 1.0\n",
335 "39 1 2015-07-23 15 1.0\n",
336 "40 1 2015-07-23 16 1.0\n",
337 "41 1 2015-07-23 17 1.0\n",
338 "42 1 2015-07-23 18 1.0\n",
339 "43 1 2015-07-23 19 1.0\n",
340 "44 1 2015-07-23 20 1.0\n",
341 "45 1 2015-07-23 21 1.0\n",
342 "46 1 2015-07-23 22 1.0\n",
343 "47 1 2015-07-23 23 1.0\n"
344 ]
345 }
346 ],
347 "source": [
348 "# calculate hour index and three hour index\n",
349 "consecutive_minutes[\"hour\"] = np.int_(np.floor(consecutive_minutes[\"local_minute_index\"] / 60))\n",
350 "consecutive_minutes[\"threehour\"] = np.int_(np.floor(consecutive_minutes[\"hour\"] / 3))\n",
351 "\n",
352 "# calculate the number of walks per user, per hour\n",
353 "walk_by_hours = consecutive_minutes[[\"user\", \"local_date\", \"hour\"]].drop_duplicates().reset_index()\n",
354 "walk_by_hours[\"walked\"] = 2\n",
355 "\n",
356 "# calculate the number of walks per user, per three hour\n",
357 "walk_by_threehours = consecutive_minutes[[\"user\", \"local_date\", \"threehour\"]].drop_duplicates().reset_index()\n",
358 "walk_by_threehours[\"walked\"] = 2\n",
359 "\n",
360 "# generate hour vector and three hour vector\n",
361 "hours = pd.DataFrame({\"hour\": range(0,24)})\n",
362 "threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n",
363 "\n",
364 "\n",
365 "walk_by_hours = walk_by_hours[:5]\n",
366 "walk_by_threehours = walk_by_threehours[:5]\n",
367 "\n",
368 "# generate complete product of vectors\n",
369 "def product_df(mat1, mat2):\n",
370 " mat1 = mat1.drop_duplicates()\n",
371 " mat2 = mat2.drop_duplicates()\n",
372 "\n",
373 " temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n",
374 " for i, acol in enumerate(mat1.columns):\n",
375 " temp[acol] = temp[0].apply(lambda x: x[i])\n",
376 " for i, acol in enumerate(mat2.columns):\n",
377 " temp[acol] = temp[1].apply(lambda x: x[i])\n",
378 " temp = temp.drop(columns=[0, 1])\n",
379 " return temp\n",
380 "\n",
381 "measured_hour = product_df(walk_by_hours[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n",
382 "measured_threehour = product_df(walk_by_threehours[[\"user\", \"local_date\"]], threehours[[\"threehour\"]])\n",
383 "\n",
384 "\n",
385 "\n",
386 "print(walk_by_threehours)\n",
387 "\n",
388 "padded_hours = walk_by_hours.merge(measured_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\")\n",
389 "padded_hours = padded_hours[[\"user\", \"local_date\", \"hour\", \"walked\"]]\n",
390 "padded_hours = padded_hours.fillna(1)\n",
391 "\n",
392 "padded_threehours = walk_by_threehours.merge(measured_threehour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\")\n",
393 "padded_threehours = padded_threehours[[\"user\", \"local_date\", \"hour\", \"walked\"]]\n",
394 "padded_threehours = padded_threehours.fillna(1)\n",
395 "\n",
396 "print(padded_threehours)\n",
397 "\n",
398 "# walk_by_hour = consecutive_minutes.groupby([\"user\", \"local_date\", \"hour\"])[\"add_count\"].sum().reset_index()\n",
399 "# walk_by_threehour = consecutive_minutes.groupby([\"user\", \"local_date\", \"threehour\"])[\"add_count\"].sum().reset_index()\n",
400 "\n",
401 "# walk_by_hour[\"walked\"] = 1\n",
402 "# walk_by_threehour[\"walked\"] = 1\n",
403 "\n",
404 "\n",
405 "# hours2 = walk_by_hour.merge(hours, left_on=\"hour\", right_on=\"hour\", how=\"right\")\n"
406 ]
407 },
408 {
409 "cell_type": "code",
410 "execution_count": null,
411 "metadata": {},
412 "outputs": [],
413 "source": [
414 "\n",
415 "\n",
"standard_minute_index = pd.Series(name=\"local_minute_index\", data=np.arange(0, 1440, 1))\n", 221 416 "standard_minute_index = pd.Series(name=\"local_minute_index\", data=np.arange(0, 1440, 1))\n",
"\n", 222 417 "\n",
"a_user = users[0]\n", 223 418 "a_user = users[0]\n",
"a_date = user_date2.local_date[0]\n", 224 419 "a_date = user_date2.local_date[0]\n",
"\n", 225 420 "\n",
"a_jawbone3 = jawbone3.loc[(jawbone3.user == a_user) & (jawbone3.local_date == a_date), :]\n", 226 421 "a_jawbone3 = jawbone3.loc[(jawbone3.user == a_user) & (jawbone3.local_date == a_date), :]\n",
"\n", 227 422 "\n",
"vec = a_jawbone3[[\"local_minute_index\", \"steps\"]]\n", 228 423 "vec = a_jawbone3[[\"local_minute_index\", \"steps\"]]\n",
"\n", 229 424 "\n",
"steps = [0] * 1440\n", 230 425 "steps = [0] * 1440\n",
"\n", 231 426 "\n",
"for index, row in vec.iterrows():\n", 232 427 "for index, row in vec.iterrows():\n",
" steps[row.local_minute_index] += row.steps\n", 233 428 " steps[row.local_minute_index] += row.steps\n",
"\n", 234 429 "\n",
"print(steps)\n", 235 430 "print(steps)\n",
"steps_series = pd.Series(name=\"steps\", data=steps)\n", 236 431 "steps_series = pd.Series(name=\"steps\", data=steps)\n",
"steps_series[\"over60\"] = (steps_series > 60) * 1\n", 237 432 "steps_series[\"over60\"] = (steps_series > 60) * 1\n",
"\n", 238 433 "\n",
"steps_series[\"roll\"] = steps_series.rolling(window=5, min_periods=1).sum()\n", 239 434 "steps_series[\"roll\"] = steps_series.rolling(window=5, min_periods=1).sum()\n",
"\n", 240 435 "\n",
"steps_series.roll.plot()\n", 241 436 "steps_series.roll.plot()\n",
"\n", 242 437 "\n",
"\n" 243 438 "\n"
] 244 439 ]
}, 245
{ 246
"cell_type": "code", 247
"execution_count": null, 248
"metadata": {}, 249
"outputs": [], 250
"source": [] 251
} 252 440 }
], 253 441 ],
"metadata": { 254 442 "metadata": {
"interpreter": { 255 443 "interpreter": {
"hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a" 256 444 "hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a"
}, 257 445 },
"kernelspec": { 258 446 "kernelspec": {
"display_name": "Python 3.7.9 64-bit ('venv': venv)", 259 447 "display_name": "Python 3.7.9 64-bit ('venv': venv)",
"language": "python", 260 448 "language": "python",
"name": "python3" 261 449 "name": "python3"
}, 262 450 },
"language_info": { 263 451 "language_info": {
"codemirror_mode": { 264 452 "codemirror_mode": {
"name": "ipython", 265 453 "name": "ipython",
"version": 3 266 454 "version": 3
}, 267 455 },
"file_extension": ".py", 268 456 "file_extension": ".py",
"mimetype": "text/x-python", 269 457 "mimetype": "text/x-python",
"name": "python", 270 458 "name": "python",
"nbconvert_exporter": "python", 271 459 "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", 272 460 "pygments_lexer": "ipython3",
"version": "3.7.9" 273 461 "version": "3.7.9"
}, 274 462 },
"orig_nbformat": 4 275 463 "orig_nbformat": 4
}, 276 464 },
"nbformat": 4, 277 465 "nbformat": 4,
"nbformat_minor": 2 278 466 "nbformat_minor": 2
} 279 467 }
280 468