Commit 1ba50105eb005cf94e0450393bb5160c369a57a5

Authored by Junghwan Park
1 parent a5d4a032f6
Exists in main

added walk hour vector / hourly and per three hours

Showing 1 changed file with 211 additions and 23 deletions Side-by-side Diff

python-notebook/data_loading.ipynb View file @ 1ba5010
... ... @@ -9,7 +9,7 @@
9 9 },
10 10 {
11 11 "cell_type": "code",
12   - "execution_count": 3,
  12 + "execution_count": 17,
13 13 "metadata": {},
14 14 "outputs": [],
15 15 "source": [
... ... @@ -19,7 +19,8 @@
19 19 "from pandas import read_csv\n",
20 20 "import pandas as pd\n",
21 21 "import os\n",
22   - "from datetime import datetime, date\n",
  22 + "from datetime import datetime, date, timedelta\n",
  23 + "from itertools import product\n",
23 24 "# %load_ext line_profiler"
24 25 ]
25 26 },
... ... @@ -32,7 +33,7 @@
32 33 },
33 34 {
34 35 "cell_type": "code",
35   - "execution_count": 4,
  36 + "execution_count": 36,
36 37 "metadata": {},
37 38 "outputs": [],
38 39 "source": [
... ... @@ -42,7 +43,17 @@
42 43 " return date(x.year, x.month, x.day)\n",
43 44 "\n",
44 45 "def get_minute_index(x):\n",
45   - " return (x.hour * 60) + x.minute"
  46 + " return (x.hour * 60) + x.minute\n",
  47 + "\n",
  48 + "\n",
  49 + "# cut off values that are not in the range of the data\n",
  50 + "THRESHOLD_OF_DAYS_PER_USER = 10\n",
  51 + "\n",
  52 + "# cut off values for the number of consecutive minutes for a walk\n",
  53 + "MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5\n",
  54 + "\n",
  55 + "# cut off values for the number of steps per minute for an active minute\n",
  56 + "MINIMUM_STEPS_PER_MINUTE = 60\n"
46 57 ]
47 58 },
48 59 {
... ... @@ -141,7 +152,7 @@
141 152 },
142 153 {
143 154 "cell_type": "code",
144   - "execution_count": 12,
  155 + "execution_count": 13,
145 156 "metadata": {},
146 157 "outputs": [
147 158 {
... ... @@ -175,8 +186,6 @@
175 186 "ax.set_xlabel('Number of days')\n",
176 187 "ax.set_ylabel('Frequency')\n",
177 188 "\n",
178   - "# cut off values that are not in the range of the data\n",
179   - "THRESHOLD_OF_DAYS_PER_USER = 10\n",
180 189 "\n",
181 190 "# filter out users that have less days of data than THRESHOLD_OF_DAYS_PER_USER\n",
182 191 "users_to_be_removed = stat_user[stat_user < THRESHOLD_OF_DAYS_PER_USER].index\n",
... ... @@ -186,6 +195,7 @@
186 195 "\n",
187 196 "jawbone4 = jawbone3[~jawbone3[\"user\"].isin(users_to_be_removed)]\n",
188 197 "\n",
  198 + "user_date2 = user_date[~user_date[\"user\"].isin(users_to_be_removed)]\n",
189 199 "\n",
190 200 "# printing the amount of data removed\n",
191 201 "jawbone3_count, _ = jawbone3.shape\n",
192 202  
193 203  
194 204  
... ... @@ -201,23 +211,208 @@
201 211 ]
202 212 },
203 213 {
  214 + "cell_type": "markdown",
  215 + "metadata": {},
  216 + "source": [
  217 + "## Find consecutive minute walks"
  218 + ]
  219 + },
  220 + {
204 221 "cell_type": "code",
205   - "execution_count": null,
  222 + "execution_count": 37,
206 223 "metadata": {},
207 224 "outputs": [
208 225 {
209   - "ename": "NameError",
210   - "evalue": "name 'users' is not defined",
211   - "output_type": "error",
212   - "traceback": [
213   - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
214   - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
215   - "\u001b[0;32m/var/folders/m6/l3x11zj94l3dp3wnxy1vnscc0000gn/T/ipykernel_50945/4152346818.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mstandard_minute_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"local_minute_index\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1440\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0ma_user\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0musers\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0ma_date\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0muser_date2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlocal_date\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
216   - "\u001b[0;31mNameError\u001b[0m: name 'users' is not defined"
  226 + "name": "stdout",
  227 + "output_type": "stream",
  228 + "text": [
  229 + "Iteration: 0, length: 377396\n",
  230 + "Iteration: 1, length: 229752\n",
  231 + "Iteration: 2, length: 170648\n",
  232 + "Iteration: 3, length: 137484\n",
  233 + "Iteration: 4, length: 178268\n",
  234 + "Final, length: 94884\n"
217 235 ]
218 236 }
219 237 ],
220 238 "source": [
  239 + "# prepare the data for the walk calculation\n",
  240 + "current_vector = jawbone4[[\"user\", \"local_date\", \"local_minute_index\", \"steps\"]]\n",
  241 + "current_vector[\"add_count\"] = 1\n",
  242 + "current_vector = current_vector[current_vector[\"steps\"] > MINIMUM_STEPS_PER_MINUTE]\n",
  243 + "current_vector = current_vector[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
  244 + "\n",
  245 + "# define an iterative walk calculation (merging consecutive active minutes)\n",
  246 + "def calculate_walk(cv):\n",
  247 + " nv = cv.copy(deep=True)\n",
  248 + " nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n",
  249 + "\n",
  250 + " # move midnight minutes to previous day\n",
  251 + " nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n",
  252 + " nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n",
  253 + " \n",
  254 + " nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n",
  255 + " jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n",
  256 + " jv[\"add_count\"] += 1\n",
  257 + " jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
  258 + "\n",
  259 + " return jv \n",
  260 + "\n",
  261 + "\n",
  262 + "# iteratively calculate the walk\n",
  263 + "for i in range(0, MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK):\n",
  264 + " print(\"Iteration: {}, length: {}\".format(i, current_vector.size))\n",
  265 + " new_vector = calculate_walk(current_vector)\n",
  266 + " current_vector = new_vector\n",
  267 + "\n",
  268 + "print(\"Final, length: {}\".format(current_vector.size))\n",
  269 + "\n",
  270 + "consecutive_minutes = current_vector[[\"user\", \"local_date\", \"local_minute_index\"]].drop_duplicates()"
  271 + ]
  272 + },
  273 + {
  274 + "cell_type": "markdown",
  275 + "metadata": {},
  276 + "source": [
  277 + "## Map consecutive minutes to 1hr and 3hr units"
  278 + ]
  279 + },
  280 + {
  281 + "cell_type": "code",
  282 + "execution_count": 108,
  283 + "metadata": {},
  284 + "outputs": [
  285 + {
  286 + "name": "stdout",
  287 + "output_type": "stream",
  288 + "text": [
  289 + " index user local_date hour walked\n",
  290 + "0 0 1 2015-07-22 8 2\n",
  291 + "1 3 1 2015-07-22 18 2\n",
  292 + "2 10 1 2015-07-22 19 2\n",
  293 + "3 30 1 2015-07-23 8 2\n",
  294 + "4 50 1 2015-07-23 9 2\n",
  295 + " user local_date hour walked\n",
  296 + "0 1 2015-07-22 0 1.0\n",
  297 + "1 1 2015-07-22 1 1.0\n",
  298 + "2 1 2015-07-22 2 1.0\n",
  299 + "3 1 2015-07-22 3 1.0\n",
  300 + "4 1 2015-07-22 4 1.0\n",
  301 + "5 1 2015-07-22 5 1.0\n",
  302 + "6 1 2015-07-22 6 1.0\n",
  303 + "7 1 2015-07-22 7 1.0\n",
  304 + "8 1 2015-07-22 8 2.0\n",
  305 + "9 1 2015-07-22 9 1.0\n",
  306 + "10 1 2015-07-22 10 1.0\n",
  307 + "11 1 2015-07-22 11 1.0\n",
  308 + "12 1 2015-07-22 12 1.0\n",
  309 + "13 1 2015-07-22 13 1.0\n",
  310 + "14 1 2015-07-22 14 1.0\n",
  311 + "15 1 2015-07-22 15 1.0\n",
  312 + "16 1 2015-07-22 16 1.0\n",
  313 + "17 1 2015-07-22 17 1.0\n",
  314 + "18 1 2015-07-22 18 2.0\n",
  315 + "19 1 2015-07-22 19 2.0\n",
  316 + "20 1 2015-07-22 20 1.0\n",
  317 + "21 1 2015-07-22 21 1.0\n",
  318 + "22 1 2015-07-22 22 1.0\n",
  319 + "23 1 2015-07-22 23 1.0\n",
  320 + "24 1 2015-07-23 0 1.0\n",
  321 + "25 1 2015-07-23 1 1.0\n",
  322 + "26 1 2015-07-23 2 1.0\n",
  323 + "27 1 2015-07-23 3 1.0\n",
  324 + "28 1 2015-07-23 4 1.0\n",
  325 + "29 1 2015-07-23 5 1.0\n",
  326 + "30 1 2015-07-23 6 1.0\n",
  327 + "31 1 2015-07-23 7 1.0\n",
  328 + "32 1 2015-07-23 8 2.0\n",
  329 + "33 1 2015-07-23 9 2.0\n",
  330 + "34 1 2015-07-23 10 1.0\n",
  331 + "35 1 2015-07-23 11 1.0\n",
  332 + "36 1 2015-07-23 12 1.0\n",
  333 + "37 1 2015-07-23 13 1.0\n",
  334 + "38 1 2015-07-23 14 1.0\n",
  335 + "39 1 2015-07-23 15 1.0\n",
  336 + "40 1 2015-07-23 16 1.0\n",
  337 + "41 1 2015-07-23 17 1.0\n",
  338 + "42 1 2015-07-23 18 1.0\n",
  339 + "43 1 2015-07-23 19 1.0\n",
  340 + "44 1 2015-07-23 20 1.0\n",
  341 + "45 1 2015-07-23 21 1.0\n",
  342 + "46 1 2015-07-23 22 1.0\n",
  343 + "47 1 2015-07-23 23 1.0\n"
  344 + ]
  345 + }
  346 + ],
  347 + "source": [
  348 + "# calculate hour index and three hour index\n",
  349 + "consecutive_minutes[\"hour\"] = np.int_(np.floor(consecutive_minutes[\"local_minute_index\"] / 60))\n",
  350 + "consecutive_minutes[\"threehour\"] = np.int_(np.floor(consecutive_minutes[\"hour\"] / 3))\n",
  351 + "\n",
  352 + "# calculate the number of walks per user, per hour\n",
  353 + "walk_by_hours = consecutive_minutes[[\"user\", \"local_date\", \"hour\"]].drop_duplicates().reset_index()\n",
  354 + "walk_by_hours[\"walked\"] = 2\n",
  355 + "\n",
  356 + "# calculate the number of walks per user, per three hour\n",
  357 + "walk_by_threehours = consecutive_minutes[[\"user\", \"local_date\", \"threehour\"]].drop_duplicates().reset_index()\n",
  358 + "walk_by_threehours[\"walked\"] = 2\n",
  359 + "\n",
  360 + "# generate hour vector and three hour vector\n",
  361 + "hours = pd.DataFrame({\"hour\": range(0,24)})\n",
  362 + "threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n",
  363 + "\n",
  364 + "\n",
  365 + "walk_by_hours = walk_by_hours[:5]\n",
  366 + "walk_by_threehours = walk_by_threehours[:5]\n",
  367 + "\n",
  368 + "# generate complete product of vectors\n",
  369 + "def product_df(mat1, mat2):\n",
  370 + " mat1 = mat1.drop_duplicates()\n",
  371 + " mat2 = mat2.drop_duplicates()\n",
  372 + "\n",
  373 + " temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n",
  374 + " for i, acol in enumerate(mat1.columns):\n",
  375 + " temp[acol] = temp[0].apply(lambda x: x[i])\n",
  376 + " for i, acol in enumerate(mat2.columns):\n",
  377 + " temp[acol] = temp[1].apply(lambda x: x[i])\n",
  378 + " temp = temp.drop(columns=[0, 1])\n",
  379 + " return temp\n",
  380 + "\n",
  381 + "measured_hour = product_df(walk_by_hours[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n",
  382 + "measured_threehour = product_df(walk_by_threehours[[\"user\", \"local_date\"]], threehours[[\"threehour\"]])\n",
  383 + "\n",
  384 + "\n",
  385 + "\n",
  386 + "print(walk_by_threehours)\n",
  387 + "\n",
  388 + "padded_hours = walk_by_hours.merge(measured_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\")\n",
  389 + "padded_hours = padded_hours[[\"user\", \"local_date\", \"hour\", \"walked\"]]\n",
  390 + "padded_hours = padded_hours.fillna(1)\n",
  391 + "\n",
  392 + "padded_threehours = walk_by_threehours.merge(measured_threehour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\")\n",
  393 + "padded_threehours = padded_threehours[[\"user\", \"local_date\", \"hour\", \"walked\"]]\n",
  394 + "padded_threehours = padded_threehours.fillna(1)\n",
  395 + "\n",
  396 + "print(padded_threehours)\n",
  397 + "\n",
  398 + "# walk_by_hour = consecutive_minutes.groupby([\"user\", \"local_date\", \"hour\"])[\"add_count\"].sum().reset_index()\n",
  399 + "# walk_by_threehour = consecutive_minutes.groupby([\"user\", \"local_date\", \"threehour\"])[\"add_count\"].sum().reset_index()\n",
  400 + "\n",
  401 + "# walk_by_hour[\"walked\"] = 1\n",
  402 + "# walk_by_threehour[\"walked\"] = 1\n",
  403 + "\n",
  404 + "\n",
  405 + "# hours2 = walk_by_hour.merge(hours, left_on=\"hour\", right_on=\"hour\", how=\"right\")\n"
  406 + ]
  407 + },
  408 + {
  409 + "cell_type": "code",
  410 + "execution_count": null,
  411 + "metadata": {},
  412 + "outputs": [],
  413 + "source": [
  414 + "\n",
  415 + "\n",
221 416 "standard_minute_index = pd.Series(name=\"local_minute_index\", data=np.arange(0, 1440, 1))\n",
222 417 "\n",
223 418 "a_user = users[0]\n",
... ... @@ -242,13 +437,6 @@
242 437 "\n",
243 438 "\n"
244 439 ]
245   - },
246   - {
247   - "cell_type": "code",
248   - "execution_count": null,
249   - "metadata": {},
250   - "outputs": [],
251   - "source": []
252 440 }
253 441 ],
254 442 "metadata": {