Commit 1ba50105eb005cf94e0450393bb5160c369a57a5
1 parent
a5d4a032f6
Exists in
main
added walk hour vector / hourly and per three hours
Showing 1 changed file with 211 additions and 23 deletions Side-by-side Diff
python-notebook/data_loading.ipynb
View file @
1ba5010
... | ... | @@ -9,7 +9,7 @@ |
9 | 9 | }, |
10 | 10 | { |
11 | 11 | "cell_type": "code", |
12 | - "execution_count": 3, | |
12 | + "execution_count": 17, | |
13 | 13 | "metadata": {}, |
14 | 14 | "outputs": [], |
15 | 15 | "source": [ |
... | ... | @@ -19,7 +19,8 @@ |
19 | 19 | "from pandas import read_csv\n", |
20 | 20 | "import pandas as pd\n", |
21 | 21 | "import os\n", |
22 | - "from datetime import datetime, date\n", | |
22 | + "from datetime import datetime, date, timedelta\n", | |
23 | + "from itertools import product\n", | |
23 | 24 | "# %load_ext line_profiler" |
24 | 25 | ] |
25 | 26 | }, |
... | ... | @@ -32,7 +33,7 @@ |
32 | 33 | }, |
33 | 34 | { |
34 | 35 | "cell_type": "code", |
35 | - "execution_count": 4, | |
36 | + "execution_count": 36, | |
36 | 37 | "metadata": {}, |
37 | 38 | "outputs": [], |
38 | 39 | "source": [ |
... | ... | @@ -42,7 +43,17 @@ |
42 | 43 | " return date(x.year, x.month, x.day)\n", |
43 | 44 | "\n", |
44 | 45 | "def get_minute_index(x):\n", |
45 | - " return (x.hour * 60) + x.minute" | |
46 | + " return (x.hour * 60) + x.minute\n", | |
47 | + "\n", | |
48 | + "\n", | |
49 | + "# cut off values that are not in the range of the data\n", | |
50 | + "THRESHOLD_OF_DAYS_PER_USER = 10\n", | |
51 | + "\n", | |
52 | + "# cut off values for the number of consecutive minutes for a walk\n", | |
53 | + "MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5\n", | |
54 | + "\n", | |
55 | + "# cut off values for the number of steps per minute for an active minute\n", | |
56 | + "MINIMUM_STEPS_PER_MINUTE = 60\n" | |
46 | 57 | ] |
47 | 58 | }, |
48 | 59 | { |
... | ... | @@ -141,7 +152,7 @@ |
141 | 152 | }, |
142 | 153 | { |
143 | 154 | "cell_type": "code", |
144 | - "execution_count": 12, | |
155 | + "execution_count": 13, | |
145 | 156 | "metadata": {}, |
146 | 157 | "outputs": [ |
147 | 158 | { |
... | ... | @@ -175,8 +186,6 @@ |
175 | 186 | "ax.set_xlabel('Number of days')\n", |
176 | 187 | "ax.set_ylabel('Frequency')\n", |
177 | 188 | "\n", |
178 | - "# cut off values that are not in the range of the data\n", | |
179 | - "THRESHOLD_OF_DAYS_PER_USER = 10\n", | |
180 | 189 | "\n", |
181 | 190 | "# filter out users that have less days of data than THRESHOLD_OF_DAYS_PER_USER\n", |
182 | 191 | "users_to_be_removed = stat_user[stat_user < THRESHOLD_OF_DAYS_PER_USER].index\n", |
... | ... | @@ -186,6 +195,7 @@ |
186 | 195 | "\n", |
187 | 196 | "jawbone4 = jawbone3[~jawbone3[\"user\"].isin(users_to_be_removed)]\n", |
188 | 197 | "\n", |
198 | + "user_date2 = user_date[~user_date[\"user\"].isin(users_to_be_removed)]\n", | |
189 | 199 | "\n", |
190 | 200 | "# printing the amount of data removed\n", |
191 | 201 | "jawbone3_count, _ = jawbone3.shape\n", |
192 | 202 | |
193 | 203 | |
194 | 204 | |
... | ... | @@ -201,23 +211,208 @@ |
201 | 211 | ] |
202 | 212 | }, |
203 | 213 | { |
214 | + "cell_type": "markdown", | |
215 | + "metadata": {}, | |
216 | + "source": [ | |
217 | + "## Find consecutive minute walks" | |
218 | + ] | |
219 | + }, | |
220 | + { | |
204 | 221 | "cell_type": "code", |
205 | - "execution_count": null, | |
222 | + "execution_count": 37, | |
206 | 223 | "metadata": {}, |
207 | 224 | "outputs": [ |
208 | 225 | { |
209 | - "ename": "NameError", | |
210 | - "evalue": "name 'users' is not defined", | |
211 | - "output_type": "error", | |
212 | - "traceback": [ | |
213 | - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
214 | - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", | |
215 | - "\u001b[0;32m/var/folders/m6/l3x11zj94l3dp3wnxy1vnscc0000gn/T/ipykernel_50945/4152346818.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mstandard_minute_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"local_minute_index\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1440\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0ma_user\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0musers\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0ma_date\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0muser_date2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlocal_date\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
216 | - "\u001b[0;31mNameError\u001b[0m: name 'users' is not defined" | |
226 | + "name": "stdout", | |
227 | + "output_type": "stream", | |
228 | + "text": [ | |
229 | + "Iteration: 0, length: 377396\n", | |
230 | + "Iteration: 1, length: 229752\n", | |
231 | + "Iteration: 2, length: 170648\n", | |
232 | + "Iteration: 3, length: 137484\n", | |
233 | + "Iteration: 4, length: 178268\n", | |
234 | + "Final, length: 94884\n" | |
217 | 235 | ] |
218 | 236 | } |
219 | 237 | ], |
220 | 238 | "source": [ |
239 | + "# prepare the data for the walk calculation\n", | |
240 | + "current_vector = jawbone4[[\"user\", \"local_date\", \"local_minute_index\", \"steps\"]]\n", | |
241 | + "current_vector[\"add_count\"] = 1\n", | |
242 | + "current_vector = current_vector[current_vector[\"steps\"] > MINIMUM_STEPS_PER_MINUTE]\n", | |
243 | + "current_vector = current_vector[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n", | |
244 | + "\n", | |
245 | + "# define an iterative walk calculation (merging consecutive active minutes)\n", | |
246 | + "def calculate_walk(cv):\n", | |
247 | + " nv = cv.copy(deep=True)\n", | |
248 | + " nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n", | |
249 | + "\n", | |
250 | + " # move midnight minutes to previous day\n", | |
251 | + " nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n", | |
252 | + " nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n", | |
253 | + " \n", | |
254 | + " nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n", | |
255 | + " jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n", | |
256 | + " jv[\"add_count\"] += 1\n", | |
257 | + " jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n", | |
258 | + "\n", | |
259 | + " return jv \n", | |
260 | + "\n", | |
261 | + "\n", | |
262 | + "# iteratively calculate the walk\n", | |
263 | + "for i in range(0, MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK):\n", | |
264 | + " print(\"Iteration: {}, length: {}\".format(i, current_vector.size))\n", | |
265 | + " new_vector = calculate_walk(current_vector)\n", | |
266 | + " current_vector = new_vector\n", | |
267 | + "\n", | |
268 | + "print(\"Final, length: {}\".format(current_vector.size))\n", | |
269 | + "\n", | |
270 | + "consecutive_minutes = current_vector[[\"user\", \"local_date\", \"local_minute_index\"]].drop_duplicates()" | |
271 | + ] | |
272 | + }, | |
273 | + { | |
274 | + "cell_type": "markdown", | |
275 | + "metadata": {}, | |
276 | + "source": [ | |
277 | + "## Map consecutive minutes to 1hr and 3hr units" | |
278 | + ] | |
279 | + }, | |
280 | + { | |
281 | + "cell_type": "code", | |
282 | + "execution_count": 108, | |
283 | + "metadata": {}, | |
284 | + "outputs": [ | |
285 | + { | |
286 | + "name": "stdout", | |
287 | + "output_type": "stream", | |
288 | + "text": [ | |
289 | + " index user local_date hour walked\n", | |
290 | + "0 0 1 2015-07-22 8 2\n", | |
291 | + "1 3 1 2015-07-22 18 2\n", | |
292 | + "2 10 1 2015-07-22 19 2\n", | |
293 | + "3 30 1 2015-07-23 8 2\n", | |
294 | + "4 50 1 2015-07-23 9 2\n", | |
295 | + " user local_date hour walked\n", | |
296 | + "0 1 2015-07-22 0 1.0\n", | |
297 | + "1 1 2015-07-22 1 1.0\n", | |
298 | + "2 1 2015-07-22 2 1.0\n", | |
299 | + "3 1 2015-07-22 3 1.0\n", | |
300 | + "4 1 2015-07-22 4 1.0\n", | |
301 | + "5 1 2015-07-22 5 1.0\n", | |
302 | + "6 1 2015-07-22 6 1.0\n", | |
303 | + "7 1 2015-07-22 7 1.0\n", | |
304 | + "8 1 2015-07-22 8 2.0\n", | |
305 | + "9 1 2015-07-22 9 1.0\n", | |
306 | + "10 1 2015-07-22 10 1.0\n", | |
307 | + "11 1 2015-07-22 11 1.0\n", | |
308 | + "12 1 2015-07-22 12 1.0\n", | |
309 | + "13 1 2015-07-22 13 1.0\n", | |
310 | + "14 1 2015-07-22 14 1.0\n", | |
311 | + "15 1 2015-07-22 15 1.0\n", | |
312 | + "16 1 2015-07-22 16 1.0\n", | |
313 | + "17 1 2015-07-22 17 1.0\n", | |
314 | + "18 1 2015-07-22 18 2.0\n", | |
315 | + "19 1 2015-07-22 19 2.0\n", | |
316 | + "20 1 2015-07-22 20 1.0\n", | |
317 | + "21 1 2015-07-22 21 1.0\n", | |
318 | + "22 1 2015-07-22 22 1.0\n", | |
319 | + "23 1 2015-07-22 23 1.0\n", | |
320 | + "24 1 2015-07-23 0 1.0\n", | |
321 | + "25 1 2015-07-23 1 1.0\n", | |
322 | + "26 1 2015-07-23 2 1.0\n", | |
323 | + "27 1 2015-07-23 3 1.0\n", | |
324 | + "28 1 2015-07-23 4 1.0\n", | |
325 | + "29 1 2015-07-23 5 1.0\n", | |
326 | + "30 1 2015-07-23 6 1.0\n", | |
327 | + "31 1 2015-07-23 7 1.0\n", | |
328 | + "32 1 2015-07-23 8 2.0\n", | |
329 | + "33 1 2015-07-23 9 2.0\n", | |
330 | + "34 1 2015-07-23 10 1.0\n", | |
331 | + "35 1 2015-07-23 11 1.0\n", | |
332 | + "36 1 2015-07-23 12 1.0\n", | |
333 | + "37 1 2015-07-23 13 1.0\n", | |
334 | + "38 1 2015-07-23 14 1.0\n", | |
335 | + "39 1 2015-07-23 15 1.0\n", | |
336 | + "40 1 2015-07-23 16 1.0\n", | |
337 | + "41 1 2015-07-23 17 1.0\n", | |
338 | + "42 1 2015-07-23 18 1.0\n", | |
339 | + "43 1 2015-07-23 19 1.0\n", | |
340 | + "44 1 2015-07-23 20 1.0\n", | |
341 | + "45 1 2015-07-23 21 1.0\n", | |
342 | + "46 1 2015-07-23 22 1.0\n", | |
343 | + "47 1 2015-07-23 23 1.0\n" | |
344 | + ] | |
345 | + } | |
346 | + ], | |
347 | + "source": [ | |
348 | + "# calculate hour index and three hour index\n", | |
349 | + "consecutive_minutes[\"hour\"] = np.int_(np.floor(consecutive_minutes[\"local_minute_index\"] / 60))\n", | |
350 | + "consecutive_minutes[\"threehour\"] = np.int_(np.floor(consecutive_minutes[\"hour\"] / 3))\n", | |
351 | + "\n", | |
352 | + "# calculate the number of walks per user, per hour\n", | |
353 | + "walk_by_hours = consecutive_minutes[[\"user\", \"local_date\", \"hour\"]].drop_duplicates().reset_index()\n", | |
354 | + "walk_by_hours[\"walked\"] = 2\n", | |
355 | + "\n", | |
356 | + "# calculate the number of walks per user, per three hour\n", | |
357 | + "walk_by_threehours = consecutive_minutes[[\"user\", \"local_date\", \"threehour\"]].drop_duplicates().reset_index()\n", | |
358 | + "walk_by_threehours[\"walked\"] = 2\n", | |
359 | + "\n", | |
360 | + "# generate hour vector and three hour vector\n", | |
361 | + "hours = pd.DataFrame({\"hour\": range(0,24)})\n", | |
362 | + "threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n", | |
363 | + "\n", | |
364 | + "\n", | |
365 | + "walk_by_hours = walk_by_hours[:5]\n", | |
366 | + "walk_by_threehours = walk_by_threehours[:5]\n", | |
367 | + "\n", | |
368 | + "# generate complete product of vectors\n", | |
369 | + "def product_df(mat1, mat2):\n", | |
370 | + " mat1 = mat1.drop_duplicates()\n", | |
371 | + " mat2 = mat2.drop_duplicates()\n", | |
372 | + "\n", | |
373 | + " temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n", | |
374 | + " for i, acol in enumerate(mat1.columns):\n", | |
375 | + " temp[acol] = temp[0].apply(lambda x: x[i])\n", | |
376 | + " for i, acol in enumerate(mat2.columns):\n", | |
377 | + " temp[acol] = temp[1].apply(lambda x: x[i])\n", | |
378 | + " temp = temp.drop(columns=[0, 1])\n", | |
379 | + " return temp\n", | |
380 | + "\n", | |
381 | + "measured_hour = product_df(walk_by_hours[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n", | |
382 | + "measured_threehour = product_df(walk_by_threehours[[\"user\", \"local_date\"]], threehours[[\"threehour\"]])\n", | |
383 | + "\n", | |
384 | + "\n", | |
385 | + "\n", | |
386 | + "print(walk_by_threehours)\n", | |
387 | + "\n", | |
388 | + "padded_hours = walk_by_hours.merge(measured_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\")\n", | |
389 | + "padded_hours = padded_hours[[\"user\", \"local_date\", \"hour\", \"walked\"]]\n", | |
390 | + "padded_hours = padded_hours.fillna(1)\n", | |
391 | + "\n", | |
392 | + "padded_threehours = walk_by_threehours.merge(measured_threehour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\")\n", | |
393 | + "padded_threehours = padded_threehours[[\"user\", \"local_date\", \"hour\", \"walked\"]]\n", | |
394 | + "padded_threehours = padded_threehours.fillna(1)\n", | |
395 | + "\n", | |
396 | + "print(padded_threehours)\n", | |
397 | + "\n", | |
398 | + "# walk_by_hour = consecutive_minutes.groupby([\"user\", \"local_date\", \"hour\"])[\"add_count\"].sum().reset_index()\n", | |
399 | + "# walk_by_threehour = consecutive_minutes.groupby([\"user\", \"local_date\", \"threehour\"])[\"add_count\"].sum().reset_index()\n", | |
400 | + "\n", | |
401 | + "# walk_by_hour[\"walked\"] = 1\n", | |
402 | + "# walk_by_threehour[\"walked\"] = 1\n", | |
403 | + "\n", | |
404 | + "\n", | |
405 | + "# hours2 = walk_by_hour.merge(hours, left_on=\"hour\", right_on=\"hour\", how=\"right\")\n" | |
406 | + ] | |
407 | + }, | |
408 | + { | |
409 | + "cell_type": "code", | |
410 | + "execution_count": null, | |
411 | + "metadata": {}, | |
412 | + "outputs": [], | |
413 | + "source": [ | |
414 | + "\n", | |
415 | + "\n", | |
221 | 416 | "standard_minute_index = pd.Series(name=\"local_minute_index\", data=np.arange(0, 1440, 1))\n", |
222 | 417 | "\n", |
223 | 418 | "a_user = users[0]\n", |
... | ... | @@ -242,13 +437,6 @@ |
242 | 437 | "\n", |
243 | 438 | "\n" |
244 | 439 | ] |
245 | - }, | |
246 | - { | |
247 | - "cell_type": "code", | |
248 | - "execution_count": null, | |
249 | - "metadata": {}, | |
250 | - "outputs": [], | |
251 | - "source": [] | |
252 | 440 | } |
253 | 441 | ], |
254 | 442 | "metadata": { |