Commit f39b4d8ba4659f5ffd97ef9d87ce3d7108231450
1 parent
57ff8482ac
Exists in
main
Finished the preprocessing.
Showing 1 changed file with 34 additions and 54 deletions Side-by-side Diff
python-notebook/data_loading.ipynb
View file @
f39b4d8
... | ... | @@ -279,39 +279,9 @@ |
279 | 279 | }, |
280 | 280 | { |
281 | 281 | "cell_type": "code", |
282 | - "execution_count": 110, | |
282 | + "execution_count": 111, | |
283 | 283 | "metadata": {}, |
284 | - "outputs": [ | |
285 | - { | |
286 | - "name": "stdout", | |
287 | - "output_type": "stream", | |
288 | - "text": [ | |
289 | - " index user local_date threehour walked\n", | |
290 | - "0 0 1 2015-07-22 2 2\n", | |
291 | - "1 3 1 2015-07-22 6 2\n", | |
292 | - "2 30 1 2015-07-23 2 2\n", | |
293 | - "3 50 1 2015-07-23 3 2\n", | |
294 | - "4 58 1 2015-07-23 5 2\n", | |
295 | - " user local_date threehour walked\n", | |
296 | - "0 1 2015-07-22 0 1.0\n", | |
297 | - "1 1 2015-07-22 1 1.0\n", | |
298 | - "2 1 2015-07-22 2 2.0\n", | |
299 | - "3 1 2015-07-22 3 1.0\n", | |
300 | - "4 1 2015-07-22 4 1.0\n", | |
301 | - "5 1 2015-07-22 5 1.0\n", | |
302 | - "6 1 2015-07-22 6 2.0\n", | |
303 | - "7 1 2015-07-22 7 1.0\n", | |
304 | - "8 1 2015-07-23 0 1.0\n", | |
305 | - "9 1 2015-07-23 1 1.0\n", | |
306 | - "10 1 2015-07-23 2 2.0\n", | |
307 | - "11 1 2015-07-23 3 2.0\n", | |
308 | - "12 1 2015-07-23 4 1.0\n", | |
309 | - "13 1 2015-07-23 5 2.0\n", | |
310 | - "14 1 2015-07-23 6 1.0\n", | |
311 | - "15 1 2015-07-23 7 1.0\n" | |
312 | - ] | |
313 | - } | |
314 | - ], | |
284 | + "outputs": [], | |
315 | 285 | "source": [ |
316 | 286 | "# calculate hour index and three hour index\n", |
317 | 287 | "consecutive_minutes[\"hour\"] = np.int_(np.floor(consecutive_minutes[\"local_minute_index\"] / 60))\n", |
... | ... | @@ -329,10 +299,6 @@ |
329 | 299 | "hours = pd.DataFrame({\"hour\": range(0,24)})\n", |
330 | 300 | "threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n", |
331 | 301 | "\n", |
332 | - "\n", | |
333 | - "walk_by_hours = walk_by_hours[:5]\n", | |
334 | - "walk_by_threehours = walk_by_threehours[:5]\n", | |
335 | - "\n", | |
336 | 302 | "# generate complete product of vectors\n", |
337 | 303 | "def product_df(mat1, mat2):\n", |
338 | 304 | " mat1 = mat1.drop_duplicates()\n", |
339 | 305 | |
340 | 306 | |
341 | 307 | |
342 | 308 | |
343 | 309 | |
344 | 310 | |
345 | 311 | |
346 | 312 | |
347 | 313 | |
348 | 314 | |
... | ... | @@ -361,36 +327,50 @@ |
361 | 327 | ] |
362 | 328 | }, |
363 | 329 | { |
330 | + "cell_type": "markdown", | |
331 | + "metadata": {}, | |
332 | + "source": [ | |
333 | + "## Pad unmeasured missing data with 0s\n", | |
334 | + "\n", | |
335 | + "For three hour window data, since it works as output vector, 0 padding is not done. Instead, missing output data is not used for training." | |
336 | + ] | |
337 | + }, | |
338 | + { | |
364 | 339 | "cell_type": "code", |
365 | - "execution_count": null, | |
340 | + "execution_count": 141, | |
366 | 341 | "metadata": {}, |
367 | 342 | "outputs": [], |
368 | 343 | "source": [ |
344 | + "# generate start and end date for each user\n", | |
345 | + "start_date = padded_hours.groupby([\"user\"])[\"local_date\"].min()\n", | |
346 | + "end_date = padded_hours.groupby([\"user\"])[\"local_date\"].max()\n", | |
369 | 347 | "\n", |
348 | + "# generate the user list\n", | |
349 | + "users = start_date.index\n", | |
350 | + "all_dates = pd.DataFrame({\"user\": [], \"local_date\": []})\n", | |
370 | 351 | "\n", |
371 | - "standard_minute_index = pd.Series(name=\"local_minute_index\", data=np.arange(0, 1440, 1))\n", | |
352 | + "def date_range(start_date, end_date):\n", | |
353 | + " delta = end_date - start_date\n", | |
372 | 354 | "\n", |
373 | - "a_user = users[0]\n", | |
374 | - "a_date = user_date2.local_date[0]\n", | |
355 | + " for i in range(delta.days + 1):\n", | |
356 | + " yield start_date + timedelta(days=i)\n", | |
375 | 357 | "\n", |
376 | - "a_jawbone3 = jawbone3.loc[(jawbone3.user == a_user) & (jawbone3.local_date == a_date), :]\n", | |
358 | + "# generate the lists of dates between start and end date\n", | |
359 | + "for userid in users:\n", | |
360 | + " current_user_dates = pd.DataFrame({\"user\": userid, \"local_date\": list(date_range(start_date[userid], end_date[userid]))})\n", | |
361 | + " \n", | |
362 | + " all_dates = pd.concat([all_dates, current_user_dates])\n", | |
377 | 363 | "\n", |
378 | - "vec = a_jawbone3[[\"local_minute_index\", \"steps\"]]\n", | |
364 | + "# generate the base vector for the padding\n", | |
365 | + "all_dates_hour = product_df(all_dates[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n", | |
379 | 366 | "\n", |
380 | - "steps = [0] * 1440\n", | |
367 | + "# final padded gait data\n", | |
368 | + "padded_hours = padded_hours.merge(all_dates_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\").fillna(0)\n", | |
381 | 369 | "\n", |
382 | - "for index, row in vec.iterrows():\n", | |
383 | - " steps[row.local_minute_index] += row.steps\n", | |
384 | 370 | "\n", |
385 | - "print(steps)\n", | |
386 | - "steps_series = pd.Series(name=\"steps\", data=steps)\n", | |
387 | - "steps_series[\"over60\"] = (steps_series > 60) * 1\n", | |
388 | - "\n", | |
389 | - "steps_series[\"roll\"] = steps_series.rolling(window=5, min_periods=1).sum()\n", | |
390 | - "\n", | |
391 | - "steps_series.roll.plot()\n", | |
392 | - "\n", | |
393 | - "\n" | |
371 | + "# save the data\n", | |
372 | + "padded_hours.to_csv(os.path.join(data_dir, \"padded_hours.csv\"), index=False)\n", | |
373 | + "padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)" | |
394 | 374 | ] |
395 | 375 | } |
396 | 376 | ], |