Commit f39b4d8ba4659f5ffd97ef9d87ce3d7108231450

Authored by Junghwan Park
1 parent 57ff8482ac
Exists in main

Finished the preprocessing.

Showing 1 changed file with 34 additions and 54 deletions Side-by-side Diff

python-notebook/data_loading.ipynb View file @ f39b4d8
... ... @@ -279,39 +279,9 @@
279 279 },
280 280 {
281 281 "cell_type": "code",
282   - "execution_count": 110,
  282 + "execution_count": 111,
283 283 "metadata": {},
284   - "outputs": [
285   - {
286   - "name": "stdout",
287   - "output_type": "stream",
288   - "text": [
289   - " index user local_date threehour walked\n",
290   - "0 0 1 2015-07-22 2 2\n",
291   - "1 3 1 2015-07-22 6 2\n",
292   - "2 30 1 2015-07-23 2 2\n",
293   - "3 50 1 2015-07-23 3 2\n",
294   - "4 58 1 2015-07-23 5 2\n",
295   - " user local_date threehour walked\n",
296   - "0 1 2015-07-22 0 1.0\n",
297   - "1 1 2015-07-22 1 1.0\n",
298   - "2 1 2015-07-22 2 2.0\n",
299   - "3 1 2015-07-22 3 1.0\n",
300   - "4 1 2015-07-22 4 1.0\n",
301   - "5 1 2015-07-22 5 1.0\n",
302   - "6 1 2015-07-22 6 2.0\n",
303   - "7 1 2015-07-22 7 1.0\n",
304   - "8 1 2015-07-23 0 1.0\n",
305   - "9 1 2015-07-23 1 1.0\n",
306   - "10 1 2015-07-23 2 2.0\n",
307   - "11 1 2015-07-23 3 2.0\n",
308   - "12 1 2015-07-23 4 1.0\n",
309   - "13 1 2015-07-23 5 2.0\n",
310   - "14 1 2015-07-23 6 1.0\n",
311   - "15 1 2015-07-23 7 1.0\n"
312   - ]
313   - }
314   - ],
  284 + "outputs": [],
315 285 "source": [
316 286 "# calculate hour index and three hour index\n",
317 287 "consecutive_minutes[\"hour\"] = np.int_(np.floor(consecutive_minutes[\"local_minute_index\"] / 60))\n",
... ... @@ -329,10 +299,6 @@
329 299 "hours = pd.DataFrame({\"hour\": range(0,24)})\n",
330 300 "threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n",
331 301 "\n",
332   - "\n",
333   - "walk_by_hours = walk_by_hours[:5]\n",
334   - "walk_by_threehours = walk_by_threehours[:5]\n",
335   - "\n",
336 302 "# generate complete product of vectors\n",
337 303 "def product_df(mat1, mat2):\n",
338 304 " mat1 = mat1.drop_duplicates()\n",
339 305  
340 306  
341 307  
342 308  
343 309  
344 310  
345 311  
346 312  
347 313  
348 314  
... ... @@ -361,36 +327,50 @@
361 327 ]
362 328 },
363 329 {
  330 + "cell_type": "markdown",
  331 + "metadata": {},
  332 + "source": [
  333 + "## Pad unmeasured missing data with 0s\n",
  334 + "\n",
  335 + "For three hour window data, since it works as output vector, 0 padding is not done. Instead, missing output data is not used for training."
  336 + ]
  337 + },
  338 + {
364 339 "cell_type": "code",
365   - "execution_count": null,
  340 + "execution_count": 141,
366 341 "metadata": {},
367 342 "outputs": [],
368 343 "source": [
  344 + "# generate start and end date for each user\n",
  345 + "start_date = padded_hours.groupby([\"user\"])[\"local_date\"].min()\n",
  346 + "end_date = padded_hours.groupby([\"user\"])[\"local_date\"].max()\n",
369 347 "\n",
  348 + "# generate the user list\n",
  349 + "users = start_date.index\n",
  350 + "all_dates = pd.DataFrame({\"user\": [], \"local_date\": []})\n",
370 351 "\n",
371   - "standard_minute_index = pd.Series(name=\"local_minute_index\", data=np.arange(0, 1440, 1))\n",
  352 + "def date_range(start_date, end_date):\n",
  353 + " delta = end_date - start_date\n",
372 354 "\n",
373   - "a_user = users[0]\n",
374   - "a_date = user_date2.local_date[0]\n",
  355 + " for i in range(delta.days + 1):\n",
  356 + " yield start_date + timedelta(days=i)\n",
375 357 "\n",
376   - "a_jawbone3 = jawbone3.loc[(jawbone3.user == a_user) & (jawbone3.local_date == a_date), :]\n",
  358 + "# generate the lists of dates between start and end date\n",
  359 + "for userid in users:\n",
  360 + " current_user_dates = pd.DataFrame({\"user\": userid, \"local_date\": list(date_range(start_date[userid], end_date[userid]))})\n",
  361 + " \n",
  362 + " all_dates = pd.concat([all_dates, current_user_dates])\n",
377 363 "\n",
378   - "vec = a_jawbone3[[\"local_minute_index\", \"steps\"]]\n",
  364 + "# generate the base vector for the padding\n",
  365 + "all_dates_hour = product_df(all_dates[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n",
379 366 "\n",
380   - "steps = [0] * 1440\n",
  367 + "# final padded gait data\n",
  368 + "padded_hours = padded_hours.merge(all_dates_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\").fillna(0)\n",
381 369 "\n",
382   - "for index, row in vec.iterrows():\n",
383   - " steps[row.local_minute_index] += row.steps\n",
384 370 "\n",
385   - "print(steps)\n",
386   - "steps_series = pd.Series(name=\"steps\", data=steps)\n",
387   - "steps_series[\"over60\"] = (steps_series > 60) * 1\n",
388   - "\n",
389   - "steps_series[\"roll\"] = steps_series.rolling(window=5, min_periods=1).sum()\n",
390   - "\n",
391   - "steps_series.roll.plot()\n",
392   - "\n",
393   - "\n"
  371 + "# save the data\n",
  372 + "padded_hours.to_csv(os.path.join(data_dir, \"padded_hours.csv\"), index=False)\n",
  373 + "padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)"
394 374 ]
395 375 }
396 376 ],