Junghwan Park / fpm280-data-analysis

Commit f39b4d8ba4659f5ffd97ef9d87ce3d7108231450

Authored by Junghwan Park 2022-02-01 15:13:42 -0800

Exists in main

Finished the preprocessing.

Showing 1 changed file with 34 additions and 54 deletions Side-by-side Diff

python-notebook/data_loading.ipynb

...	...	@@ -279,39 +279,9 @@
279	279	},
280	280	{
281	281	"cell_type": "code",
282		- "execution_count": 110,
	282	+ "execution_count": 111,
283	283	"metadata": {},
284		- "outputs": [
285		- {
286		- "name": "stdout",
287		- "output_type": "stream",
288		- "text": [
289		- " index user local_date threehour walked\n",
290		- "0 0 1 2015-07-22 2 2\n",
291		- "1 3 1 2015-07-22 6 2\n",
292		- "2 30 1 2015-07-23 2 2\n",
293		- "3 50 1 2015-07-23 3 2\n",
294		- "4 58 1 2015-07-23 5 2\n",
295		- " user local_date threehour walked\n",
296		- "0 1 2015-07-22 0 1.0\n",
297		- "1 1 2015-07-22 1 1.0\n",
298		- "2 1 2015-07-22 2 2.0\n",
299		- "3 1 2015-07-22 3 1.0\n",
300		- "4 1 2015-07-22 4 1.0\n",
301		- "5 1 2015-07-22 5 1.0\n",
302		- "6 1 2015-07-22 6 2.0\n",
303		- "7 1 2015-07-22 7 1.0\n",
304		- "8 1 2015-07-23 0 1.0\n",
305		- "9 1 2015-07-23 1 1.0\n",
306		- "10 1 2015-07-23 2 2.0\n",
307		- "11 1 2015-07-23 3 2.0\n",
308		- "12 1 2015-07-23 4 1.0\n",
309		- "13 1 2015-07-23 5 2.0\n",
310		- "14 1 2015-07-23 6 1.0\n",
311		- "15 1 2015-07-23 7 1.0\n"
312		- ]
313		- }
314		- ],
	284	+ "outputs": [],
315	285	"source": [
316	286	"# calculate hour index and three hour index\n",
317	287	"consecutive_minutes[\"hour\"] = np.int_(np.floor(consecutive_minutes[\"local_minute_index\"] / 60))\n",
...	...	@@ -329,10 +299,6 @@
329	299	"hours = pd.DataFrame({\"hour\": range(0,24)})\n",
330	300	"threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n",
331	301	"\n",
332		- "\n",
333		- "walk_by_hours = walk_by_hours[:5]\n",
334		- "walk_by_threehours = walk_by_threehours[:5]\n",
335		- "\n",
336	302	"# generate complete product of vectors\n",
337	303	"def product_df(mat1, mat2):\n",
338	304	" mat1 = mat1.drop_duplicates()\n",
339	305
340	306
341	307
342	308
343	309
344	310
345	311
346	312
347	313
348	314
...	...	@@ -361,36 +327,50 @@
361	327	]
362	328	},
363	329	{
	330	+ "cell_type": "markdown",
	331	+ "metadata": {},
	332	+ "source": [
	333	+ "## Pad unmeasured missing data with 0s\n",
	334	+ "\n",
	335	+ "For three hour window data, since it works as output vector, 0 padding is not done. Instead, missing output data is not used for training."
	336	+ ]
	337	+ },
	338	+ {
364	339	"cell_type": "code",
365		- "execution_count": null,
	340	+ "execution_count": 141,
366	341	"metadata": {},
367	342	"outputs": [],
368	343	"source": [
	344	+ "# generate start and end date for each user\n",
	345	+ "start_date = padded_hours.groupby([\"user\"])[\"local_date\"].min()\n",
	346	+ "end_date = padded_hours.groupby([\"user\"])[\"local_date\"].max()\n",
369	347	"\n",
	348	+ "# generate the user list\n",
	349	+ "users = start_date.index\n",
	350	+ "all_dates = pd.DataFrame({\"user\": [], \"local_date\": []})\n",
370	351	"\n",
371		- "standard_minute_index = pd.Series(name=\"local_minute_index\", data=np.arange(0, 1440, 1))\n",
	352	+ "def date_range(start_date, end_date):\n",
	353	+ " delta = end_date - start_date\n",
372	354	"\n",
373		- "a_user = users[0]\n",
374		- "a_date = user_date2.local_date[0]\n",
	355	+ " for i in range(delta.days + 1):\n",
	356	+ " yield start_date + timedelta(days=i)\n",
375	357	"\n",
376		- "a_jawbone3 = jawbone3.loc[(jawbone3.user == a_user) & (jawbone3.local_date == a_date), :]\n",
	358	+ "# generate the lists of dates between start and end date\n",
	359	+ "for userid in users:\n",
	360	+ " current_user_dates = pd.DataFrame({\"user\": userid, \"local_date\": list(date_range(start_date[userid], end_date[userid]))})\n",
	361	+ " \n",
	362	+ " all_dates = pd.concat([all_dates, current_user_dates])\n",
377	363	"\n",
378		- "vec = a_jawbone3[[\"local_minute_index\", \"steps\"]]\n",
	364	+ "# generate the base vector for the padding\n",
	365	+ "all_dates_hour = product_df(all_dates[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n",
379	366	"\n",
380		- "steps = [0] * 1440\n",
	367	+ "# final padded gait data\n",
	368	+ "padded_hours = padded_hours.merge(all_dates_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\").fillna(0)\n",
381	369	"\n",
382		- "for index, row in vec.iterrows():\n",
383		- " steps[row.local_minute_index] += row.steps\n",
384	370	"\n",
385		- "print(steps)\n",
386		- "steps_series = pd.Series(name=\"steps\", data=steps)\n",
387		- "steps_series[\"over60\"] = (steps_series > 60) * 1\n",
388		- "\n",
389		- "steps_series[\"roll\"] = steps_series.rolling(window=5, min_periods=1).sum()\n",
390		- "\n",
391		- "steps_series.roll.plot()\n",
392		- "\n",
393		- "\n"
	371	+ "# save the data\n",
	372	+ "padded_hours.to_csv(os.path.join(data_dir, \"padded_hours.csv\"), index=False)\n",
	373	+ "padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)"
394	374	]
395	375	}
396	376	],