Junghwan Park / fpm280-data-analysis

Commit cf37877009d2c931937801ab68f427ce3c847775

Authored by Junghwan Park 2022-02-01 15:17:09 -0800

Exists in main

rearranged code order

Showing 1 changed file with 64 additions and 48 deletions Side-by-side Diff

python-notebook/data_loading.ipynb

...	...	@@ -9,7 +9,7 @@
9	9	},
10	10	{
11	11	"cell_type": "code",
12		- "execution_count": 17,
	12	+ "execution_count": 142,
13	13	"metadata": {},
14	14	"outputs": [],
15	15	"source": [
16	16
17	17
18	18
19	19
20	20
...	...	@@ -33,19 +33,57 @@
33	33	},
34	34	{
35	35	"cell_type": "code",
36		- "execution_count": 36,
	36	+ "execution_count": 143,
37	37	"metadata": {},
38	38	"outputs": [],
39	39	"source": [
	40	+ "# to use unlimited memory for large dataframes\n",
40	41	"pd.options.mode.chained_assignment = None\n",
41	42	"\n",
	43	+ "# convert a datetime object to a date object\n",
42	44	"def get_date(x):\n",
43	45	" return date(x.year, x.month, x.day)\n",
44	46	"\n",
	47	+ "# convert a datetime object to an integer, which denotes the number of minutes since midnight\n",
45	48	"def get_minute_index(x):\n",
46	49	" return (x.hour * 60) + x.minute\n",
47	50	"\n",
	51	+ "# return a range of dates\n",
	52	+ "def date_range(start_date, end_date):\n",
	53	+ " delta = end_date - start_date\n",
48	54	"\n",
	55	+ " for i in range(delta.days + 1):\n",
	56	+ " yield start_date + timedelta(days=i)\n",
	57	+ "\n",
	58	+ "# define an iterative walk calculation (merging consecutive active minutes)\n",
	59	+ "def calculate_walk(cv):\n",
	60	+ " nv = cv.copy(deep=True)\n",
	61	+ " nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n",
	62	+ "\n",
	63	+ " # move midnight minutes to previous day\n",
	64	+ " nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n",
	65	+ " nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n",
	66	+ " \n",
	67	+ " nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n",
	68	+ " jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n",
	69	+ " jv[\"add_count\"] += 1\n",
	70	+ " jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
	71	+ "\n",
	72	+ " return jv \n",
	73	+ "\n",
	74	+ "# generate complete product of vectors\n",
	75	+ "def product_df(mat1, mat2):\n",
	76	+ " mat1 = mat1.drop_duplicates()\n",
	77	+ " mat2 = mat2.drop_duplicates()\n",
	78	+ "\n",
	79	+ " temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n",
	80	+ " for i, acol in enumerate(mat1.columns):\n",
	81	+ " temp[acol] = temp[0].apply(lambda x: x[i])\n",
	82	+ " for i, acol in enumerate(mat2.columns):\n",
	83	+ " temp[acol] = temp[1].apply(lambda x: x[i])\n",
	84	+ " temp = temp.drop(columns=[0, 1])\n",
	85	+ " return temp\n",
	86	+ "\n",
49	87	"# cut off values that are not in the range of the data\n",
50	88	"THRESHOLD_OF_DAYS_PER_USER = 10\n",
51	89	"\n",
...	...	@@ -65,7 +103,7 @@
65	103	},
66	104	{
67	105	"cell_type": "code",
68		- "execution_count": 5,
	106	+ "execution_count": 144,
69	107	"metadata": {},
70	108	"outputs": [],
71	109	"source": [
...	...	@@ -86,7 +124,7 @@
86	124	},
87	125	{
88	126	"cell_type": "code",
89		- "execution_count": 6,
	127	+ "execution_count": 145,
90	128	"metadata": {},
91	129	"outputs": [],
92	130	"source": [
...	...	@@ -135,7 +173,7 @@
135	173	},
136	174	{
137	175	"cell_type": "code",
138		- "execution_count": 7,
	176	+ "execution_count": 146,
139	177	"metadata": {},
140	178	"outputs": [],
141	179	"source": [
...	...	@@ -152,7 +190,7 @@
152	190	},
153	191	{
154	192	"cell_type": "code",
155		- "execution_count": 13,
	193	+ "execution_count": 147,
156	194	"metadata": {},
157	195	"outputs": [
158	196	{
...	...	@@ -219,7 +257,7 @@
219	257	},
220	258	{
221	259	"cell_type": "code",
222		- "execution_count": 37,
	260	+ "execution_count": 148,
223	261	"metadata": {},
224	262	"outputs": [
225	263	{
...	...	@@ -242,23 +280,6 @@
242	280	"current_vector = current_vector[current_vector[\"steps\"] > MINIMUM_STEPS_PER_MINUTE]\n",
243	281	"current_vector = current_vector[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
244	282	"\n",
245		- "# define an iterative walk calculation (merging consecutive active minutes)\n",
246		- "def calculate_walk(cv):\n",
247		- " nv = cv.copy(deep=True)\n",
248		- " nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n",
249		- "\n",
250		- " # move midnight minutes to previous day\n",
251		- " nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n",
252		- " nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n",
253		- " \n",
254		- " nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n",
255		- " jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n",
256		- " jv[\"add_count\"] += 1\n",
257		- " jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
258		- "\n",
259		- " return jv \n",
260		- "\n",
261		- "\n",
262	283	"# iteratively calculate the walk\n",
263	284	"for i in range(0, MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK):\n",
264	285	" print(\"Iteration: {}, length: {}\".format(i, current_vector.size))\n",
...	...	@@ -279,7 +300,7 @@
279	300	},
280	301	{
281	302	"cell_type": "code",
282		- "execution_count": 111,
	303	+ "execution_count": 149,
283	304	"metadata": {},
284	305	"outputs": [],
285	306	"source": [
...	...	@@ -299,19 +320,7 @@
299	320	"hours = pd.DataFrame({\"hour\": range(0,24)})\n",
300	321	"threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n",
301	322	"\n",
302		- "# generate complete product of vectors\n",
303		- "def product_df(mat1, mat2):\n",
304		- " mat1 = mat1.drop_duplicates()\n",
305		- " mat2 = mat2.drop_duplicates()\n",
306		- "\n",
307		- " temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n",
308		- " for i, acol in enumerate(mat1.columns):\n",
309		- " temp[acol] = temp[0].apply(lambda x: x[i])\n",
310		- " for i, acol in enumerate(mat2.columns):\n",
311		- " temp[acol] = temp[1].apply(lambda x: x[i])\n",
312		- " temp = temp.drop(columns=[0, 1])\n",
313		- " return temp\n",
314		- "\n",
	323	+ "# generate complete product dataframe\n",
315	324	"measured_hour = product_df(walk_by_hours[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n",
316	325	"measured_threehour = product_df(walk_by_threehours[[\"user\", \"local_date\"]], threehours[[\"threehour\"]])\n",
317	326	"\n",
...	...	@@ -337,7 +346,7 @@
337	346	},
338	347	{
339	348	"cell_type": "code",
340		- "execution_count": 141,
	349	+ "execution_count": 150,
341	350	"metadata": {},
342	351	"outputs": [],
343	352	"source": [
...	...	@@ -349,12 +358,6 @@
349	358	"users = start_date.index\n",
350	359	"all_dates = pd.DataFrame({\"user\": [], \"local_date\": []})\n",
351	360	"\n",
352		- "def date_range(start_date, end_date):\n",
353		- " delta = end_date - start_date\n",
354		- "\n",
355		- " for i in range(delta.days + 1):\n",
356		- " yield start_date + timedelta(days=i)\n",
357		- "\n",
358	361	"# generate the lists of dates between start and end date\n",
359	362	"for userid in users:\n",
360	363	" current_user_dates = pd.DataFrame({\"user\": userid, \"local_date\": list(date_range(start_date[userid], end_date[userid]))})\n",
...	...	@@ -365,9 +368,22 @@
365	368	"all_dates_hour = product_df(all_dates[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n",
366	369	"\n",
367	370	"# final padded gait data\n",
368		- "padded_hours = padded_hours.merge(all_dates_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\").fillna(0)\n",
369		- "\n",
370		- "\n",
	371	+ "padded_hours = padded_hours.merge(all_dates_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\").fillna(0)\n"
	372	+ ]
	373	+ },
	374	+ {
	375	+ "cell_type": "markdown",
	376	+ "metadata": {},
	377	+ "source": [
	378	+ "# Saving Data"
	379	+ ]
	380	+ },
	381	+ {
	382	+ "cell_type": "code",
	383	+ "execution_count": 151,
	384	+ "metadata": {},
	385	+ "outputs": [],
	386	+ "source": [
371	387	"# save the data\n",
372	388	"padded_hours.to_csv(os.path.join(data_dir, \"padded_hours.csv\"), index=False)\n",
373	389	"padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)"