Junghwan Park / fpm280-data-analysis

Browse Code »

Commit 979a88c36e8e832d4a38b058c355919d741dc010

Authored by Junghwan Park 2022-02-01 19:18:56 -0800

1 parent aa77698374

Exists in main

picking up the input/output data values

Showing 6 changed files with 270 additions and 68 deletions Side-by-side Diff

python-notebook/__pycache__/constants.cpython-37.pyc
python-notebook/__pycache__/tools.cpython-37.pyc
python-notebook/constants.py
python-notebook/data_loading.ipynb
python-notebook/prepare_trteva_data.ipynb
python-notebook/tools.py

No preview for this file type

	1	+# cut off values that are not in the range of the data
	2	+THRESHOLD_OF_DAYS_PER_USER = 10
	3	+
	4	+# cut off values for the number of consecutive minutes for a walk
	5	+MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5
	6	+
	7	+# cut off values for the number of steps per minute for an active minute
	8	+MINIMUM_STEPS_PER_MINUTE = 60
	9	+
	10	+# cut off value for the number of weeks for looking back
	11	+NUMBER_OF_WEEKS_FOR_LOOKING_BACK = 5

...	...	@@ -9,7 +9,7 @@
9	9	},
10	10	{
11	11	"cell_type": "code",
12		- "execution_count": 142,
	12	+ "execution_count": 161,
13	13	"metadata": {},
14	14	"outputs": [],
15	15	"source": [
...	...	@@ -19,9 +19,8 @@
19	19	"from pandas import read_csv\n",
20	20	"import pandas as pd\n",
21	21	"import os\n",
22		- "from datetime import datetime, date, timedelta\n",
23		- "from itertools import product\n",
24		- "# %load_ext line_profiler"
	22	+ "from tools import *\n",
	23	+ "from constants import *"
25	24	]
26	25	},
27	26	{
28	27
...	...	@@ -33,65 +32,12 @@
33	32	},
34	33	{
35	34	"cell_type": "code",
36		- "execution_count": 143,
	35	+ "execution_count": 162,
37	36	"metadata": {},
38	37	"outputs": [],
39	38	"source": [
40	39	"# to use unlimited memory for large dataframes\n",
41		- "pd.options.mode.chained_assignment = None\n",
42		- "\n",
43		- "# convert a datetime object to a date object\n",
44		- "def get_date(x):\n",
45		- " return date(x.year, x.month, x.day)\n",
46		- "\n",
47		- "# convert a datetime object to an integer, which denotes the number of minutes since midnight\n",
48		- "def get_minute_index(x):\n",
49		- " return (x.hour * 60) + x.minute\n",
50		- "\n",
51		- "# return a range of dates\n",
52		- "def date_range(start_date, end_date):\n",
53		- " delta = end_date - start_date\n",
54		- "\n",
55		- " for i in range(delta.days + 1):\n",
56		- " yield start_date + timedelta(days=i)\n",
57		- "\n",
58		- "# define an iterative walk calculation (merging consecutive active minutes)\n",
59		- "def calculate_walk(cv):\n",
60		- " nv = cv.copy(deep=True)\n",
61		- " nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n",
62		- "\n",
63		- " # move midnight minutes to previous day\n",
64		- " nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n",
65		- " nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n",
66		- " \n",
67		- " nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n",
68		- " jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n",
69		- " jv[\"add_count\"] += 1\n",
70		- " jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
71		- "\n",
72		- " return jv \n",
73		- "\n",
74		- "# generate complete product of vectors\n",
75		- "def product_df(mat1, mat2):\n",
76		- " mat1 = mat1.drop_duplicates()\n",
77		- " mat2 = mat2.drop_duplicates()\n",
78		- "\n",
79		- " temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n",
80		- " for i, acol in enumerate(mat1.columns):\n",
81		- " temp[acol] = temp[0].apply(lambda x: x[i])\n",
82		- " for i, acol in enumerate(mat2.columns):\n",
83		- " temp[acol] = temp[1].apply(lambda x: x[i])\n",
84		- " temp = temp.drop(columns=[0, 1])\n",
85		- " return temp\n",
86		- "\n",
87		- "# cut off values that are not in the range of the data\n",
88		- "THRESHOLD_OF_DAYS_PER_USER = 10\n",
89		- "\n",
90		- "# cut off values for the number of consecutive minutes for a walk\n",
91		- "MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5\n",
92		- "\n",
93		- "# cut off values for the number of steps per minute for an active minute\n",
94		- "MINIMUM_STEPS_PER_MINUTE = 60\n"
	40	+ "pd.options.mode.chained_assignment = None"
95	41	]
96	42	},
97	43	{
...	...	@@ -103,7 +49,7 @@
103	49	},
104	50	{
105	51	"cell_type": "code",
106		- "execution_count": 144,
	52	+ "execution_count": 163,
107	53	"metadata": {},
108	54	"outputs": [],
109	55	"source": [
...	...	@@ -124,7 +70,7 @@
124	70	},
125	71	{
126	72	"cell_type": "code",
127		- "execution_count": 145,
	73	+ "execution_count": 164,
128	74	"metadata": {},
129	75	"outputs": [],
130	76	"source": [
...	...	@@ -173,7 +119,7 @@
173	119	},
174	120	{
175	121	"cell_type": "code",
176		- "execution_count": 146,
	122	+ "execution_count": 165,
177	123	"metadata": {},
178	124	"outputs": [],
179	125	"source": [
...	...	@@ -190,7 +136,7 @@
190	136	},
191	137	{
192	138	"cell_type": "code",
193		- "execution_count": 147,
	139	+ "execution_count": 166,
194	140	"metadata": {},
195	141	"outputs": [
196	142	{
...	...	@@ -257,7 +203,7 @@
257	203	},
258	204	{
259	205	"cell_type": "code",
260		- "execution_count": 148,
	206	+ "execution_count": 167,
261	207	"metadata": {},
262	208	"outputs": [
263	209	{
...	...	@@ -300,7 +246,7 @@
300	246	},
301	247	{
302	248	"cell_type": "code",
303		- "execution_count": 149,
	249	+ "execution_count": 168,
304	250	"metadata": {},
305	251	"outputs": [],
306	252	"source": [
...	...	@@ -346,7 +292,7 @@
346	292	},
347	293	{
348	294	"cell_type": "code",
349		- "execution_count": 150,
	295	+ "execution_count": 169,
350	296	"metadata": {},
351	297	"outputs": [],
352	298	"source": [
353	299
354	300
...	...	@@ -380,13 +326,25 @@
380	326	},
381	327	{
382	328	"cell_type": "code",
383		- "execution_count": 151,
	329	+ "execution_count": 170,
384	330	"metadata": {},
385	331	"outputs": [],
386	332	"source": [
	333	+ "# converting data type\n",
	334	+ "padded_hours[\"user\"] = padded_hours[\"user\"].astype(int)\n",
	335	+ "padded_hours[\"hour\"] = padded_hours[\"hour\"].astype(int)\n",
	336	+ "padded_hours[\"walked\"] = padded_hours[\"walked\"].astype(int)\n",
	337	+ "\n",
	338	+ "padded_threehours[\"user\"] = padded_threehours[\"user\"].astype(int)\n",
	339	+ "padded_threehours[\"threehour\"] = padded_threehours[\"threehour\"].astype(int)\n",
	340	+ "padded_threehours[\"walked\"] = padded_threehours[\"walked\"].astype(int)\n",
	341	+ "\n",
387	342	"# save the data\n",
388	343	"padded_hours.to_csv(os.path.join(data_dir, \"padded_hours.csv\"), index=False)\n",
389		- "padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)"
	344	+ "padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)\n",
	345	+ "\n",
	346	+ "padded_hours.to_pickle(os.path.join(data_dir, \"padded_hours.pkl\"))\n",
	347	+ "padded_threehours.to_pickle(os.path.join(data_dir, \"padded_threehours.pkl\"))"
390	348	]
391	349	}
392	350	],

	1	+{
	2	+ "cells": [
	3	+ {
	4	+ "cell_type": "code",
	5	+ "execution_count": 1,
	6	+ "metadata": {},
	7	+ "outputs": [],
	8	+ "source": [
	9	+ "import numpy as np\n",
	10	+ "import pandas as pd\n",
	11	+ "import os\n",
	12	+ "from tools import *\n",
	13	+ "from constants import *\n",
	14	+ "from tensorflow.keras.utils import to_categorical"
	15	+ ]
	16	+ },
	17	+ {
	18	+ "cell_type": "markdown",
	19	+ "metadata": {},
	20	+ "source": [
	21	+ "# Prepare Training, Testing, and Validation Data\n",
	22	+ "## Loading the preprocessed data"
	23	+ ]
	24	+ },
	25	+ {
	26	+ "cell_type": "code",
	27	+ "execution_count": 2,
	28	+ "metadata": {},
	29	+ "outputs": [],
	30	+ "source": [
	31	+ "# to use unlimited memory for large dataframes\n",
	32	+ "pd.options.mode.chained_assignment = None\n",
	33	+ "\n",
	34	+ "data_dir = '../data'\n",
	35	+ "\n",
	36	+ "padded_hours = pd.read_pickle(os.path.join(data_dir, 'padded_hours.pkl'))\n",
	37	+ "padded_threehours = pd.read_pickle(os.path.join(data_dir, 'padded_threehours.pkl'))"
	38	+ ]
	39	+ },
	40	+ {
	41	+ "cell_type": "markdown",
	42	+ "metadata": {},
	43	+ "source": [
	44	+ "## Enumerating Output Data"
	45	+ ]
	46	+ },
	47	+ {
	48	+ "cell_type": "code",
	49	+ "execution_count": 3,
	50	+ "metadata": {},
	51	+ "outputs": [],
	52	+ "source": [
	53	+ "# return output value\n",
	54	+ "def get_output(y):\n",
	55	+ " return y[\"walked\"]\n",
	56	+ "\n",
	57	+ "# return intput value\n",
	58	+ "def get_input(y, padded_hours):\n",
	59	+ " # base information\n",
	60	+ " user = y[\"user\"]\n",
	61	+ " local_date = y[\"local_date\"]\n",
	62	+ " threehour_idx = y[\"threehour\"]\n",
	63	+ " \n",
	64	+ " # derived information\n",
	65	+ " hour_idx = threehour_idx * 3\n",
	66	+ " encoded_hour_idx = to_categorical(hour_idx, num_classes=24)\n",
	67	+ " end_date = local_date - timedelta(days=1)\n",
	68	+ " start_date = end_date - timedelta(days=7*NUMBER_OF_WEEKS_FOR_LOOKING_BACK-1)\n",
	69	+ " weekday = local_date.weekday()\n",
	70	+ " encoded_weekday = to_categorical(weekday, num_classes=7)\n",
	71	+ " encoded_month = to_categorical(local_date.month, num_classes=12)\n",
	72	+ " encoded_day_of_month = to_categorical(local_date.day, num_classes=31)\n",
	73	+ "\n",
	74	+ " gait = pd.Series([], dtype=int)\n",
	75	+ " # gait movement\n",
	76	+ " zero_move = 0\n",
	77	+ " for a_date in date_range(start_date, end_date):\n",
	78	+ " day_df = padded_hours[(padded_hours[\"user\"] == user) & (padded_hours[\"local_date\"] == a_date)]\n",
	79	+ " if day_df.size == 0:\n",
	80	+ " gait = pd.concat([gait, pd.Series([1,0,0] * 24, dtype=int)])\n",
	81	+ " zero_move += 1\n",
	82	+ " else:\n",
	83	+ " gait = pd.concat([gait, pd.Series(to_categorical(day_df[\"walked\"].values, 3, dtype=int).reshape(24*3), dtype=int)])\n",
	84	+ " if zero_move == 5 * 7:\n",
	85	+ " raise Exception(\"No movement data\")\n",
	86	+ "\n",
	87	+ " return_series = pd.Series([], dtype=int)\n",
	88	+ " return_series = pd.concat([return_series, pd.Series(encoded_hour_idx, dtype=np.int_)])\n",
	89	+ " return_series = pd.concat([return_series, pd.Series(encoded_weekday, dtype=np.int_)])\n",
	90	+ " return_series = pd.concat([return_series, pd.Series(encoded_month, dtype=np.int_)])\n",
	91	+ " return_series = pd.concat([return_series, pd.Series(encoded_day_of_month, dtype=np.int_)])\n",
	92	+ " return_series = pd.concat([return_series, gait])\n",
	93	+ " \n",
	94	+ " return return_series\n",
	95	+ "\n",
	96	+ "def get_database(start_idx, end_idx):\n",
	97	+ " database = pd.DataFrame({}, dtype=int)\n",
	98	+ "\n",
	99	+ " for i in range(start_idx, end_idx):\n",
	100	+ " try:\n",
	101	+ " y = padded_threehours.iloc[i, :]\n",
	102	+ " user = y[\"user\"]\n",
	103	+ " local_date = y[\"local_date\"]\n",
	104	+ " first_day = padded_hours[padded_hours[\"user\"] == user][\"local_date\"].min()\n",
	105	+ " date_diff = (local_date - first_day).days\n",
	106	+ "\n",
	107	+ " threehour_idx = y[\"threehour\"]\n",
	108	+ " hour_idx = threehour_idx * 3\n",
	109	+ "\n",
	110	+ " output = get_output(y)\n",
	111	+ " input = get_input(y, padded_hours)\n",
	112	+ "\n",
	113	+ " temp_series = pd.Series([], dtype=int)\n",
	114	+ " temp_series = pd.concat([temp_series, pd.Series(user, dtype=int)])\n",
	115	+ " temp_series = pd.concat([temp_series, pd.Series(date_diff, dtype=int)])\n",
	116	+ " temp_series = pd.concat([temp_series, pd.Series(threehour_idx, dtype=int)])\n",
	117	+ " temp_series = pd.concat([temp_series, pd.Series(hour_idx, dtype=int)])\n",
	118	+ " temp_series = pd.concat([temp_series, pd.Series(output, dtype=int)])\n",
	119	+ " temp_series = pd.concat([temp_series, pd.Series(input, dtype=int)])\n",
	120	+ "\n",
	121	+ " database = pd.concat([database, temp_series], axis=1)\n",
	122	+ " # print(input)\n",
	123	+ " except Exception as e:\n",
	124	+ " # print(\"Error:\", e)\n",
	125	+ " pass\n",
	126	+ "\n",
	127	+ " return database\n",
	128	+ "\n"
	129	+ ]
	130	+ },
	131	+ {
	132	+ "cell_type": "code",
	133	+ "execution_count": 4,
	134	+ "metadata": {},
	135	+ "outputs": [],
	136	+ "source": [
	137	+ "from tensorflow.keras.datasets import mnist\n",
	138	+ "from tensorflow.keras.models import Sequential\n",
	139	+ "from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation\n",
	140	+ "\n",
	141	+ "\n",
	142	+ "(x_train, y_train), (x_test, y_test) = mnist.load_data(path='mnist.npz')\n",
	143	+ "\n",
	144	+ "X_train = x_train.reshape(60000, 784).astype('float32') / 255\n",
	145	+ "X_test = x_test.reshape(10000, 784).astype('float32') / 255\n",
	146	+ "\n",
	147	+ "Y_train = to_categorical(y_train, 10)\n",
	148	+ "Y_test = to_categorical(y_test, 10)"
	149	+ ]
	150	+ },
	151	+ {
	152	+ "cell_type": "code",
	153	+ "execution_count": null,
	154	+ "metadata": {},
	155	+ "outputs": [],
	156	+ "source": []
	157	+ }
	158	+ ],
	159	+ "metadata": {
	160	+ "interpreter": {
	161	+ "hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a"
	162	+ },
	163	+ "kernelspec": {
	164	+ "display_name": "Python 3.7.9 64-bit ('venv': venv)",
	165	+ "language": "python",
	166	+ "name": "python3"
	167	+ },
	168	+ "language_info": {
	169	+ "codemirror_mode": {
	170	+ "name": "ipython",
	171	+ "version": 3
	172	+ },
	173	+ "file_extension": ".py",
	174	+ "mimetype": "text/x-python",
	175	+ "name": "python",
	176	+ "nbconvert_exporter": "python",
	177	+ "pygments_lexer": "ipython3",
	178	+ "version": "3.7.9"
	179	+ },
	180	+ "orig_nbformat": 4
	181	+ },
	182	+ "nbformat": 4,
	183	+ "nbformat_minor": 2
	184	+}

	1	+import pandas as pd
	2	+
	3	+from itertools import product
	4	+from datetime import date, timedelta
	5	+
	6	+
	7	+# convert a datetime object to a date object
	8	+def get_date(x):
	9	+ return date(x.year, x.month, x.day)
	10	+
	11	+# convert a datetime object to an integer, which denotes the number of minutes since midnight
	12	+def get_minute_index(x):
	13	+ return (x.hour * 60) + x.minute
	14	+
	15	+# return a range of dates
	16	+def date_range(start_date, end_date):
	17	+ delta = end_date - start_date
	18	+
	19	+ for i in range(delta.days + 1):
	20	+ yield start_date + timedelta(days=i)
	21	+
	22	+# define an iterative walk calculation (merging consecutive active minutes)
	23	+def calculate_walk(cv):
	24	+ nv = cv.copy(deep=True)
	25	+ nv["prev_minute_index"] = nv["local_minute_index"] - 1
	26	+
	27	+ # move midnight minutes to previous day
	28	+ nv[nv["prev_minute_index"] < 0]["local_date"] -= timedelta(days=1)
	29	+ nv[nv["prev_minute_index"] < 0]["prev_minute_index"] = 1439
	30	+
	31	+ nv = nv[["user", "local_date", "prev_minute_index"]]
	32	+ jv = cv.merge(nv, left_on=["user", "local_date", "local_minute_index"], right_on=["user", "local_date", "prev_minute_index"], how="inner")
	33	+ jv["add_count"] += 1
	34	+ jv = jv[["user", "local_date", "local_minute_index", "add_count"]]
	35	+
	36	+ return jv
	37	+
	38	+# generate complete product of vectors
	39	+def product_df(mat1, mat2):
	40	+ mat1 = mat1.drop_duplicates()
	41	+ mat2 = mat2.drop_duplicates()
	42	+
	43	+ temp = pd.DataFrame(list(product(mat1.values, mat2.values)))
	44	+ for i, acol in enumerate(mat1.columns):
	45	+ temp[acol] = temp[0].apply(lambda x: x[i])
	46	+ for i, acol in enumerate(mat2.columns):
	47	+ temp[acol] = temp[1].apply(lambda x: x[i])
	48	+ temp = temp.drop(columns=[0, 1])
	49	+ return temp