prepare_trteva_data.ipynb 7.93 KB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"from tools import *\n",
"from constants import *\n",
"from tensorflow.keras.utils import to_categorical\n",
"\n",
"# %load_ext line_profiler"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prepare Training, Testing, and Validation Data\n",
"## Loading the preprocessed data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# to use unlimited memory for large dataframes\n",
"pd.options.mode.chained_assignment = None\n",
"\n",
"data_dir = '../data'\n",
"\n",
"padded_hours = pd.read_pickle(os.path.join(data_dir, 'padded_hours.pkl'))\n",
"padded_threehours = pd.read_pickle(os.path.join(data_dir, 'padded_threehours.pkl'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Expanding one-hot-encoded gaits"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(42360, 4) -> (127080, 4)\n"
]
}
],
"source": [
"def mass_one_hot_encoding(padded_hours, colname, n_classes):\n",
" def __mass_one_hot_encoding(padded_hours, colname, n_classes, n):\n",
" temp = padded_hours[padded_hours[colname] == n]\n",
"\n",
" return_df = pd.DataFrame(dtype=int)\n",
"\n",
" for i in range(n_classes):\n",
" temp_2 = temp.copy(deep=True)\n",
" temp_2[\"var\"] = i\n",
" temp_2[\"value\"] = (n == i) if 1 else 0\n",
" return_df = pd.concat([return_df, temp_2], ignore_index=True)\n",
"\n",
" return return_df\n",
" \n",
" mass_encoded = pd.DataFrame(dtype=int)\n",
" for n in range(n_classes):\n",
" mass_encoded = pd.concat([mass_encoded, __mass_one_hot_encoding(padded_hours, colname, n_classes, n)], ignore_index=True)\n",
" return mass_encoded\n",
"\n",
"padded_hours_encoded = mass_one_hot_encoding(padded_hours, 'walked', 3)\n",
"padded_hours_encoded[\"local_date\"] = padded_hours_encoded[\"local_date\"].astype(str)\n",
"padded_hours_encoded = padded_hours_encoded.set_index(['user', 'local_date']).sort_index()\n",
"\n",
"print(\"{} -> {}\".format(padded_hours.shape, padded_hours_encoded.shape))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Enumerating Output Data"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# return output value\n",
"def get_output(y):\n",
" return y[\"walked\"]\n",
"\n",
"# return intput value\n",
"def get_input(y, padded_hours):\n",
" # base information\n",
" user = y[\"user\"]\n",
" local_date = y[\"local_date\"]\n",
" threehour_idx = y[\"threehour\"]\n",
" \n",
" # derived information\n",
" hour_idx = threehour_idx * 3\n",
" encoded_hour_idx = to_categorical(hour_idx, num_classes=24)\n",
" end_date = local_date - timedelta(days=1)\n",
" start_date = end_date - timedelta(days=7*NUMBER_OF_WEEKS_FOR_LOOKING_BACK-1)\n",
" weekday = local_date.weekday()\n",
" encoded_weekday = to_categorical(weekday, num_classes=7)\n",
" encoded_month = to_categorical(local_date.month, num_classes=12)\n",
" encoded_day_of_month = to_categorical(local_date.day, num_classes=31)\n",
"\n",
" gait = pd.Series([], dtype=int)\n",
" # gait movement\n",
" zero_move = 0\n",
" for a_date in date_range(start_date, end_date):\n",
" key = (user, a_date.strftime(\"%Y-%m-%d\"))\n",
" if key in padded_hours_encoded.index:\n",
" day_df = padded_hours_encoded.loc[key, \"value\"]\n",
" gait = pd.concat([gait, day_df], ignore_index=True)\n",
" else:\n",
" gait = pd.concat([gait, pd.Series([1,0,0] * 24, dtype=int)], ignore_index=True)\n",
" zero_move += 1\n",
" if zero_move == 5 * 7:\n",
" raise Exception(\"No movement data\")\n",
"\n",
" return_series = pd.Series([], dtype=int)\n",
" return_series = pd.concat([return_series, pd.Series(encoded_hour_idx, dtype=np.int_)])\n",
" return_series = pd.concat([return_series, pd.Series(encoded_weekday, dtype=np.int_)])\n",
" return_series = pd.concat([return_series, pd.Series(encoded_month, dtype=np.int_)])\n",
" return_series = pd.concat([return_series, pd.Series(encoded_day_of_month, dtype=np.int_)])\n",
" return_series = pd.concat([return_series, gait])\n",
" \n",
" return return_series\n",
"\n",
"def get_database(start_idx, end_idx):\n",
" database = pd.DataFrame({}, dtype=int)\n",
"\n",
" for i in range(start_idx, end_idx):\n",
" try:\n",
" y = padded_threehours.iloc[i, :]\n",
" user = y[\"user\"]\n",
" local_date = y[\"local_date\"]\n",
" first_day = padded_hours[padded_hours[\"user\"] == user][\"local_date\"].min()\n",
" date_diff = (local_date - first_day).days\n",
"\n",
" threehour_idx = y[\"threehour\"]\n",
" hour_idx = threehour_idx * 3\n",
"\n",
" output = get_output(y)\n",
" input = get_input(y, padded_hours)\n",
"\n",
" temp_series = pd.Series([], dtype=int)\n",
" temp_series = pd.concat([temp_series, pd.Series(user, dtype=int)])\n",
" temp_series = pd.concat([temp_series, pd.Series(date_diff, dtype=int)])\n",
" temp_series = pd.concat([temp_series, pd.Series(threehour_idx, dtype=int)])\n",
" temp_series = pd.concat([temp_series, pd.Series(hour_idx, dtype=int)])\n",
" temp_series = pd.concat([temp_series, pd.Series(output, dtype=int)])\n",
" temp_series = pd.concat([temp_series, pd.Series(input, dtype=int)]).reset_index(drop=True)\n",
"\n",
" database = pd.concat([database, temp_series], axis=1)\n",
" # print(input)\n",
" except Exception as e:\n",
" # print(\"Error:\", e)\n",
" pass\n",
"\n",
" return database\n",
"\n",
"database = get_database(0, 100)\n",
"\n",
"database.to_pickle(os.path.join(data_dir, \"database.pkl\"))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from tensorflow.keras.datasets import mnist\n",
"from tensorflow.keras.models import Sequential\n",
"from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation\n",
"\n",
"\n",
"(x_train, y_train), (x_test, y_test) = mnist.load_data(path='mnist.npz')\n",
"\n",
"X_train = x_train.reshape(60000, 784).astype('float32') / 255\n",
"X_test = x_test.reshape(10000, 784).astype('float32') / 255\n",
"\n",
"Y_train = to_categorical(y_train, 10)\n",
"Y_test = to_categorical(y_test, 10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a"
},
"kernelspec": {
"display_name": "Python 3.7.9 64-bit ('venv': venv)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}