prepare_trteva_data.ipynb 7.93 KB
edit raw blame history

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "from tools import *\n",
    "from constants import *\n",
    "from tensorflow.keras.utils import to_categorical\n",
    "\n",
    "# %load_ext line_profiler"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare Training, Testing, and Validation Data\n",
    "## Loading the preprocessed data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# to use unlimited memory for large dataframes\n",
    "pd.options.mode.chained_assignment = None\n",
    "\n",
    "data_dir = '../data'\n",
    "\n",
    "padded_hours = pd.read_pickle(os.path.join(data_dir, 'padded_hours.pkl'))\n",
    "padded_threehours = pd.read_pickle(os.path.join(data_dir, 'padded_threehours.pkl'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Expanding one-hot-encoded gaits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(42360, 4) -> (127080, 4)\n"
     ]
    }
   ],
   "source": [
    "def mass_one_hot_encoding(padded_hours, colname, n_classes):\n",
    "    def __mass_one_hot_encoding(padded_hours, colname, n_classes, n):\n",
    "        temp = padded_hours[padded_hours[colname] == n]\n",
    "\n",
    "        return_df = pd.DataFrame(dtype=int)\n",
    "\n",
    "        for i in range(n_classes):\n",
    "            temp_2 = temp.copy(deep=True)\n",
    "            temp_2[\"var\"] = i\n",
    "            temp_2[\"value\"] = (n == i) if 1 else 0\n",
    "            return_df = pd.concat([return_df, temp_2], ignore_index=True)\n",
    "\n",
    "        return return_df\n",
    "    \n",
    "    mass_encoded = pd.DataFrame(dtype=int)\n",
    "    for n in range(n_classes):\n",
    "        mass_encoded = pd.concat([mass_encoded, __mass_one_hot_encoding(padded_hours, colname, n_classes, n)], ignore_index=True)\n",
    "    return mass_encoded\n",
    "\n",
    "padded_hours_encoded = mass_one_hot_encoding(padded_hours, 'walked', 3)\n",
    "padded_hours_encoded[\"local_date\"] = padded_hours_encoded[\"local_date\"].astype(str)\n",
    "padded_hours_encoded = padded_hours_encoded.set_index(['user', 'local_date']).sort_index()\n",
    "\n",
    "print(\"{} -> {}\".format(padded_hours.shape, padded_hours_encoded.shape))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Enumerating Output Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# return output value\n",
    "def get_output(y):\n",
    "    return y[\"walked\"]\n",
    "\n",
    "# return intput value\n",
    "def get_input(y, padded_hours):\n",
    "    # base information\n",
    "    user = y[\"user\"]\n",
    "    local_date = y[\"local_date\"]\n",
    "    threehour_idx = y[\"threehour\"]\n",
    "    \n",
    "    # derived information\n",
    "    hour_idx = threehour_idx * 3\n",
    "    encoded_hour_idx = to_categorical(hour_idx, num_classes=24)\n",
    "    end_date = local_date - timedelta(days=1)\n",
    "    start_date = end_date - timedelta(days=7*NUMBER_OF_WEEKS_FOR_LOOKING_BACK-1)\n",
    "    weekday = local_date.weekday()\n",
    "    encoded_weekday = to_categorical(weekday, num_classes=7)\n",
    "    encoded_month = to_categorical(local_date.month, num_classes=12)\n",
    "    encoded_day_of_month = to_categorical(local_date.day, num_classes=31)\n",
    "\n",
    "    gait = pd.Series([], dtype=int)\n",
    "    # gait movement\n",
    "    zero_move = 0\n",
    "    for a_date in date_range(start_date, end_date):\n",
    "        key = (user, a_date.strftime(\"%Y-%m-%d\"))\n",
    "        if key in padded_hours_encoded.index:\n",
    "            day_df = padded_hours_encoded.loc[key, \"value\"]\n",
    "            gait = pd.concat([gait, day_df], ignore_index=True)\n",
    "        else:\n",
    "            gait = pd.concat([gait, pd.Series([1,0,0] * 24, dtype=int)], ignore_index=True)\n",
    "            zero_move += 1\n",
    "    if zero_move == 5 * 7:\n",
    "        raise Exception(\"No movement data\")\n",
    "\n",
    "    return_series = pd.Series([], dtype=int)\n",
    "    return_series = pd.concat([return_series, pd.Series(encoded_hour_idx, dtype=np.int_)])\n",
    "    return_series = pd.concat([return_series, pd.Series(encoded_weekday, dtype=np.int_)])\n",
    "    return_series = pd.concat([return_series, pd.Series(encoded_month, dtype=np.int_)])\n",
    "    return_series = pd.concat([return_series, pd.Series(encoded_day_of_month, dtype=np.int_)])\n",
    "    return_series = pd.concat([return_series, gait])\n",
    "    \n",
    "    return return_series\n",
    "\n",
    "def get_database(start_idx, end_idx):\n",
    "    database = pd.DataFrame({}, dtype=int)\n",
    "\n",
    "    for i in range(start_idx, end_idx):\n",
    "        try:\n",
    "            y = padded_threehours.iloc[i, :]\n",
    "            user = y[\"user\"]\n",
    "            local_date = y[\"local_date\"]\n",
    "            first_day = padded_hours[padded_hours[\"user\"] == user][\"local_date\"].min()\n",
    "            date_diff = (local_date - first_day).days\n",
    "\n",
    "            threehour_idx = y[\"threehour\"]\n",
    "            hour_idx = threehour_idx * 3\n",
    "\n",
    "            output = get_output(y)\n",
    "            input = get_input(y, padded_hours)\n",
    "\n",
    "            temp_series = pd.Series([], dtype=int)\n",
    "            temp_series = pd.concat([temp_series, pd.Series(user, dtype=int)])\n",
    "            temp_series = pd.concat([temp_series, pd.Series(date_diff, dtype=int)])\n",
    "            temp_series = pd.concat([temp_series, pd.Series(threehour_idx, dtype=int)])\n",
    "            temp_series = pd.concat([temp_series, pd.Series(hour_idx, dtype=int)])\n",
    "            temp_series = pd.concat([temp_series, pd.Series(output, dtype=int)])\n",
    "            temp_series = pd.concat([temp_series, pd.Series(input, dtype=int)]).reset_index(drop=True)\n",
    "\n",
    "            database = pd.concat([database, temp_series], axis=1)\n",
    "            # print(input)\n",
    "        except Exception as e:\n",
    "            # print(\"Error:\", e)\n",
    "            pass\n",
    "\n",
    "    return database\n",
    "\n",
    "database = get_database(0, 100)\n",
    "\n",
    "database.to_pickle(os.path.join(data_dir, \"database.pkl\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensorflow.keras.datasets import mnist\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation\n",
    "\n",
    "\n",
    "(x_train, y_train), (x_test, y_test) = mnist.load_data(path='mnist.npz')\n",
    "\n",
    "X_train = x_train.reshape(60000, 784).astype('float32') / 255\n",
    "X_test = x_test.reshape(10000, 784).astype('float32') / 255\n",
    "\n",
    "Y_train = to_categorical(y_train, 10)\n",
    "Y_test = to_categorical(y_test, 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a"
  },
  "kernelspec": {
   "display_name": "Python 3.7.9 64-bit ('venv': venv)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}