Commit 979a88c36e8e832d4a38b058c355919d741dc010
1 parent
aa77698374
Exists in
main
picking up the input/output data values
Showing 6 changed files with 270 additions and 68 deletions Inline Diff
python-notebook/__pycache__/constants.cpython-37.pyc
View file @
979a88c
python-notebook/__pycache__/tools.cpython-37.pyc
View file @
979a88c
python-notebook/constants.py
View file @
979a88c
File was created | 1 | # cut off values that are not in the range of the data | ||
2 | THRESHOLD_OF_DAYS_PER_USER = 10 | |||
3 | ||||
4 | # cut off values for the number of consecutive minutes for a walk | |||
5 | MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5 | |||
6 | ||||
7 | # cut off values for the number of steps per minute for an active minute | |||
8 | MINIMUM_STEPS_PER_MINUTE = 60 |
python-notebook/data_loading.ipynb
View file @
979a88c
{ | 1 | 1 | { | |
"cells": [ | 2 | 2 | "cells": [ | |
{ | 3 | 3 | { | |
"cell_type": "markdown", | 4 | 4 | "cell_type": "markdown", | |
"metadata": {}, | 5 | 5 | "metadata": {}, | |
"source": [ | 6 | 6 | "source": [ | |
"# Loading libraries" | 7 | 7 | "# Loading libraries" | |
] | 8 | 8 | ] | |
}, | 9 | 9 | }, | |
{ | 10 | 10 | { | |
"cell_type": "code", | 11 | 11 | "cell_type": "code", | |
"execution_count": 142, | 12 | 12 | "execution_count": 161, | |
"metadata": {}, | 13 | 13 | "metadata": {}, | |
"outputs": [], | 14 | 14 | "outputs": [], | |
"source": [ | 15 | 15 | "source": [ | |
"import numpy as np\n", | 16 | 16 | "import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | 17 | 17 | "import matplotlib.pyplot as plt\n", | |
"import seaborn as sns\n", | 18 | 18 | "import seaborn as sns\n", | |
"from pandas import read_csv\n", | 19 | 19 | "from pandas import read_csv\n", | |
"import pandas as pd\n", | 20 | 20 | "import pandas as pd\n", | |
"import os\n", | 21 | 21 | "import os\n", | |
"from datetime import datetime, date, timedelta\n", | 22 | 22 | "from tools import *\n", | |
"from itertools import product\n", | 23 | 23 | "from constants import *" | |
"# %load_ext line_profiler" | 24 | |||
] | 25 | 24 | ] | |
}, | 26 | 25 | }, | |
{ | 27 | 26 | { | |
"cell_type": "markdown", | 28 | 27 | "cell_type": "markdown", | |
"metadata": {}, | 29 | 28 | "metadata": {}, | |
"source": [ | 30 | 29 | "source": [ | |
"# Defining Functions and Adjusting Settings" | 31 | 30 | "# Defining Functions and Adjusting Settings" | |
] | 32 | 31 | ] | |
}, | 33 | 32 | }, | |
{ | 34 | 33 | { | |
"cell_type": "code", | 35 | 34 | "cell_type": "code", | |
"execution_count": 143, | 36 | 35 | "execution_count": 162, | |
"metadata": {}, | 37 | 36 | "metadata": {}, | |
"outputs": [], | 38 | 37 | "outputs": [], | |
"source": [ | 39 | 38 | "source": [ | |
"# to use unlimited memory for large dataframes\n", | 40 | 39 | "# to use unlimited memory for large dataframes\n", | |
"pd.options.mode.chained_assignment = None\n", | 41 | 40 | "pd.options.mode.chained_assignment = None" | |
"\n", | 42 | |||
"# convert a datetime object to a date object\n", | 43 | |||
"def get_date(x):\n", | 44 | |||
" return date(x.year, x.month, x.day)\n", | 45 | |||
"\n", | 46 | |||
"# convert a datetime object to an integer, which denotes the number of minutes since midnight\n", | 47 | |||
"def get_minute_index(x):\n", | 48 | |||
" return (x.hour * 60) + x.minute\n", | 49 | |||
"\n", | 50 | |||
"# return a range of dates\n", | 51 | |||
"def date_range(start_date, end_date):\n", | 52 | |||
" delta = end_date - start_date\n", | 53 | |||
"\n", | 54 | |||
" for i in range(delta.days + 1):\n", | 55 | |||
" yield start_date + timedelta(days=i)\n", | 56 | |||
"\n", | 57 | |||
"# define an iterative walk calculation (merging consecutive active minutes)\n", | 58 | |||
"def calculate_walk(cv):\n", | 59 | |||
" nv = cv.copy(deep=True)\n", | 60 | |||
" nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n", | 61 | |||
"\n", | 62 | |||
" # move midnight minutes to previous day\n", | 63 | |||
" nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n", | 64 | |||
" nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n", | 65 | |||
" \n", | 66 | |||
" nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n", | 67 | |||
" jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n", | 68 | |||
" jv[\"add_count\"] += 1\n", | 69 | |||
" jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n", | 70 | |||
"\n", | 71 | |||
" return jv \n", | 72 | |||
"\n", | 73 | |||
"# generate complete product of vectors\n", | 74 | |||
"def product_df(mat1, mat2):\n", | 75 | |||
" mat1 = mat1.drop_duplicates()\n", | 76 | |||
" mat2 = mat2.drop_duplicates()\n", | 77 | |||
"\n", | 78 | |||
" temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n", | 79 | |||
" for i, acol in enumerate(mat1.columns):\n", | 80 | |||
" temp[acol] = temp[0].apply(lambda x: x[i])\n", | 81 | |||
" for i, acol in enumerate(mat2.columns):\n", | 82 | |||
" temp[acol] = temp[1].apply(lambda x: x[i])\n", | 83 | |||
" temp = temp.drop(columns=[0, 1])\n", | 84 | |||
" return temp\n", | 85 | |||
"\n", | 86 | |||
"# cut off values that are not in the range of the data\n", | 87 | |||
"THRESHOLD_OF_DAYS_PER_USER = 10\n", | 88 | |||
"\n", | 89 | |||
"# cut off values for the number of consecutive minutes for a walk\n", | 90 | |||
"MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK = 5\n", | 91 | |||
"\n", | 92 | |||
"# cut off values for the number of steps per minute for an active minute\n", | 93 | |||
"MINIMUM_STEPS_PER_MINUTE = 60\n" | 94 | |||
] | 95 | 41 | ] | |
}, | 96 | 42 | }, | |
{ | 97 | 43 | { | |
"cell_type": "markdown", | 98 | 44 | "cell_type": "markdown", | |
"metadata": {}, | 99 | 45 | "metadata": {}, | |
"source": [ | 100 | 46 | "source": [ | |
"# Loading data files" | 101 | 47 | "# Loading data files" | |
] | 102 | 48 | ] | |
}, | 103 | 49 | }, | |
{ | 104 | 50 | { | |
"cell_type": "code", | 105 | 51 | "cell_type": "code", | |
"execution_count": 144, | 106 | 52 | "execution_count": 163, | |
"metadata": {}, | 107 | 53 | "metadata": {}, | |
"outputs": [], | 108 | 54 | "outputs": [], | |
"source": [ | 109 | 55 | "source": [ | |
"data_dir = '../data'\n", | 110 | 56 | "data_dir = '../data'\n", | |
"\n", | 111 | 57 | "\n", | |
"daily = read_csv(os.path.join(data_dir, 'daily.csv'))\n", | 112 | 58 | "daily = read_csv(os.path.join(data_dir, 'daily.csv'))\n", | |
"dose = read_csv(os.path.join(data_dir, 'dose.csv'))\n", | 113 | 59 | "dose = read_csv(os.path.join(data_dir, 'dose.csv'))\n", | |
"jawbone = read_csv(os.path.join(data_dir, 'jawbone.csv'), low_memory=False)\n" | 114 | 60 | "jawbone = read_csv(os.path.join(data_dir, 'jawbone.csv'), low_memory=False)\n" | |
] | 115 | 61 | ] | |
}, | 116 | 62 | }, | |
{ | 117 | 63 | { | |
"cell_type": "markdown", | 118 | 64 | "cell_type": "markdown", | |
"metadata": {}, | 119 | 65 | "metadata": {}, | |
"source": [ | 120 | 66 | "source": [ | |
"# Preprocessing\n", | 121 | 67 | "# Preprocessing\n", | |
"## Picking up the variables" | 122 | 68 | "## Picking up the variables" | |
] | 123 | 69 | ] | |
}, | 124 | 70 | }, | |
{ | 125 | 71 | { | |
"cell_type": "code", | 126 | 72 | "cell_type": "code", | |
"execution_count": 145, | 127 | 73 | "execution_count": 164, | |
"metadata": {}, | 128 | 74 | "metadata": {}, | |
"outputs": [], | 129 | 75 | "outputs": [], | |
"source": [ | 130 | 76 | "source": [ | |
"# Column names of jawbone data\n", | 131 | 77 | "# Column names of jawbone data\n", | |
"# 'Var1', 'user', 'start_datetime', 'end_datetime', 'timezone', 'userid',\n", | 132 | 78 | "# 'Var1', 'user', 'start_datetime', 'end_datetime', 'timezone', 'userid',\n", | |
"# 'steps', 'gmtoff', 'tz', 'start_date', 'end_date', 'start_utime',\n", | 133 | 79 | "# 'steps', 'gmtoff', 'tz', 'start_date', 'end_date', 'start_utime',\n", | |
"# 'end_utime', 'start_udate', 'end_udate', 'intake_date', 'intake_utime',\n", | 134 | 80 | "# 'end_utime', 'start_udate', 'end_udate', 'intake_date', 'intake_utime',\n", | |
"# 'intake_tz', 'intake_gmtoff', 'intake_hour', 'intake_min',\n", | 135 | 81 | "# 'intake_tz', 'intake_gmtoff', 'intake_hour', 'intake_min',\n", | |
"# 'intake_slot', 'travel_start', 'travel_end', 'exit_date',\n", | 136 | 82 | "# 'intake_slot', 'travel_start', 'travel_end', 'exit_date',\n", | |
"# 'dropout_date', 'last_date', 'last_utime', 'last_tz', 'last_gmtoff',\n", | 137 | 83 | "# 'dropout_date', 'last_date', 'last_utime', 'last_tz', 'last_gmtoff',\n", | |
"# 'last_hour', 'last_min', 'start_utime_local', 'end_utime_local'\n", | 138 | 84 | "# 'last_hour', 'last_min', 'start_utime_local', 'end_utime_local'\n", | |
"\n", | 139 | 85 | "\n", | |
"\n", | 140 | 86 | "\n", | |
"# duplicate jawbone data\n", | 141 | 87 | "# duplicate jawbone data\n", | |
"jawbone2 = jawbone.copy(deep=True)\n", | 142 | 88 | "jawbone2 = jawbone.copy(deep=True)\n", | |
"\n", | 143 | 89 | "\n", | |
"# convert string datetimes to actual datetime objects\n", | 144 | 90 | "# convert string datetimes to actual datetime objects\n", | |
"jawbone2[\"start_utime_local\"] = pd.to_datetime(\n", | 145 | 91 | "jawbone2[\"start_utime_local\"] = pd.to_datetime(\n", | |
" jawbone2[\"start_utime_local\"], format=\"%Y-%m-%d %H:%M:%S\")\n", | 146 | 92 | " jawbone2[\"start_utime_local\"], format=\"%Y-%m-%d %H:%M:%S\")\n", | |
"jawbone2[\"start_datetime\"] = pd.to_datetime(\n", | 147 | 93 | "jawbone2[\"start_datetime\"] = pd.to_datetime(\n", | |
" jawbone2[\"start_datetime\"], format=\"%Y-%m-%d %H:%M:%S\")\n", | 148 | 94 | " jawbone2[\"start_datetime\"], format=\"%Y-%m-%d %H:%M:%S\")\n", | |
"\n", | 149 | 95 | "\n", | |
"# calculate the timezone offset\n", | 150 | 96 | "# calculate the timezone offset\n", | |
"jawbone2[\"tz_offset\"] = jawbone2[\"start_datetime\"] - \\\n", | 151 | 97 | "jawbone2[\"tz_offset\"] = jawbone2[\"start_datetime\"] - \\\n", | |
" jawbone2[\"start_utime_local\"]\n", | 152 | 98 | " jawbone2[\"start_utime_local\"]\n", | |
"\n", | 153 | 99 | "\n", | |
"\n", | 154 | 100 | "\n", | |
"# selecting only important columns\n", | 155 | 101 | "# selecting only important columns\n", | |
"jawbone3 = jawbone2[[\"user\", \"start_utime_local\",\n", | 156 | 102 | "jawbone3 = jawbone2[[\"user\", \"start_utime_local\",\n", | |
" \"end_utime_local\", \"tz_offset\", \"steps\"]]\n", | 157 | 103 | " \"end_utime_local\", \"tz_offset\", \"steps\"]]\n", | |
"\n", | 158 | 104 | "\n", | |
"# picking up the local date\n", | 159 | 105 | "# picking up the local date\n", | |
"jawbone3[\"local_date\"] = jawbone3[\"start_utime_local\"].apply(get_date)\n", | 160 | 106 | "jawbone3[\"local_date\"] = jawbone3[\"start_utime_local\"].apply(get_date)\n", | |
"\n", | 161 | 107 | "\n", | |
"# picking up the local minute index\n", | 162 | 108 | "# picking up the local minute index\n", | |
"jawbone3[\"local_minute_index\"] = jawbone3[\"start_utime_local\"].apply(\n", | 163 | 109 | "jawbone3[\"local_minute_index\"] = jawbone3[\"start_utime_local\"].apply(\n", | |
" get_minute_index)\n" | 164 | 110 | " get_minute_index)\n" | |
] | 165 | 111 | ] | |
}, | 166 | 112 | }, | |
{ | 167 | 113 | { | |
"cell_type": "markdown", | 168 | 114 | "cell_type": "markdown", | |
"metadata": {}, | 169 | 115 | "metadata": {}, | |
"source": [ | 170 | 116 | "source": [ | |
"## Making a key info database" | 171 | 117 | "## Making a key info database" | |
] | 172 | 118 | ] | |
}, | 173 | 119 | }, | |
{ | 174 | 120 | { | |
"cell_type": "code", | 175 | 121 | "cell_type": "code", | |
"execution_count": 146, | 176 | 122 | "execution_count": 165, | |
"metadata": {}, | 177 | 123 | "metadata": {}, | |
"outputs": [], | 178 | 124 | "outputs": [], | |
"source": [ | 179 | 125 | "source": [ | |
"# picking up the user - date data\n", | 180 | 126 | "# picking up the user - date data\n", | |
"user_date = jawbone3[[\"user\", \"local_date\"]].drop_duplicates()" | 181 | 127 | "user_date = jawbone3[[\"user\", \"local_date\"]].drop_duplicates()" | |
] | 182 | 128 | ] | |
}, | 183 | 129 | }, | |
{ | 184 | 130 | { | |
"cell_type": "markdown", | 185 | 131 | "cell_type": "markdown", | |
"metadata": {}, | 186 | 132 | "metadata": {}, | |
"source": [ | 187 | 133 | "source": [ | |
"## Removing users with too small amount of data" | 188 | 134 | "## Removing users with too small amount of data" | |
] | 189 | 135 | ] | |
}, | 190 | 136 | }, | |
{ | 191 | 137 | { | |
"cell_type": "code", | 192 | 138 | "cell_type": "code", | |
"execution_count": 147, | 193 | 139 | "execution_count": 166, | |
"metadata": {}, | 194 | 140 | "metadata": {}, | |
"outputs": [ | 195 | 141 | "outputs": [ | |
{ | 196 | 142 | { | |
"name": "stdout", | 197 | 143 | "name": "stdout", | |
"output_type": "stream", | 198 | 144 | "output_type": "stream", | |
"text": [ | 199 | 145 | "text": [ | |
"Threshold: 10\n", | 200 | 146 | "Threshold: 10\n", | |
"Users to be removed:[12, 36, 38]\n", | 201 | 147 | "Users to be removed:[12, 36, 38]\n", | |
"Shape Change: 258889 -> 258363 (-526, -0.2%)\n" | 202 | 148 | "Shape Change: 258889 -> 258363 (-526, -0.2%)\n" | |
] | 203 | 149 | ] | |
}, | 204 | 150 | }, | |
{ | 205 | 151 | { | |
"data": { | 206 | 152 | "data": { | |
"image/png": "", | 207 | 153 | "image/png": "", | |
"text/plain": [ | 208 | 154 | "text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | 209 | 155 | "<Figure size 432x288 with 1 Axes>" | |
] | 210 | 156 | ] | |
}, | 211 | 157 | }, | |
"metadata": {}, | 212 | 158 | "metadata": {}, | |
"output_type": "display_data" | 213 | 159 | "output_type": "display_data" | |
} | 214 | 160 | } | |
], | 215 | 161 | ], | |
"source": [ | 216 | 162 | "source": [ | |
"# making a stat of the number of days per user\n", | 217 | 163 | "# making a stat of the number of days per user\n", | |
"stat_user = user_date.groupby(['user'])['local_date'].nunique().sort_values()\n", | 218 | 164 | "stat_user = user_date.groupby(['user'])['local_date'].nunique().sort_values()\n", | |
"\n", | 219 | 165 | "\n", | |
"ax = plt.figure()\n", | 220 | 166 | "ax = plt.figure()\n", | |
"ax.patch.set_facecolor('white')\n", | 221 | 167 | "ax.patch.set_facecolor('white')\n", | |
"ax = sns.histplot(stat_user)\n", | 222 | 168 | "ax = sns.histplot(stat_user)\n", | |
"ax.set_title('Distribution of number of days per user')\n", | 223 | 169 | "ax.set_title('Distribution of number of days per user')\n", | |
"ax.set_xlabel('Number of days')\n", | 224 | 170 | "ax.set_xlabel('Number of days')\n", | |
"ax.set_ylabel('Frequency')\n", | 225 | 171 | "ax.set_ylabel('Frequency')\n", | |
"\n", | 226 | 172 | "\n", | |
"\n", | 227 | 173 | "\n", | |
"# filter out users that have less days of data than THRESHOLD_OF_DAYS_PER_USER\n", | 228 | 174 | "# filter out users that have less days of data than THRESHOLD_OF_DAYS_PER_USER\n", | |
"users_to_be_removed = stat_user[stat_user < THRESHOLD_OF_DAYS_PER_USER].index\n", | 229 | 175 | "users_to_be_removed = stat_user[stat_user < THRESHOLD_OF_DAYS_PER_USER].index\n", | |
"\n", | 230 | 176 | "\n", | |
"print(\"Threshold: {}\".format(THRESHOLD_OF_DAYS_PER_USER))\n", | 231 | 177 | "print(\"Threshold: {}\".format(THRESHOLD_OF_DAYS_PER_USER))\n", | |
"print(\"Users to be removed:{}\".format(list(users_to_be_removed)))\n", | 232 | 178 | "print(\"Users to be removed:{}\".format(list(users_to_be_removed)))\n", | |
"\n", | 233 | 179 | "\n", | |
"jawbone4 = jawbone3[~jawbone3[\"user\"].isin(users_to_be_removed)]\n", | 234 | 180 | "jawbone4 = jawbone3[~jawbone3[\"user\"].isin(users_to_be_removed)]\n", | |
"\n", | 235 | 181 | "\n", | |
"user_date2 = user_date[~user_date[\"user\"].isin(users_to_be_removed)]\n", | 236 | 182 | "user_date2 = user_date[~user_date[\"user\"].isin(users_to_be_removed)]\n", | |
"\n", | 237 | 183 | "\n", | |
"# printing the amount of data removed\n", | 238 | 184 | "# printing the amount of data removed\n", | |
"jawbone3_count, _ = jawbone3.shape\n", | 239 | 185 | "jawbone3_count, _ = jawbone3.shape\n", | |
"jawbone4_count, _ = jawbone4.shape\n", | 240 | 186 | "jawbone4_count, _ = jawbone4.shape\n", | |
"\n", | 241 | 187 | "\n", | |
"print(\"Shape Change: {} -> {} (-{}, -{}%)\".format(\n", | 242 | 188 | "print(\"Shape Change: {} -> {} (-{}, -{}%)\".format(\n", | |
" jawbone3_count, \n", | 243 | 189 | " jawbone3_count, \n", | |
" jawbone4_count, \n", | 244 | 190 | " jawbone4_count, \n", | |
" jawbone3_count - jawbone4_count, \n", | 245 | 191 | " jawbone3_count - jawbone4_count, \n", | |
" round((jawbone3_count - jawbone4_count) / jawbone3_count * 100, 2)\n", | 246 | 192 | " round((jawbone3_count - jawbone4_count) / jawbone3_count * 100, 2)\n", | |
" )\n", | 247 | 193 | " )\n", | |
")" | 248 | 194 | ")" | |
] | 249 | 195 | ] | |
}, | 250 | 196 | }, | |
{ | 251 | 197 | { | |
"cell_type": "markdown", | 252 | 198 | "cell_type": "markdown", | |
"metadata": {}, | 253 | 199 | "metadata": {}, | |
"source": [ | 254 | 200 | "source": [ | |
"## Find consecutive minute walks" | 255 | 201 | "## Find consecutive minute walks" | |
] | 256 | 202 | ] | |
}, | 257 | 203 | }, | |
{ | 258 | 204 | { | |
"cell_type": "code", | 259 | 205 | "cell_type": "code", | |
"execution_count": 148, | 260 | 206 | "execution_count": 167, | |
"metadata": {}, | 261 | 207 | "metadata": {}, | |
"outputs": [ | 262 | 208 | "outputs": [ | |
{ | 263 | 209 | { | |
"name": "stdout", | 264 | 210 | "name": "stdout", | |
"output_type": "stream", | 265 | 211 | "output_type": "stream", | |
"text": [ | 266 | 212 | "text": [ | |
"Iteration: 0, length: 377396\n", | 267 | 213 | "Iteration: 0, length: 377396\n", | |
"Iteration: 1, length: 229752\n", | 268 | 214 | "Iteration: 1, length: 229752\n", | |
"Iteration: 2, length: 170648\n", | 269 | 215 | "Iteration: 2, length: 170648\n", | |
"Iteration: 3, length: 137484\n", | 270 | 216 | "Iteration: 3, length: 137484\n", | |
"Iteration: 4, length: 178268\n", | 271 | 217 | "Iteration: 4, length: 178268\n", | |
"Final, length: 94884\n" | 272 | 218 | "Final, length: 94884\n" | |
] | 273 | 219 | ] | |
} | 274 | 220 | } | |
], | 275 | 221 | ], | |
"source": [ | 276 | 222 | "source": [ | |
"# prepare the data for the walk calculation\n", | 277 | 223 | "# prepare the data for the walk calculation\n", | |
"current_vector = jawbone4[[\"user\", \"local_date\", \"local_minute_index\", \"steps\"]]\n", | 278 | 224 | "current_vector = jawbone4[[\"user\", \"local_date\", \"local_minute_index\", \"steps\"]]\n", | |
"current_vector[\"add_count\"] = 1\n", | 279 | 225 | "current_vector[\"add_count\"] = 1\n", | |
"current_vector = current_vector[current_vector[\"steps\"] > MINIMUM_STEPS_PER_MINUTE]\n", | 280 | 226 | "current_vector = current_vector[current_vector[\"steps\"] > MINIMUM_STEPS_PER_MINUTE]\n", | |
"current_vector = current_vector[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n", | 281 | 227 | "current_vector = current_vector[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n", | |
"\n", | 282 | 228 | "\n", | |
"# iteratively calculate the walk\n", | 283 | 229 | "# iteratively calculate the walk\n", | |
"for i in range(0, MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK):\n", | 284 | 230 | "for i in range(0, MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK):\n", | |
" print(\"Iteration: {}, length: {}\".format(i, current_vector.size))\n", | 285 | 231 | " print(\"Iteration: {}, length: {}\".format(i, current_vector.size))\n", | |
" new_vector = calculate_walk(current_vector)\n", | 286 | 232 | " new_vector = calculate_walk(current_vector)\n", | |
" current_vector = new_vector\n", | 287 | 233 | " current_vector = new_vector\n", | |
"\n", | 288 | 234 | "\n", | |
"print(\"Final, length: {}\".format(current_vector.size))\n", | 289 | 235 | "print(\"Final, length: {}\".format(current_vector.size))\n", | |
"\n", | 290 | 236 | "\n", | |
"consecutive_minutes = current_vector[[\"user\", \"local_date\", \"local_minute_index\"]].drop_duplicates()" | 291 | 237 | "consecutive_minutes = current_vector[[\"user\", \"local_date\", \"local_minute_index\"]].drop_duplicates()" | |
] | 292 | 238 | ] | |
}, | 293 | 239 | }, | |
{ | 294 | 240 | { | |
"cell_type": "markdown", | 295 | 241 | "cell_type": "markdown", | |
"metadata": {}, | 296 | 242 | "metadata": {}, | |
"source": [ | 297 | 243 | "source": [ | |
"## Map consecutive minutes to 1hr and 3hr units" | 298 | 244 | "## Map consecutive minutes to 1hr and 3hr units" | |
] | 299 | 245 | ] | |
}, | 300 | 246 | }, | |
{ | 301 | 247 | { | |
"cell_type": "code", | 302 | 248 | "cell_type": "code", | |
"execution_count": 149, | 303 | 249 | "execution_count": 168, | |
"metadata": {}, | 304 | 250 | "metadata": {}, | |
"outputs": [], | 305 | 251 | "outputs": [], | |
"source": [ | 306 | 252 | "source": [ | |
"# calculate hour index and three hour index\n", | 307 | 253 | "# calculate hour index and three hour index\n", | |
"consecutive_minutes[\"hour\"] = np.int_(np.floor(consecutive_minutes[\"local_minute_index\"] / 60))\n", | 308 | 254 | "consecutive_minutes[\"hour\"] = np.int_(np.floor(consecutive_minutes[\"local_minute_index\"] / 60))\n", | |
"consecutive_minutes[\"threehour\"] = np.int_(np.floor(consecutive_minutes[\"hour\"] / 3))\n", | 309 | 255 | "consecutive_minutes[\"threehour\"] = np.int_(np.floor(consecutive_minutes[\"hour\"] / 3))\n", | |
"\n", | 310 | 256 | "\n", | |
"# calculate the number of walks per user, per hour\n", | 311 | 257 | "# calculate the number of walks per user, per hour\n", | |
"walk_by_hours = consecutive_minutes[[\"user\", \"local_date\", \"hour\"]].drop_duplicates().reset_index()\n", | 312 | 258 | "walk_by_hours = consecutive_minutes[[\"user\", \"local_date\", \"hour\"]].drop_duplicates().reset_index()\n", | |
"walk_by_hours[\"walked\"] = 2\n", | 313 | 259 | "walk_by_hours[\"walked\"] = 2\n", | |
"\n", | 314 | 260 | "\n", | |
"# calculate the number of walks per user, per three hour\n", | 315 | 261 | "# calculate the number of walks per user, per three hour\n", | |
"walk_by_threehours = consecutive_minutes[[\"user\", \"local_date\", \"threehour\"]].drop_duplicates().reset_index()\n", | 316 | 262 | "walk_by_threehours = consecutive_minutes[[\"user\", \"local_date\", \"threehour\"]].drop_duplicates().reset_index()\n", | |
"walk_by_threehours[\"walked\"] = 2\n", | 317 | 263 | "walk_by_threehours[\"walked\"] = 2\n", | |
"\n", | 318 | 264 | "\n", | |
"# generate hour vector and three hour vector\n", | 319 | 265 | "# generate hour vector and three hour vector\n", | |
"hours = pd.DataFrame({\"hour\": range(0,24)})\n", | 320 | 266 | "hours = pd.DataFrame({\"hour\": range(0,24)})\n", | |
"threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n", | 321 | 267 | "threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n", | |
"\n", | 322 | 268 | "\n", | |
"# generate complete product dataframe\n", | 323 | 269 | "# generate complete product dataframe\n", | |
"measured_hour = product_df(walk_by_hours[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n", | 324 | 270 | "measured_hour = product_df(walk_by_hours[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n", | |
"measured_threehour = product_df(walk_by_threehours[[\"user\", \"local_date\"]], threehours[[\"threehour\"]])\n", | 325 | 271 | "measured_threehour = product_df(walk_by_threehours[[\"user\", \"local_date\"]], threehours[[\"threehour\"]])\n", | |
"\n", | 326 | 272 | "\n", | |
"# pad the hourly walk data (fill in missing hours with 1s)\n", | 327 | 273 | "# pad the hourly walk data (fill in missing hours with 1s)\n", | |
"padded_hours = walk_by_hours.merge(measured_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\")\n", | 328 | 274 | "padded_hours = walk_by_hours.merge(measured_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\")\n", | |
"padded_hours = padded_hours[[\"user\", \"local_date\", \"hour\", \"walked\"]]\n", | 329 | 275 | "padded_hours = padded_hours[[\"user\", \"local_date\", \"hour\", \"walked\"]]\n", | |
"padded_hours = padded_hours.fillna(1)\n", | 330 | 276 | "padded_hours = padded_hours.fillna(1)\n", | |
"\n", | 331 | 277 | "\n", | |
"# pad the walk data with 3 hours unit (fill in missing hours with 1s)\n", | 332 | 278 | "# pad the walk data with 3 hours unit (fill in missing hours with 1s)\n", | |
"padded_threehours = walk_by_threehours.merge(measured_threehour, on=[\"user\", \"local_date\", \"threehour\"], how=\"right\")\n", | 333 | 279 | "padded_threehours = walk_by_threehours.merge(measured_threehour, on=[\"user\", \"local_date\", \"threehour\"], how=\"right\")\n", | |
"padded_threehours = padded_threehours[[\"user\", \"local_date\", \"threehour\", \"walked\"]]\n", | 334 | 280 | "padded_threehours = padded_threehours[[\"user\", \"local_date\", \"threehour\", \"walked\"]]\n", | |
"padded_threehours = padded_threehours.fillna(1)" | 335 | 281 | "padded_threehours = padded_threehours.fillna(1)" | |
] | 336 | 282 | ] | |
}, | 337 | 283 | }, | |
{ | 338 | 284 | { | |
"cell_type": "markdown", | 339 | 285 | "cell_type": "markdown", | |
"metadata": {}, | 340 | 286 | "metadata": {}, | |
"source": [ | 341 | 287 | "source": [ | |
"## Pad unmeasured missing data with 0s\n", | 342 | 288 | "## Pad unmeasured missing data with 0s\n", | |
"\n", | 343 | 289 | "\n", | |
"For three hour window data, since it works as output vector, 0 padding is not done. Instead, missing output data is not used for training." | 344 | 290 | "For three hour window data, since it works as output vector, 0 padding is not done. Instead, missing output data is not used for training." | |
] | 345 | 291 | ] | |
}, | 346 | 292 | }, | |
{ | 347 | 293 | { | |
"cell_type": "code", | 348 | 294 | "cell_type": "code", | |
"execution_count": 150, | 349 | 295 | "execution_count": 169, | |
"metadata": {}, | 350 | 296 | "metadata": {}, | |
"outputs": [], | 351 | 297 | "outputs": [], | |
"source": [ | 352 | 298 | "source": [ | |
"# generate start and end date for each user\n", | 353 | 299 | "# generate start and end date for each user\n", | |
"start_date = padded_hours.groupby([\"user\"])[\"local_date\"].min()\n", | 354 | 300 | "start_date = padded_hours.groupby([\"user\"])[\"local_date\"].min()\n", | |
"end_date = padded_hours.groupby([\"user\"])[\"local_date\"].max()\n", | 355 | 301 | "end_date = padded_hours.groupby([\"user\"])[\"local_date\"].max()\n", | |
"\n", | 356 | 302 | "\n", | |
"# generate the user list\n", | 357 | 303 | "# generate the user list\n", | |
"users = start_date.index\n", | 358 | 304 | "users = start_date.index\n", | |
"all_dates = pd.DataFrame({\"user\": [], \"local_date\": []})\n", | 359 | 305 | "all_dates = pd.DataFrame({\"user\": [], \"local_date\": []})\n", | |
"\n", | 360 | 306 | "\n", | |
"# generate the lists of dates between start and end date\n", | 361 | 307 | "# generate the lists of dates between start and end date\n", | |
"for userid in users:\n", | 362 | 308 | "for userid in users:\n", | |
" current_user_dates = pd.DataFrame({\"user\": userid, \"local_date\": list(date_range(start_date[userid], end_date[userid]))})\n", | 363 | 309 | " current_user_dates = pd.DataFrame({\"user\": userid, \"local_date\": list(date_range(start_date[userid], end_date[userid]))})\n", | |
" \n", | 364 | 310 | " \n", | |
" all_dates = pd.concat([all_dates, current_user_dates])\n", | 365 | 311 | " all_dates = pd.concat([all_dates, current_user_dates])\n", | |
"\n", | 366 | 312 | "\n", | |
"# generate the base vector for the padding\n", | 367 | 313 | "# generate the base vector for the padding\n", | |
"all_dates_hour = product_df(all_dates[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n", | 368 | 314 | "all_dates_hour = product_df(all_dates[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n", | |
"\n", | 369 | 315 | "\n", | |
"# final padded gait data\n", | 370 | 316 | "# final padded gait data\n", | |
"padded_hours = padded_hours.merge(all_dates_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\").fillna(0)\n" | 371 | 317 | "padded_hours = padded_hours.merge(all_dates_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\").fillna(0)\n" | |
] | 372 | 318 | ] | |
}, | 373 | 319 | }, | |
{ | 374 | 320 | { | |
"cell_type": "markdown", | 375 | 321 | "cell_type": "markdown", | |
"metadata": {}, | 376 | 322 | "metadata": {}, | |
"source": [ | 377 | 323 | "source": [ | |
"# Saving Data" | 378 | 324 | "# Saving Data" | |
] | 379 | 325 | ] | |
}, | 380 | 326 | }, | |
{ | 381 | 327 | { | |
"cell_type": "code", | 382 | 328 | "cell_type": "code", | |
"execution_count": 151, | 383 | 329 | "execution_count": 170, | |
"metadata": {}, | 384 | 330 | "metadata": {}, | |
"outputs": [], | 385 | 331 | "outputs": [], | |
"source": [ | 386 | 332 | "source": [ | |
333 | "# converting data type\n", | |||
334 | "padded_hours[\"user\"] = padded_hours[\"user\"].astype(int)\n", | |||
335 | "padded_hours[\"hour\"] = padded_hours[\"hour\"].astype(int)\n", | |||
336 | "padded_hours[\"walked\"] = padded_hours[\"walked\"].astype(int)\n", | |||
337 | "\n", | |||
338 | "padded_threehours[\"user\"] = padded_threehours[\"user\"].astype(int)\n", | |||
339 | "padded_threehours[\"threehour\"] = padded_threehours[\"threehour\"].astype(int)\n", | |||
340 | "padded_threehours[\"walked\"] = padded_threehours[\"walked\"].astype(int)\n", | |||
341 | "\n", | |||
"# save the data\n", | 387 | 342 | "# save the data\n", | |
"padded_hours.to_csv(os.path.join(data_dir, \"padded_hours.csv\"), index=False)\n", | 388 | 343 | "padded_hours.to_csv(os.path.join(data_dir, \"padded_hours.csv\"), index=False)\n", | |
"padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)" | 389 | 344 | "padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)\n", | |
345 | "\n", | |||
346 | "padded_hours.to_pickle(os.path.join(data_dir, \"padded_hours.pkl\"))\n", | |||
347 | "padded_threehours.to_pickle(os.path.join(data_dir, \"padded_threehours.pkl\"))" | |||
] | 390 | 348 | ] | |
} | 391 | 349 | } | |
], | 392 | 350 | ], | |
"metadata": { | 393 | 351 | "metadata": { | |
"interpreter": { | 394 | 352 | "interpreter": { | |
"hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a" | 395 | 353 | "hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a" | |
}, | 396 | 354 | }, | |
"kernelspec": { | 397 | 355 | "kernelspec": { | |
"display_name": "Python 3.7.9 64-bit ('venv': venv)", | 398 | 356 | "display_name": "Python 3.7.9 64-bit ('venv': venv)", | |
"language": "python", | 399 | 357 | "language": "python", | |
"name": "python3" | 400 | 358 | "name": "python3" | |
}, | 401 | 359 | }, | |
"language_info": { | 402 | 360 | "language_info": { | |
"codemirror_mode": { | 403 | 361 | "codemirror_mode": { | |
"name": "ipython", | 404 | 362 | "name": "ipython", | |
"version": 3 | 405 | 363 | "version": 3 | |
}, | 406 | 364 | }, | |
"file_extension": ".py", | 407 | 365 | "file_extension": ".py", | |
"mimetype": "text/x-python", | 408 | 366 | "mimetype": "text/x-python", | |
"name": "python", | 409 | 367 | "name": "python", | |
"nbconvert_exporter": "python", | 410 | 368 | "nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | 411 | 369 | "pygments_lexer": "ipython3", | |
"version": "3.7.9" | 412 | 370 | "version": "3.7.9" | |
}, | 413 | 371 | }, | |
"orig_nbformat": 4 | 414 | 372 | "orig_nbformat": 4 | |
}, | 415 | 373 | }, | |
"nbformat": 4, | 416 | 374 | "nbformat": 4, | |
"nbformat_minor": 2 | 417 | 375 | "nbformat_minor": 2 | |
} | 418 | 376 | } | |
419 | 377 | |||
python-notebook/prepare_trteva_data.ipynb
View file @
979a88c
File was created | 1 | { | ||
2 | "cells": [ | |||
3 | { | |||
4 | "cell_type": "code", | |||
5 | "execution_count": 1, | |||
6 | "metadata": {}, | |||
7 | "outputs": [], | |||
8 | "source": [ | |||
9 | "import numpy as np\n", | |||
10 | "import pandas as pd\n", | |||
11 | "import os\n", | |||
12 | "from tools import *\n", | |||
13 | "from constants import *\n", | |||
14 | "from tensorflow.keras.utils import to_categorical" | |||
15 | ] | |||
16 | }, | |||
17 | { | |||
18 | "cell_type": "markdown", | |||
19 | "metadata": {}, | |||
20 | "source": [ | |||
21 | "# Prepare Training, Testing, and Validation Data\n", | |||
22 | "## Loading the preprocessed data" | |||
23 | ] | |||
24 | }, | |||
25 | { | |||
26 | "cell_type": "code", | |||
27 | "execution_count": 2, | |||
28 | "metadata": {}, | |||
29 | "outputs": [], | |||
30 | "source": [ | |||
31 | "# to use unlimited memory for large dataframes\n", | |||
32 | "pd.options.mode.chained_assignment = None\n", | |||
33 | "\n", | |||
34 | "data_dir = '../data'\n", | |||
35 | "\n", | |||
36 | "padded_hours = pd.read_pickle(os.path.join(data_dir, 'padded_hours.pkl'))\n", | |||
37 | "padded_threehours = pd.read_pickle(os.path.join(data_dir, 'padded_threehours.pkl'))" | |||
38 | ] | |||
39 | }, | |||
40 | { | |||
41 | "cell_type": "markdown", | |||
42 | "metadata": {}, | |||
43 | "source": [ | |||
44 | "## Enumerating Output Data" | |||
45 | ] | |||
46 | }, | |||
47 | { | |||
48 | "cell_type": "code", | |||
49 | "execution_count": 3, | |||
50 | "metadata": {}, | |||
51 | "outputs": [], | |||
52 | "source": [ | |||
53 | "# return output value\n", | |||
54 | "def get_output(y):\n", | |||
55 | " return y[\"walked\"]\n", | |||
56 | "\n", | |||
57 | "# return intput value\n", | |||
58 | "def get_input(y, padded_hours):\n", | |||
59 | " # base information\n", | |||
60 | " user = y[\"user\"]\n", | |||
61 | " local_date = y[\"local_date\"]\n", | |||
62 | " threehour_idx = y[\"threehour\"]\n", | |||
63 | " \n", | |||
64 | " # derived information\n", | |||
65 | " hour_idx = threehour_idx * 3\n", | |||
66 | " encoded_hour_idx = to_categorical(hour_idx, num_classes=24)\n", | |||
67 | " end_date = local_date - timedelta(days=1)\n", | |||
68 | " start_date = end_date - timedelta(days=7*NUMBER_OF_WEEKS_FOR_LOOKING_BACK-1)\n", | |||
69 | " weekday = local_date.weekday()\n", | |||
70 | " encoded_weekday = to_categorical(weekday, num_classes=7)\n", | |||
71 | " encoded_month = to_categorical(local_date.month, num_classes=12)\n", | |||
72 | " encoded_day_of_month = to_categorical(local_date.day, num_classes=31)\n", | |||
73 | "\n", | |||
74 | " gait = pd.Series([], dtype=int)\n", | |||
75 | " # gait movement\n", | |||
76 | " zero_move = 0\n", | |||
77 | " for a_date in date_range(start_date, end_date):\n", | |||
78 | " day_df = padded_hours[(padded_hours[\"user\"] == user) & (padded_hours[\"local_date\"] == a_date)]\n", | |||
79 | " if day_df.size == 0:\n", | |||
80 | " gait = pd.concat([gait, pd.Series([1,0,0] * 24, dtype=int)])\n", | |||
81 | " zero_move += 1\n", | |||
82 | " else:\n", | |||
83 | " gait = pd.concat([gait, pd.Series(to_categorical(day_df[\"walked\"].values, 3, dtype=int).reshape(24*3), dtype=int)])\n", | |||
84 | " if zero_move == 5 * 7:\n", | |||
85 | " raise Exception(\"No movement data\")\n", | |||
86 | "\n", | |||
87 | " return_series = pd.Series([], dtype=int)\n", | |||
88 | " return_series = pd.concat([return_series, pd.Series(encoded_hour_idx, dtype=np.int_)])\n", | |||
89 | " return_series = pd.concat([return_series, pd.Series(encoded_weekday, dtype=np.int_)])\n", | |||
90 | " return_series = pd.concat([return_series, pd.Series(encoded_month, dtype=np.int_)])\n", | |||
91 | " return_series = pd.concat([return_series, pd.Series(encoded_day_of_month, dtype=np.int_)])\n", | |||
92 | " return_series = pd.concat([return_series, gait])\n", | |||
93 | " \n", | |||
94 | " return return_series\n", | |||
95 | "\n", | |||
96 | "def get_database(start_idx, end_idx):\n", | |||
97 | " database = pd.DataFrame({}, dtype=int)\n", | |||
98 | "\n", | |||
99 | " for i in range(start_idx, end_idx):\n", | |||
100 | " try:\n", | |||
101 | " y = padded_threehours.iloc[i, :]\n", | |||
102 | " user = y[\"user\"]\n", | |||
103 | " local_date = y[\"local_date\"]\n", | |||
104 | " first_day = padded_hours[padded_hours[\"user\"] == user][\"local_date\"].min()\n", | |||
105 | " date_diff = (local_date - first_day).days\n", | |||
106 | "\n", | |||
107 | " threehour_idx = y[\"threehour\"]\n", | |||
108 | " hour_idx = threehour_idx * 3\n", | |||
109 | "\n", | |||
110 | " output = get_output(y)\n", | |||
111 | " input = get_input(y, padded_hours)\n", | |||
112 | "\n", | |||
113 | " temp_series = pd.Series([], dtype=int)\n", | |||
114 | " temp_series = pd.concat([temp_series, pd.Series(user, dtype=int)])\n", | |||
115 | " temp_series = pd.concat([temp_series, pd.Series(date_diff, dtype=int)])\n", | |||
116 | " temp_series = pd.concat([temp_series, pd.Series(threehour_idx, dtype=int)])\n", | |||
117 | " temp_series = pd.concat([temp_series, pd.Series(hour_idx, dtype=int)])\n", | |||
118 | " temp_series = pd.concat([temp_series, pd.Series(output, dtype=int)])\n", | |||
119 | " temp_series = pd.concat([temp_series, pd.Series(input, dtype=int)])\n", | |||
120 | "\n", | |||
121 | " database = pd.concat([database, temp_series], axis=1)\n", | |||
122 | " # print(input)\n", | |||
123 | " except Exception as e:\n", | |||
124 | " # print(\"Error:\", e)\n", | |||
125 | " pass\n", | |||
126 | "\n", | |||
127 | " return database\n", | |||
128 | "\n" | |||
129 | ] | |||
130 | }, | |||
131 | { | |||
132 | "cell_type": "code", | |||
133 | "execution_count": 4, | |||
134 | "metadata": {}, | |||
135 | "outputs": [], | |||
136 | "source": [ | |||
137 | "from tensorflow.keras.datasets import mnist\n", | |||
138 | "from tensorflow.keras.models import Sequential\n", | |||
139 | "from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation\n", | |||
140 | "\n", | |||
141 | "\n", | |||
142 | "(x_train, y_train), (x_test, y_test) = mnist.load_data(path='mnist.npz')\n", | |||
143 | "\n", | |||
144 | "X_train = x_train.reshape(60000, 784).astype('float32') / 255\n", | |||
145 | "X_test = x_test.reshape(10000, 784).astype('float32') / 255\n", | |||
146 | "\n", | |||
147 | "Y_train = to_categorical(y_train, 10)\n", | |||
148 | "Y_test = to_categorical(y_test, 10)" | |||
149 | ] | |||
150 | }, | |||
151 | { | |||
152 | "cell_type": "code", | |||
153 | "execution_count": null, | |||
154 | "metadata": {}, | |||
155 | "outputs": [], | |||
156 | "source": [] | |||
157 | } | |||
158 | ], | |||
159 | "metadata": { | |||
160 | "interpreter": { | |||
161 | "hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a" | |||
162 | }, | |||
163 | "kernelspec": { | |||
164 | "display_name": "Python 3.7.9 64-bit ('venv': venv)", | |||
165 | "language": "python", | |||
166 | "name": "python3" | |||
167 | }, | |||
168 | "language_info": { | |||
169 | "codemirror_mode": { | |||
170 | "name": "ipython", | |||
171 | "version": 3 | |||
172 | }, | |||
173 | "file_extension": ".py", | |||
174 | "mimetype": "text/x-python", | |||
175 | "name": "python", | |||
176 | "nbconvert_exporter": "python", | |||
177 | "pygments_lexer": "ipython3", | |||
178 | "version": "3.7.9" | |||
179 | }, | |||
180 | "orig_nbformat": 4 | |||
181 | }, | |||
182 | "nbformat": 4, | |||
183 | "nbformat_minor": 2 | |||
184 | } |
python-notebook/tools.py
View file @
979a88c
File was created | 1 | import pandas as pd | ||
2 | ||||
3 | from itertools import product | |||
4 | from datetime import date, timedelta | |||
5 | ||||
6 | ||||
7 | # convert a datetime object to a date object | |||
8 | def get_date(x): | |||
9 | return date(x.year, x.month, x.day) | |||
10 | ||||
11 | # convert a datetime object to an integer, which denotes the number of minutes since midnight | |||
12 | def get_minute_index(x): | |||
13 | return (x.hour * 60) + x.minute | |||
14 | ||||
15 | # return a range of dates | |||
16 | def date_range(start_date, end_date): | |||
17 | delta = end_date - start_date | |||
18 | ||||
19 | for i in range(delta.days + 1): | |||
20 | yield start_date + timedelta(days=i) | |||
21 | ||||
22 | # define an iterative walk calculation (merging consecutive active minutes) | |||
23 | def calculate_walk(cv): | |||
24 | nv = cv.copy(deep=True) | |||
25 | nv["prev_minute_index"] = nv["local_minute_index"] - 1 | |||
26 | ||||
27 | # move midnight minutes to previous day | |||
28 | nv[nv["prev_minute_index"] < 0]["local_date"] -= timedelta(days=1) | |||
29 | nv[nv["prev_minute_index"] < 0]["prev_minute_index"] = 1439 | |||
30 | ||||
31 | nv = nv[["user", "local_date", "prev_minute_index"]] | |||
32 | jv = cv.merge(nv, left_on=["user", "local_date", "local_minute_index"], right_on=["user", "local_date", "prev_minute_index"], how="inner") | |||
33 | jv["add_count"] += 1 | |||
34 | jv = jv[["user", "local_date", "local_minute_index", "add_count"]] | |||
35 | ||||
36 | return jv | |||
37 |