Commit cf37877009d2c931937801ab68f427ce3c847775

Authored by Junghwan Park
1 parent f39b4d8ba4
Exists in main

rearranged code order

Showing 1 changed file with 64 additions and 48 deletions Side-by-side Diff

python-notebook/data_loading.ipynb View file @ cf37877
... ... @@ -9,7 +9,7 @@
9 9 },
10 10 {
11 11 "cell_type": "code",
12   - "execution_count": 17,
  12 + "execution_count": 142,
13 13 "metadata": {},
14 14 "outputs": [],
15 15 "source": [
16 16  
17 17  
18 18  
19 19  
20 20  
... ... @@ -33,19 +33,57 @@
33 33 },
34 34 {
35 35 "cell_type": "code",
36   - "execution_count": 36,
  36 + "execution_count": 143,
37 37 "metadata": {},
38 38 "outputs": [],
39 39 "source": [
  40 + "# to use unlimited memory for large dataframes\n",
40 41 "pd.options.mode.chained_assignment = None\n",
41 42 "\n",
  43 + "# convert a datetime object to a date object\n",
42 44 "def get_date(x):\n",
43 45 " return date(x.year, x.month, x.day)\n",
44 46 "\n",
  47 + "# convert a datetime object to an integer, which denotes the number of minutes since midnight\n",
45 48 "def get_minute_index(x):\n",
46 49 " return (x.hour * 60) + x.minute\n",
47 50 "\n",
  51 + "# return a range of dates\n",
  52 + "def date_range(start_date, end_date):\n",
  53 + " delta = end_date - start_date\n",
48 54 "\n",
  55 + " for i in range(delta.days + 1):\n",
  56 + " yield start_date + timedelta(days=i)\n",
  57 + "\n",
  58 + "# define an iterative walk calculation (merging consecutive active minutes)\n",
  59 + "def calculate_walk(cv):\n",
  60 + " nv = cv.copy(deep=True)\n",
  61 + " nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n",
  62 + "\n",
  63 + " # move midnight minutes to previous day\n",
  64 + " nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n",
  65 + " nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n",
  66 + " \n",
  67 + " nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n",
  68 + " jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n",
  69 + " jv[\"add_count\"] += 1\n",
  70 + " jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
  71 + "\n",
  72 + " return jv \n",
  73 + "\n",
  74 + "# generate complete product of vectors\n",
  75 + "def product_df(mat1, mat2):\n",
  76 + " mat1 = mat1.drop_duplicates()\n",
  77 + " mat2 = mat2.drop_duplicates()\n",
  78 + "\n",
  79 + " temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n",
  80 + " for i, acol in enumerate(mat1.columns):\n",
  81 + " temp[acol] = temp[0].apply(lambda x: x[i])\n",
  82 + " for i, acol in enumerate(mat2.columns):\n",
  83 + " temp[acol] = temp[1].apply(lambda x: x[i])\n",
  84 + " temp = temp.drop(columns=[0, 1])\n",
  85 + " return temp\n",
  86 + "\n",
49 87 "# cut off values that are not in the range of the data\n",
50 88 "THRESHOLD_OF_DAYS_PER_USER = 10\n",
51 89 "\n",
... ... @@ -65,7 +103,7 @@
65 103 },
66 104 {
67 105 "cell_type": "code",
68   - "execution_count": 5,
  106 + "execution_count": 144,
69 107 "metadata": {},
70 108 "outputs": [],
71 109 "source": [
... ... @@ -86,7 +124,7 @@
86 124 },
87 125 {
88 126 "cell_type": "code",
89   - "execution_count": 6,
  127 + "execution_count": 145,
90 128 "metadata": {},
91 129 "outputs": [],
92 130 "source": [
... ... @@ -135,7 +173,7 @@
135 173 },
136 174 {
137 175 "cell_type": "code",
138   - "execution_count": 7,
  176 + "execution_count": 146,
139 177 "metadata": {},
140 178 "outputs": [],
141 179 "source": [
... ... @@ -152,7 +190,7 @@
152 190 },
153 191 {
154 192 "cell_type": "code",
155   - "execution_count": 13,
  193 + "execution_count": 147,
156 194 "metadata": {},
157 195 "outputs": [
158 196 {
... ... @@ -219,7 +257,7 @@
219 257 },
220 258 {
221 259 "cell_type": "code",
222   - "execution_count": 37,
  260 + "execution_count": 148,
223 261 "metadata": {},
224 262 "outputs": [
225 263 {
... ... @@ -242,23 +280,6 @@
242 280 "current_vector = current_vector[current_vector[\"steps\"] > MINIMUM_STEPS_PER_MINUTE]\n",
243 281 "current_vector = current_vector[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
244 282 "\n",
245   - "# define an iterative walk calculation (merging consecutive active minutes)\n",
246   - "def calculate_walk(cv):\n",
247   - " nv = cv.copy(deep=True)\n",
248   - " nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n",
249   - "\n",
250   - " # move midnight minutes to previous day\n",
251   - " nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n",
252   - " nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n",
253   - " \n",
254   - " nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n",
255   - " jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n",
256   - " jv[\"add_count\"] += 1\n",
257   - " jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n",
258   - "\n",
259   - " return jv \n",
260   - "\n",
261   - "\n",
262 283 "# iteratively calculate the walk\n",
263 284 "for i in range(0, MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK):\n",
264 285 " print(\"Iteration: {}, length: {}\".format(i, current_vector.size))\n",
... ... @@ -279,7 +300,7 @@
279 300 },
280 301 {
281 302 "cell_type": "code",
282   - "execution_count": 111,
  303 + "execution_count": 149,
283 304 "metadata": {},
284 305 "outputs": [],
285 306 "source": [
... ... @@ -299,19 +320,7 @@
299 320 "hours = pd.DataFrame({\"hour\": range(0,24)})\n",
300 321 "threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n",
301 322 "\n",
302   - "# generate complete product of vectors\n",
303   - "def product_df(mat1, mat2):\n",
304   - " mat1 = mat1.drop_duplicates()\n",
305   - " mat2 = mat2.drop_duplicates()\n",
306   - "\n",
307   - " temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n",
308   - " for i, acol in enumerate(mat1.columns):\n",
309   - " temp[acol] = temp[0].apply(lambda x: x[i])\n",
310   - " for i, acol in enumerate(mat2.columns):\n",
311   - " temp[acol] = temp[1].apply(lambda x: x[i])\n",
312   - " temp = temp.drop(columns=[0, 1])\n",
313   - " return temp\n",
314   - "\n",
  323 + "# generate complete product dataframe\n",
315 324 "measured_hour = product_df(walk_by_hours[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n",
316 325 "measured_threehour = product_df(walk_by_threehours[[\"user\", \"local_date\"]], threehours[[\"threehour\"]])\n",
317 326 "\n",
... ... @@ -337,7 +346,7 @@
337 346 },
338 347 {
339 348 "cell_type": "code",
340   - "execution_count": 141,
  349 + "execution_count": 150,
341 350 "metadata": {},
342 351 "outputs": [],
343 352 "source": [
... ... @@ -349,12 +358,6 @@
349 358 "users = start_date.index\n",
350 359 "all_dates = pd.DataFrame({\"user\": [], \"local_date\": []})\n",
351 360 "\n",
352   - "def date_range(start_date, end_date):\n",
353   - " delta = end_date - start_date\n",
354   - "\n",
355   - " for i in range(delta.days + 1):\n",
356   - " yield start_date + timedelta(days=i)\n",
357   - "\n",
358 361 "# generate the lists of dates between start and end date\n",
359 362 "for userid in users:\n",
360 363 " current_user_dates = pd.DataFrame({\"user\": userid, \"local_date\": list(date_range(start_date[userid], end_date[userid]))})\n",
... ... @@ -365,9 +368,22 @@
365 368 "all_dates_hour = product_df(all_dates[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n",
366 369 "\n",
367 370 "# final padded gait data\n",
368   - "padded_hours = padded_hours.merge(all_dates_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\").fillna(0)\n",
369   - "\n",
370   - "\n",
  371 + "padded_hours = padded_hours.merge(all_dates_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\").fillna(0)\n"
  372 + ]
  373 + },
  374 + {
  375 + "cell_type": "markdown",
  376 + "metadata": {},
  377 + "source": [
  378 + "# Saving Data"
  379 + ]
  380 + },
  381 + {
  382 + "cell_type": "code",
  383 + "execution_count": 151,
  384 + "metadata": {},
  385 + "outputs": [],
  386 + "source": [
371 387 "# save the data\n",
372 388 "padded_hours.to_csv(os.path.join(data_dir, \"padded_hours.csv\"), index=False)\n",
373 389 "padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)"