Commit cf37877009d2c931937801ab68f427ce3c847775
1 parent
f39b4d8ba4
Exists in
main
rearranged code order
Showing 1 changed file with 64 additions and 48 deletions Side-by-side Diff
python-notebook/data_loading.ipynb
View file @
cf37877
... | ... | @@ -9,7 +9,7 @@ |
9 | 9 | }, |
10 | 10 | { |
11 | 11 | "cell_type": "code", |
12 | - "execution_count": 17, | |
12 | + "execution_count": 142, | |
13 | 13 | "metadata": {}, |
14 | 14 | "outputs": [], |
15 | 15 | "source": [ |
16 | 16 | |
17 | 17 | |
18 | 18 | |
19 | 19 | |
20 | 20 | |
... | ... | @@ -33,19 +33,57 @@ |
33 | 33 | }, |
34 | 34 | { |
35 | 35 | "cell_type": "code", |
36 | - "execution_count": 36, | |
36 | + "execution_count": 143, | |
37 | 37 | "metadata": {}, |
38 | 38 | "outputs": [], |
39 | 39 | "source": [ |
40 | + "# to use unlimited memory for large dataframes\n", | |
40 | 41 | "pd.options.mode.chained_assignment = None\n", |
41 | 42 | "\n", |
43 | + "# convert a datetime object to a date object\n", | |
42 | 44 | "def get_date(x):\n", |
43 | 45 | " return date(x.year, x.month, x.day)\n", |
44 | 46 | "\n", |
47 | + "# convert a datetime object to an integer, which denotes the number of minutes since midnight\n", | |
45 | 48 | "def get_minute_index(x):\n", |
46 | 49 | " return (x.hour * 60) + x.minute\n", |
47 | 50 | "\n", |
51 | + "# return a range of dates\n", | |
52 | + "def date_range(start_date, end_date):\n", | |
53 | + " delta = end_date - start_date\n", | |
48 | 54 | "\n", |
55 | + " for i in range(delta.days + 1):\n", | |
56 | + " yield start_date + timedelta(days=i)\n", | |
57 | + "\n", | |
58 | + "# define an iterative walk calculation (merging consecutive active minutes)\n", | |
59 | + "def calculate_walk(cv):\n", | |
60 | + " nv = cv.copy(deep=True)\n", | |
61 | + " nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n", | |
62 | + "\n", | |
63 | + " # move midnight minutes to previous day\n", | |
64 | + " nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n", | |
65 | + " nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n", | |
66 | + " \n", | |
67 | + " nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n", | |
68 | + " jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n", | |
69 | + " jv[\"add_count\"] += 1\n", | |
70 | + " jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n", | |
71 | + "\n", | |
72 | + " return jv \n", | |
73 | + "\n", | |
74 | + "# generate complete product of vectors\n", | |
75 | + "def product_df(mat1, mat2):\n", | |
76 | + " mat1 = mat1.drop_duplicates()\n", | |
77 | + " mat2 = mat2.drop_duplicates()\n", | |
78 | + "\n", | |
79 | + " temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n", | |
80 | + " for i, acol in enumerate(mat1.columns):\n", | |
81 | + " temp[acol] = temp[0].apply(lambda x: x[i])\n", | |
82 | + " for i, acol in enumerate(mat2.columns):\n", | |
83 | + " temp[acol] = temp[1].apply(lambda x: x[i])\n", | |
84 | + " temp = temp.drop(columns=[0, 1])\n", | |
85 | + " return temp\n", | |
86 | + "\n", | |
49 | 87 | "# cut off values that are not in the range of the data\n", |
50 | 88 | "THRESHOLD_OF_DAYS_PER_USER = 10\n", |
51 | 89 | "\n", |
... | ... | @@ -65,7 +103,7 @@ |
65 | 103 | }, |
66 | 104 | { |
67 | 105 | "cell_type": "code", |
68 | - "execution_count": 5, | |
106 | + "execution_count": 144, | |
69 | 107 | "metadata": {}, |
70 | 108 | "outputs": [], |
71 | 109 | "source": [ |
... | ... | @@ -86,7 +124,7 @@ |
86 | 124 | }, |
87 | 125 | { |
88 | 126 | "cell_type": "code", |
89 | - "execution_count": 6, | |
127 | + "execution_count": 145, | |
90 | 128 | "metadata": {}, |
91 | 129 | "outputs": [], |
92 | 130 | "source": [ |
... | ... | @@ -135,7 +173,7 @@ |
135 | 173 | }, |
136 | 174 | { |
137 | 175 | "cell_type": "code", |
138 | - "execution_count": 7, | |
176 | + "execution_count": 146, | |
139 | 177 | "metadata": {}, |
140 | 178 | "outputs": [], |
141 | 179 | "source": [ |
... | ... | @@ -152,7 +190,7 @@ |
152 | 190 | }, |
153 | 191 | { |
154 | 192 | "cell_type": "code", |
155 | - "execution_count": 13, | |
193 | + "execution_count": 147, | |
156 | 194 | "metadata": {}, |
157 | 195 | "outputs": [ |
158 | 196 | { |
... | ... | @@ -219,7 +257,7 @@ |
219 | 257 | }, |
220 | 258 | { |
221 | 259 | "cell_type": "code", |
222 | - "execution_count": 37, | |
260 | + "execution_count": 148, | |
223 | 261 | "metadata": {}, |
224 | 262 | "outputs": [ |
225 | 263 | { |
... | ... | @@ -242,23 +280,6 @@ |
242 | 280 | "current_vector = current_vector[current_vector[\"steps\"] > MINIMUM_STEPS_PER_MINUTE]\n", |
243 | 281 | "current_vector = current_vector[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n", |
244 | 282 | "\n", |
245 | - "# define an iterative walk calculation (merging consecutive active minutes)\n", | |
246 | - "def calculate_walk(cv):\n", | |
247 | - " nv = cv.copy(deep=True)\n", | |
248 | - " nv[\"prev_minute_index\"] = nv[\"local_minute_index\"] - 1\n", | |
249 | - "\n", | |
250 | - " # move midnight minutes to previous day\n", | |
251 | - " nv[nv[\"prev_minute_index\"] < 0][\"local_date\"] -= timedelta(days=1)\n", | |
252 | - " nv[nv[\"prev_minute_index\"] < 0][\"prev_minute_index\"] = 1439\n", | |
253 | - " \n", | |
254 | - " nv = nv[[\"user\", \"local_date\", \"prev_minute_index\"]]\n", | |
255 | - " jv = cv.merge(nv, left_on=[\"user\", \"local_date\", \"local_minute_index\"], right_on=[\"user\", \"local_date\", \"prev_minute_index\"], how=\"inner\")\n", | |
256 | - " jv[\"add_count\"] += 1\n", | |
257 | - " jv = jv[[\"user\", \"local_date\", \"local_minute_index\", \"add_count\"]]\n", | |
258 | - "\n", | |
259 | - " return jv \n", | |
260 | - "\n", | |
261 | - "\n", | |
262 | 283 | "# iteratively calculate the walk\n", |
263 | 284 | "for i in range(0, MINIMUM_NUMBER_OF_MINUTES_FOR_A_WALK):\n", |
264 | 285 | " print(\"Iteration: {}, length: {}\".format(i, current_vector.size))\n", |
... | ... | @@ -279,7 +300,7 @@ |
279 | 300 | }, |
280 | 301 | { |
281 | 302 | "cell_type": "code", |
282 | - "execution_count": 111, | |
303 | + "execution_count": 149, | |
283 | 304 | "metadata": {}, |
284 | 305 | "outputs": [], |
285 | 306 | "source": [ |
... | ... | @@ -299,19 +320,7 @@ |
299 | 320 | "hours = pd.DataFrame({\"hour\": range(0,24)})\n", |
300 | 321 | "threehours = pd.DataFrame({\"threehour\": range(0, 8)})\n", |
301 | 322 | "\n", |
302 | - "# generate complete product of vectors\n", | |
303 | - "def product_df(mat1, mat2):\n", | |
304 | - " mat1 = mat1.drop_duplicates()\n", | |
305 | - " mat2 = mat2.drop_duplicates()\n", | |
306 | - "\n", | |
307 | - " temp = pd.DataFrame(list(product(mat1.values, mat2.values)))\n", | |
308 | - " for i, acol in enumerate(mat1.columns):\n", | |
309 | - " temp[acol] = temp[0].apply(lambda x: x[i])\n", | |
310 | - " for i, acol in enumerate(mat2.columns):\n", | |
311 | - " temp[acol] = temp[1].apply(lambda x: x[i])\n", | |
312 | - " temp = temp.drop(columns=[0, 1])\n", | |
313 | - " return temp\n", | |
314 | - "\n", | |
323 | + "# generate complete product dataframe\n", | |
315 | 324 | "measured_hour = product_df(walk_by_hours[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n", |
316 | 325 | "measured_threehour = product_df(walk_by_threehours[[\"user\", \"local_date\"]], threehours[[\"threehour\"]])\n", |
317 | 326 | "\n", |
... | ... | @@ -337,7 +346,7 @@ |
337 | 346 | }, |
338 | 347 | { |
339 | 348 | "cell_type": "code", |
340 | - "execution_count": 141, | |
349 | + "execution_count": 150, | |
341 | 350 | "metadata": {}, |
342 | 351 | "outputs": [], |
343 | 352 | "source": [ |
... | ... | @@ -349,12 +358,6 @@ |
349 | 358 | "users = start_date.index\n", |
350 | 359 | "all_dates = pd.DataFrame({\"user\": [], \"local_date\": []})\n", |
351 | 360 | "\n", |
352 | - "def date_range(start_date, end_date):\n", | |
353 | - " delta = end_date - start_date\n", | |
354 | - "\n", | |
355 | - " for i in range(delta.days + 1):\n", | |
356 | - " yield start_date + timedelta(days=i)\n", | |
357 | - "\n", | |
358 | 361 | "# generate the lists of dates between start and end date\n", |
359 | 362 | "for userid in users:\n", |
360 | 363 | " current_user_dates = pd.DataFrame({\"user\": userid, \"local_date\": list(date_range(start_date[userid], end_date[userid]))})\n", |
... | ... | @@ -365,9 +368,22 @@ |
365 | 368 | "all_dates_hour = product_df(all_dates[[\"user\", \"local_date\"]], hours[[\"hour\"]])\n", |
366 | 369 | "\n", |
367 | 370 | "# final padded gait data\n", |
368 | - "padded_hours = padded_hours.merge(all_dates_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\").fillna(0)\n", | |
369 | - "\n", | |
370 | - "\n", | |
371 | + "padded_hours = padded_hours.merge(all_dates_hour, on=[\"user\", \"local_date\", \"hour\"], how=\"right\").fillna(0)\n" | |
372 | + ] | |
373 | + }, | |
374 | + { | |
375 | + "cell_type": "markdown", | |
376 | + "metadata": {}, | |
377 | + "source": [ | |
378 | + "# Saving Data" | |
379 | + ] | |
380 | + }, | |
381 | + { | |
382 | + "cell_type": "code", | |
383 | + "execution_count": 151, | |
384 | + "metadata": {}, | |
385 | + "outputs": [], | |
386 | + "source": [ | |
371 | 387 | "# save the data\n", |
372 | 388 | "padded_hours.to_csv(os.path.join(data_dir, \"padded_hours.csv\"), index=False)\n", |
373 | 389 | "padded_threehours.to_csv(os.path.join(data_dir, \"padded_threehours.csv\"), index=False)" |