Junghwan Park / fpm280-data-analysis

{

1

{

"cells": [

2

"cells": [

{

3

{

"cell_type": "code",

4

"cell_type": "code",

"execution_count": 1,

5

"execution_count": 1,

"metadata": {},

6

"metadata": {},

"outputs": [],

7

"outputs": [],

"source": [

8

"source": [

"import numpy as np\n",

9

"import numpy as np\n",

"import pandas as pd\n",

10

"import pandas as pd\n",

"import os\n",

11

"import os\n",

"from tools import *\n",

12

"from tools import *\n",

"from constants import *\n",

13

"from constants import *\n",

"from tensorflow.keras.utils import to_categorical"

14

"from tensorflow.keras.utils import to_categorical\n",

15

"\n",

16

"# %load_ext line_profiler"

]

15

17

]

},

16

18

},

{

17

19

{

"cell_type": "markdown",

18

20

"cell_type": "markdown",

"metadata": {},

19

21

"metadata": {},

"source": [

20

22

"source": [

"# Prepare Training, Testing, and Validation Data\n",

21

23

"# Prepare Training, Testing, and Validation Data\n",

"## Loading the preprocessed data"

22

24

"## Loading the preprocessed data"

]

23

25

]

},

24

26

},

{

25

27

{

"cell_type": "code",

26

28

"cell_type": "code",

"execution_count": 2,

27

29

"execution_count": 2,

"metadata": {},

28

30

"metadata": {},

"outputs": [],

29

31

"outputs": [],

"source": [

30

32

"source": [

"# to use unlimited memory for large dataframes\n",

31

33

"# to use unlimited memory for large dataframes\n",

"pd.options.mode.chained_assignment = None\n",

32

34

"pd.options.mode.chained_assignment = None\n",

"\n",

33

35

"\n",

"data_dir = '../data'\n",

34

36

"data_dir = '../data'\n",

"\n",

35

37

"\n",

"padded_hours = pd.read_pickle(os.path.join(data_dir, 'padded_hours.pkl'))\n",

36

38

"padded_hours = pd.read_pickle(os.path.join(data_dir, 'padded_hours.pkl'))\n",

"padded_threehours = pd.read_pickle(os.path.join(data_dir, 'padded_threehours.pkl'))"

37

39

"padded_threehours = pd.read_pickle(os.path.join(data_dir, 'padded_threehours.pkl'))"

]

38

40

]

},

39

41

},

{

40

42

{

"cell_type": "markdown",

41

43

"cell_type": "markdown",

"metadata": {},

42

44

"metadata": {},

"source": [

43

45

"source": [

46

"# Expanding one-hot-encoded gaits"

47

]

48

},

49

{

50

"cell_type": "code",

51

"execution_count": 13,

52

"metadata": {},

53

"outputs": [

54

{

55

"name": "stdout",

56

"output_type": "stream",

57

"text": [

58

"(42360, 4) -> (127080, 4)\n"

59

]

60

}

61

],

62

"source": [

63

"def mass_one_hot_encoding(padded_hours, colname, n_classes):\n",

64

" def __mass_one_hot_encoding(padded_hours, colname, n_classes, n):\n",

65

" temp = padded_hours[padded_hours[colname] == n]\n",

66

"\n",

67

" return_df = pd.DataFrame(dtype=int)\n",

68

"\n",

69

" for i in range(n_classes):\n",

70

" temp_2 = temp.copy(deep=True)\n",

71

" temp_2[\"var\"] = i\n",

72

" temp_2[\"value\"] = (n == i) if 1 else 0\n",

73

" return_df = pd.concat([return_df, temp_2], ignore_index=True)\n",

74

"\n",

75

" return return_df\n",

76

" \n",

77

" mass_encoded = pd.DataFrame(dtype=int)\n",

78

" for n in range(n_classes):\n",

79

" mass_encoded = pd.concat([mass_encoded, __mass_one_hot_encoding(padded_hours, colname, n_classes, n)], ignore_index=True)\n",

80

" return mass_encoded\n",

81

"\n",

82

"padded_hours_encoded = mass_one_hot_encoding(padded_hours, 'walked', 3)\n",

83

"padded_hours_encoded[\"local_date\"] = padded_hours_encoded[\"local_date\"].astype(str)\n",

84

"padded_hours_encoded = padded_hours_encoded.set_index(['user', 'local_date']).sort_index()\n",

85

"\n",

86

"print(\"{} -> {}\".format(padded_hours.shape, padded_hours_encoded.shape))"

87

]

88

},

89

{

90

"cell_type": "markdown",

91

"metadata": {},

92

"source": [

"## Enumerating Output Data"

44

93

"## Enumerating Output Data"

]

45

94

]

},

46

95

},

{

47

96

{

"cell_type": "code",

48

97

"cell_type": "code",

"execution_count": 3,

49

98

"execution_count": 15,

"metadata": {},

50

99

"metadata": {},

"outputs": [],

51

100

"outputs": [],

"source": [

52

101

"source": [

"# return output value\n",

53

102

"# return output value\n",

"def get_output(y):\n",

54

103

"def get_output(y):\n",

" return y[\"walked\"]\n",

55

104

" return y[\"walked\"]\n",

"\n",

56

105

"\n",

"# return intput value\n",

57

106

"# return intput value\n",

"def get_input(y, padded_hours):\n",

58

107

"def get_input(y, padded_hours):\n",

" # base information\n",

59

108

" # base information\n",

" user = y[\"user\"]\n",

60

109

" user = y[\"user\"]\n",

" local_date = y[\"local_date\"]\n",

61

110

" local_date = y[\"local_date\"]\n",

" threehour_idx = y[\"threehour\"]\n",

62

111

" threehour_idx = y[\"threehour\"]\n",

" \n",

63

112

" \n",

" # derived information\n",

64

113

" # derived information\n",

" hour_idx = threehour_idx * 3\n",

65

114

" hour_idx = threehour_idx * 3\n",

" encoded_hour_idx = to_categorical(hour_idx, num_classes=24)\n",

66

115

" encoded_hour_idx = to_categorical(hour_idx, num_classes=24)\n",

" end_date = local_date - timedelta(days=1)\n",

67

116

" end_date = local_date - timedelta(days=1)\n",

" start_date = end_date - timedelta(days=7*NUMBER_OF_WEEKS_FOR_LOOKING_BACK-1)\n",

68

117

" start_date = end_date - timedelta(days=7*NUMBER_OF_WEEKS_FOR_LOOKING_BACK-1)\n",

" weekday = local_date.weekday()\n",

69

118

" weekday = local_date.weekday()\n",

" encoded_weekday = to_categorical(weekday, num_classes=7)\n",

70

119

" encoded_weekday = to_categorical(weekday, num_classes=7)\n",

" encoded_month = to_categorical(local_date.month, num_classes=12)\n",

71

120

" encoded_month = to_categorical(local_date.month, num_classes=12)\n",

" encoded_day_of_month = to_categorical(local_date.day, num_classes=31)\n",

72

121

" encoded_day_of_month = to_categorical(local_date.day, num_classes=31)\n",

"\n",

73

122

"\n",

" gait = pd.Series([], dtype=int)\n",

74

123

" gait = pd.Series([], dtype=int)\n",

" # gait movement\n",

75

124

" # gait movement\n",

" zero_move = 0\n",

76

125

" zero_move = 0\n",

" for a_date in date_range(start_date, end_date):\n",

77

126

" for a_date in date_range(start_date, end_date):\n",

" day_df = padded_hours[(padded_hours[\"user\"] == user) & (padded_hours[\"local_date\"] == a_date)]\n",

78

127

" key = (user, a_date.strftime(\"%Y-%m-%d\"))\n",

" if day_df.size == 0:\n",

79

128

" if key in padded_hours_encoded.index:\n",

" gait = pd.concat([gait, pd.Series([1,0,0] * 24, dtype=int)])\n",

80

129

" day_df = padded_hours_encoded.loc[key, \"value\"]\n",

" zero_move += 1\n",

81

130

" gait = pd.concat([gait, day_df], ignore_index=True)\n",

" else:\n",

82

131

" else:\n",

" gait = pd.concat([gait, pd.Series(to_categorical(day_df[\"walked\"].values, 3, dtype=int).reshape(24*3), dtype=int)])\n",

83

132

" gait = pd.concat([gait, pd.Series([1,0,0] * 24, dtype=int)], ignore_index=True)\n",

133

" zero_move += 1\n",

" if zero_move == 5 * 7:\n",

84

134

" if zero_move == 5 * 7:\n",

" raise Exception(\"No movement data\")\n",

85

135

" raise Exception(\"No movement data\")\n",

"\n",

86

136

"\n",

" return_series = pd.Series([], dtype=int)\n",

87

137

" return_series = pd.Series([], dtype=int)\n",

" return_series = pd.concat([return_series, pd.Series(encoded_hour_idx, dtype=np.int_)])\n",

88

138

" return_series = pd.concat([return_series, pd.Series(encoded_hour_idx, dtype=np.int_)])\n",

" return_series = pd.concat([return_series, pd.Series(encoded_weekday, dtype=np.int_)])\n",

89

139

" return_series = pd.concat([return_series, pd.Series(encoded_weekday, dtype=np.int_)])\n",

" return_series = pd.concat([return_series, pd.Series(encoded_month, dtype=np.int_)])\n",

90

140

" return_series = pd.concat([return_series, pd.Series(encoded_month, dtype=np.int_)])\n",

" return_series = pd.concat([return_series, pd.Series(encoded_day_of_month, dtype=np.int_)])\n",

91

141

" return_series = pd.concat([return_series, pd.Series(encoded_day_of_month, dtype=np.int_)])\n",

" return_series = pd.concat([return_series, gait])\n",

92

142

" return_series = pd.concat([return_series, gait])\n",

" \n",

93

143

" \n",

" return return_series\n",

94

144

" return return_series\n",

"\n",

95

145

"\n",

"def get_database(start_idx, end_idx):\n",

96

146

"def get_database(start_idx, end_idx):\n",

" database = pd.DataFrame({}, dtype=int)\n",

97

147

" database = pd.DataFrame({}, dtype=int)\n",

"\n",

98

148

"\n",

" for i in range(start_idx, end_idx):\n",

99

149

" for i in range(start_idx, end_idx):\n",

" try:\n",

100

150

" try:\n",

" y = padded_threehours.iloc[i, :]\n",

101

151

" y = padded_threehours.iloc[i, :]\n",

" user = y[\"user\"]\n",

102

152

" user = y[\"user\"]\n",

" local_date = y[\"local_date\"]\n",

103

153

" local_date = y[\"local_date\"]\n",

" first_day = padded_hours[padded_hours[\"user\"] == user][\"local_date\"].min()\n",

104

154

" first_day = padded_hours[padded_hours[\"user\"] == user][\"local_date\"].min()\n",

" date_diff = (local_date - first_day).days\n",

105

155

" date_diff = (local_date - first_day).days\n",

"\n",

106

156

"\n",

" threehour_idx = y[\"threehour\"]\n",

107

157

" threehour_idx = y[\"threehour\"]\n",

" hour_idx = threehour_idx * 3\n",

108

158

" hour_idx = threehour_idx * 3\n",

"\n",

109

159

"\n",

" output = get_output(y)\n",

110

160

" output = get_output(y)\n",

" input = get_input(y, padded_hours)\n",

111

161

" input = get_input(y, padded_hours)\n",

"\n",

112

162

"\n",

" temp_series = pd.Series([], dtype=int)\n",

113

163

" temp_series = pd.Series([], dtype=int)\n",

" temp_series = pd.concat([temp_series, pd.Series(user, dtype=int)])\n",

114

164

" temp_series = pd.concat([temp_series, pd.Series(user, dtype=int)])\n",

" temp_series = pd.concat([temp_series, pd.Series(date_diff, dtype=int)])\n",

115

165

" temp_series = pd.concat([temp_series, pd.Series(date_diff, dtype=int)])\n",

" temp_series = pd.concat([temp_series, pd.Series(threehour_idx, dtype=int)])\n",

116

166

" temp_series = pd.concat([temp_series, pd.Series(threehour_idx, dtype=int)])\n",

" temp_series = pd.concat([temp_series, pd.Series(hour_idx, dtype=int)])\n",

117

167

" temp_series = pd.concat([temp_series, pd.Series(hour_idx, dtype=int)])\n",

" temp_series = pd.concat([temp_series, pd.Series(output, dtype=int)])\n",

118

168

" temp_series = pd.concat([temp_series, pd.Series(output, dtype=int)])\n",

" temp_series = pd.concat([temp_series, pd.Series(input, dtype=int)])\n",

119

169

" temp_series = pd.concat([temp_series, pd.Series(input, dtype=int)]).reset_index(drop=True)\n",

"\n",

120

170

"\n",

" database = pd.concat([database, temp_series], axis=1)\n",

121

171

" database = pd.concat([database, temp_series], axis=1)\n",

" # print(input)\n",

122

172

" # print(input)\n",

" except Exception as e:\n",

123

173

" except Exception as e:\n",

" # print(\"Error:\", e)\n",

124

174

" # print(\"Error:\", e)\n",

" pass\n",

125

175

" pass\n",

"\n",

126

176

"\n",

" return database\n",

127

177

" return database\n",

"\n"

128

178

"\n",

179

"database = get_database(0, 100)\n",

180

"\n",

181

"database.to_pickle(os.path.join(data_dir, \"database.pkl\"))"

]

129

182

]

},

130

183

},

{

131

184

{

"cell_type": "code",

132

185

"cell_type": "code",

"execution_count": 4,

133

186

"execution_count": 4,

"metadata": {},

134

187

"metadata": {},

"outputs": [],

135

188

"outputs": [],

"source": [

136

189

"source": [

"from tensorflow.keras.datasets import mnist\n",

137

190

"from tensorflow.keras.datasets import mnist\n",

"from tensorflow.keras.models import Sequential\n",

138

191

"from tensorflow.keras.models import Sequential\n",

"from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation\n",

139

192

"from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation\n",

"\n",

140

193

"\n",

141

194

"\n",

"(x_train, y_train), (x_test, y_test) = mnist.load_data(path='mnist.npz')\n",

142

195

"(x_train, y_train), (x_test, y_test) = mnist.load_data(path='mnist.npz')\n",

"\n",

143

196

"\n",

"X_train = x_train.reshape(60000, 784).astype('float32') / 255\n",

144

197

"X_train = x_train.reshape(60000, 784).astype('float32') / 255\n",

"X_test = x_test.reshape(10000, 784).astype('float32') / 255\n",

145

198

"X_test = x_test.reshape(10000, 784).astype('float32') / 255\n",

"\n",

146

199

"\n",

"Y_train = to_categorical(y_train, 10)\n",

147

200

"Y_train = to_categorical(y_train, 10)\n",

"Y_test = to_categorical(y_test, 10)"

148

201

"Y_test = to_categorical(y_test, 10)"

]

149

202

]

},

150

203

},

{

151

204

{

"cell_type": "code",

152

205

"cell_type": "code",

"execution_count": null,

153

206

"execution_count": null,

"metadata": {},

154

207

"metadata": {},

"outputs": [],

155

208

"outputs": [],

"source": []

156

209

"source": []

}

157

210

}

],

158

211

],

"metadata": {

159

212

"metadata": {

"interpreter": {

160

213

"interpreter": {

"hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a"

161

214

"hash": "80dbe1014b4652684caa329d41db00af3ae439be86b11eab7e35b518e5d8ab1a"

},

162

215

},

"kernelspec": {

163

216

"kernelspec": {

"display_name": "Python 3.7.9 64-bit ('venv': venv)",

164

217

"display_name": "Python 3.7.9 64-bit ('venv': venv)",

"language": "python",

165

218

"language": "python",

"name": "python3"

166

219

"name": "python3"

},

167

220

},

"language_info": {

168

221

"language_info": {

"codemirror_mode": {

169

222

"codemirror_mode": {

"name": "ipython",

170

223

"name": "ipython",

"version": 3

171

224

"version": 3

},

172

225

},

"file_extension": ".py",

173

226

"file_extension": ".py",

"mimetype": "text/x-python",

174

227

"mimetype": "text/x-python",

"name": "python",

175

228

"name": "python",

"nbconvert_exporter": "python",

176

229

"nbconvert_exporter": "python",

"pygments_lexer": "ipython3",

177

230

"pygments_lexer": "ipython3",

"version": "3.7.9"

178

231

"version": "3.7.9"

},

179

232

},

"orig_nbformat": 4

180

233

"orig_nbformat": 4

},

181

234

},

"nbformat": 4,

182

235

"nbformat": 4,

"nbformat_minor": 2

183

236

"nbformat_minor": 2

}

184

237

}

185

238

GITLAB

Junghwan Park / fpm280-data-analysis

make it faster