-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathProject1 - EDA.py
155 lines (115 loc) · 4.56 KB
/
Project1 - EDA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 25 18:50:19 2018
@author: Rony Sulca
"""
#### Importing Libraries ####
import pandas as pd
from dateutil import parser
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
dataset = pd.read_csv('appdata10.csv')
#### EDA ####
dataset.head(10) # Viewing the Data
dataset.describe() # Distribution of Numerical Variables
# First set of Feature cleaning
dataset["hour"] = dataset.hour.str.slice(1, 3).astype(int)
### Plotting
dataset2 = dataset.copy().drop(columns = ['user', 'screen_list', 'enrolled_date',
'first_open', 'enrolled'])
dataset2.head()
## Histograms
plt.suptitle('Histograms of Numerical Columns', fontsize=20)
for i in range(1, dataset2.shape[1] + 1):
plt.subplot(3, 3, i)
f = plt.gca()
# f.axes.get_yaxis().set_visible(False)
f.set_title(dataset2.columns.values[i - 1])
vals = np.size(dataset2.iloc[:, i - 1].unique())
plt.hist(dataset2.iloc[:, i - 1], bins=vals, color='#3F5D7D')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
#plt.savefig('app_data_hist.jpg')
## Correlation with Response Variable
dataset2.corrwith(dataset.enrolled).plot.bar(figsize=(20,10),
title = 'Correlation with Reposnse variable',
fontsize = 15, rot = 45,
grid = True)
## Correlation Matrix
sn.set(style="white", font_scale=2)
# Compute the correlation matrix
corr = dataset2.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(18, 15))
f.suptitle("Correlation Matrix", fontsize = 40)
# Generate a custom diverging colormap
cmap = sn.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sn.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
#### Feature Engineering ####
# Formatting Date Columns
dataset.dtypes
dataset["first_open"] = [parser.parse(row_date) for row_date in dataset["first_open"]]
dataset["enrolled_date"] = [parser.parse(row_date) if isinstance(row_date, str) else row_date for row_date in dataset["enrolled_date"]]
dataset.dtypes
# Selecting Time For Response
dataset["difference"] = (dataset.enrolled_date-dataset.first_open).astype('timedelta64[h]')
response_hist = plt.hist(dataset["difference"].dropna(), color='#3F5D7D')
plt.title('Distribution of Time-Since-Screen-Reached')
plt.show()
plt.hist(dataset["difference"].dropna(), color='#3F5D7D', range = [0, 100])
plt.title('Distribution of Time-Since-Screen-Reached')
plt.show()
dataset.loc[dataset.difference > 48, 'enrolled'] = 0
dataset = dataset.drop(columns=['enrolled_date', 'difference', 'first_open'])
## Formatting the screen_list Field
# Load Top Screens
top_screens = pd.read_csv('top_screens.csv').top_screens.values
top_screens
# Mapping Screens to Fields
dataset["screen_list"] = dataset.screen_list.astype(str) + ','
for sc in top_screens:
dataset[sc] = dataset.screen_list.str.contains(sc).astype(int)
dataset['screen_list'] = dataset.screen_list.str.replace(sc+",", "")
dataset['Other'] = dataset.screen_list.str.count(",")
dataset = dataset.drop(columns=['screen_list'])
# Funnels
savings_screens = ["Saving1",
"Saving2",
"Saving2Amount",
"Saving4",
"Saving5",
"Saving6",
"Saving7",
"Saving8",
"Saving9",
"Saving10"]
dataset["SavingCount"] = dataset[savings_screens].sum(axis=1)
dataset = dataset.drop(columns=savings_screens)
cm_screens = ["Credit1",
"Credit2",
"Credit3",
"Credit3Container",
"Credit3Dashboard"]
dataset["CMCount"] = dataset[cm_screens].sum(axis=1)
dataset = dataset.drop(columns=cm_screens)
cc_screens = ["CC1",
"CC1Category",
"CC3"]
dataset["CCCount"] = dataset[cc_screens].sum(axis=1)
dataset = dataset.drop(columns=cc_screens)
loan_screens = ["Loan",
"Loan2",
"Loan3",
"Loan4"]
dataset["LoansCount"] = dataset[loan_screens].sum(axis=1)
dataset = dataset.drop(columns=loan_screens)
#### Saving Results ####
dataset.head()
dataset.describe()
dataset.columns
dataset.to_csv('new_appdata10.csv', index = False)