import pandas as pd
import numpy as np
读取文件
df = pd.read_csv(
'https://labfile.oss.aliyuncs.com/courses/1283/telecom_churn.csv')
df.head()
|
State
|
Account length
|
Area code
|
International plan
|
Voice mail plan
|
Number vmail messages
|
Total day minutes
|
Total day calls
|
Total day charge
|
Total eve minutes
|
Total eve calls
|
Total eve charge
|
Total night minutes
|
Total night calls
|
Total night charge
|
Total intl minutes
|
Total intl calls
|
Total intl charge
|
Customer service calls
|
Churn
|
0
|
KS
|
128
|
415
|
No
|
Yes
|
25
|
265.1
|
110
|
45.07
|
197.4
|
99
|
16.78
|
244.7
|
91
|
11.01
|
10.0
|
3
|
2.70
|
1
|
False
|
1
|
OH
|
107
|
415
|
No
|
Yes
|
26
|
161.6
|
123
|
27.47
|
195.5
|
103
|
16.62
|
254.4
|
103
|
11.45
|
13.7
|
3
|
3.70
|
1
|
False
|
2
|
NJ
|
137
|
415
|
No
|
No
|
0
|
243.4
|
114
|
41.38
|
121.2
|
110
|
10.30
|
162.6
|
104
|
7.32
|
12.2
|
5
|
3.29
|
0
|
False
|
3
|
OH
|
84
|
408
|
Yes
|
No
|
0
|
299.4
|
71
|
50.90
|
61.9
|
88
|
5.26
|
196.9
|
89
|
8.86
|
6.6
|
7
|
1.78
|
2
|
False
|
4
|
OK
|
75
|
415
|
Yes
|
No
|
0
|
166.7
|
113
|
28.34
|
148.3
|
122
|
12.61
|
186.9
|
121
|
8.41
|
10.1
|
3
|
2.73
|
3
|
False
|
df.info() #DataFrame 的一些总体信息。
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
State 3333 non-null object
Account length 3333 non-null int64
Area code 3333 non-null int64
International plan 3333 non-null object
Voice mail plan 3333 non-null object
Number vmail messages 3333 non-null int64
Total day minutes 3333 non-null float64
Total day calls 3333 non-null int64
Total day charge 3333 non-null float64
Total eve minutes 3333 non-null float64
Total eve calls 3333 non-null int64
Total eve charge 3333 non-null float64
Total night minutes 3333 non-null float64
Total night calls 3333 non-null int64
Total night charge 3333 non-null float64
Total intl minutes 3333 non-null float64
Total intl calls 3333 non-null int64
Total intl charge 3333 non-null float64
Customer service calls 3333 non-null int64
Churn 3333 non-null bool
dtypes: bool(1), float64(8), int64(8), object(3)
memory usage: 498.1+ KB
(3333, 20)
Index(['State', 'Account length', 'Area code', 'International plan',
'Voice mail plan', 'Number vmail messages', 'Total day minutes',
'Total day calls', 'Total day charge', 'Total eve minutes',
'Total eve calls', 'Total eve charge', 'Total night minutes',
'Total night calls', 'Total night charge', 'Total intl minutes',
'Total intl calls', 'Total intl charge', 'Customer service calls',
'Churn'],
dtype='object')
df['Churn'] = df['Churn'].astype('int64') #将Churn列修改数据类型
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
State 3333 non-null object
Account length 3333 non-null int64
Area code 3333 non-null int64
International plan 3333 non-null object
Voice mail plan 3333 non-null object
Number vmail messages 3333 non-null int64
Total day minutes 3333 non-null float64
Total day calls 3333 non-null int64
Total day charge 3333 non-null float64
Total eve minutes 3333 non-null float64
Total eve calls 3333 non-null int64
Total eve charge 3333 non-null float64
Total night minutes 3333 non-null float64
Total night calls 3333 non-null int64
Total night charge 3333 non-null float64
Total intl minutes 3333 non-null float64
Total intl calls 3333 non-null int64
Total intl charge 3333 non-null float64
Customer service calls 3333 non-null int64
Churn 3333 non-null int64
dtypes: float64(8), int64(9), object(3)
memory usage: 520.9+ KB
df.describe() #显示数值特征(int64 和 float64)的基本统计学特性
|
Account length
|
Area code
|
Number vmail messages
|
Total day minutes
|
Total day calls
|
Total day charge
|
Total eve minutes
|
Total eve calls
|
Total eve charge
|
Total night minutes
|
Total night calls
|
Total night charge
|
Total intl minutes
|
Total intl calls
|
Total intl charge
|
Customer service calls
|
Churn
|
count
|
3333.000000
|
3333.000000
|
3333.000000
|
3333.000000
|
3333.000000
|
3333.000000
|
3333.000000
|
3333.000000
|
3333.000000
|
3333.000000
|
3333.000000
|
3333.000000
|
3333.000000
|
3333.000000
|
3333.000000
|
3333.000000
|
3333.000000
|
mean
|
101.064806
|
437.182418
|
8.099010
|
179.775098
|
100.435644
|
30.562307
|
200.980348
|
100.114311
|
17.083540
|
200.872037
|
100.107711
|
9.039325
|
10.237294
|
4.479448
|
2.764581
|
1.562856
|
0.144914
|
std
|
39.822106
|
42.371290
|
13.688365
|
54.467389
|
20.069084
|
9.259435
|
50.713844
|
19.922625
|
4.310668
|
50.573847
|
19.568609
|
2.275873
|
2.791840
|
2.461214
|
0.753773
|
1.315491
|
0.352067
|
min
|
1.000000
|
408.000000
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
23.200000
|
33.000000
|
1.040000
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
25%
|
74.000000
|
408.000000
|
0.000000
|
143.700000
|
87.000000
|
24.430000
|
166.600000
|
87.000000
|
14.160000
|
167.000000
|
87.000000
|
7.520000
|
8.500000
|
3.000000
|
2.300000
|
1.000000
|
0.000000
|
50%
|
101.000000
|
415.000000
|
0.000000
|
179.400000
|
101.000000
|
30.500000
|
201.400000
|
100.000000
|
17.120000
|
201.200000
|
100.000000
|
9.050000
|
10.300000
|
4.000000
|
2.780000
|
1.000000
|
0.000000
|
75%
|
127.000000
|
510.000000
|
20.000000
|
216.400000
|
114.000000
|
36.790000
|
235.300000
|
114.000000
|
20.000000
|
235.300000
|
113.000000
|
10.590000
|
12.100000
|
6.000000
|
3.270000
|
2.000000
|
0.000000
|
max
|
243.000000
|
510.000000
|
51.000000
|
350.800000
|
165.000000
|
59.640000
|
363.700000
|
170.000000
|
30.910000
|
395.000000
|
175.000000
|
17.770000
|
20.000000
|
20.000000
|
5.400000
|
9.000000
|
1.000000
|
df.describe(include=['object', 'bool']) #通过 include 参数显式指定包含的数据类型,可以查看非数值特征的统计数据
|
State
|
International plan
|
Voice mail plan
|
count
|
3333
|
3333
|
3333
|
unique
|
51
|
2
|
2
|
top
|
WV
|
No
|
No
|
freq
|
106
|
3010
|
2411
|
df['Churn'].value_counts() #如其名
0 2850
1 483
Name: Churn, dtype: int64
df['Churn'].value_counts(normalize=True) #传入参数显示比例
0 0.855086
1 0.144914
Name: Churn, dtype: float64
df.sort_values(by='Total day charge', ascending=False).head() #根据Total day charge列进行排序 ascending=False为倒序排序
|
State
|
Account length
|
Area code
|
International plan
|
Voice mail plan
|
Number vmail messages
|
Total day minutes
|
Total day calls
|
Total day charge
|
Total eve minutes
|
Total eve calls
|
Total eve charge
|
Total night minutes
|
Total night calls
|
Total night charge
|
Total intl minutes
|
Total intl calls
|
Total intl charge
|
Customer service calls
|
Churn
|
365
|
CO
|
154
|
415
|
No
|
No
|
0
|
350.8
|
75
|
59.64
|
216.5
|
94
|
18.40
|
253.9
|
100
|
11.43
|
10.1
|
9
|
2.73
|
1
|
1
|
985
|
NY
|
64
|
415
|
Yes
|
No
|
0
|
346.8
|
55
|
58.96
|
249.5
|
79
|
21.21
|
275.4
|
102
|
12.39
|
13.3
|
9
|
3.59
|
1
|
1
|
2594
|
OH
|
115
|
510
|
Yes
|
No
|
0
|
345.3
|
81
|
58.70
|
203.4
|
106
|
17.29
|
217.5
|
107
|
9.79
|
11.8
|
8
|
3.19
|
1
|
1
|
156
|
OH
|
83
|
415
|
No
|
No
|
0
|
337.4
|
120
|
57.36
|
227.4
|
116
|
19.33
|
153.9
|
114
|
6.93
|
15.8
|
7
|
4.27
|
0
|
1
|
605
|
MO
|
112
|
415
|
No
|
No
|
0
|
335.5
|
77
|
57.04
|
212.5
|
109
|
18.06
|
265.0
|
132
|
11.93
|
12.7
|
8
|
3.43
|
2
|
1
|
df.sort_values(by=['Churn', 'Total day charge'],
ascending=[True, False]).head() #先按 Churn 离网率 升序排列,再按 Total day charge 每日总话费 降序排列
|
State
|
Account length
|
Area code
|
International plan
|
Voice mail plan
|
Number vmail messages
|
Total day minutes
|
Total day calls
|
Total day charge
|
Total eve minutes
|
Total eve calls
|
Total eve charge
|
Total night minutes
|
Total night calls
|
Total night charge
|
Total intl minutes
|
Total intl calls
|
Total intl charge
|
Customer service calls
|
Churn
|
688
|
MN
|
13
|
510
|
No
|
Yes
|
21
|
315.6
|
105
|
53.65
|
208.9
|
71
|
17.76
|
260.1
|
123
|
11.70
|
12.1
|
3
|
3.27
|
3
|
0
|
2259
|
NC
|
210
|
415
|
No
|
Yes
|
31
|
313.8
|
87
|
53.35
|
147.7
|
103
|
12.55
|
192.7
|
97
|
8.67
|
10.1
|
7
|
2.73
|
3
|
0
|
534
|
LA
|
67
|
510
|
No
|
No
|
0
|
310.4
|
97
|
52.77
|
66.5
|
123
|
5.65
|
246.5
|
99
|
11.09
|
9.2
|
10
|
2.48
|
4
|
0
|
575
|
SD
|
114
|
415
|
No
|
Yes
|
36
|
309.9
|
90
|
52.68
|
200.3
|
89
|
17.03
|
183.5
|
105
|
8.26
|
14.2
|
2
|
3.83
|
1
|
0
|
2858
|
AL
|
141
|
510
|
No
|
Yes
|
28
|
308.0
|
123
|
52.36
|
247.8
|
128
|
21.06
|
152.9
|
103
|
6.88
|
7.4
|
3
|
2.00
|
1
|
0
|
df[df['Churn'] == 1].mean()
Account length 102.664596
Area code 437.817805
Number vmail messages 5.115942
Total day minutes 206.914079
Total day calls 101.335404
Total day charge 35.175921
Total eve minutes 212.410145
Total eve calls 100.561077
Total eve charge 18.054969
Total night minutes 205.231677
Total night calls 100.399586
Total night charge 9.235528
Total intl minutes 10.700000
Total intl calls 4.163561
Total intl charge 2.889545
Customer service calls 2.229814
Churn 1.000000
dtype: float64
df[df['Churn'] == 1]['Total day minutes'].mean()
206.91407867494814
df.loc[0:5, 'State':'Area code'] #通过标签来选取
|
State
|
Account length
|
Area code
|
0
|
KS
|
128
|
415
|
1
|
OH
|
107
|
415
|
2
|
NJ
|
137
|
415
|
3
|
OH
|
84
|
408
|
4
|
OK
|
75
|
415
|
5
|
AL
|
118
|
510
|
df.iloc[0:5, 0:3] #通过索引来选取,类似于python的切片操作
|
State
|
Account length
|
Area code
|
0
|
KS
|
128
|
415
|
1
|
OH
|
107
|
415
|
2
|
NJ
|
137
|
415
|
3
|
OH
|
84
|
408
|
4
|
OK
|
75
|
415
|
State WY
Account length 243
Area code 510
International plan Yes
Voice mail plan Yes
Number vmail messages 51
Total day minutes 350.8
Total day calls 165
Total day charge 59.64
Total eve minutes 363.7
Total eve calls 170
Total eve charge 30.91
Total night minutes 395
Total night calls 175
Total night charge 17.77
Total intl minutes 20
Total intl calls 20
Total intl charge 5.4
Customer service calls 9
Churn 1
dtype: object
df[df['State'].apply(lambda state: state[0] == 'W')].head() #获取首字母为W的州
|
State
|
Account length
|
Area code
|
International plan
|
Voice mail plan
|
Number vmail messages
|
Total day minutes
|
Total day calls
|
Total day charge
|
Total eve minutes
|
Total eve calls
|
Total eve charge
|
Total night minutes
|
Total night calls
|
Total night charge
|
Total intl minutes
|
Total intl calls
|
Total intl charge
|
Customer service calls
|
Churn
|
9
|
WV
|
141
|
415
|
Yes
|
Yes
|
37
|
258.6
|
84
|
43.96
|
222.0
|
111
|
18.87
|
326.4
|
97
|
14.69
|
11.2
|
5
|
3.02
|
0
|
0
|
26
|
WY
|
57
|
408
|
No
|
Yes
|
39
|
213.0
|
115
|
36.21
|
191.1
|
112
|
16.24
|
182.7
|
115
|
8.22
|
9.5
|
3
|
2.57
|
0
|
0
|
44
|
WI
|
64
|
510
|
No
|
No
|
0
|
154.0
|
67
|
26.18
|
225.8
|
118
|
19.19
|
265.3
|
86
|
11.94
|
3.5
|
3
|
0.95
|
1
|
0
|
49
|
WY
|
97
|
415
|
No
|
Yes
|
24
|
133.2
|
135
|
22.64
|
217.2
|
58
|
18.46
|
70.6
|
79
|
3.18
|
11.0
|
3
|
2.97
|
1
|
0
|
54
|
WY
|
87
|
415
|
No
|
No
|
0
|
151.0
|
83
|
25.67
|
219.7
|
116
|
18.67
|
203.9
|
127
|
9.18
|
9.7
|
3
|
2.62
|
5
|
1
|
d = {'No': False, 'Yes': True}
df['International plan'] = df['International plan'].map(d) #将No转换为False Yes转换为True
df.head()
|
State
|
Account length
|
Area code
|
International plan
|
Voice mail plan
|
Number vmail messages
|
Total day minutes
|
Total day calls
|
Total day charge
|
Total eve minutes
|
Total eve calls
|
Total eve charge
|
Total night minutes
|
Total night calls
|
Total night charge
|
Total intl minutes
|
Total intl calls
|
Total intl charge
|
Customer service calls
|
Churn
|
0
|
KS
|
128
|
415
|
False
|
Yes
|
25
|
265.1
|
110
|
45.07
|
197.4
|
99
|
16.78
|
244.7
|
91
|
11.01
|
10.0
|
3
|
2.70
|
1
|
0
|
1
|
OH
|
107
|
415
|
False
|
Yes
|
26
|
161.6
|
123
|
27.47
|
195.5
|
103
|
16.62
|
254.4
|
103
|
11.45
|
13.7
|
3
|
3.70
|
1
|
0
|
2
|
NJ
|
137
|
415
|
False
|
No
|
0
|
243.4
|
114
|
41.38
|
121.2
|
110
|
10.30
|
162.6
|
104
|
7.32
|
12.2
|
5
|
3.29
|
0
|
0
|
3
|
OH
|
84
|
408
|
True
|
No
|
0
|
299.4
|
71
|
50.90
|
61.9
|
88
|
5.26
|
196.9
|
89
|
8.86
|
6.6
|
7
|
1.78
|
2
|
0
|
4
|
OK
|
75
|
415
|
True
|
No
|
0
|
166.7
|
113
|
28.34
|
148.3
|
122
|
12.61
|
186.9
|
121
|
8.41
|
10.1
|
3
|
2.73
|
3
|
0
|
df = df.replace({'Voice mail plan': d}) #用replace也可以达到相同的目的
df.head()
|
State
|
Account length
|
Area code
|
International plan
|
Voice mail plan
|
Number vmail messages
|
Total day minutes
|
Total day calls
|
Total day charge
|
Total eve minutes
|
Total eve calls
|
Total eve charge
|
Total night minutes
|
Total night calls
|
Total night charge
|
Total intl minutes
|
Total intl calls
|
Total intl charge
|
Customer service calls
|
Churn
|
0
|
KS
|
128
|
415
|
False
|
True
|
25
|
265.1
|
110
|
45.07
|
197.4
|
99
|
16.78
|
244.7
|
91
|
11.01
|
10.0
|
3
|
2.70
|
1
|
0
|
1
|
OH
|
107
|
415
|
False
|
True
|
26
|
161.6
|
123
|
27.47
|
195.5
|
103
|
16.62
|
254.4
|
103
|
11.45
|
13.7
|
3
|
3.70
|
1
|
0
|
2
|
NJ
|
137
|
415
|
False
|
False
|
0
|
243.4
|
114
|
41.38
|
121.2
|
110
|
10.30
|
162.6
|
104
|
7.32
|
12.2
|
5
|
3.29
|
0
|
0
|
3
|
OH
|
84
|
408
|
True
|
False
|
0
|
299.4
|
71
|
50.90
|
61.9
|
88
|
5.26
|
196.9
|
89
|
8.86
|
6.6
|
7
|
1.78
|
2
|
0
|
4
|
OK
|
75
|
415
|
True
|
False
|
0
|
166.7
|
113
|
28.34
|
148.3
|
122
|
12.61
|
186.9
|
121
|
8.41
|
10.1
|
3
|
2.73
|
3
|
0
|
汇总表
pd.crosstab(df['Churn'], df['International plan'])
International plan
|
False
|
True
|
Churn
|
|
|
0
|
2664
|
186
|
1
|
346
|
137
|
pd.crosstab(df['Churn'], df['Voice mail plan'], normalize=True)
Voice mail plan
|
False
|
True
|
Churn
|
|
|
0
|
0.602460
|
0.252625
|
1
|
0.120912
|
0.024002
|
total_calls = df['Total day calls'] + df['Total eve calls'] + \
df['Total night calls'] + df['Total intl calls']
# loc 参数是插入 Series 对象后选择的列数
# 设置为 len(df.columns)以便将计算后的 Total calls 粘贴到最后一列
df.insert(loc=len(df.columns), column='Total calls', value=total_calls)
df.head()
|
State
|
Account length
|
Area code
|
International plan
|
Voice mail plan
|
Number vmail messages
|
Total day minutes
|
Total day calls
|
Total day charge
|
Total eve minutes
|
…
|
Total eve charge
|
Total night minutes
|
Total night calls
|
Total night charge
|
Total intl minutes
|
Total intl calls
|
Total intl charge
|
Customer service calls
|
Churn
|
Total calls
|
0
|
KS
|
128
|
415
|
False
|
True
|
25
|
265.1
|
110
|
45.07
|
197.4
|
…
|
16.78
|
244.7
|
91
|
11.01
|
10.0
|
3
|
2.70
|
1
|
0
|
303
|
1
|
OH
|
107
|
415
|
False
|
True
|
26
|
161.6
|
123
|
27.47
|
195.5
|
…
|
16.62
|
254.4
|
103
|
11.45
|
13.7
|
3
|
3.70
|
1
|
0
|
332
|
2
|
NJ
|
137
|
415
|
False
|
False
|
0
|
243.4
|
114
|
41.38
|
121.2
|
…
|
10.30
|
162.6
|
104
|
7.32
|
12.2
|
5
|
3.29
|
0
|
0
|
333
|
3
|
OH
|
84
|
408
|
True
|
False
|
0
|
299.4
|
71
|
50.90
|
61.9
|
…
|
5.26
|
196.9
|
89
|
8.86
|
6.6
|
7
|
1.78
|
2
|
0
|
255
|
4
|
OK
|
75
|
415
|
True
|
False
|
0
|
166.7
|
113
|
28.34
|
148.3
|
…
|
12.61
|
186.9
|
121
|
8.41
|
10.1
|
3
|
2.73
|
3
|
0
|
359
|
5 rows × 21 columns
df['Total charge'] = df['Total day charge'] + df['Total eve charge'] + \
df['Total night charge'] + df['Total intl charge'] #不创造实例的情况下直接插入
df.head()
|
State
|
Account length
|
Area code
|
International plan
|
Voice mail plan
|
Number vmail messages
|
Total day minutes
|
Total day calls
|
Total day charge
|
Total eve minutes
|
…
|
Total night minutes
|
Total night calls
|
Total night charge
|
Total intl minutes
|
Total intl calls
|
Total intl charge
|
Customer service calls
|
Churn
|
Total calls
|
Total charge
|
0
|
KS
|
128
|
415
|
False
|
True
|
25
|
265.1
|
110
|
45.07
|
197.4
|
…
|
244.7
|
91
|
11.01
|
10.0
|
3
|
2.70
|
1
|
0
|
303
|
75.56
|
1
|
OH
|
107
|
415
|
False
|
True
|
26
|
161.6
|
123
|
27.47
|
195.5
|
…
|
254.4
|
103
|
11.45
|
13.7
|
3
|
3.70
|
1
|
0
|
332
|
59.24
|
2
|
NJ
|
137
|
415
|
False
|
False
|
0
|
243.4
|
114
|
41.38
|
121.2
|
…
|
162.6
|
104
|
7.32
|
12.2
|
5
|
3.29
|
0
|
0
|
333
|
62.29
|
3
|
OH
|
84
|
408
|
True
|
False
|
0
|
299.4
|
71
|
50.90
|
61.9
|
…
|
196.9
|
89
|
8.86
|
6.6
|
7
|
1.78
|
2
|
0
|
255
|
66.80
|
4
|
OK
|
75
|
415
|
True
|
False
|
0
|
166.7
|
113
|
28.34
|
148.3
|
…
|
186.9
|
121
|
8.41
|
10.1
|
3
|
2.73
|
3
|
0
|
359
|
52.09
|
5 rows × 22 columns