目录

learn_two

目录
import pandas as pd
import numpy as np

读取文件

df = pd.read_csv(
    'https://labfile.oss.aliyuncs.com/courses/1283/telecom_churn.csv')
df.head()
State Account length Area code International plan Voice mail plan Number vmail messages Total day minutes Total day calls Total day charge Total eve minutes Total eve calls Total eve charge Total night minutes Total night calls Total night charge Total intl minutes Total intl calls Total intl charge Customer service calls Churn
0 KS 128 415 No Yes 25 265.1 110 45.07 197.4 99 16.78 244.7 91 11.01 10.0 3 2.70 1 False
1 OH 107 415 No Yes 26 161.6 123 27.47 195.5 103 16.62 254.4 103 11.45 13.7 3 3.70 1 False
2 NJ 137 415 No No 0 243.4 114 41.38 121.2 110 10.30 162.6 104 7.32 12.2 5 3.29 0 False
3 OH 84 408 Yes No 0 299.4 71 50.90 61.9 88 5.26 196.9 89 8.86 6.6 7 1.78 2 False
4 OK 75 415 Yes No 0 166.7 113 28.34 148.3 122 12.61 186.9 121 8.41 10.1 3 2.73 3 False
df.info() #DataFrame 的一些总体信息。
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
State                     3333 non-null object
Account length            3333 non-null int64
Area code                 3333 non-null int64
International plan        3333 non-null object
Voice mail plan           3333 non-null object
Number vmail messages     3333 non-null int64
Total day minutes         3333 non-null float64
Total day calls           3333 non-null int64
Total day charge          3333 non-null float64
Total eve minutes         3333 non-null float64
Total eve calls           3333 non-null int64
Total eve charge          3333 non-null float64
Total night minutes       3333 non-null float64
Total night calls         3333 non-null int64
Total night charge        3333 non-null float64
Total intl minutes        3333 non-null float64
Total intl calls          3333 non-null int64
Total intl charge         3333 non-null float64
Customer service calls    3333 non-null int64
Churn                     3333 non-null bool
dtypes: bool(1), float64(8), int64(8), object(3)
memory usage: 498.1+ KB
df.shape #形状大小
(3333, 20)
df.columns #列名
Index(['State', 'Account length', 'Area code', 'International plan',
       'Voice mail plan', 'Number vmail messages', 'Total day minutes',
       'Total day calls', 'Total day charge', 'Total eve minutes',
       'Total eve calls', 'Total eve charge', 'Total night minutes',
       'Total night calls', 'Total night charge', 'Total intl minutes',
       'Total intl calls', 'Total intl charge', 'Customer service calls',
       'Churn'],
      dtype='object')
df['Churn'] = df['Churn'].astype('int64')  #将Churn列修改数据类型
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
State                     3333 non-null object
Account length            3333 non-null int64
Area code                 3333 non-null int64
International plan        3333 non-null object
Voice mail plan           3333 non-null object
Number vmail messages     3333 non-null int64
Total day minutes         3333 non-null float64
Total day calls           3333 non-null int64
Total day charge          3333 non-null float64
Total eve minutes         3333 non-null float64
Total eve calls           3333 non-null int64
Total eve charge          3333 non-null float64
Total night minutes       3333 non-null float64
Total night calls         3333 non-null int64
Total night charge        3333 non-null float64
Total intl minutes        3333 non-null float64
Total intl calls          3333 non-null int64
Total intl charge         3333 non-null float64
Customer service calls    3333 non-null int64
Churn                     3333 non-null int64
dtypes: float64(8), int64(9), object(3)
memory usage: 520.9+ KB
df.describe() #显示数值特征(int64 和 float64)的基本统计学特性
Account length Area code Number vmail messages Total day minutes Total day calls Total day charge Total eve minutes Total eve calls Total eve charge Total night minutes Total night calls Total night charge Total intl minutes Total intl calls Total intl charge Customer service calls Churn
count 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000
mean 101.064806 437.182418 8.099010 179.775098 100.435644 30.562307 200.980348 100.114311 17.083540 200.872037 100.107711 9.039325 10.237294 4.479448 2.764581 1.562856 0.144914
std 39.822106 42.371290 13.688365 54.467389 20.069084 9.259435 50.713844 19.922625 4.310668 50.573847 19.568609 2.275873 2.791840 2.461214 0.753773 1.315491 0.352067
min 1.000000 408.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 23.200000 33.000000 1.040000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 74.000000 408.000000 0.000000 143.700000 87.000000 24.430000 166.600000 87.000000 14.160000 167.000000 87.000000 7.520000 8.500000 3.000000 2.300000 1.000000 0.000000
50% 101.000000 415.000000 0.000000 179.400000 101.000000 30.500000 201.400000 100.000000 17.120000 201.200000 100.000000 9.050000 10.300000 4.000000 2.780000 1.000000 0.000000
75% 127.000000 510.000000 20.000000 216.400000 114.000000 36.790000 235.300000 114.000000 20.000000 235.300000 113.000000 10.590000 12.100000 6.000000 3.270000 2.000000 0.000000
max 243.000000 510.000000 51.000000 350.800000 165.000000 59.640000 363.700000 170.000000 30.910000 395.000000 175.000000 17.770000 20.000000 20.000000 5.400000 9.000000 1.000000
df.describe(include=['object', 'bool']) #通过 include 参数显式指定包含的数据类型,可以查看非数值特征的统计数据
State International plan Voice mail plan
count 3333 3333 3333
unique 51 2 2
top WV No No
freq 106 3010 2411
df['Churn'].value_counts() #如其名
0    2850
1     483
Name: Churn, dtype: int64
df['Churn'].value_counts(normalize=True) #传入参数显示比例
0    0.855086
1    0.144914
Name: Churn, dtype: float64
df.sort_values(by='Total day charge', ascending=False).head() #根据Total day charge列进行排序 ascending=False为倒序排序
State Account length Area code International plan Voice mail plan Number vmail messages Total day minutes Total day calls Total day charge Total eve minutes Total eve calls Total eve charge Total night minutes Total night calls Total night charge Total intl minutes Total intl calls Total intl charge Customer service calls Churn
365 CO 154 415 No No 0 350.8 75 59.64 216.5 94 18.40 253.9 100 11.43 10.1 9 2.73 1 1
985 NY 64 415 Yes No 0 346.8 55 58.96 249.5 79 21.21 275.4 102 12.39 13.3 9 3.59 1 1
2594 OH 115 510 Yes No 0 345.3 81 58.70 203.4 106 17.29 217.5 107 9.79 11.8 8 3.19 1 1
156 OH 83 415 No No 0 337.4 120 57.36 227.4 116 19.33 153.9 114 6.93 15.8 7 4.27 0 1
605 MO 112 415 No No 0 335.5 77 57.04 212.5 109 18.06 265.0 132 11.93 12.7 8 3.43 2 1
df.sort_values(by=['Churn', 'Total day charge'],
               ascending=[True, False]).head() #先按 Churn 离网率 升序排列,再按 Total day charge 每日总话费 降序排列
State Account length Area code International plan Voice mail plan Number vmail messages Total day minutes Total day calls Total day charge Total eve minutes Total eve calls Total eve charge Total night minutes Total night calls Total night charge Total intl minutes Total intl calls Total intl charge Customer service calls Churn
688 MN 13 510 No Yes 21 315.6 105 53.65 208.9 71 17.76 260.1 123 11.70 12.1 3 3.27 3 0
2259 NC 210 415 No Yes 31 313.8 87 53.35 147.7 103 12.55 192.7 97 8.67 10.1 7 2.73 3 0
534 LA 67 510 No No 0 310.4 97 52.77 66.5 123 5.65 246.5 99 11.09 9.2 10 2.48 4 0
575 SD 114 415 No Yes 36 309.9 90 52.68 200.3 89 17.03 183.5 105 8.26 14.2 2 3.83 1 0
2858 AL 141 510 No Yes 28 308.0 123 52.36 247.8 128 21.06 152.9 103 6.88 7.4 3 2.00 1 0
df[df['Churn'] == 1].mean()
Account length            102.664596
Area code                 437.817805
Number vmail messages       5.115942
Total day minutes         206.914079
Total day calls           101.335404
Total day charge           35.175921
Total eve minutes         212.410145
Total eve calls           100.561077
Total eve charge           18.054969
Total night minutes       205.231677
Total night calls         100.399586
Total night charge          9.235528
Total intl minutes         10.700000
Total intl calls            4.163561
Total intl charge           2.889545
Customer service calls      2.229814
Churn                       1.000000
dtype: float64
df[df['Churn'] == 1]['Total day minutes'].mean()
206.91407867494814
df.loc[0:5, 'State':'Area code'] #通过标签来选取
State Account length Area code
0 KS 128 415
1 OH 107 415
2 NJ 137 415
3 OH 84 408
4 OK 75 415
5 AL 118 510
df.iloc[0:5, 0:3] #通过索引来选取,类似于python的切片操作
State Account length Area code
0 KS 128 415
1 OH 107 415
2 NJ 137 415
3 OH 84 408
4 OK 75 415
df.apply(max) #应用到每一列
State                        WY
Account length              243
Area code                   510
International plan          Yes
Voice mail plan             Yes
Number vmail messages        51
Total day minutes         350.8
Total day calls             165
Total day charge          59.64
Total eve minutes         363.7
Total eve calls             170
Total eve charge          30.91
Total night minutes         395
Total night calls           175
Total night charge        17.77
Total intl minutes           20
Total intl calls             20
Total intl charge           5.4
Customer service calls        9
Churn                         1
dtype: object
df[df['State'].apply(lambda state: state[0] == 'W')].head() #获取首字母为W的州 
State Account length Area code International plan Voice mail plan Number vmail messages Total day minutes Total day calls Total day charge Total eve minutes Total eve calls Total eve charge Total night minutes Total night calls Total night charge Total intl minutes Total intl calls Total intl charge Customer service calls Churn
9 WV 141 415 Yes Yes 37 258.6 84 43.96 222.0 111 18.87 326.4 97 14.69 11.2 5 3.02 0 0
26 WY 57 408 No Yes 39 213.0 115 36.21 191.1 112 16.24 182.7 115 8.22 9.5 3 2.57 0 0
44 WI 64 510 No No 0 154.0 67 26.18 225.8 118 19.19 265.3 86 11.94 3.5 3 0.95 1 0
49 WY 97 415 No Yes 24 133.2 135 22.64 217.2 58 18.46 70.6 79 3.18 11.0 3 2.97 1 0
54 WY 87 415 No No 0 151.0 83 25.67 219.7 116 18.67 203.9 127 9.18 9.7 3 2.62 5 1
d = {'No': False, 'Yes': True}
df['International plan'] = df['International plan'].map(d) #将No转换为False Yes转换为True
df.head()
State Account length Area code International plan Voice mail plan Number vmail messages Total day minutes Total day calls Total day charge Total eve minutes Total eve calls Total eve charge Total night minutes Total night calls Total night charge Total intl minutes Total intl calls Total intl charge Customer service calls Churn
0 KS 128 415 False Yes 25 265.1 110 45.07 197.4 99 16.78 244.7 91 11.01 10.0 3 2.70 1 0
1 OH 107 415 False Yes 26 161.6 123 27.47 195.5 103 16.62 254.4 103 11.45 13.7 3 3.70 1 0
2 NJ 137 415 False No 0 243.4 114 41.38 121.2 110 10.30 162.6 104 7.32 12.2 5 3.29 0 0
3 OH 84 408 True No 0 299.4 71 50.90 61.9 88 5.26 196.9 89 8.86 6.6 7 1.78 2 0
4 OK 75 415 True No 0 166.7 113 28.34 148.3 122 12.61 186.9 121 8.41 10.1 3 2.73 3 0
df = df.replace({'Voice mail plan': d}) #用replace也可以达到相同的目的
df.head()
State Account length Area code International plan Voice mail plan Number vmail messages Total day minutes Total day calls Total day charge Total eve minutes Total eve calls Total eve charge Total night minutes Total night calls Total night charge Total intl minutes Total intl calls Total intl charge Customer service calls Churn
0 KS 128 415 False True 25 265.1 110 45.07 197.4 99 16.78 244.7 91 11.01 10.0 3 2.70 1 0
1 OH 107 415 False True 26 161.6 123 27.47 195.5 103 16.62 254.4 103 11.45 13.7 3 3.70 1 0
2 NJ 137 415 False False 0 243.4 114 41.38 121.2 110 10.30 162.6 104 7.32 12.2 5 3.29 0 0
3 OH 84 408 True False 0 299.4 71 50.90 61.9 88 5.26 196.9 89 8.86 6.6 7 1.78 2 0
4 OK 75 415 True False 0 166.7 113 28.34 148.3 122 12.61 186.9 121 8.41 10.1 3 2.73 3 0

汇总表

pd.crosstab(df['Churn'], df['International plan'])
International plan False True
Churn
0 2664 186
1 346 137
pd.crosstab(df['Churn'], df['Voice mail plan'], normalize=True)
Voice mail plan False True
Churn
0 0.602460 0.252625
1 0.120912 0.024002
total_calls = df['Total day calls'] + df['Total eve calls'] + \
    df['Total night calls'] + df['Total intl calls']
# loc 参数是插入 Series 对象后选择的列数
# 设置为 len(df.columns)以便将计算后的 Total calls 粘贴到最后一列
df.insert(loc=len(df.columns), column='Total calls', value=total_calls)

df.head()
State Account length Area code International plan Voice mail plan Number vmail messages Total day minutes Total day calls Total day charge Total eve minutes Total eve charge Total night minutes Total night calls Total night charge Total intl minutes Total intl calls Total intl charge Customer service calls Churn Total calls
0 KS 128 415 False True 25 265.1 110 45.07 197.4 16.78 244.7 91 11.01 10.0 3 2.70 1 0 303
1 OH 107 415 False True 26 161.6 123 27.47 195.5 16.62 254.4 103 11.45 13.7 3 3.70 1 0 332
2 NJ 137 415 False False 0 243.4 114 41.38 121.2 10.30 162.6 104 7.32 12.2 5 3.29 0 0 333
3 OH 84 408 True False 0 299.4 71 50.90 61.9 5.26 196.9 89 8.86 6.6 7 1.78 2 0 255
4 OK 75 415 True False 0 166.7 113 28.34 148.3 12.61 186.9 121 8.41 10.1 3 2.73 3 0 359

5 rows × 21 columns

df['Total charge'] = df['Total day charge'] + df['Total eve charge'] + \
    df['Total night charge'] + df['Total intl charge'] #不创造实例的情况下直接插入
df.head()
State Account length Area code International plan Voice mail plan Number vmail messages Total day minutes Total day calls Total day charge Total eve minutes Total night minutes Total night calls Total night charge Total intl minutes Total intl calls Total intl charge Customer service calls Churn Total calls Total charge
0 KS 128 415 False True 25 265.1 110 45.07 197.4 244.7 91 11.01 10.0 3 2.70 1 0 303 75.56
1 OH 107 415 False True 26 161.6 123 27.47 195.5 254.4 103 11.45 13.7 3 3.70 1 0 332 59.24
2 NJ 137 415 False False 0 243.4 114 41.38 121.2 162.6 104 7.32 12.2 5 3.29 0 0 333 62.29
3 OH 84 408 True False 0 299.4 71 50.90 61.9 196.9 89 8.86 6.6 7 1.78 2 0 255 66.80
4 OK 75 415 True False 0 166.7 113 28.34 148.3 186.9 121 8.41 10.1 3 2.73 3 0 359 52.09

5 rows × 22 columns