导入库
import pandas as pd
import numpy as np
一维数组
arr = [0, 1, 2, 3, 4]
s1 = pd.Series(arr) # 如果不指定索引,则默认从 0 开始
s1
0 0
1 1
2 2
3 3
4 4
dtype: int64
n = np.random.randn(5) # 创建一个随机 Ndarray 数组
index = ['a', 'b', 'c', 'd', 'e']
s2 = pd.Series(n, index=index)
s2
a 0.647546
b 0.197186
c 0.590904
d -0.422565
e -0.122490
dtype: float64
用字典创建一维数组
d = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5} # 定义示例字典
s3 = pd.Series(d)
s3
a 1
b 2
c 3
d 4
e 5
dtype: int64
Series 基本操作
print(s1) # 以 s1 为例
s1.index = ['A', 'B', 'C', 'D', 'E'] # 修改后的索引
s1
0 0
1 1
2 2
3 3
4 4
dtype: int64
A 0
B 1
C 2
D 3
E 4
dtype: int64
s4 = s3.append(s1) # 将 s1 拼接到 s3
s4
a 1
b 2
c 3
d 4
e 5
A 0
B 1
C 2
D 3
E 4
dtype: int64
print(s4)
s4 = s4.drop('e') # 删除索引为 e 的值
s4
a 1
b 2
c 3
d 4
e 5
A 0
B 1
C 2
D 3
E 4
dtype: int64
a 1
b 2
c 3
d 4
A 0
B 1
C 2
D 3
E 4
dtype: int64
s4['A'] = 6 # 修改索引为 A 的值 = 6
s4
a 1
b 2
c 3
d 4
A 6
B 1
C 2
D 3
E 4
dtype: int64
s4.add(s3)#如果索引不同则为NAN sub为减,mul为乘,div为除
A NaN
B NaN
C NaN
D NaN
E NaN
a 2.0
b 4.0
c 6.0
d 8.0
e NaN
dtype: float64
print(s4.median(),#中位数
s4.sum(),#求和
s4.max(),#求最大值
s4.min())#求最小值
3.0 26 6 1
DataFrame
dates = pd.date_range('today', periods=6) # 定义时间序列作为 index
num_arr = np.random.randn(6, 4) # 传入 numpy 随机数组
columns = ['A', 'B', 'C', 'D'] # 将列表作为列名
df1 = pd.DataFrame(num_arr, index=dates, columns=columns)
df1
|
A
|
B
|
C
|
D
|
2020-11-13 20:54:24.760775
|
-0.403545
|
0.643881
|
-1.697286
|
-0.616257
|
2020-11-14 20:54:24.760775
|
0.102470
|
1.451324
|
-0.269714
|
-0.316194
|
2020-11-15 20:54:24.760775
|
0.342680
|
-0.137238
|
-0.785406
|
-0.441022
|
2020-11-16 20:54:24.760775
|
0.130079
|
1.929569
|
-0.756832
|
-2.490272
|
2020-11-17 20:54:24.760775
|
1.774664
|
1.037605
|
0.275989
|
-0.982924
|
2020-11-18 20:54:24.760775
|
0.952462
|
1.666130
|
-0.920394
|
-1.358411
|
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df2 = pd.DataFrame(data, index=labels)
df2#df2.head()查看前几行 df2.tail()查看后几行
|
age
|
animal
|
priority
|
visits
|
a
|
2.5
|
cat
|
yes
|
1
|
b
|
3.0
|
cat
|
yes
|
3
|
c
|
0.5
|
snake
|
no
|
2
|
d
|
NaN
|
dog
|
yes
|
3
|
e
|
5.0
|
dog
|
no
|
2
|
f
|
2.0
|
cat
|
no
|
3
|
g
|
4.5
|
snake
|
no
|
1
|
h
|
NaN
|
cat
|
yes
|
1
|
i
|
7.0
|
dog
|
no
|
2
|
j
|
3.0
|
dog
|
no
|
1
|
df2.columns,df2.index,df2.values #查看索引.列名和值
(Index(['age', 'animal', 'priority', 'visits'], dtype='object'),
Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object'),
array([[2.5, 'cat', 'yes', 1],
[3.0, 'cat', 'yes', 3],
[0.5, 'snake', 'no', 2],
[nan, 'dog', 'yes', 3],
[5.0, 'dog', 'no', 2],
[2.0, 'cat', 'no', 3],
[4.5, 'snake', 'no', 1],
[nan, 'cat', 'yes', 1],
[7.0, 'dog', 'no', 2],
[3.0, 'dog', 'no', 1]], dtype=object))
|
age
|
visits
|
count
|
8.000000
|
10.000000
|
mean
|
3.437500
|
1.900000
|
std
|
2.007797
|
0.875595
|
min
|
0.500000
|
1.000000
|
25%
|
2.375000
|
1.000000
|
50%
|
3.000000
|
2.000000
|
75%
|
4.625000
|
2.750000
|
max
|
7.000000
|
3.000000
|
|
a
|
b
|
c
|
d
|
e
|
f
|
g
|
h
|
i
|
j
|
age
|
2.5
|
3
|
0.5
|
NaN
|
5
|
2
|
4.5
|
NaN
|
7
|
3
|
animal
|
cat
|
cat
|
snake
|
dog
|
dog
|
cat
|
snake
|
cat
|
dog
|
dog
|
priority
|
yes
|
yes
|
no
|
yes
|
no
|
no
|
no
|
yes
|
no
|
no
|
visits
|
1
|
3
|
2
|
3
|
2
|
3
|
1
|
1
|
2
|
1
|
df2.sort_values(by='age')
|
age
|
animal
|
priority
|
visits
|
c
|
0.5
|
snake
|
no
|
2
|
f
|
2.0
|
cat
|
no
|
3
|
a
|
2.5
|
cat
|
yes
|
1
|
b
|
3.0
|
cat
|
yes
|
3
|
j
|
3.0
|
dog
|
no
|
1
|
g
|
4.5
|
snake
|
no
|
1
|
e
|
5.0
|
dog
|
no
|
2
|
i
|
7.0
|
dog
|
no
|
2
|
d
|
NaN
|
dog
|
yes
|
3
|
h
|
NaN
|
cat
|
yes
|
1
|
df2['age']#显示age列 == df2.age
a 2.5
b 3.0
c 0.5
d NaN
e 5.0
f 2.0
g 4.5
h NaN
i 7.0
j 3.0
Name: age, dtype: float64
df2[['age','animal']] #多列查询
|
age
|
animal
|
a
|
2.5
|
cat
|
b
|
3.0
|
cat
|
c
|
0.5
|
snake
|
d
|
NaN
|
dog
|
e
|
5.0
|
dog
|
f
|
2.0
|
cat
|
g
|
4.5
|
snake
|
h
|
NaN
|
cat
|
i
|
7.0
|
dog
|
j
|
3.0
|
dog
|
df2.iloc[1:3]#查询2,3行 ==df2[1:3]
|
age
|
animal
|
priority
|
visits
|
b
|
3.0
|
cat
|
yes
|
3
|
c
|
0.5
|
snake
|
no
|
2
|
df3 = df2.copy()#拷贝一份
df3
|
age
|
animal
|
priority
|
visits
|
a
|
2.5
|
cat
|
yes
|
1
|
b
|
3.0
|
cat
|
yes
|
3
|
c
|
0.5
|
snake
|
no
|
2
|
d
|
NaN
|
dog
|
yes
|
3
|
e
|
5.0
|
dog
|
no
|
2
|
f
|
2.0
|
cat
|
no
|
3
|
g
|
4.5
|
snake
|
no
|
1
|
h
|
NaN
|
cat
|
yes
|
1
|
i
|
7.0
|
dog
|
no
|
2
|
j
|
3.0
|
dog
|
no
|
1
|
df3.isnull()#检测是否为空,为空返回True
|
age
|
animal
|
priority
|
visits
|
a
|
False
|
False
|
False
|
False
|
b
|
False
|
False
|
False
|
False
|
c
|
False
|
False
|
False
|
False
|
d
|
True
|
False
|
False
|
False
|
e
|
False
|
False
|
False
|
False
|
f
|
False
|
False
|
False
|
False
|
g
|
False
|
False
|
False
|
False
|
h
|
True
|
False
|
False
|
False
|
i
|
False
|
False
|
False
|
False
|
j
|
False
|
False
|
False
|
False
|
num = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], index=df3.index)#建立数据,注意索引和df3一致
df3['No.'] = num # 添加以 'No.' 为列名的新数据列
df3
|
age
|
animal
|
priority
|
visits
|
No.
|
a
|
2.5
|
cat
|
yes
|
1
|
0
|
b
|
3.0
|
cat
|
yes
|
3
|
1
|
c
|
0.5
|
snake
|
no
|
2
|
2
|
d
|
NaN
|
dog
|
yes
|
3
|
3
|
e
|
5.0
|
dog
|
no
|
2
|
4
|
f
|
2.0
|
cat
|
no
|
3
|
5
|
g
|
4.5
|
snake
|
no
|
1
|
6
|
h
|
NaN
|
cat
|
yes
|
1
|
7
|
i
|
7.0
|
dog
|
no
|
2
|
8
|
j
|
3.0
|
dog
|
no
|
1
|
9
|
df3.iat[1,1] = 2 #根据索引改变值,这里相当于二维数组了
df3
|
age
|
animal
|
priority
|
visits
|
No.
|
a
|
2.5
|
cat
|
yes
|
1
|
0
|
b
|
3.0
|
2
|
yes
|
3
|
1
|
c
|
0.5
|
snake
|
no
|
2
|
2
|
d
|
NaN
|
dog
|
yes
|
3
|
3
|
e
|
5.0
|
dog
|
no
|
2
|
4
|
f
|
1.5
|
cat
|
no
|
3
|
5
|
g
|
4.5
|
snake
|
no
|
1
|
6
|
h
|
NaN
|
cat
|
yes
|
1
|
7
|
i
|
7.0
|
dog
|
no
|
2
|
8
|
j
|
3.0
|
dog
|
no
|
1
|
9
|
df3.loc['f', 'age'] = 1.5 #根据标签来修改
df3
|
age
|
animal
|
priority
|
visits
|
No.
|
a
|
2.5
|
cat
|
yes
|
1
|
0
|
b
|
3.0
|
2
|
yes
|
3
|
1
|
c
|
0.5
|
snake
|
no
|
2
|
2
|
d
|
NaN
|
dog
|
yes
|
3
|
3
|
e
|
5.0
|
dog
|
no
|
2
|
4
|
f
|
1.5
|
cat
|
no
|
3
|
5
|
g
|
4.5
|
snake
|
no
|
1
|
6
|
h
|
NaN
|
cat
|
yes
|
1
|
7
|
i
|
7.0
|
dog
|
no
|
2
|
8
|
j
|
3.0
|
dog
|
no
|
1
|
9
|
age 3.375
visits 1.900
No. 4.500
dtype: float64
df3['visits'].sum()#对visits列进行求和
19
df4 = df3.copy()
print(df4)
df4.fillna(value=3)#如果是NAN补充为3
age animal priority visits No.
a 2.5 cat yes 1 0
b 3.0 2 yes 3 1
c 0.5 snake no 2 2
d NaN dog yes 3 3
e 5.0 dog no 2 4
f 1.5 cat no 3 5
g 4.5 snake no 1 6
h NaN cat yes 1 7
i 7.0 dog no 2 8
j 3.0 dog no 1 9
|
age
|
animal
|
priority
|
visits
|
No.
|
a
|
2.5
|
cat
|
yes
|
1
|
0
|
b
|
3.0
|
2
|
yes
|
3
|
1
|
c
|
0.5
|
snake
|
no
|
2
|
2
|
d
|
3.0
|
dog
|
yes
|
3
|
3
|
e
|
5.0
|
dog
|
no
|
2
|
4
|
f
|
1.5
|
cat
|
no
|
3
|
5
|
g
|
4.5
|
snake
|
no
|
1
|
6
|
h
|
3.0
|
cat
|
yes
|
1
|
7
|
i
|
7.0
|
dog
|
no
|
2
|
8
|
j
|
3.0
|
dog
|
no
|
1
|
9
|
df5 = df3.copy()
print(df5)
df5.dropna(how='any') # 任何存在 NaN 的行都将被删除
age animal priority visits No.
a 2.5 cat yes 1 0
b 3.0 2 yes 3 1
c 0.5 snake no 2 2
d NaN dog yes 3 3
e 5.0 dog no 2 4
f 1.5 cat no 3 5
g 4.5 snake no 1 6
h NaN cat yes 1 7
i 7.0 dog no 2 8
j 3.0 dog no 1 9
|
age
|
animal
|
priority
|
visits
|
No.
|
a
|
2.5
|
cat
|
yes
|
1
|
0
|
b
|
3.0
|
2
|
yes
|
3
|
1
|
c
|
0.5
|
snake
|
no
|
2
|
2
|
e
|
5.0
|
dog
|
no
|
2
|
4
|
f
|
1.5
|
cat
|
no
|
3
|
5
|
g
|
4.5
|
snake
|
no
|
1
|
6
|
i
|
7.0
|
dog
|
no
|
2
|
8
|
j
|
3.0
|
dog
|
no
|
1
|
9
|
left = pd.DataFrame({'key': ['foo1', 'foo2'], 'one': [1, 2]})
right = pd.DataFrame({'key': ['foo2', 'foo3'], 'two': [4, 5]})
print(left)
print(right)
# 按照 key 列对齐连接,只存在 foo2 相同,所以最后变成一行
pd.merge(left, right, on='key')
key one
0 foo1 1
1 foo2 2
key two
0 foo2 4
1 foo3 5
df3.to_csv('animal.csv')
print("写入成功.")
写入成功.
df_animal = pd.read_csv('animal.csv')
df_animal
|
Unnamed: 0
|
age
|
animal
|
priority
|
visits
|
No.
|
0
|
a
|
2.5
|
cat
|
yes
|
1
|
0
|
1
|
b
|
3.0
|
2
|
yes
|
3
|
1
|
2
|
c
|
0.5
|
snake
|
no
|
2
|
2
|
3
|
d
|
NaN
|
dog
|
yes
|
3
|
3
|
4
|
e
|
5.0
|
dog
|
no
|
2
|
4
|
5
|
f
|
1.5
|
cat
|
no
|
3
|
5
|
6
|
g
|
4.5
|
snake
|
no
|
1
|
6
|
7
|
h
|
NaN
|
cat
|
yes
|
1
|
7
|
8
|
i
|
7.0
|
dog
|
no
|
2
|
8
|
9
|
j
|
3.0
|
dog
|
no
|
1
|
9
|
df3.to_excel('animal.xlsx', sheet_name='Sheet1')
print("写入成功.")
写入成功.
pd.read_excel('animal.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
|
age
|
animal
|
priority
|
visits
|
No.
|
a
|
2.5
|
cat
|
yes
|
1
|
0
|
b
|
3.0
|
2
|
yes
|
3
|
1
|
c
|
0.5
|
snake
|
no
|
2
|
2
|
d
|
NaN
|
dog
|
yes
|
3
|
3
|
e
|
5.0
|
dog
|
no
|
2
|
4
|
f
|
1.5
|
cat
|
no
|
3
|
5
|
g
|
4.5
|
snake
|
no
|
1
|
6
|
h
|
NaN
|
cat
|
yes
|
1
|
7
|
i
|
7.0
|
dog
|
no
|
2
|
8
|
j
|
3.0
|
dog
|
no
|
1
|
9
|
进阶部分
dti = pd.date_range(start='2018-01-01', end='2018-12-31', freq='D')
s = pd.Series(np.random.rand(len(dti)), index=dti)
s
2018-01-01 0.201676
2018-01-02 0.502547
2018-01-03 0.170739
2018-01-04 0.725599
2018-01-05 0.550199
2018-01-06 0.032421
2018-01-07 0.699385
2018-01-08 0.486797
2018-01-09 0.829427
2018-01-10 0.180882
2018-01-11 0.038766
2018-01-12 0.277279
2018-01-13 0.549932
2018-01-14 0.053058
2018-01-15 0.031786
2018-01-16 0.418983
2018-01-17 0.118010
2018-01-18 0.396846
2018-01-19 0.337507
2018-01-20 0.357052
2018-01-21 0.715088
2018-01-22 0.147573
2018-01-23 0.444479
2018-01-24 0.354306
2018-01-25 0.054420
2018-01-26 0.024047
2018-01-27 0.432698
2018-01-28 0.532165
2018-01-29 0.262547
2018-01-30 0.727870
...
2018-12-02 0.855610
2018-12-03 0.682516
2018-12-04 0.991494
2018-12-05 0.935902
2018-12-06 0.893206
2018-12-07 0.884918
2018-12-08 0.285479
2018-12-09 0.317185
2018-12-10 0.449078
2018-12-11 0.266593
2018-12-12 0.432669
2018-12-13 0.065663
2018-12-14 0.524504
2018-12-15 0.164935
2018-12-16 0.347022
2018-12-17 0.127325
2018-12-18 0.406984
2018-12-19 0.698307
2018-12-20 0.135198
2018-12-21 0.153526
2018-12-22 0.312617
2018-12-23 0.965893
2018-12-24 0.957769
2018-12-25 0.449219
2018-12-26 0.037703
2018-12-27 0.640956
2018-12-28 0.434779
2018-12-29 0.750819
2018-12-30 0.471872
2018-12-31 0.566274
Freq: D, Length: 365, dtype: float64
# 周一从 0 开始
s[s.index.weekday == 2].sum()#求所有周三的和
26.097402479556862
s.resample('M').mean()# 每个月值的平均值
2018-01-31 0.351720
2018-02-28 0.555833
2018-03-31 0.510938
2018-04-30 0.524954
2018-05-31 0.477990
2018-06-30 0.584583
2018-07-31 0.531724
2018-08-31 0.494037
2018-09-30 0.579968
2018-10-31 0.532052
2018-11-30 0.471283
2018-12-31 0.518058
Freq: M, dtype: float64