目录

learn_one

vllbc 收录于 Pandas

2021-07-22 约 2318 字预计阅读 5 分钟次阅读

目录

导入库

import pandas as pd
import numpy as np

一维数组

arr = [0, 1, 2, 3, 4]
s1 = pd.Series(arr)  # 如果不指定索引，则默认从 0 开始
s1

0    0
1    1
2    2
3    3
4    4
dtype: int64

n = np.random.randn(5)  # 创建一个随机 Ndarray 数组

index = ['a', 'b', 'c', 'd', 'e']
s2 = pd.Series(n, index=index)
s2

a    0.647546
b    0.197186
c    0.590904
d   -0.422565
e   -0.122490
dtype: float64

用字典创建一维数组

d = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}  # 定义示例字典
s3 = pd.Series(d)
s3

a    1
b    2
c    3
d    4
e    5
dtype: int64

Series 基本操作

print(s1)  # 以 s1 为例

s1.index = ['A', 'B', 'C', 'D', 'E']  # 修改后的索引
s1

0    0
1    1
2    2
3    3
4    4
dtype: int64





A    0
B    1
C    2
D    3
E    4
dtype: int64

s4 = s3.append(s1)  # 将 s1 拼接到 s3
s4

a    1
b    2
c    3
d    4
e    5
A    0
B    1
C    2
D    3
E    4
dtype: int64

print(s4)
s4 = s4.drop('e')  # 删除索引为 e 的值
s4

a    1
b    2
c    3
d    4
e    5
A    0
B    1
C    2
D    3
E    4
dtype: int64





a    1
b    2
c    3
d    4
A    0
B    1
C    2
D    3
E    4
dtype: int64

s4['A'] = 6  # 修改索引为 A 的值 = 6
s4

a    1
b    2
c    3
d    4
A    6
B    1
C    2
D    3
E    4
dtype: int64

s4.add(s3)#如果索引不同则为NAN sub为减，mul为乘，div为除

A    NaN
B    NaN
C    NaN
D    NaN
E    NaN
a    2.0
b    4.0
c    6.0
d    8.0
e    NaN
dtype: float64

print(s4.median(),#中位数
s4.sum(),#求和
s4.max(),#求最大值
s4.min())#求最小值

3.0 26 6 1

DataFrame

dates = pd.date_range('today', periods=6)  # 定义时间序列作为 index
num_arr = np.random.randn(6, 4)  # 传入 numpy 随机数组
columns = ['A', 'B', 'C', 'D']  # 将列表作为列名
df1 = pd.DataFrame(num_arr, index=dates, columns=columns)
df1

	A	B	C	D
2020-11-13 20:54:24.760775	-0.403545	0.643881	-1.697286	-0.616257
2020-11-14 20:54:24.760775	0.102470	1.451324	-0.269714	-0.316194
2020-11-15 20:54:24.760775	0.342680	-0.137238	-0.785406	-0.441022
2020-11-16 20:54:24.760775	0.130079	1.929569	-0.756832	-2.490272
2020-11-17 20:54:24.760775	1.774664	1.037605	0.275989	-0.982924
2020-11-18 20:54:24.760775	0.952462	1.666130	-0.920394	-1.358411

data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df2 = pd.DataFrame(data, index=labels)
df2#df2.head()查看前几行 df2.tail()查看后几行

	age	animal	priority	visits
a	2.5	cat	yes	1
b	3.0	cat	yes	3
c	0.5	snake	no	2
d	NaN	dog	yes	3
e	5.0	dog	no	2
f	2.0	cat	no	3
g	4.5	snake	no	1
h	NaN	cat	yes	1
i	7.0	dog	no	2
j	3.0	dog	no	1

df2.columns,df2.index,df2.values #查看索引.列名和值

(Index(['age', 'animal', 'priority', 'visits'], dtype='object'),
 Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object'),
 array([[2.5, 'cat', 'yes', 1],
        [3.0, 'cat', 'yes', 3],
        [0.5, 'snake', 'no', 2],
        [nan, 'dog', 'yes', 3],
        [5.0, 'dog', 'no', 2],
        [2.0, 'cat', 'no', 3],
        [4.5, 'snake', 'no', 1],
        [nan, 'cat', 'yes', 1],
        [7.0, 'dog', 'no', 2],
        [3.0, 'dog', 'no', 1]], dtype=object))

df2.describe()#数据处理常用

	age	visits
count	8.000000	10.000000
mean	3.437500	1.900000
std	2.007797	0.875595
min	0.500000	1.000000
25%	2.375000	1.000000
50%	3.000000	2.000000
75%	4.625000	2.750000
max	7.000000	3.000000

df2.T #逆

	a	b	c	d	e	f	g	h	i	j
age	2.5	3	0.5	NaN	5	2	4.5	NaN	7	3
animal	cat	cat	snake	dog	dog	cat	snake	cat	dog	dog
priority	yes	yes	no	yes	no	no	no	yes	no	no
visits	1	3	2	3	2	3	1	1	2	1

df2.sort_values(by='age')

	age	animal	priority	visits
c	0.5	snake	no	2
f	2.0	cat	no	3
a	2.5	cat	yes	1
b	3.0	cat	yes	3
j	3.0	dog	no	1
g	4.5	snake	no	1
e	5.0	dog	no	2
i	7.0	dog	no	2
d	NaN	dog	yes	3
h	NaN	cat	yes	1

df2['age']#显示age列 == df2.age

a    2.5
b    3.0
c    0.5
d    NaN
e    5.0
f    2.0
g    4.5
h    NaN
i    7.0
j    3.0
Name: age, dtype: float64

df2[['age','animal']] #多列查询

	age	animal
a	2.5	cat
b	3.0	cat
c	0.5	snake
d	NaN	dog
e	5.0	dog
f	2.0	cat
g	4.5	snake
h	NaN	cat
i	7.0	dog
j	3.0	dog

df2.iloc[1:3]#查询2，3行 ==df2[1:3]

	age	animal	priority	visits
b	3.0	cat	yes	3
c	0.5	snake	no	2

df3 = df2.copy()#拷贝一份
df3

	age	animal	priority	visits
a	2.5	cat	yes	1
b	3.0	cat	yes	3
c	0.5	snake	no	2
d	NaN	dog	yes	3
e	5.0	dog	no	2
f	2.0	cat	no	3
g	4.5	snake	no	1
h	NaN	cat	yes	1
i	7.0	dog	no	2
j	3.0	dog	no	1

df3.isnull()#检测是否为空，为空返回True

	age	animal	priority	visits
a	False	False	False	False
b	False	False	False	False
c	False	False	False	False
d	True	False	False	False
e	False	False	False	False
f	False	False	False	False
g	False	False	False	False
h	True	False	False	False
i	False	False	False	False
j	False	False	False	False

num = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], index=df3.index)#建立数据，注意索引和df3一致

df3['No.'] = num  # 添加以 'No.' 为列名的新数据列
df3

	age	animal	priority	visits	No.
a	2.5	cat	yes	1	0
b	3.0	cat	yes	3	1
c	0.5	snake	no	2	2
d	NaN	dog	yes	3	3
e	5.0	dog	no	2	4
f	2.0	cat	no	3	5
g	4.5	snake	no	1	6
h	NaN	cat	yes	1	7
i	7.0	dog	no	2	8
j	3.0	dog	no	1	9

df3.iat[1,1] = 2 #根据索引改变值，这里相当于二维数组了
df3

	age	animal	priority	visits	No.
a	2.5	cat	yes	1	0
b	3.0	2	yes	3	1
c	0.5	snake	no	2	2
d	NaN	dog	yes	3	3
e	5.0	dog	no	2	4
f	1.5	cat	no	3	5
g	4.5	snake	no	1	6
h	NaN	cat	yes	1	7
i	7.0	dog	no	2	8
j	3.0	dog	no	1	9

df3.loc['f', 'age'] = 1.5 #根据标签来修改
df3

	age	animal	priority	visits	No.
a	2.5	cat	yes	1	0
b	3.0	2	yes	3	1
c	0.5	snake	no	2	2
d	NaN	dog	yes	3	3
e	5.0	dog	no	2	4
f	1.5	cat	no	3	5
g	4.5	snake	no	1	6
h	NaN	cat	yes	1	7
i	7.0	dog	no	2	8
j	3.0	dog	no	1	9

df3.mean()#求平均值

age       3.375
visits    1.900
No.       4.500
dtype: float64

df3['visits'].sum()#对visits列进行求和

df4 = df3.copy()
print(df4)
df4.fillna(value=3)#如果是NAN补充为3

   age animal priority  visits  No.
a  2.5    cat      yes       1    0
b  3.0      2      yes       3    1
c  0.5  snake       no       2    2
d  NaN    dog      yes       3    3
e  5.0    dog       no       2    4
f  1.5    cat       no       3    5
g  4.5  snake       no       1    6
h  NaN    cat      yes       1    7
i  7.0    dog       no       2    8
j  3.0    dog       no       1    9

	age	animal	priority	visits	No.
a	2.5	cat	yes	1	0
b	3.0	2	yes	3	1
c	0.5	snake	no	2	2
d	3.0	dog	yes	3	3
e	5.0	dog	no	2	4
f	1.5	cat	no	3	5
g	4.5	snake	no	1	6
h	3.0	cat	yes	1	7
i	7.0	dog	no	2	8
j	3.0	dog	no	1	9

df5 = df3.copy()
print(df5)
df5.dropna(how='any')  # 任何存在 NaN 的行都将被删除

   age animal priority  visits  No.
a  2.5    cat      yes       1    0
b  3.0      2      yes       3    1
c  0.5  snake       no       2    2
d  NaN    dog      yes       3    3
e  5.0    dog       no       2    4
f  1.5    cat       no       3    5
g  4.5  snake       no       1    6
h  NaN    cat      yes       1    7
i  7.0    dog       no       2    8
j  3.0    dog       no       1    9

	age	animal	priority	visits	No.
a	2.5	cat	yes	1	0
b	3.0	2	yes	3	1
c	0.5	snake	no	2	2
e	5.0	dog	no	2	4
f	1.5	cat	no	3	5
g	4.5	snake	no	1	6
i	7.0	dog	no	2	8
j	3.0	dog	no	1	9

left = pd.DataFrame({'key': ['foo1', 'foo2'], 'one': [1, 2]})
right = pd.DataFrame({'key': ['foo2', 'foo3'], 'two': [4, 5]})

print(left)
print(right)

# 按照 key 列对齐连接，只存在 foo2 相同，所以最后变成一行
pd.merge(left, right, on='key')

    key  one
0  foo1    1
1  foo2    2
    key  two
0  foo2    4
1  foo3    5

	key	one	two
0	foo2	2	4

df3.to_csv('animal.csv')
print("写入成功.")

写入成功.

df_animal = pd.read_csv('animal.csv')
df_animal

	Unnamed: 0	age	animal	priority	visits	No.
0	a	2.5	cat	yes	1	0
1	b	3.0	2	yes	3	1
2	c	0.5	snake	no	2	2
3	d	NaN	dog	yes	3	3
4	e	5.0	dog	no	2	4
5	f	1.5	cat	no	3	5
6	g	4.5	snake	no	1	6
7	h	NaN	cat	yes	1	7
8	i	7.0	dog	no	2	8
9	j	3.0	dog	no	1	9

df3.to_excel('animal.xlsx', sheet_name='Sheet1')
print("写入成功.")

写入成功.

pd.read_excel('animal.xlsx', 'Sheet1', index_col=None, na_values=['NA'])

	age	animal	priority	visits	No.
a	2.5	cat	yes	1	0
b	3.0	2	yes	3	1
c	0.5	snake	no	2	2
d	NaN	dog	yes	3	3
e	5.0	dog	no	2	4
f	1.5	cat	no	3	5
g	4.5	snake	no	1	6
h	NaN	cat	yes	1	7
i	7.0	dog	no	2	8
j	3.0	dog	no	1	9

进阶部分

dti = pd.date_range(start='2018-01-01', end='2018-12-31', freq='D')
s = pd.Series(np.random.rand(len(dti)), index=dti)
s

2018-01-01    0.201676
2018-01-02    0.502547
2018-01-03    0.170739
2018-01-04    0.725599
2018-01-05    0.550199
2018-01-06    0.032421
2018-01-07    0.699385
2018-01-08    0.486797
2018-01-09    0.829427
2018-01-10    0.180882
2018-01-11    0.038766
2018-01-12    0.277279
2018-01-13    0.549932
2018-01-14    0.053058
2018-01-15    0.031786
2018-01-16    0.418983
2018-01-17    0.118010
2018-01-18    0.396846
2018-01-19    0.337507
2018-01-20    0.357052
2018-01-21    0.715088
2018-01-22    0.147573
2018-01-23    0.444479
2018-01-24    0.354306
2018-01-25    0.054420
2018-01-26    0.024047
2018-01-27    0.432698
2018-01-28    0.532165
2018-01-29    0.262547
2018-01-30    0.727870
                ...   
2018-12-02    0.855610
2018-12-03    0.682516
2018-12-04    0.991494
2018-12-05    0.935902
2018-12-06    0.893206
2018-12-07    0.884918
2018-12-08    0.285479
2018-12-09    0.317185
2018-12-10    0.449078
2018-12-11    0.266593
2018-12-12    0.432669
2018-12-13    0.065663
2018-12-14    0.524504
2018-12-15    0.164935
2018-12-16    0.347022
2018-12-17    0.127325
2018-12-18    0.406984
2018-12-19    0.698307
2018-12-20    0.135198
2018-12-21    0.153526
2018-12-22    0.312617
2018-12-23    0.965893
2018-12-24    0.957769
2018-12-25    0.449219
2018-12-26    0.037703
2018-12-27    0.640956
2018-12-28    0.434779
2018-12-29    0.750819
2018-12-30    0.471872
2018-12-31    0.566274
Freq: D, Length: 365, dtype: float64

# 周一从 0 开始
s[s.index.weekday == 2].sum()#求所有周三的和

26.097402479556862

s.resample('M').mean()# 每个月值的平均值

2018-01-31    0.351720
2018-02-28    0.555833
2018-03-31    0.510938
2018-04-30    0.524954
2018-05-31    0.477990
2018-06-30    0.584583
2018-07-31    0.531724
2018-08-31    0.494037
2018-09-30    0.579968
2018-10-31    0.532052
2018-11-30    0.471283
2018-12-31    0.518058
Freq: M, dtype: float64