
对象
Series对象(一维)
Numpy与Series
index行索引 创建Series对象 Dataframe对象(二维)
创建Dataframe对象
columns Index对象
交 并 异或 数据取值与选择
Series数据选择方式
keys、items 索引器
loc、iloc Dataframe数据选择方式 数值运算
Series
保留索引索引对齐 DataframeDataframe与Series运算 处理缺失值
Pandas的缺失值
python对象类型的缺失值数值类型的缺失值NaN:不是一个数字Pandas中NaN与None的差异 处理缺失值
发现缺失值:isnull()和notnull()剔除缺失值:dropna()填充缺失值:fillna() 层级索引MultiIndex
多级索引Series
笨方法好方法:Pandas多级索引高维数据的多级索引 多级索引的创建方法
显示地创建多级索引多级索引的等级名称多级列索引 多级索引的取片与切片
Series多级索引Dataframe多级索引 多级索引行列转换
有序的索引和无序的索引索引stack与unstack索引的设置与重置 多级索引的数据累计方法 合并数据集:Concat与Append *** 作
Numpy数组合并通过pd.concat实现简易合并
索引重复
对象 Series对象(一维)Series对象是带有索引数据构成的数组,可以作为通用型的Numpy数组,也可以看作特殊的Python字典
import pandas as pd data = pd.Series([0.25,0.5,0.75,1.0]) print(data) '''数据和索引绑定在一起 0 0.25 1 0.50 2 0.75 3 1.00 dtype: float64 ''' print(data.values) # [0.25 0.5 0.75 1. ] print(data.index) # RangeIndex(start=0, stop=4, step=1) print(data[1]) # 0.5 print(data[1:3]) ''' 1 0.50 2 0.75 dtype: float64 '''Numpy与Series index行索引
numpy是隐试定义索引获取数值,Series是显示定义索引获取数值
import pandas as pd data = pd.Series([0.25,0.5,.75,1.0], index=['a','b','c','d']) # 第二个参数可省 print(data) ''' a 0.25 b 0.50 c 0.75 d 1.00 dtype: float64 ''' print(data['b']) # 0.5 data = pd.Series([0.25,0.5,.75,1.0], index=[2,5,3,7]) print(data) ''' 2 0.25 5 0.50 3 0.75 7 1.00 dtype: float64 ''' print(data[5]) # 0.5创建Series对象
pd.Series(data, index=index)第二个参数可省
data可以是数组
data = pd.Series([0.25,0.5,0.75,1.0])
data可以是标量
data = pd.Series(5,index=[100,200,300]); print(data) ''' 100 5 200 5 300 5 dtype: int64 '''
data可以是字典,与字典不同的是它还支持数组型的 *** 作,比如切片
# Series是特殊的字典
popu = {'aa':123,
'bb':456,
'cc':789}
po = pd.Series(popu)
print(po)
'''
aa 123
bb 456
cc 789
dtype: int64
'''
print(po['bb']) # 456
# 与字典不同的是它还支持数组型的 *** 作,比如且切片
print(po['aa':'bb'])
'''
aa 123
bb 456
dtype: int64
'''
data = pd.Series({2:'a',1:'b',3:'c'}, index=[3,2])
print(data)
'''
3 c
2 a
dtype: object
'''
Dataframe对象(二维)
Dataframe可以作为通用型的Numpy数组,也可以看作特殊的Python字典
,索引[列][行],不能调换顺序,否则出错
import numpy as np
import pandas as pd
ages = {'Li':15,
'Liu':26,
'Wan':19}
age = pd.Series(ages)
scores = {'Li':80,
'Liu':88,
'Wan':92}
score = pd.Series(scores)
stu = pd.Dataframe({'age':age,'score':score})
print(stu)
'''
age score
Li 15 80
Liu 26 88
Wan 19 92
'''
# index获取索引标签
print(stu.index) # Index(['Li', 'Liu', 'Wan'], dtype='object')
# 存放标签的index对象
print(stu.columns) # Index(['age', 'score'], dtype='object')
print(stu['age'])
'''
Li 15
Liu 26
Wan 19
Name: age, dtype: int64
'''
# 索引age 和 li不能更换
print(stu['age']['Li']) # 15
创建Dataframe对象
columns
创建单列
ages = {'Li':15,
'Liu':26,
'Wan':19}
age = pd.Series(ages)
print(pd.Dataframe(age,columns=['age']))
'''
age
Li 15
Liu 26
Wan 19
'''
通过字典创建
data = [{'a':i,'b':2*i}for i in range(3)]
print(pd.Dataframe(data))
'''
a b
0 0 0
1 1 2
2 2 4
'''
缺失用NaN表示
print(pd.Dataframe([{'a':1,'b':2}, {'c':3,'d':4}]))
'''
a b c d
0 1.0 2.0 NaN NaN
1 NaN NaN 3.0 4.0
'''
通过Series字典创建
ages = {'Li':15,
'Liu':26,
'Wan':19}
age = pd.Series(ages)
scores = {'Li':80,
'Liu':88,
'Wan':92}
score = pd.Series(scores)
print(pd.Dataframe({'age':age,'score':score}))
'''
age score
Li 15 80
Liu 26 88
Wan 19 92
'''
通过Numpy二维数组创建
print(pd.Dataframe(np.random.rand(3,2), columns=['foo','bar'], index=['a','b','c']))
'''
foo bar
a 0.321252 0.393929
b 0.006765 0.450808
c 0.783284 0.667144
'''
通过Numpy结构化数组创建
a = np.zeros(3,dtype=[('a','i8'),('b','f8')])
print(a) # [(0, 0.) (0, 0.) (0, 0.)]
print(pd.Dataframe(a))
'''
a b
0 0 0.0
1 0 0.0
2 0 0.0
'''
Index对象
该对象是一个不可变数组或有序集合
ind = pd.Index([2,3,5,7,11]) print(ind) # Int64Index([2, 3, 5, 7, 11], dtype='int64') print(ind[1]) # 3 print(ind[1:3]) # Int64Index([3, 5], dtype='int64') print(ind.size,ind.shape,ind.dtype) # 5 (5,) int64交 并 异或
a = pd.Index([1,3,5,7,9]) b = pd.Index([2,3,5,6]) print(a&b) # Int64Index([3, 5], dtype='int64') print(a|b) # Int64Index([1, 2, 3, 5, 6, 7, 9], dtype='int64') print(a^b) # Int64Index([1, 2, 6, 7, 9], dtype='int64')数据取值与选择 Series数据选择方式 keys、items
将Series看做字典,可用字典表达式和方法来检测/索引和值
data = pd.Series([0.25,0.5,.75,1.0], index=['a','b','c','d'])
print('a' in data) # True
print(data.keys()) # Index(['a', 'b', 'c', 'd'], dtype='object')
print(list(data.items())) # [('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]
data['e']=1.25
print(data)
'''
a 0.25
b 0.50
c 0.75
d 1.00
e 1.25
dtype: float64
'''
Series看做一维数组
print(data['a':'c']) ''' a 0.25 b 0.50 c 0.75 dtype: float64 ''' print(data[0:2]) ''' a 0.25 b 0.50 dtype: float64 ''' print(data[(data>0.3)&(data<0.8)]) ''' b 0.50 c 0.75 dtype: float64 ''' print(data[['a','e']]) ''' a 0.25 e 1.25 dtype: float64 '''索引器 loc、iloc
data = pd.Series(['a','b','c'],index=[1,3,5]) print(data) ''' 1 a 3 b 5 c dtype: object ''' print(data[1])# a print(data[1:3]) ''' 3 b 5 c dtype: object ''' print(data.loc[1]) # a print(data.loc[1:3]) ''' 1 a 3 b dtype: object ''' print(data.iloc[1]) # b print(data.iloc[1:3]) ''' 3 b 5 c dtype: object '''Dataframe数据选择方式
# 将Dataframe看做字典
ages = {'Li':15,
'Liu':26,
'Wan':19}
age = pd.Series(ages)
scores = {'Li':80,
'Liu':88,
'Wan':92}
score = pd.Series(scores)
stu = pd.Dataframe({'age':age,'score':score})
print(stu)
'''
age score
Li 15 80
Liu 26 88
Wan 19 92
'''
print(stu['age'])
'''
Li 15
Liu 26
Wan 19
Name: age, dtype: int64
'''
print(stu.age)
'''
Li 15
Liu 26
Wan 19
Name: age, dtype: int64
'''
# 如果列名不是纯字符串,或者列名与Dataframe的方法名相同,就不能用属性索引
print(stu.age is stu['age']) # True
# 避免对属性的形式选择的列直接赋值,可以stu['age']=z,不要用stu.age=z
stu['a']=stu['age']/stu['score']
print(stu)
'''
age score a
Li 15 80 0.187500
Liu 26 88 0.295455
Wan 19 92 0.206522
'''
# 将Dataframe看作二维数组
print(stu.values)
'''
[[15. 80. 0.1875 ]
[26. 88. 0.29545455]
[19. 92. 0.20652174]]
'''
print(stu.T)
'''
Li Liu Wan
age 15.0000 26.000000 19.000000
score 80.0000 88.000000 92.000000
a 0.1875 0.295455 0.206522
'''
ages = {'Li':15,
'Liu':26,
'Wan':19}
age = pd.Series(ages)
scores = {'Li':80,
'Liu':88,
'Wan':92}
score = pd.Series(scores)
stu = pd.Dataframe({'age':age,'score':score})
print(stu)
'''
age score
Li 15 80
Liu 26 88
Wan 19 92
'''
print(stu.values[0]) # [15 80]
print(stu['age'])
'''
Li 15
Liu 26
Wan 19
Name: age, dtype: int64
'''
print(stu.iloc[:2,:1])
'''
age
Li 15
Liu 26
'''
print(stu.loc[:'Liu',:'age'])
'''
age
Li 15
Liu 26
'''
print(stu)
'''
age score
Li 15 80
Liu 26 88
Wan 19 92
'''
stu.iloc[0,1] = 100
print(stu)
'''
age score
Li 15 100
Liu 26 88
Wan 19 92
'''
# 其他取值方式
print(stu['Li':'Wan'])
'''
age score
Li 15 100
Liu 26 88
Wan 19 92
'''
print(stu[1:3])
'''
age score
Liu 26 88
Wan 19 92
'''
print(stu[stu.age>20])
'''
age score
Liu 26 88
'''
数值运算
Series
保留索引
两对象都可以使用Numpy通用函数
rng = np.random.RandomState(42) # 种子
ser = pd.Series(rng.randint(0,10,4)) # 生成4个0~10之间的数
print(ser)
'''
0 6
1 3
2 7
3 4
dtype: int32
'''
df = pd.Dataframe(rng.randint(0,10,(3,4)),columns=['a','b','c','d'])
print(df)
'''
a b c d
0 6 9 2 6
1 7 4 3 7
2 7 2 5 4
'''
print(np.exp(ser)) # 指数函数,e的ser次方
'''
0 403.428793
1 20.085537
2 1096.633158
3 54.598150
dtype: float64
'''
print(df*np.pi/4)
'''
a b c d
0 4.712389 7.068583 1.570796 4.712389
1 5.497787 3.141593 2.356194 5.497787
2 5.497787 1.570796 3.926991 3.141593
'''
索引对齐
# Series
age = pd.Series({'Li':15,'Liu':26,'Wan':19},name='age')
score = pd.Series({'Chen':88,'Li':80,'Wan':92},name='score')
print(age/score)
'''索引为两数组的并集
Chen NaN
Li 0.187500
Liu NaN
Wan 0.206522
dtype: float64
'''
print(age.index|score.index) # Index(['Chen', 'Li', 'Liu', 'Wan'], dtype='object')
NaN表示此处没有数
a = pd.Series([2,4,6],index=[0,1,2]) b = pd.Series([1,3,5],index=[1,2,3]) print(a+b) ''' 0 NaN 1 5.0 2 9.0 3 NaN dtype: float64 '''
NaN不是想要的结果,用适当的对象方法代替运算符a.add(b)等价于a+b,也可自定义a或b缺失的数据
print(a.add(b,fill_value=0)) # 没有共同的加fill_value,即加0 ''' 0 2.0 1 5.0 2 9.0 3 5.0 dtype: float64 '''Dataframe
rng = np.random.RandomState(42)
A = pd.Dataframe(rng.randint(0,20,(2,2)),columns=list('AB'))
print(A)
'''
A B
0 6 19
1 14 10
'''
B = pd.Dataframe(rng.randint(0,10,(3,3)),columns=list('BAC'))
print(B)
'''
B A C
0 7 4 6
1 9 2 6
2 7 4 3
'''
print(A+B)
'''
A B C
0 10.0 26.0 NaN
1 16.0 19.0 NaN
2 NaN NaN NaN
'''
fill = A.stack().mean() # 计算A的均值
print(fill) # 12.25
print(A.add(B,fill_value=fill))# 没有共同的加fill,即加12.25
'''
A B C
0 10.00 26.00 18.25
1 16.00 19.00 18.25
2 16.25 19.25 15.25
'''
Dataframe与Series的运算规则与Numpy中一维、二维数组运算一样的
rng = np.random.RandomState(42)
A = rng.randint(0,10,(3,4))
print(A)
'''
[[6 3 7 4]
[6 9 2 6]
[7 4 3 7]]
'''
print(A-A[0])
'''
[[ 0 0 0 0]
[ 0 6 -5 2]
[ 1 1 -4 3]]
'''
df = pd.Dataframe(A,columns=list('QRST'))
print(df)
'''
Q R S T
0 6 3 7 4
1 6 9 2 6
2 7 4 3 7
'''
print(df-df.iloc[0])
'''
Q R S T
0 0 0 0 0
1 0 6 -5 2
2 1 1 -4 3
'''
print(df.subtract(df['R'],axis=0))
'''
Q R S T
0 3 0 4 1
1 -3 0 -7 -3
2 3 0 -1 3
'''
halfrow = df.iloc[0,::2]
print(halfrow)
'''
Q 6
S 7
Name: 0, dtype: int32
'''
print(df-halfrow)
'''
Q R S T
0 0.0 NaN 0.0 NaN
1 0.0 NaN -5.0 NaN
2 1.0 NaN -4.0 NaN
'''
处理缺失值
选择缺失值处理方法:覆盖全局掩码表示缺失值(额外存储和计算负担),标签值表示缺失值(额外CPU或GPU计算逻辑
Pandas的缺失值 python对象类型的缺失值None不能作为Numpy/Pandas的缺失值,只能用于’object’数组类型dtype=object表示Numpy认为这个数组是Python对象构成的,对于常用的快速 *** 作,这种 *** 作耗更多资源
vals1 = np.array([1, None, 3, 4]) print(vals1) # [1 None 3 4] #print(vals1.sum()) # 出错,Python中没有定义整数与None之间的运算数值类型的缺失值NaN:不是一个数字
Numpy会为这个数组选择一个原生浮点类型,这会将数组编译成C代码从而实现快速 *** 作
vals2 = np.array([1, np.nan, 3, 4]) print(vals2.dtype) # float64 print(vals2) # [ 1. nan 3. 4.]
任何数字与NaN进行任何 *** 作都会变成NaN
print(1+np.nan) # nan print(vals2.sum(),vals2.min(),vals2.max()) # nan nan nan
Numpy提供了一些特殊的累计函数,忽略NaN缺失值影响
print(np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)) # 8.0 1.0 4.0Pandas中NaN与None的差异
在适当的时候,Pandas会将NaN与None等价交换
print(pd.Series([1,np.nan,3,None])) ''' 0 1.0 1 NaN 2 3.0 3 NaN dtype: float64 '''
Pandas会将没有标签值的数据类型自动转化为NaN
x = pd.Series([1,2],dtype=int) print(x) ''' 0 1 1 2 dtype: int32 ''' x[0]=None print(x) ''' 0 NaN 1 2.0 dtype: float64 '''
Pandas对不同类型缺失值的转换规则
两种方法发现缺失值:isnull()和notnull(),返回布尔类型的掩码数据,该方法两对象都适用
data = pd.Series([1,np.nan,'hello',None]) print(data.isnull()) ''' 0 False 1 True 2 False 3 True dtype: bool ''' print(data.notnull()) ''' 0 True 1 False 2 True 3 False dtype: bool '''剔除缺失值:dropna()
Series使用该方法简单
data = pd.Series([1,np.nan,'hello',None]) print(data.dropna()) ''' 0 1 2 hello dtype: object '''
Dataframe需要一些参数,dropna()剔除整行或整列,默认为剔除整行,axis='colums’或axis=1剔除整列
df = pd.Dataframe([[1,np.nan,2],
[2,3,5],
[np.nan,4,6]])
print(df)
'''
0 1 2
0 1.0 NaN 2
1 2.0 3.0 5
2 NaN 4.0 6
'''
# 剔除整行或整列,默认为剔除整行
print(df.dropna())
'''
0 1 2
1 2.0 3.0 5
'''
print(df.dropna(axis=1)) # axis='colums'整列
'''
2
0 2
1 5
2 6
'''
how和thresh参数可以设置剔除行或列缺失值的数量阈值
默认设置是how='any',即剔除整行或整列,(行列有axis参数决定)
how='all',剔除全部是缺失值的行或列
df[3]=np.nan
print(df)
'''
0 1 2 3
0 1.0 NaN 2 NaN
1 2.0 3.0 5 NaN
2 NaN 4.0 6 NaN
'''
print(df.dropna(axis='columns',how='all'))
'''
0 1 2
0 1.0 NaN 2
1 2.0 3.0 5
2 NaN 4.0 6
'''
# thresh参数设置行或列中非缺失值的最小数量
print(df.dropna(axis='rows',thresh=3))
'''
0 1 2 3
1 2.0 3.0 5 NaN
'''
填充缺失值:fillna()
method参数表示如何填充
method='ffill'用前面的填充后面的
method='bfill'用后面的填充前面的
data = pd.Series([1,np.nan,2,None,3],index=list('abcde'))
print(data)
'''
a 1.0
b NaN
c 2.0
d NaN
e 3.0
dtype: float64
'''
print(data.fillna(0))
'''
a 1.0
b 0.0
c 2.0
d 0.0
e 3.0
dtype: float64
'''
# 用前面的填充后面的,前面没有则仍然为缺失值
print(data.fillna(method='ffill'))
'''
a 1.0
b 1.0
c 2.0
d 2.0
e 3.0
dtype: float64
'''
# 用后面的填充前面的,后面没有则仍然为缺失值
print(data.fillna(method='bfill'))
'''
a 1.0
b 2.0
c 2.0
d 3.0
e 3.0
dtype: float64
'''
df = pd.Dataframe([[1,np.nan,2,np.nan],
[2,3,5,np.nan],
[np.nan,4,6,np.nan]])
print(df.fillna(method='ffill',axis=1))
'''
0 1 2 3
0 1.0 1.0 2.0 2.0
1 2.0 3.0 5.0 5.0
2 NaN 4.0 6.0 6.0
'''
层级索引MultiIndex
多级索引Series
用一维Series对象表示二维数据
笨方法index = [('California',2000),('California',2010),
('New York',2000),('New York', 2010),
('Texas',2000),('Texas',2010)]
populations = [11111,22222,33333,44444,55555,66666]
pop = pd.Series(populations, index=index)
print(pop)
'''
(California, 2000) 11111
(California, 2010) 22222
(New York, 2000) 33333
(New York, 2010) 44444
(Texas, 2000) 55555
(Texas, 2010) 66666
dtype: int64
'''
print(pop[('California',2010):('Texas',2000)]) # 切片
'''
(California, 2010) 22222
(New York, 2000) 33333
(New York, 2010) 44444
(Texas, 2000) 55555
dtype: int64
'''
print(pop[[i for i in pop.index if i[1] == 2010]]) # 选择2010数据
'''
(California, 2010) 22222
(New York, 2010) 44444
(Texas, 2010) 66666
dtype: int64
'''
好方法:Pandas多级索引
index = [('California',2000),('California',2010),
('New York',2000),('New York', 2010),
('Texas',2000),('Texas',2010)]
populations = [11111,22222,33333,44444,55555,66666]
pop = pd.Series(populations, index=index)
index = pd.MultiIndex.from_tuples(index)
print(index)
'''
MultiIndex([('California', 2000),
('California', 2010),
( 'New York', 2000),
( 'New York', 2010),
( 'Texas', 2000),
( 'Texas', 2010)],
)
'''
pop = pop.reindex(index)
print(pop)
'''
California 2000 11111
2010 22222
New York 2000 33333
2010 44444
Texas 2000 55555
2010 66666
dtype: int64
'''
print(pop[:,2010]) # 直接使用第二个索引获取2010的全部数据
'''
California 22222
New York 44444
Texas 66666
dtype: int64
'''
高维数据的多级索引
populations = [11111,22222,33333,44444,55555,66666]
index = pd.MultiIndex.from_tuples([('California',2000),('California',2010),
('New York',2000),('New York', 2010),
('Texas',2000),('Texas',2010)])
pop = pd.Series(populations, index=index)
pop_df = pop.unstack()
print(pop_df)
'''
2000 2010
California 11111 22222
New York 33333 44444
Texas 55555 66666
'''
print(pop_df.stack())
'''
California 2000 11111
2010 22222
New York 2000 33333
2010 44444
Texas 2000 55555
2010 66666
dtype: int64
'''
pop_df = pd.Dataframe({'total':pop,
'under18':[11111,22222,33333,44444,55555,66666]})
print(pop_df)
'''
total under18
California 2000 11111 11111
2010 22222 22222
New York 2000 33333 33333
2010 44444 44444
Texas 2000 55555 55555
2010 66666 66666
'''
f_u18 = pop_df['under18']/pop_df['total']
print(f_u18.unstack())
'''
2000 2010
California 1.0 1.0
New York 1.0 1.0
Texas 1.0 1.0
'''
多级索引的创建方法
df = pd.Dataframe(np.random.rand(4,2),index=[['a','a','b','b'],[1,2,1,2]],columns=['data1','data2'])
print(df)
'''
data1 data2
a 1 0.045858 0.391234
2 0.631418 0.924928
b 1 0.534416 0.216372
2 0.300895 0.523091
'''
data = {('California',2000):11111,('California',2010):22222,
('New York',2000):33333,('New York', 2010):44444,
('Texas',2000):55555,('Texas',2010):66666}
print(pd.Series(data))
'''
California 2000 11111
2010 22222
New York 2000 33333
2010 44444
Texas 2000 55555
2010 66666
dtype: int64
'''
显示地创建多级索引
print(pd.MultiIndex.from_arrays([['a','a','b','b'],[1,2,1,2]])) # 若干简单数组组成的列表创建
'''
MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)
'''
print(pd.MultiIndex.from_tuples([('a',1),('a',2),('b',1),('b',2)])) # 包含多个索引元组构成的列表创建
'''
MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)
'''
print(pd.MultiIndex.from_product([['a','b'],[1,2]])) # 用两个索引的笛卡尔积创建
'''
MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)
'''
???
print(pd.MultiIndex(levels=[['a','b'],[1,2]],labels=[[0,0,1,1],[0,1,0,1]])) # 直接提供levels和labels创建多级索引的等级名称
populations = [11111,22222,33333,44444,55555,66666]
index = pd.MultiIndex.from_tuples([('California',2000),('California',2010),
('New York',2000),('New York', 2010),
('Texas',2000),('Texas',2010)])
pop = pd.Series(populations, index=index)
pop.index.names = ['state','year']
print(pop)
'''
state year
California 2000 11111
2010 22222
New York 2000 33333
2010 44444
Texas 2000 55555
2010 66666
dtype: int64
'''
多级列索引
# 多级行列索引
index = pd.MultiIndex.from_product([[2013,2014],[1,2]],names=['year','visit'])
columns = pd.MultiIndex.from_product([['Bob','Guido','Sue'],['HR','Temp']],names=['subject','type'])
# 模拟数据
data = np.round(np.random.randn(4,6),1)
data[:,::2] *= 10
data += 37
# 创建Dataframe
health_data = pd.Dataframe(data,index=index,columns=columns)
print(health_data)
'''
subject Bob Guido Sue
type HR Temp HR Temp HR Temp
year visit
2013 1 49.0 37.7 29.0 36.2 45.0 36.4
2 51.0 36.9 13.0 36.3 50.0 39.2
2014 1 41.0 36.2 21.0 37.4 43.0 37.1
2 42.0 35.3 37.0 35.8 41.0 35.9
type HR Temp
'''
print(health_data['Guido'])
'''
year visit
2013 1 29.0 36.2
2 13.0 36.3
2014 1 21.0 37.4
2 37.0 35.8
'''
多级索引的取片与切片
Series多级索引
populations = [11111,22222,33333,44444,55555,66666]
index = pd.MultiIndex.from_tuples([('California',2000),('California',2010),
('New York',2000),('New York', 2010),
('Texas',2000),('Texas',2010)])
pop = pd.Series(populations, index=index)
pop.index.names = ['state','year']
print(pop)
'''
state year
California 2000 11111
2010 22222
New York 2000 33333
2010 44444
Texas 2000 55555
2010 66666
dtype: int64
'''
print(pop['California'])
'''
year
2000 11111
2010 22222
dtype: int64
'''
print(pop['California',2000]) # 11111
print(pop.loc['California':'New York'])
'''
state year
California 2000 11111
2010 22222
New York 2000 33333
2010 44444
dtype: int64
'''
print(pop[:,2000])
'''
state
California 11111
New York 33333
Texas 55555
dtype: int64
'''
print(pop[pop>22222])
'''
state year
New York 2000 33333
2010 44444
Texas 2000 55555
2010 66666
dtype: int64
'''
print(pop[['California','Texas']])
'''
state year
California 2000 11111
2010 22222
Texas 2000 55555
2010 66666
dtype: int64
'''
Dataframe多级索引
index = pd.MultiIndex.from_product([[2013,2014],[1,2]],names=['year','visit'])
columns = pd.MultiIndex.from_product([['Bob','Guido','Sue'],['HR','Temp']],names=['subject','type'])
# 模拟数据
data = np.round(np.random.randn(4,6),1)
data[:,::2] *= 10
data += 37
# 创建Dataframe
health_data = pd.Dataframe(data,index=index,columns=columns)
print(health_data)
'''
subject Bob Guido Sue
type HR Temp HR Temp HR Temp
year visit
2013 1 51.0 38.7 27.0 35.8 18.0 37.2
2 40.0 37.4 41.0 36.0 34.0 36.6
2014 1 17.0 35.6 39.0 36.0 41.0 36.4
2 26.0 35.8 32.0 36.2 47.0 36.9
'''
print(health_data['Guido','HR'])
'''
year visit
2013 1 48.0
2 35.0
2014 1 30.0
2 48.0
Name: (Guido, HR), dtype: float64
'''
print(health_data.iloc[:2,:2])
'''
subject Bob
type HR Temp
year visit
2013 1 24.0 38.6
2 42.0 38.2
'''
print(health_data.loc[:,('Bob','HR')])
'''
year visit
2013 1 36.0
2 49.0
2014 1 55.0
2 35.0
Name: (Bob, HR), dtype: float64
'''
idx = pd.IndexSlice
print(health_data.loc[idx[:,1],idx[:,'HR']])
'''
subject Bob Guido Sue
type HR HR HR
year visit
2013 1 46.0 41.0 37.0
2014 1 37.0 51.0 43.0
'''
多级索引行列转换
有序的索引和无序的索引
如果MultiIndex不是有序索引,那么大多数切片 *** 作都会失败
index = pd.MultiIndex.from_product([['a','c','b'],[1,2]])
data = pd.Series(np.random.rand(6),index=index)
data.index.names=['char','int']
print(data)
'''
char int
a 1 0.252805
2 0.934107
c 1 0.154999
2 0.363860
b 1 0.391106
2 0.316172
dtype: float64
'''
# 对索引使用局部切片,由于无序会出错
try:
print(data['a':'b'])
except KeyError as e:
print(e)
'''
'Key length (1) was greater than MultiIndex lexsort depth (0)'
'''
# Pandas提供了排序 *** 作,如sort_indx()和sortlevel()
data = data.sort_index()
print(data)
'''
char int
a 1 0.839251
2 0.853601
b 1 0.698143
2 0.926872
c 1 0.632588
2 0.959209
dtype: float64
'''
print(data['a':'b'])
'''
char int
a 1 0.839251
2 0.853601
b 1 0.698143
2 0.926872
dtype: float64
'''
索引stack与unstack
populations = [11111,22222,33333,44444,55555,66666]
index = pd.MultiIndex.from_tuples([('California',2000),('California',2010),
('New York',2000),('New York', 2010),
('Texas',2000),('Texas',2010)])
pop = pd.Series(populations, index=index)
pop.index.names = ['state','year']
print(pop.unstack(level=0))
'''
state California New York Texas
year
2000 11111 33333 55555
2010 22222 44444 66666
'''
print(pop.unstack(level=1))
'''
year 2000 2010
state
California 11111 22222
New York 33333 44444
Texas 55555 66666
'''
print(pop.unstack().stack()) # stack()与unstack()是逆 *** 作,同时使用两个,数据不变
'''
state year
California 2000 11111
2010 22222
New York 2000 33333
2010 44444
Texas 2000 55555
2010 66666
dtype: int64
'''
索引的设置与重置
populations = [11111,22222,33333,44444,55555,66666]
index = pd.MultiIndex.from_tuples([('California',2000),('California',2010),
('New York',2000),('New York', 2010),
('Texas',2000),('Texas',2010)])
pop = pd.Series(populations, index=index)
pop.index.names = ['state','year']
pop_flat = pop.reset_index(name='population')
print(pop_flat)
'''
state year population
0 California 2000 11111
1 California 2010 22222
2 New York 2000 33333
3 New York 2010 44444
4 Texas 2000 55555
5 Texas 2010 66666
'''
print(pop_flat.set_index(['state','year']))
'''
population
state year
California 2000 11111
2010 22222
New York 2000 33333
2010 44444
Texas 2000 55555
2010 66666
'''
多级索引的数据累计方法
# Pandas自带数据累计方法,如:mean()、sum()、max(),对于层级索引数据,可以设置参数level实现对数据子集的累计 *** 作
index = pd.MultiIndex.from_product([[2013,2014],[1,2]],names=['year','visit'])
columns = pd.MultiIndex.from_product([['Bob','Guido','Sue'],['HR','Temp']],names=['subject','type'])
# 模拟数据
data = np.round(np.random.randn(4,6),1)
data[:,::2] *= 10
data += 37
# 创建Dataframe
health_data = pd.Dataframe(data,index=index,columns=columns)
print(health_data)
'''
subject Bob Guido Sue
type HR Temp HR Temp HR Temp
year visit
2013 1 30.0 36.5 42.0 37.6 16.0 37.5
2 34.0 37.8 53.0 37.1 38.0 38.0
2014 1 35.0 35.7 27.0 37.6 45.0 36.8
2 42.0 39.3 48.0 36.8 31.0 36.6
'''
# 计算每一年各项平均值
data_mean = health_data.mean(level='year')
print(data_mean)
'''
subject Bob Guido Sue
type HR Temp HR Temp HR Temp
year
2013 32.0 37.15 47.5 37.35 27.0 37.75
2014 38.5 37.50 37.5 37.20 38.0 36.70
'''
# 设置axis参数,可以对列索引进行类似的累计 *** 作
print(data_mean.mean(axis=1,level='type'))
'''
type HR Temp
year
2013 35.5 37.416667
2014 38.0 37.133333
'''
合并数据集:Concat与Append *** 作
# 定义一个创建Dataframe某种形式的函数
def make_df(cols, ind):
data = {c:[str(c)+str(i) for i in ind] for c in cols}
return pd.Dataframe(data,ind)
print(make_df('ABC',range(3)))
'''
A B C
0 A0 B0 C0
1 A1 B1 C1
2 A2 B2 C2
'''
Numpy数组合并
x=[1,2,3] y=[4,5,6] z=[7,8,9] print(np.concatenate([x,y,z])) # [1 2 3 4 5 6 7 8 9] x = [[1,2],[3,4]] print(np.concatenate([x,x],axis=1)) ''' [[1 2 1 2] [3 4 3 4]] '''通过pd.concat实现简易合并
# 定义一个创建Dataframe某种形式的函数
def make_df(cols, ind):
data = {c:[str(c)+str(i) for i in ind] for c in cols}
return pd.Dataframe(data,ind)
## 通过pd.concat实现简易合并
ser1 = pd.Series(['A','B','C'],index=[1,2,3])
ser2 = pd.Series(['D','E','F'],index=[4,5,6])
print(pd.concat([ser1,ser2]))
'''
1 A
2 B
3 C
4 D
5 E
6 F
dtype: object
'''
df1 = make_df('AB',[1,2])
df2 = make_df('AB',[3,4])
print(df1)
'''
A B
1 A1 B1
2 A2 B2
'''
print(df2)
'''
A B
3 A3 B3
4 A4 B4
'''
print(pd.concat([df1,df2]))
'''
A B
1 A1 B1
2 A2 B2
3 A3 B3
4 A4 B4
'''
df3 = make_df('AB',[0,1])
df4 = make_df('CD',[0,1])
print(df3)
'''
A B
0 A0 B0
1 A1 B1
'''
print(df4)
'''
C D
0 C0 D0
1 C1 D1
'''
print(pd.concat([df3,df4],axis=1))
'''
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
'''
索引重复欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)