
import pandas as pd
# 1.通过list 创建
ser = pd.Series([1, 2, 3, 4])
# 2.通过numpy 创建
import numpy as np
ser = pd.Series(np.arange(1, 5))
# 指定索引
ser = pd.Series([1, 2, 3], index= [1, 2, 3])
# 索引与列数量对应一致
# 3.通过字典创建
dict_v = {
'name': 'python',
'age': 18,
'class': 3
}
ser = pd.Series(dict_v)
ser = pd.Series(dict_v, index= ['x1', 'x2', 'x3', 'x4'])
Series 基本用法
ser = pd.Series([1, 2, 3, 4])
# 判断是否为空 为空为True
ser.isnull()
# 判断是否不为空, 非空为True
ser.notnull()
print(ser.index)
print(ser.values)
# 通过索引获取数据
print(ser[0])
print(ser['v'])
print(ser[[0, 1, 2]])
# 切片取值
print(ser[1:3])
# 标签切片 会包含末端的值
print(ser['name': 'class'])
# 布尔索引
print(ser[ser > 3])
# name 属性
ser.name = 'temp' # Series 对象名
ser.index.name = 'index01' # 索引名
ser.head()
ser.tail()
DataFrame 数据结构
DataFrame 创建
import numpy as np
import pandas as pd
# 构造一个字典
data = {
"column1": [1, 2, 3, 4, 5],
"column2": (3, 9, 4, 2, 1),
"column3": np.arange(9, 14)
}
frame = pd.DataFrame(data)
frame.index.name = "index"
print(frame)
# column1 column2 column3
# index
# 0 1 3 9
# 1 2 9 10
# 2 3 4 11
# 3 4 2 12
# 4 5 1 13
# 查看行索引
print(frame.index)
# 查看列索引
print(frame.columns)
# values 属性查看值
print(frame.values)
# 指定行索引, 列索引
frame = pd.DataFrame(data, index=[1, 2, 3, 4, 5], columns=[*data.keys(), 'col1'])
print(frame)
# column1 column2 column3 col1
# 1 1 3 9 NaN
# 2 2 9 10 NaN
# 3 3 4 11 NaN
# 4 4 2 12 NaN
# 5 5 1 13 NaN
# Series 构成的字典构造dataframe
frame = pd.DataFrame({
'A': pd.Series(np.arange(1, 5)),
'B': pd.Series(np.arange(2, 6))
})
print(frame)
# A B
# 0 1 2
# 1 2 3
# 2 3 4
# 3 4 5
# 字典构成的字典构造dataframe
data = {
'col1': {0: 1, 1: 2, 2: 3},
'col2': {0: 1, 1: 2, 2: 3},
'col3': {0: 1, 1: 2}
}
frame = pd.DataFrame(data)
print(frame)
# col1 col2 col3
# 0 1 1 1.0
# 1 2 2 2.0
# 2 3 3 NaN
# 2D ndarry 构造dataframe
arr1 = np.arange(12).reshape(4, 3)
frame = pd.DataFrame(arr1)
print(frame)
# 0 1 2
# 0 0 1 2
# 1 3 4 5
# 2 6 7 8
# 3 9 10 11
# 字典构成的列表构造dataframe
data = [
{'a': 1},
{'b': 2},
{'c': 3}
]
frame = pd.DataFrame(data)
print(frame)
# a b c
# 0 1.0 NaN NaN
# 1 NaN 2.0 NaN
# 2 NaN NaN 3.0
# Series构成的列表构造dataframe
data = [
pd.Series(np.random.rand(3)),
pd.Series(np.random.rand(2))
]
frame = pd.DataFrame(data)
print(frame)
# 0 1 2
# 0 0.473710 0.229683 0.754207
# 1 0.314483 0.825005 NaN
data = [
pd.Series([x for x in range(5)]),
pd.Series(np.random.rand(5))
]
frame = pd.DataFrame(data)
print(frame)
# 0 1 2 3 4
# 0 0.000000 1.000000 2.000000 3.000000 4.000000
# 1 0.621414 0.244883 0.755113 0.572064 0.671114
DataFrame 基本用法
# 转置
frame_t = frame.T
print(frame_t, type(frame_t))
# a b
# 0 0 1
# 1 1 2
# 2 2 3
# 3 3 4
# 4 4 5
# --------------------
# 0 1 2 3 4
# a 0 1 2 3 4
# b 1 2 3 4 5
# 通过列索引获取列数据 (Series类型)
print(frame['a'])
# 增加一列数据
frame['c'] = 0
print(frame)
# 删除一列数据
del(frame['c'])
print(frame)
# a b c
# 0 0 1 0
# 1 1 2 0
# 2 2 3 0
# 3 3 4 0
# 4 4 5 0
# a b
# 0 0 1
# 1 1 2
# 2 2 3
# 3 3 4
# 4 4 5
Pandas 索引 *** 作
索引对象 Index
# 索引对象不可变, 保证数据的安全性
# 常见的Index 种类
# Index 基本索引
# Int64Index 整数索引
# MultiIndex 层级索引
# DatetimeIndex 时间戳索引
索引的一些基本 *** 作
import pandas as pd
import numpy as np
ser = pd.Series(['1', '2', '3'], index=[1, 2, 3])
print(ser)
# 1.重新索引
# 行索引重建
ser = ser.reindex([1, 2, 3, 4])
print(ser)
# 列索引重建
data = {
'a': np.arange(5),
'b': np.arange(1, 6)
}
frame = pd.DataFrame(data)
print(frame)
frame = frame.reindex(columns=['b', 'a'])
print(frame)
# Series 行增加
ser1 = pd.concat([ser, pd.Series({5: 111})])
print(ser)
print(ser1)
# DataFrame 增加列
frame['c'] = 10
frame['d'] = [0, 0, 0, 0, 1]
# frame['d'] = [0, 0, 0, 0]# 行数不一致报错
# 在某一列号位置上插入数据
frame.insert(0, 'e', 99)
print(frame)
# 增加行
# 标签索引loc
frame.loc[5] = 0
row = {'e': 1, 'b': 2, 'a': 3, 'c': 4, 'd': 5}
# 未来版本,将被删除
frame.append(row, ignore_index=True)
# 未来版本,支持
f1 = pd.DataFrame(row, index=[6])
print(f1)
frame = pd.concat([frame, f1])
print(frame)
# 删除行
ser[5] = 1
print(ser)
del ser[4]
print(ser)
ser1 = ser.drop(5)
print(ser1)
# 删除多条
ser2 = ser.drop([1, 5])
print(ser2)
# DataFrame 删除 默认删除行索引
print(frame)
f1 = frame.drop(0)
print(f1)
f2 = frame.drop([0, 6, 5])
print(f2)
# 指定删除列 axis = 1 列轴, 0 横轴
f3 = frame.drop('e', axis=1)
print(f3)
f4 = frame.drop(['b', 'd'], axis='columns')
print(f4)
# 原地删除
f4.drop([0, 1, 6], axis=0, inplace=True)
print(f4)
data = {
'a': np.arange(3),
'b': np.arange(3)
}
frame = pd.DataFrame(data)
print(frame)
# a b
# 0 0 0
# 1 1 1
# 2 2 2
# 对DataFrame 某一列进行赋值 有则更改, 无则替换
frame['c'] = 100
frame['b'] = 1
print(frame)
# a b c
# 0 0 1 100
# 1 1 1 100
# 2 2 1 100
# 对某一个位置上的值进行更改
frame.loc[0, 'a'] = 999
print(frame)
# a b c
# 0 999 1 100
# 1 1 1 100
# 2 2 1 100
# 查
# Series 查
ser = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
# 位置索引
print("位置索引", ser[0])
# 标签索引
print("位置索引", ser['a'])
# 位置切片
print("位置切片", ser[0:])
# 标签切片
print("标签切片", ser['a':'c'])
# 不连续索引
print("不连续索引", ser[[0, 2]])
# 位置索引 1
# 位置索引 1
# 位置切片 a 1
# b 2
# c 3
# dtype: object
# 标签切片 a 1
# b 2
# c 3
# dtype: object
# 不连续索引 a 1
# c 3
# dtype: object
# 布尔索引
print(ser > 1)
print(ser[ser > 1])
# a False
# b True
# c True
# dtype: bool
# b 2
# c 3
# dtype: int64
# dataframe 索引取值
print(frame)
# 取单列
print(frame['b'])
# 取多列
print(frame[['a', 'c']])
# 取单个值
print(frame.loc[0, 'a'])
print(frame['a'][0])
# 切片取的是行
print(frame[:2])
高级索引
- loc标签索引
- iloc位置索引
- ix标签与位置混合索引
# 标签索引 loc 先行索引后列索引, 是标签索引
print(frame.loc[0: 1, 'a': 'b'])
# 位置索引 iloc 先行后列
print(frame.iloc[0: 1, 1:2])
Pandas 对齐运算
填充值
import pandas as pd
import numpy as np
s1 = pd.Series(np.arange(5), index=['a', 'b', 'c', 'f', 'g'])
s2 = pd.Series(np.arange(3), index=['a', 'd', 'e'])
print(s1)
print(s2)
# 算数相加
s3 = s1 + s2
s3 = s1.add(s2, fill_value =0)
print(s3)
# a 0
# b 1
# c 2
# f 3
# g 4
# dtype: int32
# a 0
# d 1
# e 2
# dtype: int32
# a 0.0
# b 1.0
# c 2.0
# d 1.0
# e 2.0
# f 3.0
# g 4.0
# dtype: float64
Panads 函数应用
apply 和 applymap
# 应用到列与行
f = lambda x:x.max()
# 默认求每一列的最大值
print(f1.apply(f))
# 0 0.258035
# 1 1.058038
# 2 1.265633
# 3 1.236317
# dtype: float64
# 求每一行的最大值
print(f1.apply(f, axis=1))
# 0 1.058038
# 1 1.265633
# 2 0.989948
# 3 0.839860
# 4 0.518687
# dtype: float64
# 应用到每一个值
f = lambda x:"%.2f" % x
print(f1.applymap(f))
# 0 1 2 3
# 0 0.15 1.06 0.76 0.37
# 1 0.15 0.18 1.27 1.24
# 2 0.26 0.92 0.99 0.15
# 3 0.12 0.84 0.53 0.27
# 4 0.01 0.52 0.01 0.27
排序
# 索引排序 默认升序
ser = pd.Series(range(5), index=list('acdeb'))
print(ser)
# a 0
# c 1
# d 2
# e 3
# b 4
# dtype: int64
ser = ser.sort_index()
print(ser)
# a 0
# b 4
# c 1
# d 2
# e 3
# dtype: int64
ser = ser.sort_index(ascending=False)
print(ser)
# e 3
# d 2
# c 1
# b 4
# a 0
# dtype: int64
ser = ser.sort_values(ascending=False)
print(ser)
df = pd.DataFrame(np.arange(12).reshape(4, 3), index=list('dabc'), columns=list('BCA'))
print(df)
# B C A
# d 0 1 2
# a 3 4 5
# b 6 7 8
# c 9 10 11
# 默认行排序
df1 = df.sort_index()
print(df1)
# B C A
# a 3 4 5
# b 6 7 8
# c 9 10 11
# d 0 1 2
# 按列排序, 降序
df1 = df.sort_index(axis=1, ascending=False)
print(df1)
# C B A
# d 1 0 2
# a 4 3 5
# b 7 6 8
# c 10 9 11
# 根据A 列 降序排列
df1 = df1.sort_values(by='A', ascending=False)
print(df1)
# C B A
# c 10 9 11
# b 7 6 8
# a 4 3 5
# d 1 0 2
唯一值和成员属性
print(ser)
print(ser.unique())
print(ser.value_counts())
print(ser.isin([3]))
# [4 3 2 1 0]
# 4 1
# 3 1
# 2 1
# 1 1
# 0 1
# dtype: int64
# b False
# e True
# d False
# c False
# a False
# dtype: bool
处理缺失数据
df = pd.DataFrame({'a':[1,2,23,3], 'b':[np.nan, 3, 4, 1]})
print(df)
# a b
# 0 1 NaN
# 1 2 3.0
# 2 23 4.0
# 3 3 1.0
# 判断是否有缺失值
print(df.isnull())
# a b
# 0 False True
# 1 False False
# 2 False False
# 3 False False
# 默认丢弃行数据
print(df.dropna())
# a b
# 1 2 3.0
# 2 23 4.0
# 3 3 1.0
# 选择列轴
print(df.dropna(axis=1))
# a
# 0 1
# 1 2
# 2 23
# 3 3
# 填充缺失数据
print(df.fillna(0))
# a b
# 0 1 0.0
# 1 2 3.0
# 2 23 4.0
# 3 3 1.0
统计计算
import numpy as np
import pandas as pd
frame = pd.DataFrame(np.random.randn(5, 4))
print(frame)
print(frame.describe())
# [ 0 1 2 3
# 0 0.656203 -1.270162 2.004630 -0.706720
# 1 -1.547710 0.493761 0.150936 -0.688306
# 2 0.199421 0.534954 0.267138 0.788930
# 3 -1.405272 0.402331 -1.520276 -1.268665
# 4 0.412419 -0.316246 0.136897 -0.558747
# 0 1 2 3
# count 5.000000 5.000000 5.000000 5.000000
# mean -0.336988 -0.031072 0.207865 -0.486702
# std 1.053904 0.774652 1.247660 0.763699
# min -1.547710 -1.270162 -1.520276 -1.268665
# 25% -1.405272 -0.316246 0.136897 -0.706720
# 50% 0.199421 0.402331 0.150936 -0.688306
# 75% 0.412419 0.493761 0.267138 -0.558747
# max 0.656203 0.534954 2.004630 0.788930]()
数据读取与存储
import numpy as np
import pandas as pd
FILE_PATH = "../../tmp_file/only_panads/GS.csv"
df = pd.read_csv(FILE_PATH, sep=',')
print(df.shape)
print(df.head())
df = pd.read_table(FILE_PATH, sep=',', names=['a','b','c','d','e','f','g'], index_col='a')
print(df.shape)
print(df.head())
df1 = df.head()
df1.to_csv('../../tmp_file/only_panads/f1.csv')
# 分块读取
df = pd.read_csv(FILE_PATH, sep=',', chunksize=10)
print(df.get_chunk())
print(df.get_chunk(5))
df = pd.read_csv(FILE_PATH, sep=',', iterator=True)
print(df.get_chunk(5))
print(df.get_chunk(10))
数据清洗和准备
# 丢弃全为Nan的行
df.dropna(how='all')
# 丢弃某一行缺失值为2个的行
df.dropna(thresh=2)
# 填充缺失值
df.fillna(0)
# 第一列和第二列替换相应的值
df.fillna({1: 0.9, 2: 0})
# 就地修改
df.fillna(0, inplace=True)
# 检查是否是重复数据
df.duplicated()
# 删除重复数据
df.drop_duplicates()
# 去除指定列的重复数据行
df.drop_duplicates(['col1', 'col2'])
df.drop_duplicates(['col1', 'col2'], keep='last')
# 利用函数或映射进行数据转换
df['col1'] = df['key'].map({"":""})
# 利用函数
df['col1'] = df['key'].map(lambda x: meat[x.lower()])
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)