Panads入门_python_内存溢出

Pandas 学习 Series 数据结构 Series 创建

import pandas as pd
# 1.通过list 创建
ser = pd.Series([1, 2, 3, 4])
# 2.通过numpy 创建
import numpy as np
ser = pd.Series(np.arange(1, 5))

# 指定索引
ser = pd.Series([1, 2, 3], index= [1, 2, 3])
# 索引与列数量对应一致

# 3.通过字典创建
dict_v = {
    'name': 'python',
    'age': 18,
    'class': 3
}
ser = pd.Series(dict_v)
ser = pd.Series(dict_v, index= ['x1', 'x2', 'x3', 'x4'])

Series 基本用法

ser = pd.Series([1, 2, 3, 4])
# 判断是否为空 为空为True
ser.isnull()
# 判断是否不为空， 非空为True
ser.notnull()

print(ser.index)
print(ser.values)
# 通过索引获取数据
print(ser[0])
print(ser['v'])
print(ser[[0, 1, 2]])

# 切片取值
print(ser[1:3])
# 标签切片  会包含末端的值
print(ser['name': 'class'])
# 布尔索引
print(ser[ser > 3])

# name 属性
ser.name = 'temp' # Series 对象名
ser.index.name = 'index01' # 索引名

ser.head()
ser.tail()

DataFrame 数据结构 DataFrame 创建

import numpy as np
import pandas as pd

# 构造一个字典
data = {
    "column1": [1, 2, 3, 4, 5],
    "column2": (3, 9, 4, 2, 1),
    "column3": np.arange(9, 14)
}
frame = pd.DataFrame(data)
frame.index.name = "index"
print(frame)
#        column1  column2  column3
# index
# 0            1        3        9
# 1            2        9       10
# 2            3        4       11
# 3            4        2       12
# 4            5        1       13

# 查看行索引
print(frame.index)
# 查看列索引
print(frame.columns)

# values 属性查看值
print(frame.values)

# 指定行索引, 列索引

frame = pd.DataFrame(data, index=[1, 2, 3, 4, 5], columns=[*data.keys(), 'col1'])
print(frame)
#    column1  column2  column3 col1
# 1        1        3        9  NaN
# 2        2        9       10  NaN
# 3        3        4       11  NaN
# 4        4        2       12  NaN
# 5        5        1       13  NaN

# Series 构成的字典构造dataframe
frame = pd.DataFrame({
    'A': pd.Series(np.arange(1, 5)),
    'B': pd.Series(np.arange(2, 6))
})
print(frame)
#    A  B
# 0  1  2
# 1  2  3
# 2  3  4
# 3  4  5

# 字典构成的字典构造dataframe
data = {
    'col1': {0: 1, 1: 2, 2: 3},
    'col2': {0: 1, 1: 2, 2: 3},
    'col3': {0: 1, 1: 2}
}
frame = pd.DataFrame(data)
print(frame)
#    col1  col2  col3
# 0     1     1   1.0
# 1     2     2   2.0
# 2     3     3   NaN

# 2D ndarry 构造dataframe
arr1 = np.arange(12).reshape(4, 3)
frame = pd.DataFrame(arr1)
print(frame)
#    0   1   2
# 0  0   1   2
# 1  3   4   5
# 2  6   7   8
# 3  9  10  11

# 字典构成的列表构造dataframe
data = [
    {'a': 1},
    {'b': 2},
    {'c': 3}
]
frame = pd.DataFrame(data)
print(frame)
#      a    b    c
# 0  1.0  NaN  NaN
# 1  NaN  2.0  NaN
# 2  NaN  NaN  3.0

# Series构成的列表构造dataframe
data = [
    pd.Series(np.random.rand(3)),
    pd.Series(np.random.rand(2))
]
frame = pd.DataFrame(data)
print(frame)
#           0         1         2
# 0  0.473710  0.229683  0.754207
# 1  0.314483  0.825005       NaN


data = [
    pd.Series([x for x in range(5)]),
    pd.Series(np.random.rand(5))
]
frame = pd.DataFrame(data)
print(frame)
#           0         1         2         3         4
# 0  0.000000  1.000000  2.000000  3.000000  4.000000
# 1  0.621414  0.244883  0.755113  0.572064  0.671114

DataFrame 基本用法

# 转置
frame_t = frame.T
print(frame_t, type(frame_t))

#    a  b
# 0  0  1
# 1  1  2
# 2  2  3
# 3  3  4
# 4  4  5
# --------------------
#    0  1  2  3  4
# a  0  1  2  3  4
# b  1  2  3  4  5 

# 通过列索引获取列数据 （Series类型）
print(frame['a'])

# 增加一列数据
frame['c'] = 0
print(frame)

# 删除一列数据
del(frame['c'])
print(frame)
#    a  b  c
# 0  0  1  0
# 1  1  2  0
# 2  2  3  0
# 3  3  4  0
# 4  4  5  0
#    a  b
# 0  0  1
# 1  1  2
# 2  2  3
# 3  3  4
# 4  4  5

Pandas 索引 *** 作索引对象 Index

# 索引对象不可变， 保证数据的安全性
# 常见的Index 种类
# Index 基本索引
# Int64Index 整数索引
# MultiIndex 层级索引
# DatetimeIndex 时间戳索引

索引的一些基本 *** 作

import pandas as pd
import numpy as np

ser = pd.Series(['1', '2', '3'], index=[1, 2, 3])
print(ser)


# 1.重新索引
# 行索引重建
ser = ser.reindex([1, 2, 3, 4])
print(ser)

# 列索引重建
data = {
    'a': np.arange(5),
    'b': np.arange(1, 6)
}
frame = pd.DataFrame(data)
print(frame)
frame = frame.reindex(columns=['b', 'a'])
print(frame)

# Series 行增加
ser1 = pd.concat([ser, pd.Series({5: 111})])
print(ser)
print(ser1)

# DataFrame 增加列
frame['c'] = 10
frame['d'] = [0, 0, 0, 0, 1]
# frame['d'] = [0, 0, 0, 0]# 行数不一致报错
# 在某一列号位置上插入数据
frame.insert(0, 'e', 99)
print(frame)

# 增加行
# 标签索引loc
frame.loc[5] = 0
row = {'e': 1, 'b': 2, 'a': 3, 'c': 4, 'd': 5}
# 未来版本，将被删除
frame.append(row, ignore_index=True)
# 未来版本，支持
f1 = pd.DataFrame(row, index=[6])
print(f1)
frame = pd.concat([frame, f1])
print(frame)


# 删除行
ser[5] = 1
print(ser)

del ser[4]
print(ser)

ser1 = ser.drop(5)
print(ser1)

# 删除多条
ser2 = ser.drop([1, 5])
print(ser2)

# DataFrame 删除 默认删除行索引
print(frame)
f1 = frame.drop(0)
print(f1)
f2 = frame.drop([0, 6, 5])
print(f2)

# 指定删除列 axis = 1 列轴， 0 横轴
f3 = frame.drop('e', axis=1)
print(f3)
f4 = frame.drop(['b', 'd'], axis='columns')
print(f4)

# 原地删除
f4.drop([0, 1, 6], axis=0, inplace=True)
print(f4)

data = {
    'a': np.arange(3),
    'b': np.arange(3)
}
frame = pd.DataFrame(data)
print(frame)
#    a  b
# 0  0  0
# 1  1  1
# 2  2  2

# 对DataFrame 某一列进行赋值 有则更改, 无则替换
frame['c'] = 100
frame['b'] = 1
print(frame)
#    a  b    c
# 0  0  1  100
# 1  1  1  100
# 2  2  1  100

# 对某一个位置上的值进行更改
frame.loc[0, 'a'] = 999
print(frame)
#      a  b    c
# 0  999  1  100
# 1    1  1  100
# 2    2  1  100

# 查
# Series 查
ser = pd.Series([1, 2, 3], index=['a', 'b', 'c'])

# 位置索引
print("位置索引", ser[0])
# 标签索引
print("位置索引", ser['a'])
# 位置切片
print("位置切片", ser[0:])
# 标签切片
print("标签切片", ser['a':'c'])
# 不连续索引
print("不连续索引", ser[[0, 2]])
# 位置索引 1
# 位置索引 1
# 位置切片 a    1
# b    2
# c    3
# dtype: object
# 标签切片 a    1
# b    2
# c    3
# dtype: object
# 不连续索引 a    1
# c    3
# dtype: object


# 布尔索引
print(ser > 1)
print(ser[ser > 1])
# a    False
# b     True
# c     True
# dtype: bool
# b    2
# c    3
# dtype: int64

# dataframe 索引取值
print(frame)
# 取单列
print(frame['b'])
# 取多列
print(frame[['a', 'c']])
# 取单个值
print(frame.loc[0, 'a'])
print(frame['a'][0])

# 切片取的是行
print(frame[:2])

高级索引

loc标签索引
iloc位置索引
ix标签与位置混合索引

# 标签索引  loc 先行索引后列索引, 是标签索引
print(frame.loc[0: 1, 'a': 'b'])

# 位置索引 iloc 先行后列
print(frame.iloc[0: 1, 1:2])

Pandas 对齐运算填充值

import pandas as pd
import numpy as np


s1 = pd.Series(np.arange(5), index=['a', 'b', 'c', 'f', 'g'])
s2 = pd.Series(np.arange(3), index=['a', 'd', 'e'])
print(s1)
print(s2)

# 算数相加
s3 = s1 + s2
s3 = s1.add(s2, fill_value =0)
print(s3)
# a    0
# b    1
# c    2
# f    3
# g    4
# dtype: int32
# a    0
# d    1
# e    2
# dtype: int32
# a    0.0
# b    1.0
# c    2.0
# d    1.0
# e    2.0
# f    3.0
# g    4.0
# dtype: float64

Panads 函数应用 apply 和 applymap

# 应用到列与行
f = lambda x:x.max()
# 默认求每一列的最大值
print(f1.apply(f))
# 0    0.258035
# 1    1.058038
# 2    1.265633
# 3    1.236317
# dtype: float64
# 求每一行的最大值
print(f1.apply(f, axis=1))
# 0    1.058038
# 1    1.265633
# 2    0.989948
# 3    0.839860
# 4    0.518687
# dtype: float64
# 应用到每一个值
f = lambda x:"%.2f" % x
print(f1.applymap(f))
#       0     1     2     3
# 0  0.15  1.06  0.76  0.37
# 1  0.15  0.18  1.27  1.24
# 2  0.26  0.92  0.99  0.15
# 3  0.12  0.84  0.53  0.27
# 4  0.01  0.52  0.01  0.27

排序

# 索引排序 默认升序
ser = pd.Series(range(5), index=list('acdeb'))
print(ser)
# a    0
# c    1
# d    2
# e    3
# b    4
# dtype: int64
ser = ser.sort_index()
print(ser)
# a    0
# b    4
# c    1
# d    2
# e    3
# dtype: int64
ser = ser.sort_index(ascending=False)
print(ser)
# e    3
# d    2
# c    1
# b    4
# a    0
# dtype: int64
ser = ser.sort_values(ascending=False)
print(ser)

df = pd.DataFrame(np.arange(12).reshape(4, 3), index=list('dabc'), columns=list('BCA'))
print(df)
#    B   C   A
# d  0   1   2
# a  3   4   5
# b  6   7   8
# c  9  10  11

# 默认行排序
df1 = df.sort_index()
print(df1)
#    B   C   A
# a  3   4   5
# b  6   7   8
# c  9  10  11
# d  0   1   2
# 按列排序, 降序
df1 = df.sort_index(axis=1, ascending=False)
print(df1)
#     C  B   A
# d   1  0   2
# a   4  3   5
# b   7  6   8
# c  10  9  11
# 根据A 列 降序排列
df1 = df1.sort_values(by='A', ascending=False)
print(df1)
#     C  B   A
# c  10  9  11
# b   7  6   8
# a   4  3   5
# d   1  0   2

唯一值和成员属性

print(ser)
print(ser.unique())
print(ser.value_counts())
print(ser.isin([3]))
# [4 3 2 1 0]
# 4    1
# 3    1
# 2    1
# 1    1
# 0    1
# dtype: int64
# b    False
# e     True
# d    False
# c    False
# a    False
# dtype: bool

处理缺失数据

df = pd.DataFrame({'a':[1,2,23,3], 'b':[np.nan, 3, 4, 1]})
print(df)
#     a    b
# 0   1  NaN
# 1   2  3.0
# 2  23  4.0
# 3   3  1.0
# 判断是否有缺失值
print(df.isnull())
#        a      b
# 0  False   True
# 1  False  False
# 2  False  False
# 3  False  False
# 默认丢弃行数据
print(df.dropna())
#     a    b
# 1   2  3.0
# 2  23  4.0
# 3   3  1.0
# 选择列轴
print(df.dropna(axis=1))
#     a
# 0   1
# 1   2
# 2  23
# 3   3
# 填充缺失数据
print(df.fillna(0))
#     a    b
# 0   1  0.0
# 1   2  3.0
# 2  23  4.0
# 3   3  1.0

统计计算

import numpy as np
import pandas as pd

frame = pd.DataFrame(np.random.randn(5, 4))
print(frame)

print(frame.describe())

# [          0         1         2         3
# 0  0.656203 -1.270162  2.004630 -0.706720
# 1 -1.547710  0.493761  0.150936 -0.688306
# 2  0.199421  0.534954  0.267138  0.788930
# 3 -1.405272  0.402331 -1.520276 -1.268665
# 4  0.412419 -0.316246  0.136897 -0.558747
#               0         1         2         3
# count  5.000000  5.000000  5.000000  5.000000
# mean  -0.336988 -0.031072  0.207865 -0.486702
# std    1.053904  0.774652  1.247660  0.763699
# min   -1.547710 -1.270162 -1.520276 -1.268665
# 25%   -1.405272 -0.316246  0.136897 -0.706720
# 50%    0.199421  0.402331  0.150936 -0.688306
# 75%    0.412419  0.493761  0.267138 -0.558747
# max    0.656203  0.534954  2.004630  0.788930]()

数据读取与存储

import numpy as np
import pandas as pd

FILE_PATH = "../../tmp_file/only_panads/GS.csv"
df = pd.read_csv(FILE_PATH, sep=',')
print(df.shape)
print(df.head())

df = pd.read_table(FILE_PATH, sep=',', names=['a','b','c','d','e','f','g'], index_col='a')

print(df.shape)
print(df.head())

df1 = df.head()
df1.to_csv('../../tmp_file/only_panads/f1.csv')

# 分块读取
df = pd.read_csv(FILE_PATH, sep=',', chunksize=10)
print(df.get_chunk())
print(df.get_chunk(5))

df = pd.read_csv(FILE_PATH, sep=',', iterator=True)
print(df.get_chunk(5))
print(df.get_chunk(10))

数据清洗和准备

# 丢弃全为Nan的行
df.dropna(how='all')
# 丢弃某一行缺失值为2个的行
df.dropna(thresh=2)

# 填充缺失值
df.fillna(0)
# 第一列和第二列替换相应的值
df.fillna({1: 0.9, 2: 0})
# 就地修改
df.fillna(0, inplace=True)

# 检查是否是重复数据
df.duplicated()
# 删除重复数据
df.drop_duplicates()
# 去除指定列的重复数据行
df.drop_duplicates(['col1', 'col2'])

df.drop_duplicates(['col1', 'col2'], keep='last')

# 利用函数或映射进行数据转换
df['col1'] = df['key'].map({"":""})
# 利用函数
df['col1'] = df['key'].map(lambda x: meat[x.lower()])

欢迎分享，转载请注明来源：内存溢出

原文地址:https://54852.com/langs/886072.html

Panads入门

发表评论

评论列表（0条）