Python数据分析 4.pandas数据科学库_PHP

Python数据分析 4.pandas数据科学库 1.数据类型series

实质是带标签的一维数组，有键和值

import pandas as pd

t1 = pd.Series([1,2,3,4,5],index=list("abcde"))

temp_dict = {"name":"Lucy","age":20,"tel":10086}
t2 = pd.Series(temp_dict)

print(t1[[1]])
print(t2[["name","age"]])
print(t1>2)

list(t1.index)[:2]
type(t1.values)  #numpy.ndarray

b    2
dtype: int64

name    Lucy
age       20
dtype: object

a    False
b    False
c     True
d     True
e     True
dtype: bool

2.pandas读取外部数据

import pandas as pd

# pandas读取csv中的文件
df = pd.read_csv("D:/数据分析资料/day04/code/dogNames2.csv")

3.DataFrame的创建

DataFrame对象既有行索引，又有列索引
行索引，表明不同行，横向索引，叫index，0轴，axis=0
列索引，表名不同列，纵向索引，叫columns，1轴，axis=1

import pandas as pd
import numpy as np

df = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))

print(df)

   W  X   Y   Z
a  0  1   2   3
b  4  5   6   7
c  8  9  10  11

（DataFrame可以看作一个Series容器）

import pandas as pd
import numpy as np

d1 = {"name":["Lucy","Lily","Cindy"],"age":[20,42,37],"tel":["110","120","119"]}
df1 = pd.DataFrame(d1)

d2 = [{"name":"Lucy","age":20,"tel":110},{"name":"Lily","age":42,"tel":120},{"name":"Cindy","age":37}]
df2 = pd.DataFrame(d2)

print(df1)
print(df2)

    name  age  tel
0   Lucy   20  110
1   Lily   42  120
2  Cindy   37  119

    name  age    tel
0   Lucy   20  110.0
1   Lily   42  120.0
2  Cindy   37    NaN

df.index #行索引
df.columns #列索引
df.values #对象值
df.shape #形状
df.dtypes #数据类型
df.ndim #数据维度

df.head()
df.tail()
df.info()
df.describe()

4.DataFrame的排序和索引排序：

import pandas as pd

# pandas读取csv中的文件
df = pd.read_csv("D:/daily/大二下/量化/拜师/数据分析资料/day04/code/dogNames2.csv")
# print(df.info())
# print(df.head())

# dataframe中排序
df = df.sort_values(by="Count_AnimalName",ascending=False)

# pandas取行或者列
# - 方括号写数字表示取行，对行进行 *** 作
# - 写字符串，表示取列索引，对列进行 *** 作
print(df[:20]["Row_Labels"])

索引：

df.loc 通过标签索引行数据
df.iloc 通过位置获取行数据

import pandas as pd
import numpy as np

df = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))

# loc方法
print(df.loc["a","Z"])
print(df.loc["a"]) #["a",:]
print(df.loc[:,"Y"])
print(df.loc["a":"c",["W","Y"]]) #这里的冒号是闭合的

# iloc方法
print(df.iloc[1,:])
print(df.iloc[1:,[2,3]])
df.iloc[1,2] = np.nan
print(df)

3

W    0
X    1
Y    2
Z    3
Name: a, dtype: int32

a     2
b     6
c    10
Name: Y, dtype: int32

   W   Y
a  0   2
b  4   6
c  8  10

W    4
X    5
Y    6
Z    7
Name: b, dtype: int32

    Y   Z
b   6   7
c  10  11

   W  X     Y   Z
a  0  1   2.0   3
b  4  5   NaN   7
c  8  9  10.0  11

布尔索引：

import pandas as pd

df = pd.read_csv("D:/daily/大二下/量化/拜师/数据分析资料/day04/code/dogNames2.csv")

print(df[(df["Count_AnimalName"]>800) & (df["Count_AnimalName"]<1000)])
print(df["Row_Labels"].str.split("/").tolist())  #series类型的tolist()方法

5.缺失数据的处理

import pandas as pd

df = pd.read_csv("D:/数据分析资料/day04/code/dogNames2.csv")

pd.isnull(df)
pd.notnull(df)
# df.dropna(axis=0,how="any",inplace=True) #how="all"

df.fillna(df.mean) #均值填充空值
df["Count_AnimalName"].mean()

判断数据是否为NaN：pd.isnull(df),pd.notnull(df)

处理方式1：删除NaN所在的行列

dropna (axis=0, how='any', inplace=False)

处理方式2：填充数据

t.fillna(t.mean()), t.fillna(t.median()), t.fillna(0)

处理为0的数据：

t[t==0]=np.nan

当然并不是每次为0的数据都需要处理
计算平均值等情况，nan是不参与计算的，但是0会

6.pandas常用统计方法

import pandas as pd

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)

# 获取平均评分
rating_mean = df["Rating"].mean()
print(rating_mean)

# 导演人数
director_num = len(set(df["Director"].tolist()))
director_num = len(df["Director"].unique())
print(director_num)

# 获取演员人数
temp_actors_list = df["Actors"].str.split(",").tolist()
actors_list = [i for j in temp_actors_list for i in j]
actors_num = len(set(actors_list))
print(actors_num)

练习：runtime分布情况

import pandas as pd
from matplotlib import pyplot as plt

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)

#runtime分布情况
#选择图形，直方图

runtime_data = df["Runtime (Minutes)"].values

max_runtime = runtime_data.max()
min_runtime = runtime_data.min()

#计算组数
num_bin = (max_runtime-min_runtime)//5

plt.figure(figsize=(20,8),dpi=80)
plt.hist(runtime_data,num_bin)

plt.grid()

7.数据的合并和分组聚合

练习：字符串离散化统计

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)

#统计电影的分类情况
temp_list = df["Genre"].str.split(",").tolist()
genre_list = list(set(([i for j in temp_list for i in j])))

#构建一个全为零的DataFrame
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
# print(zeros_df)

#给每个电影出现分类的位置赋值1
for i in range(df.shape[0]):
    zeros_df.loc[i,temp_list[i]] = 1

genre_count = zeros_df.sum(axis=0)
genre_count = genre_count.sort_values(ascending=False)

_x = genre_count.index
_y = genre_count.values

#画图
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x)

思路是新建一个全为零的DataFrame，有该标签的赋值为1，没有的赋值为0。

数据合并：join，merge

join：按照行索引合并
merge：按照列索引合并

import pandas as pd
import numpy as np

df1 = pd.DataFrame(np.zeros(8).reshape(2,4),index=["1","2"],columns=["a","b","c","d"])
df2 = pd.DataFrame(np.zeros(9).reshape(3,3),index=["1","2","3"],columns=["a","e","f"])
df2.iloc[1,:2]=1

print(df1.merge(df2,on="a",how="outer"))

     a    b    c    d    e    f
0  0.0  0.0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0  0.0  0.0
4  1.0  NaN  NaN  NaN  1.0  0.0

数据分组聚合：groupby方法

统计国家中门店数量：

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)

grouped = df.groupby(by="Country")

country_count = grouped["Brand"].count()
print(country_count["US"])
print(country_count["CN"])

统计中国各省门店数量：

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)

china_data = df[df["Country"] == "CN"]
grouped = china_data.groupby(by="State/Province")["Brand"].count()

print(grouped)

多条件分组：

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)

#多条件分组返回Series
grouped = df["Brand"].groupby(by=[df["Country"],df["State/Province"]]).count()

#多条件分组返回DataFrame
grouped = df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count()
grouped = df.groupby(by=[df["Country"],df["State/Province"]])[["Brand"]].count()
grouped = df.groupby(by=[df["Country"],df["State/Province"]]).count()[["Brand"]]

8.数据的索引

简单的索引 *** 作：

获取index：df.index
指定index ：df.index = [‘x’,‘y’]
重新设置index : df.reindex(list(“abcedf”))
指定某一列作为index：df.set_index(“Country”,drop=False)
返回index的唯一值：df.set_index(“Country”).index.unique()

index是可迭代对象，可以 *** 作list(df.index)，len(df.index)

练习：

1.星巴克数量最多的十个国家

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)

grouped = df.groupby("Country")["Brand"].count()
grouped = grouped.sort_values(ascending=False)
top = grouped.head(10)

x = list(top.index)
y = list(top.values)

plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(x)),y)
plt.xticks(range(len(x)),x)

2.星巴克最多的25个中国城市

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import font_manager

my_font = font_manager.FontProperties(fname = "C:/Windows/Fonts/simhei.ttf")

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
df = df[df["Country"] == "CN"]

grouped = df.groupby(by="City")["Brand"].count()
grouped = grouped.sort_values(ascending=False)[:25]

x = grouped.index
y = grouped.values

plt.figure(figsize=(20,12),dpi=80)
plt.barh(range(len(x)),y,height=0.4,color="orange")
plt.yticks(range(len(x)),x,font=my_font)

3.不同年份书的数量和平均评分

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import font_manager

my_font = font_manager.FontProperties(fname = "C:/Windows/Fonts/simhei.ttf")

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\books.csv"
df = pd.read_csv(file_path)

#不同年份书的数量
df = df[pd.notnull(df["original_publication_year"])]
grouped = df.groupby(by="original_publication_year").count()["title"]

#不同年份书的平均评分
df = df[pd.notnull(df["original_publication_year"])]
grouped = df["average_rating"].groupby(by=df["original_publication_year"]).mean()

x = grouped.index
y = grouped.values

plt.figure(figsize=(20,8),dpi=80)
plt.xticks(list(range(len(x)))[::10],x[::10].astype(int),rotation=45)
plt.plot(range(len(x)),y)

欢迎分享，转载请注明来源：内存溢出

原文地址:https://54852.com/langs/584604.html

Python数据分析 4.pandas数据科学库

发表评论

评论列表（0条）