Python数据分析 4.pandas数据科学库

Python数据分析 4.pandas数据科学库,第1张

Python数据分析 4.pandas数据科学库 1.数据类型series

实质是带标签的一维数组,有键和值

import pandas as pd

t1 = pd.Series([1,2,3,4,5],index=list("abcde"))

temp_dict = {"name":"Lucy","age":20,"tel":10086}
t2 = pd.Series(temp_dict)

print(t1[[1]])
print(t2[["name","age"]])
print(t1>2)

list(t1.index)[:2]
type(t1.values)  #numpy.ndarray
b    2
dtype: int64

name    Lucy
age       20
dtype: object

a    False
b    False
c     True
d     True
e     True
dtype: bool
2.pandas读取外部数据
import pandas as pd

# pandas读取csv中的文件
df = pd.read_csv("D:/数据分析资料/day04/code/dogNames2.csv")
3.DataFrame的创建

DataFrame对象既有行索引,又有列索引
行索引,表明不同行,横向索引,叫index,0轴,axis=0
列索引,表名不同列,纵向索引,叫columns,1轴,axis=1

import pandas as pd
import numpy as np

df = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))

print(df)
   W  X   Y   Z
a  0  1   2   3
b  4  5   6   7
c  8  9  10  11

(DataFrame可以看作一个Series容器)

import pandas as pd
import numpy as np

d1 = {"name":["Lucy","Lily","Cindy"],"age":[20,42,37],"tel":["110","120","119"]}
df1 = pd.DataFrame(d1)

d2 = [{"name":"Lucy","age":20,"tel":110},{"name":"Lily","age":42,"tel":120},{"name":"Cindy","age":37}]
df2 = pd.DataFrame(d2)

print(df1)
print(df2)
    name  age  tel
0   Lucy   20  110
1   Lily   42  120
2  Cindy   37  119

    name  age    tel
0   Lucy   20  110.0
1   Lily   42  120.0
2  Cindy   37    NaN
df.index #行索引
df.columns #列索引
df.values #对象值
df.shape #形状
df.dtypes #数据类型
df.ndim #数据维度

df.head()
df.tail()
df.info()
df.describe()
4.DataFrame的排序和索引 排序:
import pandas as pd

# pandas读取csv中的文件
df = pd.read_csv("D:/daily/大二下/量化/拜师/数据分析资料/day04/code/dogNames2.csv")
# print(df.info())
# print(df.head())

# dataframe中排序
df = df.sort_values(by="Count_AnimalName",ascending=False)

# pandas取行或者列
# - 方括号写数字表示取行,对行进行 *** 作
# - 写字符串,表示取列索引,对列进行 *** 作
print(df[:20]["Row_Labels"])
索引:

df.loc 通过标签索引行数据
df.iloc 通过位置获取行数据

import pandas as pd
import numpy as np

df = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))

# loc方法
print(df.loc["a","Z"])
print(df.loc["a"]) #["a",:]
print(df.loc[:,"Y"])
print(df.loc["a":"c",["W","Y"]]) #这里的冒号是闭合的

# iloc方法
print(df.iloc[1,:])
print(df.iloc[1:,[2,3]])
df.iloc[1,2] = np.nan
print(df)
3

W    0
X    1
Y    2
Z    3
Name: a, dtype: int32

a     2
b     6
c    10
Name: Y, dtype: int32

   W   Y
a  0   2
b  4   6
c  8  10

W    4
X    5
Y    6
Z    7
Name: b, dtype: int32

    Y   Z
b   6   7
c  10  11

   W  X     Y   Z
a  0  1   2.0   3
b  4  5   NaN   7
c  8  9  10.0  11
布尔索引:
import pandas as pd

df = pd.read_csv("D:/daily/大二下/量化/拜师/数据分析资料/day04/code/dogNames2.csv")

print(df[(df["Count_AnimalName"]>800) & (df["Count_AnimalName"]<1000)])
print(df["Row_Labels"].str.split("/").tolist())  #series类型的tolist()方法
5.缺失数据的处理
import pandas as pd

df = pd.read_csv("D:/数据分析资料/day04/code/dogNames2.csv")

pd.isnull(df)
pd.notnull(df)
# df.dropna(axis=0,how="any",inplace=True) #how="all"

df.fillna(df.mean) #均值填充空值
df["Count_AnimalName"].mean()

判断数据是否为NaN:pd.isnull(df),pd.notnull(df)

处理方式1:删除NaN所在的行列

dropna (axis=0, how='any', inplace=False)

处理方式2:填充数据

t.fillna(t.mean()), t.fillna(t.median()), t.fillna(0)

处理为0的数据:

t[t==0]=np.nan

当然并不是每次为0的数据都需要处理
计算平均值等情况,nan是不参与计算的,但是0会

6.pandas常用统计方法
import pandas as pd

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)

# 获取平均评分
rating_mean = df["Rating"].mean()
print(rating_mean)

# 导演人数
director_num = len(set(df["Director"].tolist()))
director_num = len(df["Director"].unique())
print(director_num)

# 获取演员人数
temp_actors_list = df["Actors"].str.split(",").tolist()
actors_list = [i for j in temp_actors_list for i in j]
actors_num = len(set(actors_list))
print(actors_num)

练习:runtime分布情况

import pandas as pd
from matplotlib import pyplot as plt

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)

#runtime分布情况
#选择图形,直方图

runtime_data = df["Runtime (Minutes)"].values

max_runtime = runtime_data.max()
min_runtime = runtime_data.min()

#计算组数
num_bin = (max_runtime-min_runtime)//5

plt.figure(figsize=(20,8),dpi=80)
plt.hist(runtime_data,num_bin)

plt.grid()

7.数据的合并和分组聚合

练习:字符串离散化统计

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)

#统计电影的分类情况
temp_list = df["Genre"].str.split(",").tolist()
genre_list = list(set(([i for j in temp_list for i in j])))

#构建一个全为零的DataFrame
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
# print(zeros_df)

#给每个电影出现分类的位置赋值1
for i in range(df.shape[0]):
    zeros_df.loc[i,temp_list[i]] = 1

genre_count = zeros_df.sum(axis=0)
genre_count = genre_count.sort_values(ascending=False)

_x = genre_count.index
_y = genre_count.values

#画图
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x)

思路是新建一个全为零的DataFrame,有该标签的赋值为1,没有的赋值为0。


数据合并:join,merge

join:按照行索引合并
merge:按照列索引合并

import pandas as pd
import numpy as np

df1 = pd.DataFrame(np.zeros(8).reshape(2,4),index=["1","2"],columns=["a","b","c","d"])
df2 = pd.DataFrame(np.zeros(9).reshape(3,3),index=["1","2","3"],columns=["a","e","f"])
df2.iloc[1,:2]=1

print(df1.merge(df2,on="a",how="outer"))
     a    b    c    d    e    f
0  0.0  0.0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0  0.0  0.0
4  1.0  NaN  NaN  NaN  1.0  0.0
数据分组聚合:groupby方法

统计国家中门店数量:

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)

grouped = df.groupby(by="Country")

country_count = grouped["Brand"].count()
print(country_count["US"])
print(country_count["CN"])

统计中国各省门店数量:

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)

china_data = df[df["Country"] == "CN"]
grouped = china_data.groupby(by="State/Province")["Brand"].count()

print(grouped)

多条件分组:

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)

#多条件分组返回Series
grouped = df["Brand"].groupby(by=[df["Country"],df["State/Province"]]).count()

#多条件分组返回DataFrame
grouped = df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count()
grouped = df.groupby(by=[df["Country"],df["State/Province"]])[["Brand"]].count()
grouped = df.groupby(by=[df["Country"],df["State/Province"]]).count()[["Brand"]]
8.数据的索引

简单的索引 *** 作:

  • 获取index:df.index
  • 指定index :df.index = [‘x’,‘y’]
  • 重新设置index : df.reindex(list(“abcedf”))
  • 指定某一列作为index:df.set_index(“Country”,drop=False)
  • 返回index的唯一值:df.set_index(“Country”).index.unique()

index是可迭代对象,可以 *** 作list(df.index),len(df.index)

练习:

1.星巴克数量最多的十个国家

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)

grouped = df.groupby("Country")["Brand"].count()
grouped = grouped.sort_values(ascending=False)
top = grouped.head(10)

x = list(top.index)
y = list(top.values)

plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(x)),y)
plt.xticks(range(len(x)),x)


2.星巴克最多的25个中国城市

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import font_manager

my_font = font_manager.FontProperties(fname = "C:/Windows/Fonts/simhei.ttf")

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
df = df[df["Country"] == "CN"]

grouped = df.groupby(by="City")["Brand"].count()
grouped = grouped.sort_values(ascending=False)[:25]

x = grouped.index
y = grouped.values

plt.figure(figsize=(20,12),dpi=80)
plt.barh(range(len(x)),y,height=0.4,color="orange")
plt.yticks(range(len(x)),x,font=my_font)


3.不同年份书的数量和平均评分

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import font_manager

my_font = font_manager.FontProperties(fname = "C:/Windows/Fonts/simhei.ttf")

file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\books.csv"
df = pd.read_csv(file_path)

#不同年份书的数量
df = df[pd.notnull(df["original_publication_year"])]
grouped = df.groupby(by="original_publication_year").count()["title"]

#不同年份书的平均评分
df = df[pd.notnull(df["original_publication_year"])]
grouped = df["average_rating"].groupby(by=df["original_publication_year"]).mean()

x = grouped.index
y = grouped.values

plt.figure(figsize=(20,8),dpi=80)
plt.xticks(list(range(len(x)))[::10],x[::10].astype(int),rotation=45)
plt.plot(range(len(x)),y)

欢迎分享,转载请注明来源:内存溢出

原文地址:https://54852.com/langs/584604.html

(0)
打赏 微信扫一扫微信扫一扫 支付宝扫一扫支付宝扫一扫
上一篇 2022-04-12
下一篇2022-04-12

发表评论

登录后才能评论

评论列表(0条)

    保存