
实质是带标签的一维数组,有键和值
import pandas as pd
t1 = pd.Series([1,2,3,4,5],index=list("abcde"))
temp_dict = {"name":"Lucy","age":20,"tel":10086}
t2 = pd.Series(temp_dict)
print(t1[[1]])
print(t2[["name","age"]])
print(t1>2)
list(t1.index)[:2]
type(t1.values) #numpy.ndarray
b 2
dtype: int64
name Lucy
age 20
dtype: object
a False
b False
c True
d True
e True
dtype: bool
2.pandas读取外部数据
import pandas as pd
# pandas读取csv中的文件
df = pd.read_csv("D:/数据分析资料/day04/code/dogNames2.csv")
3.DataFrame的创建
DataFrame对象既有行索引,又有列索引
行索引,表明不同行,横向索引,叫index,0轴,axis=0
列索引,表名不同列,纵向索引,叫columns,1轴,axis=1
import pandas as pd
import numpy as np
df = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
print(df)
W X Y Z
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11
(DataFrame可以看作一个Series容器)
import pandas as pd
import numpy as np
d1 = {"name":["Lucy","Lily","Cindy"],"age":[20,42,37],"tel":["110","120","119"]}
df1 = pd.DataFrame(d1)
d2 = [{"name":"Lucy","age":20,"tel":110},{"name":"Lily","age":42,"tel":120},{"name":"Cindy","age":37}]
df2 = pd.DataFrame(d2)
print(df1)
print(df2)
name age tel
0 Lucy 20 110
1 Lily 42 120
2 Cindy 37 119
name age tel
0 Lucy 20 110.0
1 Lily 42 120.0
2 Cindy 37 NaN
df.index #行索引
df.columns #列索引
df.values #对象值
df.shape #形状
df.dtypes #数据类型
df.ndim #数据维度
df.head()
df.tail()
df.info()
df.describe()
4.DataFrame的排序和索引
排序:
import pandas as pd
# pandas读取csv中的文件
df = pd.read_csv("D:/daily/大二下/量化/拜师/数据分析资料/day04/code/dogNames2.csv")
# print(df.info())
# print(df.head())
# dataframe中排序
df = df.sort_values(by="Count_AnimalName",ascending=False)
# pandas取行或者列
# - 方括号写数字表示取行,对行进行 *** 作
# - 写字符串,表示取列索引,对列进行 *** 作
print(df[:20]["Row_Labels"])
索引:
df.loc 通过标签索引行数据
df.iloc 通过位置获取行数据
import pandas as pd
import numpy as np
df = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
# loc方法
print(df.loc["a","Z"])
print(df.loc["a"]) #["a",:]
print(df.loc[:,"Y"])
print(df.loc["a":"c",["W","Y"]]) #这里的冒号是闭合的
# iloc方法
print(df.iloc[1,:])
print(df.iloc[1:,[2,3]])
df.iloc[1,2] = np.nan
print(df)
3
W 0
X 1
Y 2
Z 3
Name: a, dtype: int32
a 2
b 6
c 10
Name: Y, dtype: int32
W Y
a 0 2
b 4 6
c 8 10
W 4
X 5
Y 6
Z 7
Name: b, dtype: int32
Y Z
b 6 7
c 10 11
W X Y Z
a 0 1 2.0 3
b 4 5 NaN 7
c 8 9 10.0 11
布尔索引:
import pandas as pd
df = pd.read_csv("D:/daily/大二下/量化/拜师/数据分析资料/day04/code/dogNames2.csv")
print(df[(df["Count_AnimalName"]>800) & (df["Count_AnimalName"]<1000)])
print(df["Row_Labels"].str.split("/").tolist()) #series类型的tolist()方法
5.缺失数据的处理
import pandas as pd
df = pd.read_csv("D:/数据分析资料/day04/code/dogNames2.csv")
pd.isnull(df)
pd.notnull(df)
# df.dropna(axis=0,how="any",inplace=True) #how="all"
df.fillna(df.mean) #均值填充空值
df["Count_AnimalName"].mean()
判断数据是否为NaN:pd.isnull(df),pd.notnull(df)
处理方式1:删除NaN所在的行列
dropna (axis=0, how='any', inplace=False)
处理方式2:填充数据
t.fillna(t.mean()), t.fillna(t.median()), t.fillna(0)
处理为0的数据:
t[t==0]=np.nan
当然并不是每次为0的数据都需要处理
计算平均值等情况,nan是不参与计算的,但是0会
import pandas as pd
file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)
# 获取平均评分
rating_mean = df["Rating"].mean()
print(rating_mean)
# 导演人数
director_num = len(set(df["Director"].tolist()))
director_num = len(df["Director"].unique())
print(director_num)
# 获取演员人数
temp_actors_list = df["Actors"].str.split(",").tolist()
actors_list = [i for j in temp_actors_list for i in j]
actors_num = len(set(actors_list))
print(actors_num)
练习:runtime分布情况
import pandas as pd
from matplotlib import pyplot as plt
file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)
#runtime分布情况
#选择图形,直方图
runtime_data = df["Runtime (Minutes)"].values
max_runtime = runtime_data.max()
min_runtime = runtime_data.min()
#计算组数
num_bin = (max_runtime-min_runtime)//5
plt.figure(figsize=(20,8),dpi=80)
plt.hist(runtime_data,num_bin)
plt.grid()
7.数据的合并和分组聚合
练习:字符串离散化统计
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)
#统计电影的分类情况
temp_list = df["Genre"].str.split(",").tolist()
genre_list = list(set(([i for j in temp_list for i in j])))
#构建一个全为零的DataFrame
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
# print(zeros_df)
#给每个电影出现分类的位置赋值1
for i in range(df.shape[0]):
zeros_df.loc[i,temp_list[i]] = 1
genre_count = zeros_df.sum(axis=0)
genre_count = genre_count.sort_values(ascending=False)
_x = genre_count.index
_y = genre_count.values
#画图
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x)
思路是新建一个全为零的DataFrame,有该标签的赋值为1,没有的赋值为0。
join:按照行索引合并
merge:按照列索引合并
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.zeros(8).reshape(2,4),index=["1","2"],columns=["a","b","c","d"])
df2 = pd.DataFrame(np.zeros(9).reshape(3,3),index=["1","2","3"],columns=["a","e","f"])
df2.iloc[1,:2]=1
print(df1.merge(df2,on="a",how="outer"))
a b c d e f
0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0
4 1.0 NaN NaN NaN 1.0 0.0
数据分组聚合:groupby方法
统计国家中门店数量:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
grouped = df.groupby(by="Country")
country_count = grouped["Brand"].count()
print(country_count["US"])
print(country_count["CN"])
统计中国各省门店数量:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
china_data = df[df["Country"] == "CN"]
grouped = china_data.groupby(by="State/Province")["Brand"].count()
print(grouped)
多条件分组:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
#多条件分组返回Series
grouped = df["Brand"].groupby(by=[df["Country"],df["State/Province"]]).count()
#多条件分组返回DataFrame
grouped = df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count()
grouped = df.groupby(by=[df["Country"],df["State/Province"]])[["Brand"]].count()
grouped = df.groupby(by=[df["Country"],df["State/Province"]]).count()[["Brand"]]
8.数据的索引
简单的索引 *** 作:
- 获取index:df.index
- 指定index :df.index = [‘x’,‘y’]
- 重新设置index : df.reindex(list(“abcedf”))
- 指定某一列作为index:df.set_index(“Country”,drop=False)
- 返回index的唯一值:df.set_index(“Country”).index.unique()
index是可迭代对象,可以 *** 作list(df.index),len(df.index)
练习:
1.星巴克数量最多的十个国家
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
grouped = df.groupby("Country")["Brand"].count()
grouped = grouped.sort_values(ascending=False)
top = grouped.head(10)
x = list(top.index)
y = list(top.values)
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(x)),y)
plt.xticks(range(len(x)),x)
2.星巴克最多的25个中国城市
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import font_manager
my_font = font_manager.FontProperties(fname = "C:/Windows/Fonts/simhei.ttf")
file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
df = df[df["Country"] == "CN"]
grouped = df.groupby(by="City")["Brand"].count()
grouped = grouped.sort_values(ascending=False)[:25]
x = grouped.index
y = grouped.values
plt.figure(figsize=(20,12),dpi=80)
plt.barh(range(len(x)),y,height=0.4,color="orange")
plt.yticks(range(len(x)),x,font=my_font)
3.不同年份书的数量和平均评分
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import font_manager
my_font = font_manager.FontProperties(fname = "C:/Windows/Fonts/simhei.ttf")
file_path = "D:\daily\大二下\量化\拜师\数据分析资料\day05\code\books.csv"
df = pd.read_csv(file_path)
#不同年份书的数量
df = df[pd.notnull(df["original_publication_year"])]
grouped = df.groupby(by="original_publication_year").count()["title"]
#不同年份书的平均评分
df = df[pd.notnull(df["original_publication_year"])]
grouped = df["average_rating"].groupby(by=df["original_publication_year"]).mean()
x = grouped.index
y = grouped.values
plt.figure(figsize=(20,8),dpi=80)
plt.xticks(list(range(len(x)))[::10],x[::10].astype(int),rotation=45)
plt.plot(range(len(x)),y)
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)