
本人tushare ID:513297
本文基于网络上已有的三因子模型代码构造的五因子模型,仅简单实现了Fama-French五因子模型的主要流程,没有考虑各因子的pearson系数,因子构造也极为简单,还有诸多其他方面没有考虑。
本文使用了tushare的数据,tushare数据库有着相当丰富可靠的数据,部分数据调用需要达到tushare数据库设置的积分门槛。高校学生可以联系tushare管理员免费获得tushare pro的支持,对学生相当友好。由于部分数据没有达到积分门槛无法从tushare调用,故使用csmar数据代替。csmar的数据需要用户自己去csmar官方下载。
from audioop import avg
from operator import concat
import pandas as pd
import statsmodels.api as sm
import tushare as ts
import numpy as np
import matplotlib.pyplot as plt
#获取中证800成分股并剔除ST股票的股票
csmar_data = pd.read_csv("C:/Users/86181/Desktop/data_noST_SH_A/TRD_Dalyr.csv")
csmar_data_continue = pd.read_csv("C:/Users/86181/Desktop/data_noST_SH_A/TRD_Dalyr1.csv")
csmar_data = pd.concat([csmar_data, csmar_data_continue], axis = 0, ignore_index = True)
csmar_data2 = pd.read_csv("C:/Users/86181/Desktop/个股日交易衍生指标222402163/STK_MKT_DALYR.csv")##此处需要导入股票市场系列-股票市场衍生指标-交易衍生指标-个股日交易衍生指标文件
csmar_data2_con1 = pd.read_csv("C:/Users/86181/Desktop/个股日交易衍生指标222402163/STK_MKT_DALYR1.csv")
csmar_data2_con2 = pd.read_csv("C:/Users/86181/Desktop/个股日交易衍生指标222402163/STK_MKT_DALYR2.csv")
csmar_data2 = pd.concat([csmar_data2,csmar_data2_con1],axis=0,ignore_index = True)
csmar_data2 = pd.concat([csmar_data2,csmar_data2_con2],axis=0,ignore_index = True)
csmar_data3 = pd.read_csv("C:/Users/86181/Desktop/指数文件221838293/TRD_Index.csv")##导入csmar股票市场交易-指数信息-指数文件-指数回报率作为市场回报率中证800指数代码000906,数据已经按照日期排序
def getZZ800(start,end):
df1 = pro.index_weight(index_code = "000016.SH",start_date = start, end_date = end)
ZZ800_codes = list(set(df1["con_code"].tolist()))
print("成分股: ",len(ZZ800_codes))
df2 = pro.stock_basic(list_status="L")
df2 = df2[df2["list_date"].apply(int).values<20210501]
df2 = df2[-df2["name"].apply(lambda x:x.startswith("*ST"))]
all_codes = df2["ts_code"].tolist()
stock_codes = []
for i in ZZ800_codes:
if i in all_codes:
stock_codes.append(i[:6])
print("\n股票池:",len(stock_codes))
return stock_codes
def group_stocks(stocks,date):
list_mv = []
df_stocks = pd.DataFrame()
count = 0
date = date[:4]+"-"+date[4:6]+"-"+date[6:]
print("\ndate: ",date,"\n")
for i in stocks:
count += 1
a = csmar_data[(csmar_data["Stkcd"] == int(i)) & (csmar_data["Trddt"] == date)]##Trddt格式为2021-01-23,date格式为20220123 date[:4]+"/"+date[4:6]+"/"+date[6:]
a = a["Dsmvosd"].values
list_mv.append(float(a))
print("第%d支股票流通市值数据成功获取\n" % count)
print("\n所有股票流通市值数据获取完成")
#按市值大小分类
df_stocks["code"] = stocks
df_stocks["mv"] = list_mv
plt.plot(range(0,len(df_stocks)),sorted(list_mv))
plt.show()
df_stocks["SB"] = df_stocks["mv"].map(lambda x: "B" if x > df_stocks["mv"].quantile(0.75) else "S")
#按账面市值比的高中低层次分类
list_bm = []
count = 0
for i in stocks:
count += 1
b = csmar_data2[(csmar_data2["Symbol"] == int(i) )& (csmar_data2["TradingDate"] == date)]
b = 1 / b["PB"].values
list_bm.append(float(b))
print("第%d支股票账面市值比计算完成" % count)
df_stocks["bm"] = list_bm
plt.plot(range(0,len(df_stocks)),sorted(list_bm))
plt.show()
df_stocks["HML"] = df_stocks["bm"].apply(lambda x: "H" if x >= df_stocks["bm"].quantile(0.7)
else ("L" if x <= df_stocks["bm"].quantile(0.3) else "M"))
return df_stocks
def group_returns(stocks, start, end):
SL = stocks[stocks["SB"].isin(["S"])&stocks["HML"].isin(["L"])].code.tolist()
sum_SL = stocks[stocks["SB"].isin(["S"])&stocks["HML"].isin(["L"])]["mv"].sum()
SM = stocks[stocks["SB"].isin(["S"])&stocks["HML"].isin(["M"])].code.tolist()
sum_SM = stocks[stocks["SB"].isin(["S"])&stocks["HML"].isin(["M"])]["mv"].sum()
SH = stocks[stocks["SB"].isin(["S"])&stocks["HML"].isin(["H"])].code.tolist()
sum_SH = stocks[stocks["SB"].isin(["S"])&stocks["HML"].isin(["H"])]["mv"].sum()
BL = stocks[stocks["SB"].isin(["B"])&stocks["HML"].isin(["L"])].code.tolist()
sum_BL = stocks[stocks["SB"].isin(["B"])&stocks["HML"].isin(["L"])]["mv"].sum()
BM = stocks[stocks["SB"].isin(["B"])&stocks["HML"].isin(["M"])].code.tolist()
sum_BM = stocks[stocks["SB"].isin(["B"])&stocks["HML"].isin(["M"])]["mv"].sum()
BH = stocks[stocks["SB"].isin(["B"])&stocks["HML"].isin(["H"])].code.tolist()
sum_BH = stocks[stocks["SB"].isin(["B"])&stocks["HML"].isin(["H"])]["mv"].sum()
groups = [SL,SM,SH,BL,BM,BH]
sums = [sum_SL,sum_SM,sum_SH,sum_BL,sum_BM,sum_BH]
groups_names = ["SL","SM","SH","BL","BM","BH"]
df_groups = pd.DataFrame(columns=groups_names)
count=0
tusharedata_time=[]
for group in groups:
df1 = pd.DataFrame()
for i in range(len(group)):
data = pro.daily(ts_code=group[i]+".SH",start_date=start,end_date=end)
print("datatusharedaily:",data.head(), "\ndatagroup[i]", group[i])
data.sort_values(by="trade_date",inplace=True)
data = data["pct_chg"]*stocks["mv"][i]
df1[group[i]] = data
df_groups[groups_names[count]] = df1.apply(lambda x:x.sum()/sums[count],axis=1)/100
print("%s组计算完成"%groups_names[count])
count += 1
print("\ndf_groups: ",df_groups.head(),"\nlength of dfgroups", len(df_groups))
return df_groups,tusharedata_time
def SMB_HML(data):
data["SMB"] = (data["SL"]+data["SM"]+data["SH"])/3-(data["BL"]+data["BM"]+data["BH"])/3
data["HML"] = (data["SH"]+data["BH"])/2-(data["SL"]+data["BL"])/2
return data
def selection(data,start,end,stocks_codes,tusharedata_time):
MKT = (csmar_data3[(csmar_data3["Indexcd"] == 10) & (csmar_data3["Trddt"] <= "2022-02-28") & (csmar_data3["Trddt"] >= "2020-03-02")]["Retindex"]-0.0137).tolist()
print("\nlen(MKT):",len(MKT))
data["MKT"] = MKT
stockreturn = pd.DataFrame()
print("\ndata.head(): \n",data.head())
for i in range(len(stocks_codes)):
a = pd.DataFrame(csmar_data[(csmar_data["Stkcd"] == int(stocks_codes[i])) & (csmar_data["Trddt"] <= "2022-02-28") & (csmar_data["Trddt"] >= "2020-03-02")])
if len(a) != len(MKT):
continue
a = a["Dretwd"].tolist()
print("\nlen(a): ",len(a))
print("stockscodes: ",stocks_codes[i])
data["%s"%stocks_codes[i]] = a
stockreturn["%s"%stocks_codes[i]] = a
return data,stockreturn
def OLS(df_final,stockreturn):
list_model_analysis = []
results = pd.DataFrame()
stockreturn[np.isnan(stockreturn)] = 0
stockreturn[np.isinf(stockreturn)] = 0
stocks_return = stockreturn
df_final[np.isnan(df_final)] = 0
df_final[np.isinf(df_final)] = 0
print("df_final:\n",df_final.head())
print("stocks_return:\n",stocks_return.head())
for i in range(len(stocks_return.columns)):
x = list(df_final.iloc[:,6])
x1 = list(df_final.iloc[:,7])
x2 = list(df_final.iloc[:,8])#注意index问题
x_ = pd.DataFrame()
x_["SMB"] = x
x_["HML"] = x1
x_["MKT"] = x2
y = stocks_return.iloc[:,i]
X = sm.add_constant(x_)
model = sm.OLS(y,X)
result = model.fit()
list_model_analysis.append(result.rsquared)
results[i] = result.params
results.columns = stocks_return.columns
results.rename(index={"const":"Alpha"},inplace=True)
z =results.sort_values(by="Alpha",axis=1,ascending=False)
print("z:\n",z)
print("\n",max(list_model_analysis),min(list_model_analysis),np.mean(list_model_analysis))
stocks_lists = z.columns.values.tolist()
top_stocks = stocks_lists[:10]
return top_stocks
if __name__ == "__main__":
token = '你的token'
ts.set_token(token)
pro = ts.pro_api()
start_date='20200302'
end_date='20220228'
stocks = getZZ800(start_date, end_date)
group=group_stocks(stocks,end_date)
data,tusharedata_time = group_returns(group, start_date, end_date)
data = SMB_HML(data)
df_final, stockreturn = selection(data, start_date, end_date, stocks,tusharedata_time)
list=OLS(df_final,stockreturn)
print(list)
R方最大值:0.6884104765055692
R方最小值:0.160489405793935
R方均值:0.42275816015434237
当前前10股alpha最小股票代码
['600809', '601888', '601012', '600519', '600036', '601166', '600436', '600438', '600309', '601601']
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)