数据科学:时间序列(一)—— ETS 预估 DAU

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import itertools
import warnings

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import cm
import matplotlib as mpl
import matplotlib.pyplot as plt
import datetime
from matplotlib.ticker import FuncFormatter
from dateutil.parser import parse
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.stats.diagnostic import acorr_ljungbox
import statsmodels.tsa.stattools as ts
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.api import ExponentialSmoothing
from mpl_toolkits.mplot3d import Axes3D
from datetime import datetime, timedelta
warnings.filterwarnings("ignore")
# register the converters
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
# 打印图像
%matplotlib inline
# 定义种子
np.random.seed(sum(map(ord,"aesthetics")))
# 取消科学计数
pd.set_option('display.float_format',lambda x : '%.2f' % x)
# 显示中文
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['font.serif'] = ['SimHei']
mpl.rcParams['axes.formatter.useoffset'] = False
# 解决保存图像是负号'-'显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False
# 解决Seaborn中文显示问题并调整字体大小
sns.set_style("darkgrid",{"font.sans-serif":['SimHei', 'Arial']})
# 设置图片分辨率
plt.rcParams['savefig.dpi'] = 150
plt.rcParams['figure.dpi'] = 150

# plt.gcf().autofmt_xdate()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def plot_2d(data, title, figsize=(18,5)):
plt.figure(figsize=figsize)
plt.title(title)
data.plot()

def plot_3d_surface(X, Y, Z, figsize=(18,10)):
fig = plt.figure(figsize=figsize)
ax = Axes3D(fig)
# 选色带 https://matplotlib.org/3.2.1/tutorials/colors/colormaps.html
# cmap: rainbow coolwarm jet PiYG
surf = ax.plot_surface(X, Y, Z * 100,
rstride=1, cstride=1,
cmap='coolwarm',
linewidth=0.1,
shade=True,
alpha=0.8,
norm=mpl.colors.Normalize(vmin=-1., vmax=40)
)
# ax.contourf(X, Y, Z*100, zdir='y', offset=121, cmap=cm.coolwarm)
# ax.set_xlim(-10, 120)

ax.set_xlabel('N days')
ax.set_ylabel('date')
ax.set_zlabel('ratio(%)')
# ax.set_title('Retention rate of new users')
fig.colorbar(surf, shrink=0.6, aspect=6)
plt.show()

def df_show(df, m=5, n=5):
print('shape={}'.format(df.shape))
return pd.concat([df.head(m),df.tail(n)])

原理

  • $k$:k 天前
  • $r_k$:k 天前新用户第 k 日留存
  • $s_k$:k 天前新用户占比
  • $K$:历史新老用户分解线,比如将2019年前所有用户当做老用户,之后的新用户看做是新用户

数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def get_dau(path='./data/0610_dau.xlsx', usecols=['f_date', 'dau'], index_col=0, parse_dates=['f_date']):
data = pd.read_excel(io=path, usecols=usecols, index_col=index_col, parse_dates=parse_dates).dau * 10000
data.index = pd.DatetimeIndex(data.index, freq='D')
return data.astype(int)

def get_dau_arima(path='./data/0610_dau_arima.xlsx', usecols=['f_date', 'DAU'], index_col=0, parse_dates=['f_date']):
data = pd.read_excel(io=path, usecols=usecols, index_col=index_col, parse_dates=parse_dates).DAU
data.index = pd.DatetimeIndex(data.index, freq='D')
return data.astype(int)

def get_dau_new(path='./data/0610_dau_new.xlsx', usecols=['f_date', 'f_dau_new'], index_col=0, parse_dates=['f_date']):
data = pd.read_excel(io=path, usecols=usecols, index_col=index_col, parse_dates=parse_dates).f_dau_new
data.index = pd.DatetimeIndex(data.index, freq='D')
return data


def get_retention_new(path='./data/0610_retention_new.xlsx',
usecols=['f_visit_day', 'f_remain_days', 'f_ratio'],
parse_dates=['f_visit_day']):
data = pd.read_excel(io=path, usecols=usecols, parse_dates=parse_dates)
data = data.drop_duplicates(['f_visit_day', 'f_remain_days'], keep='last') \
.set_index(['f_visit_day', 'f_remain_days']) \
.unstack() \
.sort_index() \
.f_ratio
data.index = pd.DatetimeIndex(data.index, freq='D')
# pd.to_datetime(datetime.date.today() - datetime.timedelta(121))
data = data.loc['2020-01-01':]
return data


def get_retention_old(path='./data/0610_retention_old.xlsx', usecols=['f_date', 'f_ratio'],
index_col=0, parse_dates=['f_date']):
data = pd.read_excel(io=path, usecols=usecols, index_col=index_col, parse_dates=parse_dates).f_ratio
data.index = pd.DatetimeIndex(data.index, freq='D')
return data

小程序 DAU

1
2
dau = get_dau()
plot_2d(dau.loc['2020-03-01':], u'DAU')

1
df_show(dau, 10, 10)
shape=(524,)

f_date
2019-01-04       2960
2019-01-05       2318
2019-01-06       2274
2019-01-07       2601
2019-01-08       2520
2019-01-09       2514
2019-01-10       2315
2019-01-11       2179
2019-01-12       1947
2019-01-13       1813
2020-06-01    3994272
2020-06-02    3546456
2020-06-03    3248834
2020-06-04    3129509
2020-06-05    3216982
2020-06-06    2895258
2020-06-07    2926685
2020-06-08    3023940
2020-06-09    2911481
2020-06-10    2737987
Name: dau, dtype: int64

新增用户

1
2
s_dau_new = get_dau_new()
df_show(s_dau_new)
shape=(161,)





f_date
2020-01-01    183431
2020-01-02    175389
2020-01-03    172991
2020-01-04    157629
2020-01-05    158287
2020-06-05    440420
2020-06-06    419283
2020-06-07    409955
2020-06-08    422092
2020-06-09    412235
Name: f_dau_new, dtype: int64
1
2
# '2020-03-01':
plot_2d(s_dau_new.loc[:], u'新增用户')

新用户留存

1
2
df_ret_new = get_retention_new()
df_show(df_ret_new)
shape=(160, 120)
f_remain_days 1 2 3 4 5 6 7 8 9 10 ... 111 112 113 114 115 116 117 118 119 120
f_visit_day
2020-01-01 0.02 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.00 0.01 ... 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
2020-01-02 0.02 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 ... 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
2020-01-03 0.02 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 ... 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
2020-01-04 0.02 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 ... 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
2020-01-05 0.02 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 ... 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
2020-06-04 0.04 0.02 0.01 0.01 0.01 nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-05 0.04 0.02 0.01 0.01 nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-06 0.03 0.02 0.01 nan nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-07 0.03 0.02 nan nan nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-08 0.04 nan nan nan nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan

10 rows × 120 columns

1
2
3
4
5
6
7
df_ret_new_plot = df_ret_new.loc['2020-02-10':]
X = df_ret_new_plot.columns
Y = (df_ret_new_plot.index - df_ret_new_plot.index.min()).map(lambda x: x.days)
X, Y = np.meshgrid(X,Y)
Z = df_ret_new_plot.values.astype(np.double)

plot_3d_surface(X, Y, Z, figsize=(18, 8))

老用户留存

1
2
s_ret_old = get_retention_old()
df_show(s_ret_old)
shape=(101,)





f_date
2020-03-01   0.05
2020-03-02   0.05
2020-03-03   0.04
2020-03-04   0.04
2020-03-05   0.04
2020-06-05   0.01
2020-06-06   0.01
2020-06-07   0.01
2020-06-08   0.01
2020-06-09   0.01
Name: f_ratio, dtype: float64
1
plot_2d(s_ret_old,u'老用户留存')

模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240

class MyEts:
def __init__(self, s_dau_new, df_ret_new, s_ret_old, base_date, pred_start_date,
n_forcast=7, dau_old=273385370, n_train=90):
"""
Args:
s_dau_new: pd.Series 新增用户DAU
df_ret_new: pd.DataFrame 新用户留存矩阵
s_ret_old: pd.Series 老用户留存率
base_date: String 新老用户边界日期,新用户最早日期
pred_start_date: String 开始预测日期
n_forcast: int 预测天数
dau_old: 老用户总数
n_train: 训练集容量
"""
self.base_date = pd.to_datetime(base_date)
self.pred_start_date_origin = pd.to_datetime(pred_start_date)
self.pred_end_date_origin = self.pred_start_date_origin + timedelta(n_forcast - 1)

# 数据末端对齐, 重新定义开始预估日期和预估长度
max_dau_new_date = s_dau_new.index.max()
max_ret_new_date = df_ret_new.index.max() + timedelta(1)
max_ret_old_date = s_ret_old.index.max()
end_date = min(max_dau_new_date, max_ret_new_date, max_ret_old_date, self.pred_start_date_origin - timedelta(1))

self.s_dau_new = s_dau_new.loc[: end_date]
self.df_ret_new = df_ret_new.loc[: end_date - timedelta(1)]
self.s_ret_old = s_ret_old.loc[: end_date]
self.pred_start_date = end_date + timedelta(1)
self.pred_end_date = self.pred_end_date_origin
self.n_forcast = (self.pred_end_date - self.pred_start_date).days + 1

self.n_train = n_train
self.n_offset = (self.pred_start_date - self.base_date).days
self.n_forcast = n_forcast

self.dau_old = dau_old
self.ratio_matrix = None
self.dau_matrix = None
self.df_result = pd.DataFrame(index=pd.date_range(self.pred_start_date, self.pred_end_date))

def ets(self, s_train, trend='add', seasonal='add', seasonal_periods=7, damped=False,
smoothing_level=None, smoothing_slope=None, smoothing_seasonal=None, damping_slope=None,
use_boxcox=False, remove_bias=False):
"""
指数平滑,返回未来 n_forcast 天的预估值构成的Series
:s_train 训练集 Series
:n_forcast 预测天数
:trend 趋势类型
:seasonal 季节类型
:seasonal_periods 季节周期
"""
model = ExponentialSmoothing(s_train,
trend=trend,
seasonal=seasonal,
seasonal_periods=seasonal_periods,
damped=damped).fit(smoothing_level=smoothing_level,
smoothing_slope=smoothing_slope,
smoothing_seasonal=smoothing_seasonal,
damping_slope=damping_slope,
use_boxcox=use_boxcox,
remove_bias=remove_bias)
pred_result = model.forecast(self.n_forcast)
return pred_result

def arima(self, train, pdq, pdqs, trend):
"""
arima 模型: 输入训练集和预测未来天数,返回预测结果Series
"""
train_log = np.log(train.loc[self.base_date:])
model = sm.tsa.statespace.SARIMAX(train_log,
order=pdq,
seasonal_order=pdqs,
trend=trend,
measurement_error=False,
time_varying_regression=False,
mle_regression=True,
simple_differencing=False,
enforce_stationarity=True,
enforce_invertibility=True,
hamilton_representation=False,
concentrate_scale=False
).fit()

pred_result = model.predict(self.pred_start_date, self.pred_end_date, dynamic=True, typ='levels')
return np.exp(pred_result)

def pred_dau_new(self, trend=None, seasonal='add', seasonal_periods=7, damped=False, use_boxcox=False, model='ets',
smoothing_level=None, smoothing_slope=None, smoothing_seasonal=None, damping_slope=None):
"""
预测新用户 DAU,返回Series;
将 base_date 之前的老用户 DAU 算作第零天 DAU
"""
s_train_data = self.s_dau_new
if model == 'ets':
s_pred_data = self.ets(s_train_data,
trend=trend,
seasonal=seasonal,
seasonal_periods=seasonal_periods,
damped=damped,
use_boxcox=use_boxcox,
smoothing_level=smoothing_level,
smoothing_slope=smoothing_slope,
smoothing_seasonal=smoothing_seasonal,
damping_slope=damping_slope,
remove_bias=True)
elif model == 'arima':
s_pred_data = self.arima(s_train_data, pdq=[1, 2, 0], pdqs=[3, 0, 2, 7], trend='c')
else:
raise Exception('请输入正确的model: ets 或者 arima')
s_pred_data = pd.concat([s_train_data, s_pred_data]).loc[self.base_date: self.pred_end_date]
s_pred_data[pd.to_datetime(self.base_date) - timedelta(1)] = self.dau_old
s_pred_data = s_pred_data.sort_index()
return s_pred_data

def pred_ret_new(self, trend='add', seasonal='add', seasonal_periods=7, damped=False, use_boxcox=False,
smoothing_level=None, smoothing_slope=None, smoothing_seasonal=None, damping_slope=None):
"""
预测新用户第 N 日留存率,返回 DataFrame shape=(self.n_offset + self.n_forcast, self.n_offset + self.n_forcast)
加上第 0 日留存的一列
"""
pred_matrix = pd.DataFrame(index=pd.date_range(self.base_date, self.pred_end_date))

# todo 数据源数据不全检查 for n in range(98,100):
for n in range(self.n_offset + self.n_forcast):
if not n:
pred_matrix[n] = 1
else:
train_start_date = min(self.pred_start_date - timedelta(n + self.n_train), self.base_date)
train_end_date = self.pred_start_date - timedelta(n + 1)
s_train_data = self.df_ret_new.loc[train_start_date:train_end_date, n]
s_pred_data = self.ets(s_train_data,
trend=trend,
seasonal=seasonal,
seasonal_periods=seasonal_periods,
damped=damped,
use_boxcox=use_boxcox,
smoothing_level=smoothing_level,
smoothing_slope=smoothing_slope,
smoothing_seasonal=smoothing_seasonal,
damping_slope=damping_slope,
remove_bias=True)
pred_matrix[n] = pd.concat([s_train_data, s_pred_data]).sort_index()
pred_matrix.sort_index(axis=1)
return pred_matrix

def pred_ret_old(self, trend='add', seasonal='add', seasonal_periods=7, damped=False, use_boxcox=False,
smoothing_level=0.5, smoothing_slope=None, smoothing_seasonal=None, damping_slope=None):
"""
预测老用户第 N 日留存率,返回时间序列 DataFrame
转化为和新用户留存率矩阵行的形式,并加上第0日留存率=1
"""
s_train_data = self.s_ret_old.copy()
s_pred_data = self.ets(s_train_data,
trend=trend,
seasonal=seasonal,
seasonal_periods=seasonal_periods,
damped=damped,
use_boxcox=use_boxcox,
smoothing_level=smoothing_level,
smoothing_slope=smoothing_slope,
smoothing_seasonal=smoothing_seasonal,
damping_slope=damping_slope,
remove_bias=True)
min_index = pd.to_datetime(self.base_date) - timedelta(1)
s_train_data[min_index] = 1
result = pd.concat([s_train_data, s_pred_data]).sort_index()
result = result.loc[min_index: self.pred_end_date].to_frame('ratio')
result['date'] = min_index
result['offset'] = (result.index - min_index).map(lambda x: x.days)
result = result.pivot(index='date', columns='offset', values='ratio').rename_axis(index=None, columns=None)
result = result.sort_index().sort_index(axis=1)
return result

def predict(self):
"""
综合预测未来DAU
"""
df_dau_new = self.pred_dau_new(trend='add',
seasonal='add',
seasonal_periods=7,
damped=True,
use_boxcox=False,
model='ets',
smoothing_level=0.8,
smoothing_slope=0.9,
smoothing_seasonal=None,
damping_slope=0.85)
df_ratio_new = self.pred_ret_new(trend='add',
seasonal='add',
seasonal_periods=7,
damped=False,
use_boxcox=False,
smoothing_level=0.9,
smoothing_slope=0.7,
smoothing_seasonal=None,
damping_slope=0.75)

df_ratio_old = self.pred_ret_old()

self.ratio_matrix = pd.concat([df_ratio_old, df_ratio_new]).sort_index().sort_index(axis=1)
self.dau_matrix = self.ratio_matrix.mul(df_dau_new, axis='index')

self.df_result['total_dau'] = [np.sum(np.diag(np.fliplr(np.array(self.dau_matrix)), d)) for d in
range(self.n_forcast - 1, -1, -1)]
return self.df_result

@classmethod
def rmse(cls, predictions, targets):
"""
计算序列 predictions 和 targets 的均方根误差率
Args:
predictions:
targets:

Returns:
"""
return np.sqrt((((predictions - targets) / targets) ** 2).mean())

@classmethod
def predict_compare(cls, predictions, targets, title):
"""
计算误差率,绘制图形
"""
pred_index = targets.index
compare = pd.concat([targets, predictions], axis=1)
colname_origin, colname_predict = '{}_origin'.format(targets.name), '{}_predict'.format(targets.name)
compare.columns = [colname_origin, colname_predict]
compare['{}_diff'.format(targets.name)] = compare[colname_predict] - compare[colname_origin]
compare['{}_rate'.format(targets.name)] = compare['{}_diff'.format(targets.name)] / compare[colname_origin]

fig = plt.figure(figsize=(18, 8))
plt.plot(compare.index, compare[colname_origin], label=colname_origin)
plt.plot(compare.index, compare[colname_predict], label=colname_predict)
plt.legend()
plt.title(u'预估效果对比:{}'.format(title))
plt.xticks(rotation=30)
rmse_rate = cls.rmse(predictions, targets[predictions.index])
print('RMSE={}'.format(rmse_rate))
return compare
1
2
3
4
5
6
base_date, pred_start_date = pd.to_datetime('2020-03-01'), pd.to_datetime('2020-06-01')
n_forcast = 10
pred_end_date = pred_start_date + timedelta(n_forcast-1)
myets = MyEts(s_dau_new, df_ret_new, s_ret_old,
base_date=base_date, pred_start_date=pred_start_date, n_forcast=n_forcast)
pred_start_date, pred_end_date
(Timestamp('2020-06-01 00:00:00'), Timestamp('2020-06-10 00:00:00'))

新增用户

ARIMA

平稳性检验
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def judge_stationarity(data_sanya_one):
"""
平稳性检验
"""
dftest = ts.adfuller(data_sanya_one)
print(dftest)
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
stationarity = 1
for key, value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
if dftest[0] > value:
stationarity = 0
print(dfoutput)
print("数据是否平稳(1/0): %d" %(stationarity))
return stationarity

def season_resolve(data):
"""
季节性分解:observed = trend + seasonal + residual
"""
decomposition = seasonal_decompose(data)
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

fig = decomposition.plot()
fig.set_size_inches(6, 4)
print("test: p={}".format(ts.adfuller(seasonal)[1]))
# 残差是否平稳
stationarity = judge_stationarity(residual.dropna())

s_dau_new_log = np.log(myets.s_dau_new.loc['2020-03-01':])
s_dau_new_log_diff = s_dau_new_log.diff().dropna()
s_dau_new_log_diff2 = s_dau_new_log_diff.diff().dropna()
season_resolve(s_dau_new_log)
test: p=0.0
(-3.387682117502709, 0.01138602363842863, 11, 74, {'5%': -2.9014701097664504, '1%': -3.5219803175527606, '10%': -2.58807215485756}, -100.45009341551648)
Test Statistic                -3.39
p-value                        0.01
#Lags Used                    11.00
Number of Observations Used   74.00
Critical Value (5%)           -2.90
Critical Value (1%)           -3.52
Critical Value (10%)          -2.59
dtype: float64
数据是否平稳(1/0): 0

1
season_resolve(s_dau_new_log_diff)
test: p=0.0
(-5.650643581278351, 9.878845977099614e-07, 9, 75, {'5%': -2.9009249540740742, '1%': -3.520713130074074, '10%': -2.5877813777777776}, -88.7065598940126)
Test Statistic                -5.65
p-value                        0.00
#Lags Used                     9.00
Number of Observations Used   75.00
Critical Value (5%)           -2.90
Critical Value (1%)           -3.52
Critical Value (10%)          -2.59
dtype: float64
数据是否平稳(1/0): 1

1
season_resolve(s_dau_new_log_diff2)
test: p=0.0
(-5.898783625957804, 2.8100827285187457e-07, 11, 72, {'5%': -2.9026070739026064, '1%': -3.524624466842421, '10%': -2.5886785262345677}, -68.75760108315257)
Test Statistic                -5.90
p-value                        0.00
#Lags Used                    11.00
Number of Observations Used   72.00
Critical Value (5%)           -2.90
Critical Value (1%)           -3.52
Critical Value (10%)          -2.59
dtype: float64
数据是否平稳(1/0): 1

自相关图
1
2
3
4
5
6
7
8
9
10
11
12
def plot_acf_pacf(df_list):
"""
绘制自相关图,偏自相关图
"""
n = len(df_list)
base_num = 100 * n + 20
plt.figure(figsize=(16, 3 * n))
plt.subplots_adjust(wspace =0.1, hspace =0.3)
for i in range(n):
tmp = base_num + 2 * i + 1
one_1 = plot_acf(df_list[i], lags=40, title= u'ACF', ax=plt.subplot(tmp))
one_2 = plot_pacf(df_list[i], lags=40, title= u'PACF', ax=plt.subplot(tmp+1))
1
plot_acf_pacf([s_dau_new_log, s_dau_new_log_diff])

网格搜索
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def parameter_selection(df, ps=[0,1,2], ds=[2], qs=[0,1,2]):
"""
通过网格搜索对模型p,d,q进行定阶,取损失最小
"""
pdqs = [(x[0], x[1], x[2]) for x in itertools.product(ps, ds, qs)]
best_bic = 1000000.0
best_pdqs = None
for param in pdqs:
p, d, q = param
try:
model = ARIMA(df, (p, d, q))
results = model.fit()
bic = results.aic
print('{} - BIC:{}'.format(param, bic))
if bic < best_bic:
best_bic = bic
best_pdqs = param
except Exception as e:
pass
# print 'error:{}'.format(e)
print '最优参数:x{} - AIC:{}'.format(best_pdqs, best_bic)
return best_pdqs, best_bic
1
matrix = parameter_selection(s_dau_new_log, ps=range(4), qs=range(4))
(0, 2, 0) - BIC:15.5504937045
(0, 2, 1) - BIC:-3.90439313959
(0, 2, 2) - BIC:-16.7008738132
(1, 2, 0) - BIC:13.7741898435
(1, 2, 1) - BIC:-13.1288961798
(1, 2, 2) - BIC:-14.7089870806
(1, 2, 3) - BIC:-13.2074940281
(2, 2, 0) - BIC:0.12566262446
(2, 2, 1) - BIC:-14.3782928385
(2, 2, 2) - BIC:-13.0656053424
(2, 2, 3) - BIC:-13.1196434498
(3, 2, 0) - BIC:-1.77302826664
(3, 2, 1) - BIC:-13.5275309286
(3, 2, 2) - BIC:-10.9182600295
(3, 2, 3) - BIC:-10.6767970671
最优参数:x(0, 2, 2) - AIC:-16.7008738132
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def max_node(p, d, q, P, D, Q, s):
return d + D * s + max(3 * q + 1, 3 * Q * s + 1, p, P * s) + 1
def get_arima_params(data, pdq, ps=[0,1,2], ds=[0,1,2], qs=[0,1,2], m=7):
"""
通过网格搜索确定季节ARIMA的最优参数
@data: 用于拟合的数据源
@pdq: 最优非季节性参数元组(p, d, q)
@季节性周期
"""
p, d, q = pdq
seasonal_pdq = [(x[0], x[1], x[2], m) for x in list(itertools.product(ps, ds, qs))]
score_aic = 1000000.0
warnings.filterwarnings("ignore")
for param_seasonal in seasonal_pdq:
P, D, Q, s = param_seasonal
try:
mod = sm.tsa.statespace.SARIMAX(data,
order=pdq,
seasonal_order=param_seasonal,
enforce_stationarity=False,
enforce_invertibility=False)
results = mod.fit()
print('x{} - AIC:{} need {} observations'.format(param_seasonal, results.aic, max_node(p, d, q, P, D, Q, s)))
if results.aic < score_aic:
score_aic = results.aic
params = param_seasonal, results.aic
except Exception as e:
print 'error:{}'.format(e)
pass
param_seasonal, results.aic = params
print('最优参数:x{} - AIC:{}'.format(param_seasonal, results.aic))
1
2
pdq = [0, 2, 2]
get_arima_params(s_dau_new_log, pdq, range(4),range(3), range(3), 7)
x(0, 0, 0, 7) - AIC:-15.9235194034 need 10 observations
x(0, 0, 1, 7) - AIC:-14.7113328488 need 25 observations
x(0, 0, 2, 7) - AIC:-29.4064147183 need 46 observations
x(0, 1, 0, 7) - AIC:63.48723362 need 17 observations
x(0, 1, 1, 7) - AIC:-2.14367734565 need 32 observations
x(0, 1, 2, 7) - AIC:-11.8848525187 need 53 observations
x(0, 2, 0, 7) - AIC:141.898368919 need 24 observations
x(0, 2, 1, 7) - AIC:39.8814708492 need 39 observations
x(0, 2, 2, 7) - AIC:2.67172770386 need 60 observations
x(1, 0, 0, 7) - AIC:-13.7461303101 need 10 observations
x(1, 0, 1, 7) - AIC:-13.4284053199 need 25 observations
x(1, 0, 2, 7) - AIC:-30.6083506043 need 46 observations
x(1, 1, 0, 7) - AIC:26.5931599614 need 17 observations
x(1, 1, 1, 7) - AIC:11.54952856 need 32 observations
x(1, 1, 2, 7) - AIC:-11.5907261101 need 53 observations
x(1, 2, 0, 7) - AIC:79.078108805 need 24 observations
x(1, 2, 1, 7) - AIC:47.7740298619 need 39 observations
x(1, 2, 2, 7) - AIC:6.90057268677 need 60 observations
x(2, 0, 0, 7) - AIC:-41.7769269787 need 17 observations
x(2, 0, 1, 7) - AIC:-40.1960951619 need 25 observations
x(2, 0, 2, 7) - AIC:-37.2401608236 need 46 observations
x(2, 1, 0, 7) - AIC:-9.33829251277 need 24 observations
x(2, 1, 1, 7) - AIC:-19.2066580221 need 32 observations
x(2, 1, 2, 7) - AIC:-22.4909020767 need 53 observations
x(2, 2, 0, 7) - AIC:25.240615073 need 31 observations
x(2, 2, 1, 7) - AIC:18.7858056712 need 39 observations
x(2, 2, 2, 7) - AIC:11.9538869508 need 60 observations
x(3, 0, 0, 7) - AIC:-38.0001167775 need 24 observations
x(3, 0, 1, 7) - AIC:-45.2099169457 need 25 observations
x(3, 0, 2, 7) - AIC:-49.2523157208 need 46 observations
x(3, 1, 0, 7) - AIC:-32.9888239169 need 31 observations
x(3, 1, 1, 7) - AIC:-31.4343836764 need 32 observations
x(3, 1, 2, 7) - AIC:-30.8665326957 need 53 observations
x(3, 2, 0, 7) - AIC:8.55482266627 need 38 observations
x(3, 2, 1, 7) - AIC:-4.72304771546 need 39 observations
x(3, 2, 2, 7) - AIC:-3.35845958847 need 60 observations
最优参数:x(3, 0, 2, 7) - AIC:-49.2523157208
1
2
3
4
5
6
7
pred_dau_arima = myets.arima(train=myets.s_dau_new, 
pdq=[0,2,2],
pdqs=[3, 0, 2, 7],
trend='t'
)
pred_dau_arima_diff = myets.predict_compare(pred_dau_arima.loc[pred_start_date: pred_end_date],
s_dau_new.loc['2020-03-01':], u'新用户第N日留存率-ARIMA')
RMSE=0.078732475591

1
df_show(pred_dau_arima_diff.loc[pred_start_date: pred_end_date], 0, 14).dropna()
shape=(10, 4)
f_dau_new_origin f_dau_new_predict f_dau_new_diff f_dau_new_rate
f_date
2020-06-01 588936.00 566249.56 -22686.44 -0.04
2020-06-02 507322.00 529802.47 22480.47 0.04
2020-06-03 459898.00 516113.69 56215.69 0.12
2020-06-04 434520.00 486794.93 52274.93 0.12
2020-06-05 440420.00 451129.71 10709.71 0.02
2020-06-06 419283.00 455463.38 36180.38 0.09
2020-06-07 409955.00 433574.34 23619.34 0.06
2020-06-08 422092.00 398749.40 -23342.60 -0.06
2020-06-09 412235.00 374202.33 -38032.67 -0.09

ETS

1
2
3
4
5
6
7
8
9
10
11
12
pred_dau_new = myets.pred_dau_new(trend='add',
seasonal='add',
seasonal_periods=7,
damped=True,
use_boxcox=False,
model='ets',
smoothing_level=0.8,
smoothing_slope=0.9,
smoothing_seasonal=None,
damping_slope=0.85)
pred_dau_new_diff = myets.predict_compare(pred_dau_new.loc[pred_start_date: pred_end_date],
s_dau_new.loc['2020-03-01':], u'新用户第N日留存率-ETS')
RMSE=0.0654776482142

1
df_show(pred_dau_new_diff.loc[pred_start_date: pred_end_date],0,14).dropna()
shape=(10, 4)
f_dau_new_origin f_dau_new_predict f_dau_new_diff f_dau_new_rate
2020-06-01 588936.00 563557.69 -25378.31 -0.04
2020-06-02 507322.00 518941.51 11619.51 0.02
2020-06-03 459898.00 517735.65 57837.65 0.13
2020-06-04 434520.00 486268.62 51748.62 0.12
2020-06-05 440420.00 463959.35 23539.35 0.05
2020-06-06 419283.00 431672.77 12389.77 0.03
2020-06-07 409955.00 417944.87 7989.87 0.02
2020-06-08 422092.00 416701.86 -5390.14 -0.01
2020-06-09 412235.00 394114.05 -18120.95 -0.04

新用户留存

1
df_show(df_ret_new,3,10)
shape=(160, 120)
f_remain_days 1 2 3 4 5 6 7 8 9 10 ... 111 112 113 114 115 116 117 118 119 120
f_visit_day
2020-01-01 0.02 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.00 0.01 ... 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
2020-01-02 0.02 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 ... 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
2020-01-03 0.02 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 ... 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
2020-05-30 0.11 0.05 0.03 0.02 0.01 0.01 0.01 0.01 0.01 0.01 ... nan nan nan nan nan nan nan nan nan nan
2020-05-31 0.07 0.03 0.02 0.01 0.01 0.01 0.01 0.01 0.01 nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-01 0.06 0.03 0.02 0.01 0.01 0.01 0.01 0.01 nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-02 0.05 0.02 0.02 0.01 0.01 0.01 0.01 nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-03 0.05 0.02 0.01 0.01 0.01 0.01 nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-04 0.04 0.02 0.01 0.01 0.01 nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-05 0.04 0.02 0.01 0.01 nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-06 0.03 0.02 0.01 nan nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-07 0.03 0.02 nan nan nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-08 0.04 nan nan nan nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan

13 rows × 120 columns

新用户第 N 日留存预估

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
def predict_ret_new_n(n):
train_start_date = myets.pred_start_date - timedelta(n + myets.n_train)
train_end_date = myets.pred_start_date - timedelta(n + 1)
s_train_data = myets.df_ret_new.loc[train_start_date:train_end_date, n]
s_pred_data = myets.ets(s_train_data,
trend='add',
seasonal='add',
seasonal_periods=7,
damped=True,
use_boxcox=False,
smoothing_level=0.8,
smoothing_slope=0.75,
smoothing_seasonal=None,
damping_slope=0.76)
return s_pred_data

# n = 18
for n in range(1,3):
s_ret_new_n = df_ret_new.loc[:, n].dropna()
pred_ret_new_n = predict_ret_new_n(n)
pred_dau_new_diff = myets.predict_compare(pred_ret_new_n, s_ret_new_n.loc['2020-03-01':],n)
RMSE=0.227393637754
RMSE=1.19606960639

新用户第 N 日留存预估矩阵

1
2
3
4
5
6
7
8
9
pred_ret_new_ratio = myets.pred_ret_new(trend=None,
seasonal='add',
seasonal_periods=7,
damped=False,
use_boxcox=False,
smoothing_level=0.9,
smoothing_slope=0.7,
smoothing_seasonal=None,
damping_slope=0.75)
1
df_show(pred_ret_new_ratio,3,15)
shape=(102, 102)
0 1 2 3 4 5 6 7 8 9 ... 92 93 94 95 96 97 98 99 100 101
2020-03-01 1 0.14 0.06 0.04 0.03 0.02 0.02 0.02 0.02 0.01 ... 0.00 0.00 0.00 0.00 0.01 0.00 0.01 0.01 0.00 0.00
2020-03-02 1 0.10 0.05 0.03 0.03 0.02 0.02 0.02 0.01 0.01 ... 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 nan
2020-03-03 1 0.08 0.04 0.03 0.02 0.02 0.02 0.02 0.02 0.01 ... 0.01 0.01 0.00 0.00 0.00 0.00 0.01 0.01 nan nan
2020-05-27 1 0.18 0.08 0.04 0.02 0.02 0.01 0.01 0.01 0.01 ... nan nan nan nan nan nan nan nan nan nan
2020-05-28 1 0.17 0.06 0.03 0.03 0.02 0.02 0.01 0.01 0.01 ... nan nan nan nan nan nan nan nan nan nan
2020-05-29 1 0.14 0.06 0.04 0.03 0.02 0.02 0.01 0.01 0.01 ... nan nan nan nan nan nan nan nan nan nan
2020-05-30 1 0.11 0.07 0.04 0.03 0.02 0.02 0.01 0.01 0.01 ... nan nan nan nan nan nan nan nan nan nan
2020-05-31 1 0.11 0.06 0.04 0.03 0.02 0.01 0.01 0.01 0.01 ... nan nan nan nan nan nan nan nan nan nan
2020-06-01 1 0.11 0.06 0.04 0.03 0.02 0.01 0.01 0.01 0.01 ... nan nan nan nan nan nan nan nan nan nan
2020-06-02 1 0.11 0.06 0.04 0.02 0.02 0.01 0.01 0.01 nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-03 1 0.11 0.06 0.03 0.02 0.02 0.01 0.01 nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-04 1 0.11 0.06 0.03 0.03 0.02 0.02 nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-05 1 0.12 0.06 0.04 0.03 0.02 nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-06 1 0.11 0.07 0.04 0.03 nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-07 1 0.11 0.06 0.04 nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-08 1 0.11 0.06 nan nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-09 1 0.11 nan nan nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-10 1 nan nan nan nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan

18 rows × 102 columns

老用户留存

老用户第 N 日留存率预估

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 转置
s_pred_data = myets.ets(myets.s_ret_old,
trend='add',
seasonal='add',
seasonal_periods=7,
damped=False,
use_boxcox=False,
smoothing_level=0.5,
smoothing_slope=None,
smoothing_seasonal=None,
damping_slope=None)
pred_ret_old_result = myets.predict_compare(s_pred_data,
s_ret_old.loc['2020-03-01':],
u'老用户第N日留存-ETS')
RMSE=0.127854181665

1
df_show(pred_ret_old_result,0,15)
shape=(102, 4)
f_ratio_origin f_ratio_predict f_ratio_diff f_ratio_rate
f_date
2020-05-27 0.01 nan nan nan
2020-05-28 0.01 nan nan nan
2020-05-29 0.01 nan nan nan
2020-05-30 0.01 nan nan nan
2020-05-31 0.01 nan nan nan
2020-06-01 0.01 0.01 0.00 0.00
2020-06-02 0.01 0.01 0.00 0.12
2020-06-03 0.01 0.01 0.00 0.17
2020-06-04 0.01 0.01 0.00 0.23
2020-06-05 0.01 0.01 0.00 0.12
2020-06-06 0.01 0.01 0.00 0.13
2020-06-07 0.01 0.01 0.00 0.08
2020-06-08 0.01 0.01 0.00 0.06
2020-06-09 0.01 0.01 0.00 0.11
2020-06-10 nan 0.01 nan nan

新老用户留存率矩阵

1
2
3
4
5
6
7
8
9
10
11
# 备份修改
pred_ret_old_ratio = myets.pred_ret_old(trend='add',
seasonal='add',
seasonal_periods=7,
damped=False,
use_boxcox=False,
smoothing_level=0.5,
smoothing_slope=None,
smoothing_seasonal=None,
damping_slope=None)
pred_ret_old_ratio
0 1 2 3 4 5 6 7 8 9 ... 93 94 95 96 97 98 99 100 101 102
2020-02-29 1.00 0.05 0.05 0.04 0.04 0.04 0.04 0.03 0.03 0.03 ... 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01

1 rows × 103 columns

1
2
ratio_matrix = pd.concat([pred_ret_old_ratio, pred_ret_new_ratio]).sort_index().sort_index(axis=1)
df_show(ratio_matrix, 10,10)
shape=(103, 103)
0 1 2 3 4 5 6 7 8 9 ... 93 94 95 96 97 98 99 100 101 102
2020-02-29 1.00 0.05 0.05 0.04 0.04 0.04 0.04 0.03 0.03 0.03 ... 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
2020-03-01 1.00 0.14 0.06 0.04 0.03 0.02 0.02 0.02 0.02 0.01 ... 0.00 0.00 0.00 0.01 0.00 0.01 0.01 0.00 0.00 nan
2020-03-02 1.00 0.10 0.05 0.03 0.03 0.02 0.02 0.02 0.01 0.01 ... 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 nan nan
2020-03-03 1.00 0.08 0.04 0.03 0.02 0.02 0.02 0.02 0.02 0.01 ... 0.01 0.00 0.00 0.00 0.00 0.01 0.01 nan nan nan
2020-03-04 1.00 0.07 0.04 0.03 0.02 0.02 0.02 0.02 0.01 0.01 ... 0.01 0.00 0.00 0.01 0.01 0.01 nan nan nan nan
2020-03-05 1.00 0.06 0.03 0.02 0.02 0.02 0.02 0.02 0.01 0.01 ... 0.01 0.00 0.01 0.01 0.01 nan nan nan nan nan
2020-03-06 1.00 0.06 0.03 0.02 0.02 0.02 0.02 0.02 0.01 0.01 ... 0.00 0.00 0.01 0.01 nan nan nan nan nan nan
2020-03-07 1.00 0.06 0.03 0.02 0.02 0.02 0.02 0.01 0.01 0.01 ... 0.01 0.00 0.00 nan nan nan nan nan nan nan
2020-03-08 1.00 0.05 0.03 0.02 0.02 0.02 0.02 0.01 0.01 0.01 ... 0.00 0.00 nan nan nan nan nan nan nan nan
2020-03-09 1.00 0.06 0.03 0.02 0.02 0.02 0.02 0.02 0.01 0.01 ... 0.01 nan nan nan nan nan nan nan nan nan
2020-06-01 1.00 0.11 0.06 0.04 0.03 0.02 0.01 0.01 0.01 0.01 ... nan nan nan nan nan nan nan nan nan nan
2020-06-02 1.00 0.11 0.06 0.04 0.02 0.02 0.01 0.01 0.01 nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-03 1.00 0.11 0.06 0.03 0.02 0.02 0.01 0.01 nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-04 1.00 0.11 0.06 0.03 0.03 0.02 0.02 nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-05 1.00 0.12 0.06 0.04 0.03 0.02 nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-06 1.00 0.11 0.07 0.04 0.03 nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-07 1.00 0.11 0.06 0.04 nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-08 1.00 0.11 0.06 nan nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-09 1.00 0.11 nan nan nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-10 1.00 nan nan nan nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan

20 rows × 103 columns

DAU留存矩阵

1
2
dau_matrix = ratio_matrix.mul(pred_dau_new, axis='index')
df_show(dau_matrix, 10,10)
shape=(103, 103)
0 1 2 3 4 5 6 7 8 9 ... 93 94 95 96 97 98 99 100 101 102
2020-02-29 273385370.00 12958148.00 12452616.00 11797162.00 11073535.00 10472504.00 9798707.00 9117300.00 8284139.00 8039563.00 ... 2699356.59 2715180.35 2626011.84 2671154.75 2519520.16 2264857.58 2199788.72 2234150.08 2249973.83 2160805.32
2020-03-01 1475032.00 204291.93 94549.55 58411.27 43218.44 34663.25 29648.14 25223.05 22420.49 20650.45 ... 6936.95 5220.64 7032.82 7539.64 4889.92 7497.01 7614.17 7092.76 7053.06 nan
2020-03-02 1347642.00 134494.67 65495.40 44202.66 34364.87 27896.19 23448.97 21966.56 19675.57 18866.99 ... 9554.92 9098.68 9092.78 8151.67 6763.07 11073.35 9959.14 9417.70 nan nan
2020-03-03 1112908.00 87029.41 47632.46 33943.69 26932.37 21924.29 20700.09 18362.98 17917.82 16359.75 ... 6765.79 4993.47 4390.72 4932.99 4972.60 7680.72 7071.00 nan nan nan
2020-03-04 993211.00 70021.38 38238.62 27015.34 20956.75 19268.29 16785.27 16189.34 14798.84 14600.20 ... 6376.45 3963.54 4519.98 5838.69 5006.08 7176.47 nan nan nan nan
2020-03-05 985703.00 62197.86 33612.47 23065.45 20206.91 17348.37 16559.81 14982.69 14588.40 12814.14 ... 6147.21 4536.31 6059.73 6628.64 5874.86 nan nan nan nan nan
2020-03-06 936955.00 55655.13 29139.30 22955.40 18739.10 17146.28 15272.37 14991.28 13304.76 11993.02 ... 4524.75 3964.23 4799.75 5515.87 nan nan nan nan nan nan
2020-03-07 889821.00 49029.14 28296.31 20821.81 17618.46 15304.92 14593.06 13169.35 11834.62 11389.71 ... 4509.68 3406.76 4220.58 nan nan nan nan nan nan nan
2020-03-08 780080.00 42358.34 23090.37 17629.81 14821.52 13573.39 12013.23 10999.13 10219.05 9594.98 ... 3668.65 2760.97 nan nan nan nan nan nan nan nan
2020-03-09 727798.00 42503.40 23944.55 17903.83 15429.32 12809.24 11353.65 11062.53 10261.95 9461.37 ... 5160.16 nan nan nan nan nan nan nan nan nan
2020-06-01 563557.69 62103.07 35575.73 20969.61 14305.90 10089.09 7795.20 6727.99 5750.08 5095.59 ... nan nan nan nan nan nan nan nan nan nan
2020-06-02 518941.51 56246.99 32305.73 19031.49 12288.68 9177.43 7599.00 6267.09 5469.77 nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-03 517735.65 58635.20 32593.13 18076.83 12242.16 9718.19 7724.17 6392.72 nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-04 486268.62 54515.05 29297.18 17014.05 12271.39 9261.54 7372.38 nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-05 463959.35 54259.31 28966.35 17484.08 11792.88 8750.09 nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-06 431672.77 48169.79 28347.88 16167.69 10923.33 nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-07 417944.87 47617.30 27000.65 15774.89 nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-08 416701.86 45919.82 26305.16 nan nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-09 394114.05 42717.21 nan nan nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan
2020-06-10 411632.31 nan nan nan nan nan nan nan nan nan ... nan nan nan nan nan nan nan nan nan nan

20 rows × 103 columns

DAU 预估

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# df_result['ret_old'] = pd.Series(dau_matrix.loc[base_date-timedelta(1)].values, 
# index=(base_date + timedelta(i-1) for i in dau_matrix.columns))

ets_result = myets.df_result.copy()
ets_result['dau_origin'] = dau.loc[pred_start_date:pred_end_date]
ets_result['dau_sum'] = [np.sum(np.diag(np.fliplr(np.array(dau_matrix)), d)) for d in
range(myets.n_forcast - 1, -1, -1)]
ets_result['dau_new'] = dau_matrix[0]
ets_result['ret_old'] = dau_matrix.iloc[0, -n_forcast:].values
ets_result['ret_new'] = [np.sum(np.diag(np.fliplr(np.array(dau_matrix.iloc[1:-1,1:-1])), d)) for d in
range(myets.n_forcast - 1, -1, -1)]
ets_result['check'] = ets_result['dau_new'] + ets_result['ret_old'] + ets_result['ret_new'] - ets_result['dau_sum']
# df_result = df_result.astype(int)
df_show(ets_result, 10, 10)
shape=(10, 6)
dau_origin dau_sum dau_new ret_old ret_new check
2020-06-01 3994272 3922560.87 563557.69 2699356.59 659646.59 0.00
2020-06-02 3546456 3925123.98 518941.51 2715180.35 691002.12 0.00
2020-06-03 3248834 3830069.65 517735.65 2626011.84 686322.16 -0.00
2020-06-04 3129509 3864890.96 486268.62 2671154.75 707467.59 0.00
2020-06-05 3216982 3691592.09 463959.35 2519520.16 708112.58 0.00
2020-06-06 2895258 3331318.71 431672.77 2264857.58 634788.36 0.00
2020-06-07 2926685 3222451.35 417944.87 2199788.72 604717.76 0.00
2020-06-08 3023940 3298679.82 416701.86 2234150.08 647827.88 -0.00
2020-06-09 2911481 3327172.93 394114.05 2249973.83 683085.04 0.00
2020-06-10 2737987 3258783.04 411632.31 2160805.32 686345.40 0.00
2020-06-01 3994272 3922560.87 563557.69 2699356.59 659646.59 0.00
2020-06-02 3546456 3925123.98 518941.51 2715180.35 691002.12 0.00
2020-06-03 3248834 3830069.65 517735.65 2626011.84 686322.16 -0.00
2020-06-04 3129509 3864890.96 486268.62 2671154.75 707467.59 0.00
2020-06-05 3216982 3691592.09 463959.35 2519520.16 708112.58 0.00
2020-06-06 2895258 3331318.71 431672.77 2264857.58 634788.36 0.00
2020-06-07 2926685 3222451.35 417944.87 2199788.72 604717.76 0.00
2020-06-08 3023940 3298679.82 416701.86 2234150.08 647827.88 -0.00
2020-06-09 2911481 3327172.93 394114.05 2249973.83 683085.04 0.00
2020-06-10 2737987 3258783.04 411632.31 2160805.32 686345.40 0.00

验证

1
result = myets.predict_compare(ets_result['dau_sum'], dau.loc['2020-03-01':], u'ETS-DAU')
RMSE=0.147793852021

1
2
arima_result = get_dau_arima()
result = myets.predict_compare(arima_result.loc['2020-06-01': '2020-06-10'], dau.loc['2020-03-01':], u'ETS-DAU')
RMSE=0.378640685005

坚持原创技术分享,您的支持将鼓励我继续创作!