Forecast: Keras with Categorical Embeddings¶
In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import os
from tqdm.notebook import tqdm
In [2]:
def reduce_mem_usage(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
Load data¶
In [3]:
path = "./input/csv"
In [4]:
calendar = pd.read_csv(os.path.join(path, "calendar.csv.gz"))
selling_prices = pd.read_csv(os.path.join(path, "sell_prices.csv.gz"))
sample_submission = pd.read_csv(os.path.join(path, "sample_submission.csv.gz"))
In [5]:
sales = pd.read_csv(os.path.join(path, "sales_train_validation.csv.gz"))
Describe and prepare data¶
We will now go through all data sets and prepare them for modelling.
Calendar data¶
For each date (covering both training and test data), we have access to useful calendar information.
In [6]:
calendar.head()
Out[6]:
In [7]:
from sklearn.preprocessing import OrdinalEncoder
In [8]:
def prep_calendar(df):
df = df.drop(["date", "weekday"], axis=1)
df = df.assign(d = df.d.str[2:].astype(int))
df = df.fillna("missing")
cols = list(set(df.columns) - {"wm_yr_wk", "d"})
df[cols] = OrdinalEncoder(dtype="int").fit_transform(df[cols])
df = reduce_mem_usage(df)
return df
In [9]:
calendar = prep_calendar(calendar)
Notes for modeling¶
Features deemed to be useful:
- "wday", "year", "month" -> integer coding & embedding
- "event_name_1", "event_type_1" -> integer coding & embedding
- "snap_XX" -> numeric (they are dummies)
Reshape required: No
Merge key(s): "d", "wm_yr_wk"
Selling prices¶
Contains selling prices for each store_id, item_id_wm_yr_wk combination.
Derive some time related features:
In [11]:
def prep_selling_prices(df):
gr = df.groupby(["store_id", "item_id"])["sell_price"]
df["sell_price_rel_diff"] = gr.pct_change()
df["sell_price_roll_sd7"] = gr.transform(lambda x: x.rolling(7).std())
df["sell_price_cumrel"] = (gr.shift(0) - gr.cummin()) / (1 + gr.cummax() - gr.cummin())
df = reduce_mem_usage(df)
return df
In [12]:
selling_prices = prep_selling_prices(selling_prices)
In [14]:
def reshape_sales(df, drop_d = None):
if drop_d is not None:
df = df.drop(["d_" + str(i + 1) for i in range(drop_d)], axis=1)
df = df.assign(id=df.id.str.replace("_validation", ""))
df = df.reindex(columns=df.columns.tolist() + ["d_" + str(1913 + i + 1) for i in range(2 * 28)])
df = df.melt(id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"],
var_name='d', value_name='demand')
df = df.assign(d=df.d.str[2:].astype("int16"))
return df
In [15]:
sales = reshape_sales(sales, 1000)
In [16]:
sales
Out[16]:
In [17]:
def prep_sales(df):
df['lag_t7'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(7))
df['lag_t28'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28))
df['rolling_mean_7_7'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(7).rolling(7).mean())
df['rolling_mean_7_28'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(7).rolling(28).mean())
df['rolling_mean_28_7'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).mean())
df['rolling_mean_28_28'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(28).mean())
df['rolling_std_28_7'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).std())
df['rolling_std_28_28'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(28).std())
# Remove rows with NAs except for submission rows. rolling_mean_7_28
df = df[(df.d >= 1914) | (pd.notna(df.rolling_mean_7_28)) | (df.d <= 200) ]
df = reduce_mem_usage(df)
return df
In [18]:
sales = sales.merge(calendar, how="left", on="d")
gc.collect()
sales.head()
Out[18]:
In [19]:
sales = sales.merge(selling_prices, how="left", on=["wm_yr_wk", "store_id", "item_id"])
sales.drop(["wm_yr_wk"], axis=1, inplace=True)
gc.collect()
sales.head()
Out[19]:
In [20]:
sales = prep_sales(sales)
In [22]:
sales
Out[22]:
In [21]:
del selling_prices
Prepare data for Keras interface¶
Ordinal encoding of remaining categoricals¶
In [23]:
cat_id_cols = ["item_id", "dept_id", "store_id", "cat_id", "state_id"]
cat_cols = cat_id_cols + ["wday", "month", "year", "event_name_1",
"event_type_1", "event_name_2", "event_type_2"]
In [24]:
# In loop to minimize memory use
for i, v in tqdm(enumerate(cat_id_cols)):
sales[v] = OrdinalEncoder(dtype="int").fit_transform(sales[[v]])
sales = reduce_mem_usage(sales)
sales.head()
gc.collect()
Out[24]:
Impute numeric columns¶
In [25]:
num_cols = ["sell_price", "sell_price_rel_diff", "sell_price_roll_sd7", "sell_price_cumrel",
"lag_t7",
"lag_t28",
"rolling_mean_7_7",
"rolling_mean_7_28",
"rolling_mean_28_7",
"rolling_mean_7_28",
"rolling_std_28_7",
"rolling_std_28_28"]
bool_cols = ["snap_CA", "snap_TX", "snap_WI"]
dense_cols = num_cols + bool_cols
In [26]:
# Need to do column by column due to memory constraints
for i, v in tqdm(enumerate(num_cols)):
sales[v] = sales[v].fillna(sales[v].median())
sales.head()
Out[26]:
Separate submission data and reconstruct id columns¶
In [27]:
test = sales[sales.d >= 1914]
test = test.assign(id=test.id + "_" + np.where(test.d <= 1941, "validation", "evaluation"),
F="F" + (test.d - 1913 - 28 * (test.d > 1941)).astype("str"))
test.head()
gc.collect()
Out[27]:
In [28]:
test_long = sales[sales.d >= 1914 - 100]
test_long.head()
gc.collect()
Out[28]:
Make training data¶
In [29]:
# Input dict for training with a dense array and separate inputs for each embedding input
def make_X(df):
X = {"dense1": df[dense_cols].to_numpy()}
for i, v in enumerate(cat_cols):
X[v] = df[[v]].to_numpy()
return X
In [30]:
# Submission data
X_test = make_X(test)
In [31]:
# One month of validation data
flag = (sales.d < 1914) & (sales.d >= 1914 - 28)
valid = (make_X(sales[flag]),
sales["demand"][flag])
In [32]:
# Rest is used for training
flag = sales.d < 1914 #- 7
X_train = make_X(sales[flag])
y_train = sales["demand"][flag]
In [33]:
del sales, flag
gc.collect()
Out[33]:
In [34]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dense, Input, Embedding, Dropout, concatenate, Flatten
from tensorflow.keras.models import Model
Architecture with embeddings¶
In [35]:
def create_model(lr=0.002):
tf.keras.backend.clear_session()
gc.collect()
# Dense input
dense_input = Input(shape=(len(dense_cols), ), name='dense1')
# Embedding input
wday_input = Input(shape=(1,), name='wday')
month_input = Input(shape=(1,), name='month')
year_input = Input(shape=(1,), name='year')
event_name_1_input = Input(shape=(1,), name='event_name_1')
event_type_1_input = Input(shape=(1,), name='event_type_1')
event_name_2_input = Input(shape=(1,), name='event_name_2')
event_type_2_input = Input(shape=(1,), name='event_type_2')
item_id_input = Input(shape=(1,), name='item_id')
dept_id_input = Input(shape=(1,), name='dept_id')
store_id_input = Input(shape=(1,), name='store_id')
cat_id_input = Input(shape=(1,), name='cat_id')
state_id_input = Input(shape=(1,), name='state_id')
wday_emb = Flatten()(Embedding(7, 1)(wday_input))
month_emb = Flatten()(Embedding(12, 1)(month_input))
year_emb = Flatten()(Embedding(6, 1)(year_input))
event_name_1_emb = Flatten()(Embedding(31, 1)(event_name_1_input))
event_type_1_emb = Flatten()(Embedding(5, 1)(event_type_1_input))
event_name_2_emb = Flatten()(Embedding(5, 1)(event_name_2_input))
event_type_2_emb = Flatten()(Embedding(5, 1)(event_type_2_input))
item_id_emb = Flatten()(Embedding(3049, 3)(item_id_input))
dept_id_emb = Flatten()(Embedding(7, 1)(dept_id_input))
store_id_emb = Flatten()(Embedding(10, 1)(store_id_input))
cat_id_emb = Flatten()(Embedding(3, 1)(cat_id_input))
state_id_emb = Flatten()(Embedding(3, 1)(state_id_input))
# Combine dense and embedding parts and add dense layers. Exit on linear scale.
x = concatenate([dense_input, wday_emb, month_emb, year_emb,
event_name_1_emb, event_type_1_emb,
event_name_2_emb, event_type_2_emb,
item_id_emb, dept_id_emb, store_id_emb,
cat_id_emb, state_id_emb])
x = Dense(256, activation="tanh")(x)
x = Dense(128, activation="tanh")(x)
x = Dense(64, activation="tanh")(x)
x = Dense(16, activation="tanh")(x)
x = Dense(4, activation="tanh")(x)
outputs = Dense(1, activation="linear", name='output')(x)
inputs = {"dense1": dense_input, "wday": wday_input, "month": month_input, "year": year_input,
"event_name_1": event_name_1_input, "event_type_1": event_type_1_input,
"event_name_2": event_name_2_input, "event_type_2": event_type_2_input,
"item_id": item_id_input, "dept_id": dept_id_input, "store_id": store_id_input,
"cat_id": cat_id_input, "state_id": state_id_input}
# Connect input and output
model = Model(inputs, outputs)
model.compile(loss=keras.losses.mean_squared_error,
metrics=["mse"],
optimizer=keras.optimizers.Adam(learning_rate=lr))
return model
In [38]:
model = create_model(0.0002)
model.summary()
keras.utils.plot_model(model, 'model.png', show_shapes=True)
Calculate derivatives and fit model¶
In [42]:
history = model.fit(X_train,
y_train,
batch_size=2 ** 14,
epochs=100,
shuffle=True,
validation_data=valid)
Plot the evaluation metrics over epochs¶
In [43]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper left')
plt.savefig('plt')
In [44]:
history.history["val_loss"]
Out[44]:
In [45]:
model.save('model.h5')
Submission¶
In [46]:
for i in range(1914, 1969 +1):
print(i)
if i <= 1941:
if i>= 1920:
test_long['lag_t7'] = test_long.groupby(['id'])['demand'].transform(lambda x: x.shift(7))
test_long['rolling_mean_7_7'] = test_long.groupby(['id'])['demand'].transform(lambda x: x.shift(7).rolling(7).mean())
test_long['rolling_mean_7_28'] = test_long.groupby(['id'])['demand'].transform(lambda x: x.shift(7).rolling(28).mean())
forecast = make_X(test_long[test_long.d == i])
pred = model.predict(forecast, batch_size=2 ** 14)
test_long.loc[test_long.d == i, "demand"] = pred.clip(0) * 1.02
else:
test_long.loc[test_long.d == i, "demand"] = 0
In [47]:
test = test_long[test_long.d >= 1914]
test = test.assign(id=test.id + "_" + np.where(test.d <= 1941, "validation", "evaluation"),
F="F" + (test.d - 1913 - 28 * (test.d > 1941)).astype("str"))
submission = test.pivot(index="id", columns="F", values="demand").reset_index()[sample_submission.columns]
submission = sample_submission[["id"]].merge(submission, how="left", on="id")
submission.head()
Out[47]:
In [48]:
submission.to_csv("dnn_fake_valid_day_to_day.csv", index=False)
In [ ]:
0 件のコメント:
コメントを投稿