Kaggle

2021年2月23日火曜日

t f B! P L
å Keras_with_Categorical_Embeddings

Forecast: Keras with Categorical Embeddings

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import os
from tqdm.notebook import tqdm
In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

Load data

In [3]:
path = "./input/csv"
In [4]:
calendar = pd.read_csv(os.path.join(path, "calendar.csv.gz"))
selling_prices = pd.read_csv(os.path.join(path, "sell_prices.csv.gz"))
sample_submission = pd.read_csv(os.path.join(path, "sample_submission.csv.gz"))
In [5]:
sales = pd.read_csv(os.path.join(path, "sales_train_validation.csv.gz"))

Describe and prepare data

We will now go through all data sets and prepare them for modelling.

Calendar data

For each date (covering both training and test data), we have access to useful calendar information.

In [6]:
calendar.head()
Out[6]:
date wm_yr_wk weekday wday month year d event_name_1 event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI
0 2011-01-29 11101 Saturday 1 1 2011 d_1 NaN NaN NaN NaN 0 0 0
1 2011-01-30 11101 Sunday 2 1 2011 d_2 NaN NaN NaN NaN 0 0 0
2 2011-01-31 11101 Monday 3 1 2011 d_3 NaN NaN NaN NaN 0 0 0
3 2011-02-01 11101 Tuesday 4 2 2011 d_4 NaN NaN NaN NaN 1 1 0
4 2011-02-02 11101 Wednesday 5 2 2011 d_5 NaN NaN NaN NaN 1 0 1
In [7]:
from sklearn.preprocessing import OrdinalEncoder
In [8]:
def prep_calendar(df):
    df = df.drop(["date", "weekday"], axis=1)
    df = df.assign(d = df.d.str[2:].astype(int))
    df = df.fillna("missing")
    cols = list(set(df.columns) - {"wm_yr_wk", "d"})
    df[cols] = OrdinalEncoder(dtype="int").fit_transform(df[cols])
    df = reduce_mem_usage(df)
    return df
In [9]:
calendar = prep_calendar(calendar)
Mem. usage decreased to  0.03 Mb (84.7% reduction)

Notes for modeling

Features deemed to be useful:

  • "wday", "year", "month" -> integer coding & embedding
  • "event_name_1", "event_type_1" -> integer coding & embedding
  • "snap_XX" -> numeric (they are dummies)

Reshape required: No

Merge key(s): "d", "wm_yr_wk"

Selling prices

Contains selling prices for each store_id, item_id_wm_yr_wk combination.

Derive some time related features:

In [11]:
def prep_selling_prices(df):
    gr = df.groupby(["store_id", "item_id"])["sell_price"]
    df["sell_price_rel_diff"] = gr.pct_change()
    df["sell_price_roll_sd7"] = gr.transform(lambda x: x.rolling(7).std())
    df["sell_price_cumrel"] = (gr.shift(0) - gr.cummin()) / (1 + gr.cummax() - gr.cummin())
    df = reduce_mem_usage(df)
    return df
In [12]:
selling_prices = prep_selling_prices(selling_prices)
Mem. usage decreased to 169.63 Mb (53.6% reduction)
In [14]:
def reshape_sales(df, drop_d = None):
    if drop_d is not None:
        df = df.drop(["d_" + str(i + 1) for i in range(drop_d)], axis=1)
    df = df.assign(id=df.id.str.replace("_validation", ""))
    df = df.reindex(columns=df.columns.tolist() + ["d_" + str(1913 + i + 1) for i in range(2 * 28)])
    df = df.melt(id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"],
                 var_name='d', value_name='demand')
    df = df.assign(d=df.d.str[2:].astype("int16"))
    return df
In [15]:
sales = reshape_sales(sales, 1000)
In [16]:
sales
Out[16]:
id item_id dept_id cat_id store_id state_id d demand
0 HOBBIES_1_001_CA_1 HOBBIES_1_001 HOBBIES_1 HOBBIES CA_1 CA 1001 2.0
1 HOBBIES_1_002_CA_1 HOBBIES_1_002 HOBBIES_1 HOBBIES CA_1 CA 1001 0.0
2 HOBBIES_1_003_CA_1 HOBBIES_1_003 HOBBIES_1 HOBBIES CA_1 CA 1001 0.0
3 HOBBIES_1_004_CA_1 HOBBIES_1_004 HOBBIES_1 HOBBIES CA_1 CA 1001 0.0
4 HOBBIES_1_005_CA_1 HOBBIES_1_005 HOBBIES_1 HOBBIES CA_1 CA 1001 1.0
... ... ... ... ... ... ... ... ...
29544805 FOODS_3_823_WI_3 FOODS_3_823 FOODS_3 FOODS WI_3 WI 1969 NaN
29544806 FOODS_3_824_WI_3 FOODS_3_824 FOODS_3 FOODS WI_3 WI 1969 NaN
29544807 FOODS_3_825_WI_3 FOODS_3_825 FOODS_3 FOODS WI_3 WI 1969 NaN
29544808 FOODS_3_826_WI_3 FOODS_3_826 FOODS_3 FOODS WI_3 WI 1969 NaN
29544809 FOODS_3_827_WI_3 FOODS_3_827 FOODS_3 FOODS WI_3 WI 1969 NaN

29544810 rows × 8 columns

In [17]:
def prep_sales(df):
    
    df['lag_t7'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(7))
    df['lag_t28'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28))
    
    df['rolling_mean_7_7'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(7).rolling(7).mean())
    df['rolling_mean_7_28'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(7).rolling(28).mean())

    df['rolling_mean_28_7'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).mean())
    df['rolling_mean_28_28'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(28).mean())
    
    df['rolling_std_28_7'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).std())
    df['rolling_std_28_28'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(28).std())
    

    # Remove rows with NAs except for submission rows. rolling_mean_7_28 
    df = df[(df.d >= 1914) | (pd.notna(df.rolling_mean_7_28)) | (df.d <= 200) ]
    df = reduce_mem_usage(df)

    return df
In [18]:
sales = sales.merge(calendar, how="left", on="d")
gc.collect()
sales.head()
Out[18]:
id item_id dept_id cat_id store_id state_id d demand wm_yr_wk wday month year event_name_1 event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI
0 HOBBIES_1_001_CA_1 HOBBIES_1_001 HOBBIES_1 HOBBIES CA_1 CA 1001 2.0 11339 6 9 2 30 4 4 2 0 0 0
1 HOBBIES_1_002_CA_1 HOBBIES_1_002 HOBBIES_1 HOBBIES CA_1 CA 1001 0.0 11339 6 9 2 30 4 4 2 0 0 0
2 HOBBIES_1_003_CA_1 HOBBIES_1_003 HOBBIES_1 HOBBIES CA_1 CA 1001 0.0 11339 6 9 2 30 4 4 2 0 0 0
3 HOBBIES_1_004_CA_1 HOBBIES_1_004 HOBBIES_1 HOBBIES CA_1 CA 1001 0.0 11339 6 9 2 30 4 4 2 0 0 0
4 HOBBIES_1_005_CA_1 HOBBIES_1_005 HOBBIES_1 HOBBIES CA_1 CA 1001 1.0 11339 6 9 2 30 4 4 2 0 0 0
In [19]:
sales = sales.merge(selling_prices, how="left", on=["wm_yr_wk", "store_id", "item_id"])
sales.drop(["wm_yr_wk"], axis=1, inplace=True)
gc.collect()
sales.head()
Out[19]:
id item_id dept_id cat_id store_id state_id d demand wday month ... event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI sell_price sell_price_rel_diff sell_price_roll_sd7 sell_price_cumrel
0 HOBBIES_1_001_CA_1 HOBBIES_1_001 HOBBIES_1 HOBBIES CA_1 CA 1001 2.0 6 9 ... 4 4 2 0 0 0 8.257812 0.0 0.0 0.000000
1 HOBBIES_1_002_CA_1 HOBBIES_1_002 HOBBIES_1 HOBBIES CA_1 CA 1001 0.0 6 9 ... 4 4 2 0 0 0 3.970703 0.0 0.0 0.000000
2 HOBBIES_1_003_CA_1 HOBBIES_1_003 HOBBIES_1 HOBBIES CA_1 CA 1001 0.0 6 9 ... 4 4 2 0 0 0 NaN NaN NaN NaN
3 HOBBIES_1_004_CA_1 HOBBIES_1_004 HOBBIES_1 HOBBIES CA_1 CA 1001 0.0 6 9 ... 4 4 2 0 0 0 4.640625 0.0 0.0 0.230713
4 HOBBIES_1_005_CA_1 HOBBIES_1_005 HOBBIES_1 HOBBIES CA_1 CA 1001 1.0 6 9 ... 4 4 2 0 0 0 3.080078 0.0 0.0 0.375000

5 rows × 22 columns

In [20]:
sales = prep_sales(sales)
Mem. usage decreased to 2555.62 Mb (36.5% reduction)
In [22]:
sales
Out[22]:
id item_id dept_id cat_id store_id state_id d demand wday month ... sell_price_roll_sd7 sell_price_cumrel lag_t7 lag_t28 rolling_mean_7_7 rolling_mean_7_28 rolling_mean_28_7 rolling_mean_28_28 rolling_std_28_7 rolling_std_28_28
1036660 HOBBIES_1_001_CA_1 HOBBIES_1_001 HOBBIES_1 HOBBIES CA_1 CA 1035 1.0 5 10 ... 0.0 0.000000 0.0 0.0 0.142822 0.714355 0.856934 NaN 0.899902 NaN
1036661 HOBBIES_1_002_CA_1 HOBBIES_1_002 HOBBIES_1 HOBBIES CA_1 CA 1035 0.0 5 10 ... 0.0 0.000000 0.0 0.0 0.285645 0.142822 0.142822 NaN 0.377930 NaN
1036662 HOBBIES_1_003_CA_1 HOBBIES_1_003 HOBBIES_1 HOBBIES CA_1 CA 1035 0.0 5 10 ... NaN NaN 0.0 0.0 0.000000 0.000000 0.000000 NaN 0.000000 NaN
1036663 HOBBIES_1_004_CA_1 HOBBIES_1_004 HOBBIES_1 HOBBIES CA_1 CA 1035 5.0 5 10 ... 0.0 0.230713 1.0 2.0 1.571289 2.072266 3.142578 NaN 2.853516 NaN
1036664 HOBBIES_1_005_CA_1 HOBBIES_1_005 HOBBIES_1 HOBBIES CA_1 CA 1035 0.0 5 10 ... 0.0 0.375000 2.0 2.0 1.000000 0.678711 0.714355 NaN 0.755859 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
29544805 FOODS_3_823_WI_3 FOODS_3_823 FOODS_3 FOODS WI_3 WI 1969 NaN 1 5 ... 0.0 0.333252 NaN NaN NaN NaN NaN NaN NaN NaN
29544806 FOODS_3_824_WI_3 FOODS_3_824 FOODS_3 FOODS WI_3 WI 1969 NaN 1 5 ... 0.0 0.285645 NaN NaN NaN NaN NaN NaN NaN NaN
29544807 FOODS_3_825_WI_3 FOODS_3_825 FOODS_3 FOODS WI_3 WI 1969 NaN 1 5 ... 0.0 0.000000 NaN NaN NaN NaN NaN NaN NaN NaN
29544808 FOODS_3_826_WI_3 FOODS_3_826 FOODS_3 FOODS WI_3 WI 1969 NaN 1 5 ... 0.0 0.000000 NaN NaN NaN NaN NaN NaN NaN NaN
29544809 FOODS_3_827_WI_3 FOODS_3_827 FOODS_3 FOODS WI_3 WI 1969 NaN 1 5 ... 0.0 0.000000 NaN NaN NaN NaN NaN NaN NaN NaN

28508150 rows × 30 columns

In [21]:
del selling_prices

Prepare data for Keras interface

Ordinal encoding of remaining categoricals

In [23]:
cat_id_cols = ["item_id", "dept_id", "store_id", "cat_id", "state_id"]
cat_cols = cat_id_cols + ["wday", "month", "year", "event_name_1", 
                          "event_type_1", "event_name_2", "event_type_2"]
In [24]:
# In loop to minimize memory use
for i, v in tqdm(enumerate(cat_id_cols)):
    sales[v] = OrdinalEncoder(dtype="int").fit_transform(sales[[v]])

sales = reduce_mem_usage(sales)
sales.head()
gc.collect()
Mem. usage decreased to 1631.25 Mb (18.9% reduction)
Out[24]:
5

Impute numeric columns

In [25]:
num_cols = ["sell_price", "sell_price_rel_diff", "sell_price_roll_sd7", "sell_price_cumrel",
            "lag_t7", 
            "lag_t28", 
            "rolling_mean_7_7", 
            "rolling_mean_7_28", 
            "rolling_mean_28_7", 
            "rolling_mean_7_28", 
            "rolling_std_28_7", 
            "rolling_std_28_28"]
bool_cols = ["snap_CA", "snap_TX", "snap_WI"]
dense_cols = num_cols + bool_cols
In [26]:
# Need to do column by column due to memory constraints
for i, v in tqdm(enumerate(num_cols)):
    sales[v] = sales[v].fillna(sales[v].median())
    
sales.head()

Out[26]:
id item_id dept_id cat_id store_id state_id d demand wday month ... sell_price_roll_sd7 sell_price_cumrel lag_t7 lag_t28 rolling_mean_7_7 rolling_mean_7_28 rolling_mean_28_7 rolling_mean_28_28 rolling_std_28_7 rolling_std_28_28
1036660 HOBBIES_1_001_CA_1 1437 3 1 0 0 1035 1.0 5 10 ... 0.0 0.000000 0.0 0.0 0.142822 0.714355 0.856934 NaN 0.899902 0.744629
1036661 HOBBIES_1_002_CA_1 1438 3 1 0 0 1035 0.0 5 10 ... 0.0 0.000000 0.0 0.0 0.285645 0.142822 0.142822 NaN 0.377930 0.744629
1036662 HOBBIES_1_003_CA_1 1439 3 1 0 0 1035 0.0 5 10 ... 0.0 0.000000 0.0 0.0 0.000000 0.000000 0.000000 NaN 0.000000 0.744629
1036663 HOBBIES_1_004_CA_1 1440 3 1 0 0 1035 5.0 5 10 ... 0.0 0.230713 1.0 2.0 1.571289 2.072266 3.142578 NaN 2.853516 0.744629
1036664 HOBBIES_1_005_CA_1 1441 3 1 0 0 1035 0.0 5 10 ... 0.0 0.375000 2.0 2.0 1.000000 0.678711 0.714355 NaN 0.755859 0.744629

5 rows × 30 columns

Separate submission data and reconstruct id columns

In [27]:
test = sales[sales.d >= 1914]
test = test.assign(id=test.id + "_" + np.where(test.d <= 1941, "validation", "evaluation"),
                   F="F" + (test.d - 1913 - 28 * (test.d > 1941)).astype("str"))
test.head()
gc.collect()
Out[27]:
88
In [28]:
test_long = sales[sales.d >= 1914 - 100]
test_long.head()
gc.collect()
Out[28]:
20

Make training data

In [29]:
# Input dict for training with a dense array and separate inputs for each embedding input
def make_X(df):
    X = {"dense1": df[dense_cols].to_numpy()}
    for i, v in enumerate(cat_cols):
        X[v] = df[[v]].to_numpy()
    return X
In [30]:
# Submission data
X_test = make_X(test)
In [31]:
# One month of validation data
flag = (sales.d < 1914) & (sales.d >= 1914 - 28)
valid = (make_X(sales[flag]),
         sales["demand"][flag])
In [32]:
# Rest is used for training
flag = sales.d < 1914 #- 7
X_train = make_X(sales[flag])
y_train = sales["demand"][flag]
In [33]:
del sales, flag
gc.collect()
Out[33]:
20
In [34]:
import tensorflow as tf
import tensorflow.keras as keras

from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dense, Input, Embedding, Dropout, concatenate, Flatten
from tensorflow.keras.models import Model

Architecture with embeddings

In [35]:
def create_model(lr=0.002):
    tf.keras.backend.clear_session()
    gc.collect()

    # Dense input
    dense_input = Input(shape=(len(dense_cols), ), name='dense1')

    # Embedding input
    wday_input = Input(shape=(1,), name='wday')
    month_input = Input(shape=(1,), name='month')
    year_input = Input(shape=(1,), name='year')
    event_name_1_input = Input(shape=(1,), name='event_name_1')
    event_type_1_input = Input(shape=(1,), name='event_type_1')
    event_name_2_input = Input(shape=(1,), name='event_name_2')
    event_type_2_input = Input(shape=(1,), name='event_type_2')
    item_id_input = Input(shape=(1,), name='item_id')
    dept_id_input = Input(shape=(1,), name='dept_id')
    store_id_input = Input(shape=(1,), name='store_id')
    cat_id_input = Input(shape=(1,), name='cat_id')
    state_id_input = Input(shape=(1,), name='state_id')

    wday_emb = Flatten()(Embedding(7, 1)(wday_input))
    month_emb = Flatten()(Embedding(12, 1)(month_input))
    year_emb = Flatten()(Embedding(6, 1)(year_input))
    event_name_1_emb = Flatten()(Embedding(31, 1)(event_name_1_input))
    event_type_1_emb = Flatten()(Embedding(5, 1)(event_type_1_input))
    event_name_2_emb = Flatten()(Embedding(5, 1)(event_name_2_input))
    event_type_2_emb = Flatten()(Embedding(5, 1)(event_type_2_input))

    item_id_emb = Flatten()(Embedding(3049, 3)(item_id_input))
    dept_id_emb = Flatten()(Embedding(7, 1)(dept_id_input))
    store_id_emb = Flatten()(Embedding(10, 1)(store_id_input))
    cat_id_emb = Flatten()(Embedding(3, 1)(cat_id_input))
    state_id_emb = Flatten()(Embedding(3, 1)(state_id_input))

    # Combine dense and embedding parts and add dense layers. Exit on linear scale.
    x = concatenate([dense_input, wday_emb, month_emb, year_emb, 
                     event_name_1_emb, event_type_1_emb, 
                     event_name_2_emb, event_type_2_emb, 
                     item_id_emb, dept_id_emb, store_id_emb,
                     cat_id_emb, state_id_emb])
    
    x = Dense(256, activation="tanh")(x)
    x = Dense(128, activation="tanh")(x)
    x = Dense(64, activation="tanh")(x)
    x = Dense(16, activation="tanh")(x)
    x = Dense(4, activation="tanh")(x)
    
    outputs = Dense(1, activation="linear", name='output')(x)

    inputs = {"dense1": dense_input, "wday": wday_input, "month": month_input, "year": year_input, 
              "event_name_1": event_name_1_input, "event_type_1": event_type_1_input,
              "event_name_2": event_name_2_input, "event_type_2": event_type_2_input,
              "item_id": item_id_input, "dept_id": dept_id_input, "store_id": store_id_input, 
              "cat_id": cat_id_input, "state_id": state_id_input}

    # Connect input and output
    model = Model(inputs, outputs)

    model.compile(loss=keras.losses.mean_squared_error,
                  metrics=["mse"],
                  optimizer=keras.optimizers.Adam(learning_rate=lr))
    return model
In [38]:
model = create_model(0.0002)
model.summary()
keras.utils.plot_model(model, 'model.png', show_shapes=True)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
wday (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
month (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
year (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
event_name_1 (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
event_type_1 (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
event_name_2 (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
event_type_2 (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
dept_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
store_id (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
cat_id (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
state_id (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 1)         7           wday[0][0]                       
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 1)         12          month[0][0]                      
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 1)         6           year[0][0]                       
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 1)         31          event_name_1[0][0]               
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 1, 1)         5           event_type_1[0][0]               
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 1, 1)         5           event_name_2[0][0]               
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 1, 1)         5           event_type_2[0][0]               
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 1, 3)         9147        item_id[0][0]                    
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 1, 1)         7           dept_id[0][0]                    
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 1, 1)         10          store_id[0][0]                   
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, 1, 1)         3           cat_id[0][0]                     
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 1, 1)         3           state_id[0][0]                   
__________________________________________________________________________________________________
dense1 (InputLayer)             [(None, 15)]         0                                            
__________________________________________________________________________________________________
flatten (Flatten)               (None, 1)            0           embedding[0][0]                  
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 1)            0           embedding_1[0][0]                
__________________________________________________________________________________________________
flatten_2 (Flatten)             (None, 1)            0           embedding_2[0][0]                
__________________________________________________________________________________________________
flatten_3 (Flatten)             (None, 1)            0           embedding_3[0][0]                
__________________________________________________________________________________________________
flatten_4 (Flatten)             (None, 1)            0           embedding_4[0][0]                
__________________________________________________________________________________________________
flatten_5 (Flatten)             (None, 1)            0           embedding_5[0][0]                
__________________________________________________________________________________________________
flatten_6 (Flatten)             (None, 1)            0           embedding_6[0][0]                
__________________________________________________________________________________________________
flatten_7 (Flatten)             (None, 3)            0           embedding_7[0][0]                
__________________________________________________________________________________________________
flatten_8 (Flatten)             (None, 1)            0           embedding_8[0][0]                
__________________________________________________________________________________________________
flatten_9 (Flatten)             (None, 1)            0           embedding_9[0][0]                
__________________________________________________________________________________________________
flatten_10 (Flatten)            (None, 1)            0           embedding_10[0][0]               
__________________________________________________________________________________________________
flatten_11 (Flatten)            (None, 1)            0           embedding_11[0][0]               
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 29)           0           dense1[0][0]                     
                                                                 flatten[0][0]                    
                                                                 flatten_1[0][0]                  
                                                                 flatten_2[0][0]                  
                                                                 flatten_3[0][0]                  
                                                                 flatten_4[0][0]                  
                                                                 flatten_5[0][0]                  
                                                                 flatten_6[0][0]                  
                                                                 flatten_7[0][0]                  
                                                                 flatten_8[0][0]                  
                                                                 flatten_9[0][0]                  
                                                                 flatten_10[0][0]                 
                                                                 flatten_11[0][0]                 
__________________________________________________________________________________________________
dense (Dense)                   (None, 256)          7680        concatenate[0][0]                
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 128)          32896       dense[0][0]                      
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 64)           8256        dense_1[0][0]                    
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 16)           1040        dense_2[0][0]                    
__________________________________________________________________________________________________
dense_4 (Dense)                 (None, 4)            68          dense_3[0][0]                    
__________________________________________________________________________________________________
output (Dense)                  (None, 1)            5           dense_4[0][0]                    
==================================================================================================
Total params: 59,186
Trainable params: 59,186
Non-trainable params: 0
__________________________________________________________________________________________________
Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.

Calculate derivatives and fit model

In [42]:
history = model.fit(X_train, 
                    y_train,
                    batch_size=2 ** 14,
                    epochs=100,
                    shuffle=True,
                    validation_data=valid)
Train on 26800710 samples, validate on 853720 samples
Epoch 1/100
 2359296/26800710 [=>............................] - ETA: 49:48 - loss: 14.5268 - mean_squared_error: 14.526 - ETA: 26:09 - loss: 16.4514 - mean_squared_error: 16.451 - ETA: 18:11 - loss: 14.9618 - mean_squared_error: 14.961 - ETA: 14:10 - loss: 14.5856 - mean_squared_error: 14.585 - ETA: 11:46 - loss: 14.1370 - mean_squared_error: 14.137 - ETA: 10:10 - loss: 15.7298 - mean_squared_error: 15.729 - ETA: 9:01 - loss: 15.5423 - mean_squared_error: 15.542 - ETA: 8:10 - loss: 15.3015 - mean_squared_error: 15.30 
...
mean_squared_error: 4.42 - ETA: 0s - loss: 4.4277 - mean_squared_error: 4.42 - ETA: 0s - loss: 4.4279 - mean_squared_error: 4.42 - ETA: 0s - loss: 4.4275 - mean_squared_error: 4.42 - ETA: 0s - loss: 4.4273 - mean_squared_error: 4.42 - 138s 5us/sample - loss: 4.4282 - mean_squared_error: 4.4282 - val_loss: 3.9257 - val_mean_squared_error: 3.9257

Plot the evaluation metrics over epochs

In [43]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper left')
plt.savefig('plt')
In [44]:
history.history["val_loss"]
Out[44]:
[10.370679273414549,
 9.435458637399838,
 8.73997919560484,
...
 3.9624899415308983,
 3.9598669769200603,
 3.970755115174445,
 3.925747579416317]
In [45]:
model.save('model.h5')

Submission

In [46]:
for i in range(1914, 1969 +1):
    print(i)
    if i <= 1941:
        if i>= 1920:
            test_long['lag_t7'] = test_long.groupby(['id'])['demand'].transform(lambda x: x.shift(7))    
            test_long['rolling_mean_7_7'] = test_long.groupby(['id'])['demand'].transform(lambda x: x.shift(7).rolling(7).mean())
            test_long['rolling_mean_7_28'] = test_long.groupby(['id'])['demand'].transform(lambda x: x.shift(7).rolling(28).mean())
    
        forecast = make_X(test_long[test_long.d == i])
        pred = model.predict(forecast, batch_size=2 ** 14)

        test_long.loc[test_long.d == i, "demand"] = pred.clip(0) * 1.02
    else:
        test_long.loc[test_long.d == i, "demand"] = 0
1914
1915
...
1965
1969
In [47]:
test = test_long[test_long.d >= 1914]
test = test.assign(id=test.id + "_" + np.where(test.d <= 1941, "validation", "evaluation"),
                   F="F" + (test.d - 1913 - 28 * (test.d > 1941)).astype("str"))
submission = test.pivot(index="id", columns="F", values="demand").reset_index()[sample_submission.columns]
submission = sample_submission[["id"]].merge(submission, how="left", on="id")
submission.head()
Out[47]:
id F1 F2 F3 F4 F5 F6 F7 F8 F9 ... F19 F20 F21 F22 F23 F24 F25 F26 F27 F28
0 HOBBIES_1_001_CA_1_validation 0.922363 0.810547 0.759277 0.895020 0.936035 1.053711 1.212891 0.944336 0.905273 ... 0.994629 1.220703 1.088867 0.913086 0.870117 0.863770 0.924805 0.990234 1.105469 1.166016
1 HOBBIES_1_002_CA_1_validation 0.181274 0.166382 0.217651 0.188232 0.202881 0.240356 0.284668 0.142212 0.149414 ... 0.199219 0.261719 0.305664 0.175293 0.164917 0.175415 0.179810 0.208252 0.280518 0.301270
2 HOBBIES_1_003_CA_1_validation 0.493408 0.490967 0.493896 0.458984 0.529785 0.557129 0.569336 0.416016 0.409180 ... 0.543457 0.680664 0.764648 0.522949 0.512695 0.497314 0.460693 0.499512 0.618164 0.613770
3 HOBBIES_1_004_CA_1_validation 2.119141 1.777344 1.576172 1.601562 2.166016 2.789062 2.841797 2.173828 1.965820 ... 2.394531 2.935547 3.238281 2.242188 2.025391 1.917969 2.013672 2.478516 3.292969 3.171875
4 HOBBIES_1_005_CA_1_validation 0.962402 0.897461 0.874512 0.966797 1.090820 1.468750 1.792969 1.213867 1.208008 ... 1.319336 1.614258 1.688477 1.186523 1.127930 1.093750 1.141602 1.298828 1.681641 1.850586

5 rows × 29 columns

In [48]:
submission.to_csv("dnn_fake_valid_day_to_day.csv", index=False)
In [ ]:
 

このブログを検索

ブログ アーカイブ

AD

自己紹介

自分の写真
機械学習による売上予測モデル構築や商品ポートフォリオ最適化に取り組むAI屋です。E資格・G検定保有。LINEスタンプ販売やYoutubeもやってます。

QooQ