From 8005e8bc0fa755de69a7a267e323467244139b70 Mon Sep 17 00:00:00 2001
From: kdgutier <kin.gtz.olivares@gmail.com>
Date: Tue, 19 Sep 2023 11:07:16 -0400
Subject: [PATCH] Favorita promotions data

---
 datasetsforecast/_modidx.py  |   2 +
 datasetsforecast/favorita.py |  74 ++++++++++++--
 nbs/favorita.ipynb           | 183 +++++++++++++++++++++++++----------
 3 files changed, 201 insertions(+), 58 deletions(-)

diff --git a/datasetsforecast/_modidx.py b/datasetsforecast/_modidx.py
index 305dcc6..50b8321 100644
--- a/datasetsforecast/_modidx.py
+++ b/datasetsforecast/_modidx.py
@@ -47,6 +47,8 @@
                                                                                             'datasetsforecast/favorita.py'),
                                            'datasetsforecast.favorita.FavoritaData.load_preprocessed': ( 'favorita.html#favoritadata.load_preprocessed',
                                                                                                          'datasetsforecast/favorita.py'),
+                                           'datasetsforecast.favorita.FavoritaData.load_promotions': ( 'favorita.html#favoritadata.load_promotions',
+                                                                                                       'datasetsforecast/favorita.py'),
                                            'datasetsforecast.favorita.FavoritaRawData': ( 'favorita.html#favoritarawdata',
                                                                                           'datasetsforecast/favorita.py'),
                                            'datasetsforecast.favorita.FavoritaRawData._load_raw_group_data': ( 'favorita.html#favoritarawdata._load_raw_group_data',
diff --git a/datasetsforecast/favorita.py b/datasetsforecast/favorita.py
index 6320449..5254c3e 100644
--- a/datasetsforecast/favorita.py
+++ b/datasetsforecast/favorita.py
@@ -3,7 +3,7 @@
 # %% auto 0
 __all__ = ['FavoritaRawData', 'FavoritaData']
 
-# %% ../nbs/favorita.ipynb 4
+# %% ../nbs/favorita.ipynb 5
 import os
 import gc
 import timeit
@@ -23,7 +23,7 @@
 
 from .utils import download_file, extract_file, Info #, CodeTimer
 
-# %% ../nbs/favorita.ipynb 7
+# %% ../nbs/favorita.ipynb 8
 # TODO: @kdgutier `CodeTimer`/`numpy_balance` are shared with hierarchicalforecast.utils
 # In case of merging datasetsforecast/hierarchicalforeast we wil need to keep only one.
 class CodeTimer:
@@ -108,7 +108,7 @@ def numpy_bfill(arr):
     out = arr[np.arange(idx.shape[0])[:,None], idx]
     return out
 
-# %% ../nbs/favorita.ipynb 12
+# %% ../nbs/favorita.ipynb 13
 def one_hot_encoding(df, index_col):
     """ 
     Encodes dataFrame `df`'s categorical variables skipping `index_col`.
@@ -175,7 +175,7 @@ def get_levels_from_S_df(S_df):
     assert sum([len(lv) for lv in levels]) == S_df.shape[0]
     return levels
 
-# %% ../nbs/favorita.ipynb 16
+# %% ../nbs/favorita.ipynb 17
 # TODO: @kdgutier `make_holidays_distance_df` partially shared with neuralforecast.utils
 # In particular some Transformers use a holiday-based global positional encoding.
 # Same goes for HINT experiment that uses such general purpose holiday distances.
@@ -224,7 +224,7 @@ def make_holidays_distance_df(holidays_df, dates):
     holidays_distance_df = pd.DataFrame(distance_dict)
     return holidays_distance_df
 
-# %% ../nbs/favorita.ipynb 18
+# %% ../nbs/favorita.ipynb 19
 @dataclass
 class Favorita200:
     freq: str = 'D'
@@ -265,7 +265,7 @@ class FavoritaComplete:
 
 FavoritaInfo = Info((Favorita200, Favorita500, FavoritaComplete))
 
-# %% ../nbs/favorita.ipynb 20
+# %% ../nbs/favorita.ipynb 21
 class FavoritaRawData:
     """ Favorita Raw Data
 
@@ -454,7 +454,7 @@ def _load_raw_group_data(directory, group, verbose=False):
 
         return filter_items, filter_stores, filter_dates, raw_group_data
 
-# %% ../nbs/favorita.ipynb 26
+# %% ../nbs/favorita.ipynb 27
 class FavoritaData:
     """ Favorita Data
 
@@ -463,7 +463,6 @@ class FavoritaData:
     January 2013 to August 2017, with a geographic hierarchy of states, cities, and stores. 
     This wrangling matches that of the DPMN paper.
 
-    - [Kin G. Olivares, O. Nganba Meetei, Ruijun Ma, Rohan Reddy, Mengfei Cao, Lee Dicker (2022)."Probabilistic Hierarchical Forecasting with Deep Poisson Mixtures". International Journal Forecasting, special issue.](https://doi.org/10.1016/j.ijforecast.2023.04.007)
     """
     @staticmethod
     def _get_static_data(filter_items, filter_stores, items, store_info, temporal, verbose=False):
@@ -593,7 +592,21 @@ def _get_temporal_bottom(temporal, item_store_df, filter_dates, verbose=False):
                 balanced_df[col] = col_values.flatten()
                 balanced_df[col] = balanced_df[col].fillna(0)
             #check_nans(balanced_df)
-        
+
+            #-------------------- with CodeTimer('Promotions'): --------------------#
+            onpromotion = balanced_df['onpromotion'].values
+            onpromotion_lag = balanced_df[['unique_id', 'onpromotion']].groupby(['unique_id'])
+            onpromotion_lag = onpromotion_lag.shift(periods=1, fill_value=0).values.flatten()
+
+            onpromotion_start = 1 * ((onpromotion - onpromotion_lag)>0)
+            onpromotion_start = onpromotion_start.reshape(n_items * n_stores, n_dates)
+            onpromotion = onpromotion.reshape(n_items * n_stores, n_dates)
+
+            promotion_id = np.cumsum(onpromotion_start, axis=1) * onpromotion
+
+            balanced_df['onpromotion_start'] = onpromotion_start.flatten()
+            balanced_df['promotion_id'] = promotion_id.flatten().astype(np.int64)
+
         # Rename variables for StatsForecast/NeuralForecast compatibility
         balanced_df.rename(columns={"date": "ds", "unit_sales": "y"}, inplace=True)
 
@@ -901,3 +914,46 @@ def load(directory: str, group: str, cache: bool=True, verbose: bool=False):
                 ds = ds, y = Y_hier.flatten()))
         
         return Y_df, S_df, tags
+    
+    @staticmethod
+    def load_promotions(directory: str, group: str, cache: bool=True, verbose: bool=False):
+        """
+        Load Favorita promotions benchmark dataset.
+
+        Normally time series forecasting assumes regularly sampled observations from same
+        generative processes. This dataset aims presents a more intricate problem, forecasting
+        sales at the promotion event. The unique feature of promotion forecasting is that 
+        promotions are irregular events of varying length; making standard forecasting approach 
+        not directly applicable and calling for the neural architecture innovations.
+
+        The dataset contains daily sales trajectories at the item-store level, and its promotions. 
+        The dataset is augmented with calendar variables (month, day of week, day of month), and
+        is accompanied with static features of the store location as well as the item characteristics.
+
+        **Parameters:**<br>
+        `directory`: str, directory where data will be downloaded and saved.<br>
+        `group`: str, dataset group name in 'Favorita200', 'Favorita500', 'FavoritaComplete'.<br>
+        `cache`: bool=False, If `True` saves and loads.<br>
+        `verbose`: bool=False, wether or not print partial outputs.<br>
+
+        **Returns:**<br>
+        `Y_df`: pd.DataFrame, augmented daily item-store sales and promotions.<br>
+        `static_df`: pd.DataFrame, store and item level static features.<br>
+        """
+        static_agg, static_bottom, temporal_agg, temporal_bottom, _ = \
+            FavoritaData.load_preprocessed(directory=directory, group=group,
+                                            cache=cache, verbose=verbose)
+
+        # Temporal Data
+        Y_df = temporal_bottom.copy()
+        Y_df['unique_promotion_id'] = Y_df['unique_id'].astype(str) + \
+                                      '_' + Y_df['promotion_id'].astype(str)
+
+        temporal_agg = temporal_agg[['item_nbr', 'ds', 'month', 'day_of_week', 'day_of_month']]
+        Y_df = Y_df.merge(temporal_agg, on=['item_nbr', 'ds'], how='left')
+
+        # Static Data
+        static_agg = static_agg.filter(regex=("family.*|item_nbr|perishable"))
+        static_df = static_bottom.merge(static_agg, on=['item_nbr'], how='left')
+
+        return Y_df, static_df    
diff --git a/nbs/favorita.ipynb b/nbs/favorita.ipynb
index 1d6af16..f76a9f4 100644
--- a/nbs/favorita.ipynb
+++ b/nbs/favorita.ipynb
@@ -14,9 +14,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Favorita\n",
-    "\n",
-    "## Description\n",
+    "# Favorita"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "The 2018 Kaggle competition was organized by Corporación Favorita, a major Ecuatorian grocery retailer. The Favorita dataset is comprised of item sales history and promotions information, with additional information of items and stores,regional and national holidays, among other. \n",
     "\n",
     "The competition task consisted on forecasting sixteen days for the log-sales of particular item store combinations, for 210,654 series. \n",
@@ -32,7 +36,6 @@
     "|  Stores               |             54                |           217,944             |   217,944   |\n",
     "|  Total                |             93                |           371,312             |   371,312   |\n",
     "\n",
-    "## References\n",
     "- [Corporación Favorita (2018). Corporación favorita grocery sales forecasting. Kaggle Competition. URL: https://www.kaggle.com/c/favorita-grocery-sales-forecasting/.](https://www.kaggle.com/c/favorita-grocery-sales-forecasting/)<br>\n",
     "- [Kin G. Olivares, O. Nganba Meetei, Ruijun Ma, Rohan Reddy, Mengfei Cao, Lee Dicker (2022).\"Probabilistic Hierarchical Forecasting with Deep Poisson Mixtures\". International Journal Forecasting, special issue.](https://doi.org/10.1016/j.ijforecast.2023.04.007)<br>"
    ]
@@ -91,7 +94,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Auxiliary Functions\n",
+    "# 1. Auxiliary Functions\n",
     "\n",
     "This auxiliary functions are used to efficiently create and wrangle Favorita's series."
    ]
@@ -101,7 +104,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Numpy Wrangling"
+    "## Numpy Wrangling"
    ]
   },
   {
@@ -202,7 +205,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "show_doc(numpy_balance, title_level=4)"
+    "show_doc(numpy_balance, title_level=3)"
    ]
   },
   {
@@ -211,7 +214,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "show_doc(numpy_ffill, title_level=4)"
+    "show_doc(numpy_ffill, title_level=3)"
    ]
   },
   {
@@ -220,7 +223,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "show_doc(numpy_bfill, title_level=4)"
+    "show_doc(numpy_bfill, title_level=3)"
    ]
   },
   {
@@ -228,7 +231,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Pandas Wrangling"
+    "## Pandas Wrangling"
    ]
   },
   {
@@ -311,7 +314,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "show_doc(one_hot_encoding, title_level=4)"
+    "show_doc(one_hot_encoding, title_level=3)"
    ]
   },
   {
@@ -320,7 +323,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "show_doc(nested_one_hot_encoding, title_level=4)"
+    "show_doc(nested_one_hot_encoding, title_level=3)"
    ]
   },
   {
@@ -329,7 +332,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "show_doc(get_levels_from_S_df, title_level=4)"
+    "show_doc(get_levels_from_S_df, title_level=3)"
    ]
   },
   {
@@ -393,7 +396,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Favorita Dataset"
+    "# 2. Favorita Dataset"
    ]
   },
   {
@@ -449,7 +452,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Favorita Raw"
+    "## FavoritaRawData"
    ]
   },
   {
@@ -654,7 +657,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "show_doc(FavoritaRawData, title_level=4)"
+    "show_doc(FavoritaRawData, title_level=3)"
    ]
   },
   {
@@ -663,7 +666,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "show_doc(FavoritaRawData._load_raw_group_data, title_level=4)"
+    "show_doc(FavoritaRawData._load_raw_group_data, title_level=3)"
    ]
   },
   {
@@ -671,7 +674,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### Favorita Raw Usage example"
+    "### Favorita Raw Usage example"
    ]
   },
   {
@@ -707,7 +710,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### FavoritaData"
+    "## FavoritaData"
    ]
   },
   {
@@ -725,7 +728,6 @@
     "    January 2013 to August 2017, with a geographic hierarchy of states, cities, and stores. \n",
     "    This wrangling matches that of the DPMN paper.\n",
     "\n",
-    "    - [Kin G. Olivares, O. Nganba Meetei, Ruijun Ma, Rohan Reddy, Mengfei Cao, Lee Dicker (2022).\"Probabilistic Hierarchical Forecasting with Deep Poisson Mixtures\". International Journal Forecasting, special issue.](https://doi.org/10.1016/j.ijforecast.2023.04.007)\n",
     "    \"\"\"\n",
     "    @staticmethod\n",
     "    def _get_static_data(filter_items, filter_stores, items, store_info, temporal, verbose=False):\n",
@@ -855,7 +857,21 @@
     "                balanced_df[col] = col_values.flatten()\n",
     "                balanced_df[col] = balanced_df[col].fillna(0)\n",
     "            #check_nans(balanced_df)\n",
-    "        \n",
+    "\n",
+    "            #-------------------- with CodeTimer('Promotions'): --------------------#\n",
+    "            onpromotion = balanced_df['onpromotion'].values\n",
+    "            onpromotion_lag = balanced_df[['unique_id', 'onpromotion']].groupby(['unique_id'])\n",
+    "            onpromotion_lag = onpromotion_lag.shift(periods=1, fill_value=0).values.flatten()\n",
+    "\n",
+    "            onpromotion_start = 1 * ((onpromotion - onpromotion_lag)>0)\n",
+    "            onpromotion_start = onpromotion_start.reshape(n_items * n_stores, n_dates)\n",
+    "            onpromotion = onpromotion.reshape(n_items * n_stores, n_dates)\n",
+    "\n",
+    "            promotion_id = np.cumsum(onpromotion_start, axis=1) * onpromotion\n",
+    "\n",
+    "            balanced_df['onpromotion_start'] = onpromotion_start.flatten()\n",
+    "            balanced_df['promotion_id'] = promotion_id.flatten().astype(np.int64)\n",
+    "\n",
     "        # Rename variables for StatsForecast/NeuralForecast compatibility\n",
     "        balanced_df.rename(columns={\"date\": \"ds\", \"unit_sales\": \"y\"}, inplace=True)\n",
     "\n",
@@ -1162,7 +1178,59 @@
     "                item_id = item_id, hier_id = hier_id,\n",
     "                ds = ds, y = Y_hier.flatten()))\n",
     "        \n",
-    "        return Y_df, S_df, tags"
+    "        return Y_df, S_df, tags\n",
+    "    \n",
+    "    @staticmethod\n",
+    "    def load_promotions(directory: str, group: str, cache: bool=True, verbose: bool=False):\n",
+    "        \"\"\"\n",
+    "        Load Favorita promotions benchmark dataset.\n",
+    "\n",
+    "        Normally time series forecasting assumes regularly sampled observations from same\n",
+    "        generative processes. This dataset aims presents a more intricate problem, forecasting\n",
+    "        sales at the promotion event. The unique feature of promotion forecasting is that \n",
+    "        promotions are irregular events of varying length; making standard forecasting approach \n",
+    "        not directly applicable and calling for the neural architecture innovations.\n",
+    "\n",
+    "        The dataset contains daily sales trajectories at the item-store level, and its promotions. \n",
+    "        The dataset is augmented with calendar variables (month, day of week, day of month), and\n",
+    "        is accompanied with static features of the store location as well as the item characteristics.\n",
+    "\n",
+    "        **Parameters:**<br>\n",
+    "        `directory`: str, directory where data will be downloaded and saved.<br>\n",
+    "        `group`: str, dataset group name in 'Favorita200', 'Favorita500', 'FavoritaComplete'.<br>\n",
+    "        `cache`: bool=False, If `True` saves and loads.<br>\n",
+    "        `verbose`: bool=False, wether or not print partial outputs.<br>\n",
+    "\n",
+    "        **Returns:**<br>\n",
+    "        `Y_df`: pd.DataFrame, augmented daily item-store sales and promotions.<br>\n",
+    "        `static_df`: pd.DataFrame, store and item level static features.<br>\n",
+    "        \"\"\"\n",
+    "        static_agg, static_bottom, temporal_agg, temporal_bottom, _ = \\\n",
+    "            FavoritaData.load_preprocessed(directory=directory, group=group,\n",
+    "                                            cache=cache, verbose=verbose)\n",
+    "\n",
+    "        # Temporal Data\n",
+    "        Y_df = temporal_bottom.copy()\n",
+    "        Y_df['unique_promotion_id'] = Y_df['unique_id'].astype(str) + \\\n",
+    "                                      '_' + Y_df['promotion_id'].astype(str)\n",
+    "\n",
+    "        temporal_agg = temporal_agg[['item_nbr', 'ds', 'month', 'day_of_week', 'day_of_month']]\n",
+    "        Y_df = Y_df.merge(temporal_agg, on=['item_nbr', 'ds'], how='left')\n",
+    "\n",
+    "        # Static Data\n",
+    "        static_agg = static_agg.filter(regex=(\"family.*|item_nbr|perishable\"))\n",
+    "        static_df = static_bottom.merge(static_agg, on=['item_nbr'], how='left')\n",
+    "\n",
+    "        return Y_df, static_df    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_doc(FavoritaData, title_level=3)"
    ]
   },
   {
@@ -1171,7 +1239,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "show_doc(FavoritaData, title_level=4)"
+    "show_doc(FavoritaData.load_preprocessed, title_level=3)"
    ]
   },
   {
@@ -1180,7 +1248,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "show_doc(FavoritaData.load_preprocessed, title_level=4)"
+    "show_doc(FavoritaData.load, title_level=3)"
    ]
   },
   {
@@ -1189,7 +1257,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "show_doc(FavoritaData.load, title_level=4)"
+    "show_doc(FavoritaData.load_promotions, title_level=3)"
    ]
   },
   {
@@ -1289,24 +1357,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# #| hide\n",
-    "# #| eval: false\n",
-    "# # Test the equality of created and loaded datasets columns and rows\n",
-    "# static_agg1, static_bottom1, temporal_agg1, temporal_bottom1, S_df1 = \\\n",
-    "#                         FavoritaData.load_preprocessed(directory=directory, group=group, cache=False)\n",
-    "\n",
-    "# static_agg2, static_bottom2, temporal_agg2, temporal_bottom2, S_df2 = \\\n",
-    "#                         FavoritaData.load_preprocessed(directory=directory, group=group)\n",
-    "\n",
-    "# test_eq(len(static_agg1)+len(static_agg1.columns), \n",
-    "#         len(static_agg2)+len(static_agg2.columns))\n",
-    "# test_eq(len(static_bottom1)+len(static_bottom1.columns), \n",
-    "#         len(static_bottom2)+len(static_bottom2.columns))\n",
-    "\n",
-    "# test_eq(len(temporal_agg1)+len(temporal_agg1.columns), \n",
-    "#         len(temporal_agg2)+len(temporal_agg2.columns))\n",
-    "# test_eq(len(temporal_bottom1)+len(temporal_bottom1.columns), \n",
-    "#         len(temporal_bottom2)+len(temporal_bottom2.columns))"
+    "#| hide\n",
+    "#| eval: false\n",
+    "# Test the equality of created and loaded datasets columns and rows\n",
+    "static_agg1, static_bottom1, temporal_agg1, temporal_bottom1, S_df1 = \\\n",
+    "                        FavoritaData.load_preprocessed(directory=directory, group=group, cache=False)\n",
+    "\n",
+    "static_agg2, static_bottom2, temporal_agg2, temporal_bottom2, S_df2 = \\\n",
+    "                        FavoritaData.load_preprocessed(directory=directory, group=group)\n",
+    "\n",
+    "test_eq(len(static_agg1)+len(static_agg1.columns), \n",
+    "        len(static_agg2)+len(static_agg2.columns))\n",
+    "test_eq(len(static_bottom1)+len(static_bottom1.columns), \n",
+    "        len(static_bottom2)+len(static_bottom2.columns))\n",
+    "\n",
+    "test_eq(len(temporal_agg1)+len(temporal_agg1.columns), \n",
+    "        len(temporal_agg2)+len(temporal_agg2.columns))\n",
+    "test_eq(len(temporal_bottom1)+len(temporal_bottom1.columns), \n",
+    "        len(temporal_bottom2)+len(temporal_bottom2.columns))"
    ]
   },
   {
@@ -1323,10 +1391,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#| eval: false\n",
+    "# | eval: false\n",
     "# Qualitative evaluation of hierarchical data\n",
     "from datasetsforecast.favorita import FavoritaData\n",
-    "from hierarchicalforecast.utils import HierarchicalPlot\n",
+    "# from hierarchicalforecast.utils import HierarchicalPlot\n",
     "\n",
     "group = 'Favorita200' # 'Favorita500', 'FavoritaComplete'\n",
     "directory = './data/favorita'\n",
@@ -1337,10 +1405,27 @@
     "Y_item_df = Y_item_df.set_index('unique_id')\n",
     "del Y_item_df['item_id']\n",
     "\n",
-    "hplots = HierarchicalPlot(S=S_df, tags=tags)\n",
-    "hplots.plot_hierarchically_linked_series(\n",
-    "    Y_df=Y_item_df, bottom_series='store_[40]',\n",
-    ")"
+    "#hplots = HierarchicalPlot(S=S_df, tags=tags)\n",
+    "#hplots.plot_hierarchically_linked_series(\n",
+    "#    Y_df=Y_item_df, bottom_series='store_[40]',\n",
+    "#)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | eval: false\n",
+    "# Qualitative evaluation of hierarchical data\n",
+    "group = 'Favorita200' # 'Favorita500', 'FavoritaComplete'\n",
+    "directory = './data/favorita'\n",
+    "\n",
+    "Y_df, static_df = \\\n",
+    "    FavoritaData.load_promotions(directory=directory, group=group, cache=True, verbose=True)\n",
+    "\n",
+    "Y_df"
    ]
   },
   {