diff --git a/app.py b/app.py new file mode 100644 index 0000000..7e8de01 --- /dev/null +++ b/app.py @@ -0,0 +1,54 @@ +import streamlit as st +from src.ui_components import ( + render_what_if_analysis, + render_promo_planner, + render_inventory_forecaster, + render_portfolio_analysis +) +from src.utils import load_all_models, load_ped_results, load_feature_means, load_price_stats + +# --- Page Configuration --- +st.set_page_config( + page_title="SME Price Optimization Tool", + page_icon="�", + layout="wide" +) + +try: + if 'models_loaded' not in st.session_state: + st.session_state['models'] = load_all_models() + st.session_state['ped_results'] = load_ped_results() + st.session_state['feature_means'] = load_feature_means() + st.session_state['price_stats'] = load_price_stats() + st.session_state['models_loaded'] = True +except FileNotFoundError as e: + st.error(f"Model file not found. Please ensure all .joblib files are in the 'saved_models' directory. Error: {e}") + st.stop() + + +# --- App Header --- +st.title("Price Optimization for E-commerce SMEs") +st.markdown(""" +This tool is a practical demonstration of the hybrid analytical framework developed. +It combines predictive forecasting with strategic insights to help SMEs make data-driven pricing decisions. +""") + +# --- Sidebar Navigation --- +st.sidebar.title("Navigation") +use_case = st.sidebar.radio("Choose a Use Case:", [ + "📊 Strategic Product Portfolio Analysis", + "🎁 Strategic Promotional Planning", + "📈 'What-If' Weekly Price Setting", + "📦 Proactive Inventory Management", +]) + +# --- Main Content Area --- +if use_case == "📈 'What-If' Weekly Price Setting": + render_what_if_analysis() +elif use_case == "🎁 Strategic Promotional Planning": + render_promo_planner() +elif use_case == "📦 Proactive Inventory Management": + render_inventory_forecaster() +elif use_case == "📊 Strategic Product Portfolio Analysis": + render_portfolio_analysis() + diff --git a/pyproject.toml b/pyproject.toml index a88295f..06d1ac3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,5 +12,6 @@ dependencies = [ "scikit-learn>=1.7.1", "seaborn>=0.13.2", "statsmodels>=0.14.5", + "streamlit>=1.49.1", "xgboost>=3.0.4", ] diff --git a/saved_models/feature_means.joblib b/saved_models/feature_means.joblib new file mode 100644 index 0000000..0d26126 Binary files /dev/null and b/saved_models/feature_means.joblib differ diff --git a/saved_models/ped_results.joblib b/saved_models/ped_results.joblib new file mode 100644 index 0000000..2520bbc Binary files /dev/null and b/saved_models/ped_results.joblib differ diff --git a/saved_models/price_stats.joblib b/saved_models/price_stats.joblib new file mode 100644 index 0000000..cf76878 Binary files /dev/null and b/saved_models/price_stats.joblib differ diff --git a/saved_models/random_forest_model_JUMBO_BAG_RED_RETROSPOT.joblib b/saved_models/random_forest_model_JUMBO_BAG_RED_RETROSPOT.joblib new file mode 100644 index 0000000..62e21d5 Binary files /dev/null and b/saved_models/random_forest_model_JUMBO_BAG_RED_RETROSPOT.joblib differ diff --git a/saved_models/random_forest_model_REGENCY_CAKESTAND_3_TIER.joblib b/saved_models/random_forest_model_REGENCY_CAKESTAND_3_TIER.joblib new file mode 100644 index 0000000..62679cc Binary files /dev/null and b/saved_models/random_forest_model_REGENCY_CAKESTAND_3_TIER.joblib differ diff --git a/saved_models/random_forest_model_WHITE_HANGING_HEART_TLIGHT_HOLDER.joblib b/saved_models/random_forest_model_WHITE_HANGING_HEART_TLIGHT_HOLDER.joblib new file mode 100644 index 0000000..0117b98 Binary files /dev/null and b/saved_models/random_forest_model_WHITE_HANGING_HEART_TLIGHT_HOLDER.joblib differ diff --git a/saved_models/xgboost_model_JUMBO_BAG_RED_RETROSPOT.joblib b/saved_models/xgboost_model_JUMBO_BAG_RED_RETROSPOT.joblib new file mode 100644 index 0000000..79b9c75 Binary files /dev/null and b/saved_models/xgboost_model_JUMBO_BAG_RED_RETROSPOT.joblib differ diff --git a/saved_models/xgboost_model_REGENCY_CAKESTAND_3_TIER.joblib b/saved_models/xgboost_model_REGENCY_CAKESTAND_3_TIER.joblib new file mode 100644 index 0000000..0afb927 Binary files /dev/null and b/saved_models/xgboost_model_REGENCY_CAKESTAND_3_TIER.joblib differ diff --git a/saved_models/xgboost_model_WHITE_HANGING_HEART_TLIGHT_HOLDER.joblib b/saved_models/xgboost_model_WHITE_HANGING_HEART_TLIGHT_HOLDER.joblib new file mode 100644 index 0000000..618151e Binary files /dev/null and b/saved_models/xgboost_model_WHITE_HANGING_HEART_TLIGHT_HOLDER.joblib differ diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/ui_components.py b/src/ui_components.py new file mode 100644 index 0000000..67b43d4 --- /dev/null +++ b/src/ui_components.py @@ -0,0 +1,201 @@ +import streamlit as st +import pandas as pd +import datetime + +def render_what_if_analysis(): + """Renders the UI for Use Case 1 with data-driven price defaults.""" + st.header("📈 'What-If' Weekly Price Setting") + st.markdown("Use this tool to forecast weekly sales for a product at different potential price points.") + + models = st.session_state['models'] + feature_means = st.session_state['feature_means'] + price_stats = st.session_state['price_stats'] + product_list = list(models.keys()) + + col1, col2 = st.columns(2) + with col1: + selected_product = st.selectbox("Select a Product:", product_list, key="uc1_product") + with col2: + target_date = st.date_input("Select a Target Week:", datetime.date.today(), key="uc1_date") + + avg_price = price_stats[selected_product]['avg_price'] + + st.subheader("Enter Potential Prices to Compare") + price_col1, price_col2, price_col3 = st.columns(3) + with price_col1: + price1 = st.number_input("Price 1 (£):", min_value=0.01, value=round(avg_price * 0.95, 2), step=0.50) + with price_col2: + price2 = st.number_input("Price 2 (£):", min_value=0.01, value=round(avg_price, 2), step=0.50) + with price_col3: + price3 = st.number_input("Price 3 (£):", min_value=0.01, value=round(avg_price * 1.05, 2), step=0.50) + + if st.button("Forecast Sales", key="uc1_forecast"): + model = models[selected_product] + potential_prices = [price1, price2, price3] + results = [] + + month = target_date.month + week_of_year = target_date.isocalendar()[1] + is_holiday = 1 if month in [11, 12] else 0 + means = feature_means[selected_product] + + for price in potential_prices: + features = pd.DataFrame({ + 'Weekly_Avg_Price': [price], 'Month': [month], 'Week_of_Year': [week_of_year], + 'Quantity_Last_Week': [means['Quantity_Last_Week']], + 'Quantity_4_Week_MA': [means['Quantity_4_Week_MA']], + 'Is_Holiday_Season': [is_holiday] + }) + prediction = model.predict(features)[0] + revenue = prediction * price + results.append({ + "Proposed Price": f"£{price:.2f}", + "Predicted Weekly Sales (Units)": f"~{int(prediction)}", + "Estimated Weekly Revenue": f"~£{revenue:,.2f}" + }) + + st.subheader("Forecasted Results") + results_df = pd.DataFrame(results) + st.table(results_df) + + best_revenue_str = results_df["Estimated Weekly Revenue"].str.replace('~£', '').str.replace(',', '').astype(float).idxmax() + best_option = results_df.loc[best_revenue_str] + st.success(f"**Recommendation:** The price of **{best_option['Proposed Price']}** is forecast to generate the highest revenue ({best_option['Estimated Weekly Revenue']}).") + + +def render_promo_planner(): + """Renders the UI for Use Case 2 with data-driven price defaults.""" + st.header("🎁 Strategic Promotional Planning") + st.markdown("Analyze a product's price sensitivity and simulate the impact of promotional discounts.") + + models = st.session_state['models'] + ped_results = st.session_state['ped_results'] + feature_means = st.session_state['feature_means'] + price_stats = st.session_state['price_stats'] + product_list = list(models.keys()) + + selected_product = st.selectbox("Select a Product:", product_list, key="uc2_product") + + avg_price = price_stats[selected_product]['avg_price'] + ped_value = ped_results.get(selected_product, "N/A") + elasticity_text = "**Highly Elastic**" if isinstance(ped_value, float) and ped_value < -2 else "**Elastic**" if isinstance(ped_value, float) and ped_value < -1 else "**Inelastic**" + + with st.container(border=True): + st.subheader(f"Strategic Insight for: {selected_product}") + st.metric(label="Price Elasticity of Demand (PED)", value=ped_value) + st.markdown(f"**Interpretation:** This product is {elasticity_text}, making it a good candidate for promotions.") + + st.subheader("Simulate a Promotion") + promo_date = st.date_input("Select a Promotional Week:", datetime.date.today(), key="uc2_date") + + price_col1, price_col2 = st.columns(2) + with price_col1: + promo_price1 = st.number_input("Promotional Price 1 (£):", min_value=0.01, value=round(avg_price * 0.9, 2), step=0.10) + with price_col2: + promo_price2 = st.number_input("Promotional Price 2 (£):", min_value=0.01, value=round(avg_price * 0.8, 2), step=0.10) + + if st.button("Simulate Promotion", key="uc2_simulate"): + model = models[selected_product] + potential_prices = [promo_price1, promo_price2] + results = [] + + month = promo_date.month + week_of_year = promo_date.isocalendar()[1] + is_holiday = 1 if month in [11, 12] else 0 + means = feature_means[selected_product] + + for price in potential_prices: + features = pd.DataFrame({ + 'Weekly_Avg_Price': [price], 'Month': [month], 'Week_of_Year': [week_of_year], + 'Quantity_Last_Week': [means['Quantity_Last_Week']], + 'Quantity_4_Week_MA': [means['Quantity_4_Week_MA']], + 'Is_Holiday_Season': [is_holiday] + }) + prediction = model.predict(features)[0] + revenue = prediction * price + results.append({ + "Promotional Price": f"£{price:.2f}", + "Predicted Sales": f"~{int(prediction)}", + "Estimated Revenue": f"~£{revenue:,.2f}" + }) + + st.subheader("Promotional Forecast") + st.table(pd.DataFrame(results)) + + +def render_inventory_forecaster(): + """Renders the UI for Use Case 3 with data-driven price defaults.""" + st.header("📦 Proactive Inventory Management") + st.markdown("Forecast total demand over the next several weeks to inform your inventory planning.") + + models = st.session_state['models'] + feature_means = st.session_state['feature_means'] + price_stats = st.session_state['price_stats'] + product_list = list(models.keys()) + + col1, col2 = st.columns(2) + with col1: + selected_product = st.selectbox("Select a Product:", product_list, key="uc3_product") + with col2: + weeks_to_forecast = st.slider("Weeks to Forecast:", 1, 12, 4, key="uc3_weeks") + + avg_price = price_stats[selected_product]['avg_price'] + planned_price = st.number_input("Planned Average Price (£):", 0.01, value=round(avg_price, 2), step=0.25, key="uc3_price") + + if st.button("Forecast Demand", key="uc3_forecast"): + model = models[selected_product] + means = feature_means[selected_product] + weekly_predictions = [] + total_demand = 0 + start_date = datetime.date.today() + + for i in range(weeks_to_forecast): + target_date = start_date + datetime.timedelta(weeks=i) + month = target_date.month + week_of_year = target_date.isocalendar()[1] + is_holiday = 1 if month in [11, 12] else 0 + + features = pd.DataFrame({ + 'Weekly_Avg_Price': [planned_price], 'Month': [month], 'Week_of_Year': [week_of_year], + 'Quantity_Last_Week': [means['Quantity_Last_Week']], + 'Quantity_4_Week_MA': [means['Quantity_4_Week_MA']], + 'Is_Holiday_Season': [is_holiday] + }) + prediction = int(model.predict(features)[0]) + total_demand += prediction + weekly_predictions.append({"Week": f"Week {i+1} (from {target_date.strftime('%d %b')})", "Forecasted Sales (Units)": f"~{prediction}"}) + + st.metric(f"Total Estimated Demand (Next {weeks_to_forecast} weeks)", f"~{total_demand:,} Units") + + st.subheader("Weekly Breakdown") + st.table(pd.DataFrame(weekly_predictions)) + st.warning("Note: Forecast assumes a constant price and uses historical averages for recent sales trends.") + + +def render_portfolio_analysis(): + """Renders the UI for Use Case 4.""" + st.header("📊 Strategic Product Portfolio Analysis") + st.markdown("Compare price sensitivity across your portfolio to inform high-level strategy.") + st.info("Price Elasticity of Demand (PED) measures how quantity demanded responds to a price change. A more negative number means demand is more sensitive to price.") + + ped_results = st.session_state['ped_results'] + portfolio_data = [] + + for product, ped in ped_results.items(): + implication = "" + if isinstance(ped, float): + if ped < -2: + implication = "**Extremely Elastic:** Volume-driven. Ideal for promotions." + elif ped < -1: + implication = "**Elastic:** Responds well to discounts." + else: + implication = "**Inelastic:** Less sensitive to price. May handle a price increase." + + portfolio_data.append({ + "Product Name": product, + "Price Elasticity (PED)": ped, + "Strategic Implication": implication + }) + + st.table(pd.DataFrame(portfolio_data)) + diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..7cd7a1a --- /dev/null +++ b/src/utils.py @@ -0,0 +1,52 @@ +import joblib +import os +import streamlit as st + +MODEL_DIR = 'saved_models' +SELECTED_PRODUCTS = [ + 'WHITE HANGING HEART T-LIGHT HOLDER', + 'REGENCY CAKESTAND 3 TIER', + 'JUMBO BAG RED RETROSPOT' +] + +@st.cache_data +def load_model(product_name): + """Loads a single pre-trained model file from disk.""" + safe_name = "".join(c for c in product_name if c.isalnum() or c in (' ', '_')).rstrip().replace(' ', '_') + model_path = os.path.join(MODEL_DIR, f'random_forest_model_{safe_name}.joblib') + if not os.path.exists(model_path): + raise FileNotFoundError(f"Model file not found at {model_path}") + return joblib.load(model_path) + +@st.cache_data +def load_all_models(): + """Loads all models for the selected products into a dictionary.""" + models = {} + for product in SELECTED_PRODUCTS: + models[product] = load_model(product) + return models + +@st.cache_data +def load_ped_results(): + """Loads the saved Price Elasticity of Demand results.""" + ped_path = os.path.join(MODEL_DIR, 'ped_results.joblib') + if not os.path.exists(ped_path): + raise FileNotFoundError(f"PED results file not found at {ped_path}") + return joblib.load(ped_path) + +@st.cache_data +def load_feature_means(): + """Loads the saved feature means for app placeholders.""" + means_path = os.path.join(MODEL_DIR, 'feature_means.joblib') + if not os.path.exists(means_path): + raise FileNotFoundError(f"Feature means file not found at {means_path}") + return joblib.load(means_path) + +@st.cache_data +def load_price_stats(): + """Loads the saved price stats for UI defaults.""" + stats_path = os.path.join(MODEL_DIR, 'price_stats.joblib') + if not os.path.exists(stats_path): + raise FileNotFoundError(f"Price stats file not found at {stats_path}") + return joblib.load(stats_path) + diff --git a/u2734832_DS7010_final.py b/u2734832_DS7010_final.py index 25c3ece..a8bbba0 100644 --- a/u2734832_DS7010_final.py +++ b/u2734832_DS7010_final.py @@ -14,7 +14,7 @@ # ============================================================================== print("--- Installing required libraries ---") -# !pip install pandas numpy statsmodels scikit-learn xgboost matplotlib seaborn -q +# !pip install pandas numpy statsmodels scikit-learn xgboost matplotlib seaborn joblib -q print("Libraries installed successfully.") import pandas as pd @@ -25,6 +25,7 @@ import seaborn as sns import os import warnings +import joblib from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error @@ -43,19 +44,30 @@ 'REGENCY CAKESTAND 3 TIER': 5.50, 'JUMBO BAG RED RETROSPOT': 0.75 } -OUTPUT_DIR = 'outputs' -FILE_PATH = "ENTER_FILE_PATH_LOCATION" +# --- IMPORTANT: UPDATE THIS PATH --- +FILE_PATH = "online_retail_II_kaggle.csv" # + +# Define output directories for models and plots +MODEL_DIR = 'saved_models' +OUTPUT_DIR = 'tmp/outputs' + +SAVE_MODEL = True + +if not os.path.exists(MODEL_DIR): + os.makedirs(MODEL_DIR) + print(f"Created directory: {MODEL_DIR}") if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) print(f"Created directory: {OUTPUT_DIR}") + # --- Data Loading and Full Preparation Pipeline --- print("\n--- Starting Data Preparation Pipeline ---") try: - raw_df = pd.read_csv(FILE_PATH) + raw_df = pd.read_csv(FILE_PATH, encoding='cp1252') # Added encoding for compatibility print("Dataset loaded successfully.") except FileNotFoundError: - print(f"ERROR: '{FILE_PATH}' file not found. Please upload the file.") + print(f"ERROR: '{FILE_PATH}' file not found. Please update the FILE_PATH variable in the script.") exit() # Cleaning @@ -109,6 +121,15 @@ print("Data preparation complete.") print("-" * 60) +# --- Helper Function for Saving Models --- +def save_model_joblib(model_object, product_name, model_prefix): + """Saves a model object to a file using joblib.""" + safe_product_name = "".join(c for c in product_name if c.isalnum() or c in (' ', '_')).rstrip().replace(' ', '_') + model_filename = os.path.join(MODEL_DIR, f'{model_prefix}_{safe_product_name}.joblib') + joblib.dump(model_object, model_filename) + print(f"\n >>> Model for '{product_name}' saved successfully to: {model_filename} <<<") + + def plot_eda(data, product_name): print(f"\n--- Generating EDA Plots for: {product_name} ---") data['Date'] = pd.to_datetime(data['Year'].astype(str) + data['Week_of_Year'].astype(str) + '1', format='%Y%W%w') @@ -161,8 +182,7 @@ def plot_advanced_eda(data, product_name): def run_correlation_analysis(data, product_name): """ - Calculates and plots a correlation matrix for the features. Includes - both Pearson (linear) and Spearman (monotonic) correlations. + Calculates and plots a correlation matrix for the features. """ print(f"\n--- Performing Correlation Analysis for: {product_name} ---") @@ -200,11 +220,7 @@ def plot_model_diagnostics(model, product_name): print(f" - OLS diagnostic plots saved to {plot_filename}") -def run_elasticity_model(data, product_name): - """ - Runs the Log-Log regression, prints key metrics including both R-squared - and Adjusted R-squared, and generates diagnostic plots. - """ +def run_elasticity_model(data, product_name, save_model=False): print("\n--- Model 1: Log-Log Regression (Explanatory) ---") log_data = data.copy() log_data['log_Quantity'] = np.log1p(log_data['Quantity']) @@ -225,60 +241,71 @@ def run_elasticity_model(data, product_name): plot_model_diagnostics(model, product_name) + if save_model: + save_model_joblib(model, product_name, 'log_log_model') + return model -def tune_and_run_random_forest(product_data, product_name): - print(f"\n--- Model 2: Tuned Random Forest Regressor (Predictive) ---") +def tune_and_run_random_forest(product_data, product_name, save_model=False): + """ + This function now only trains and tunes the RF model, returning the best one. + """ + print(f"\n--- Model: Tuning Random Forest Regressor for {product_name} ---") features = ['Weekly_Avg_Price', 'Month', 'Week_of_Year', 'Quantity_Last_Week', 'Quantity_4_Week_MA', 'Is_Holiday_Season'] X = product_data[features] y = product_data['Quantity'] - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - param_grid = {'n_estimators': [100, 200], 'max_depth': [5, 10, None], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5]} + + # We train on the full dataset for the final model to be deployed + X_train, y_train = X, y + + # A more focused parameter grid for faster tuning + param_grid = { + 'n_estimators': [100, 200], + 'max_depth': [5, 10, 15], + 'min_samples_leaf': [1, 2, 4], + 'min_samples_split': [2, 5] + } rf = RandomForestRegressor(random_state=42) grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=0, scoring='r2') grid_search.fit(X_train, y_train) + print(f" - Best parameters found: {grid_search.best_params_}") + print(f" - Best cross-validated R-squared on training data: {grid_search.best_score_:.3f}") + best_rf_model = grid_search.best_estimator_ - y_pred = best_rf_model.predict(X_test) - r2, rmse, mae = r2_score(y_test, y_pred), np.sqrt(mean_squared_error(y_test, y_pred)), mean_absolute_error(y_test, y_pred) - print(f" - Final R-squared: {r2:.3f}\n - Final RMSE: {rmse:.3f}\n - Final MAE: {mae:.3f}") - importances = pd.Series(best_rf_model.feature_importances_, index=X.columns) - plt.figure(figsize=(8, 5)) - importances.sort_values().plot(kind='barh') - plt.title(f'TUNED RF Feature Importance for {product_name}') - plt.tight_layout() - plot_filename = os.path.join(OUTPUT_DIR, f'{product_name}_tuned_rf_feature_importance.png') - plt.savefig(plot_filename) - plt.close() - print(f" - Feature importance plot saved to {plot_filename}") - return y_test, y_pred + + if save_model: + save_model_joblib(best_rf_model, product_name, 'random_forest_model') + + return best_rf_model + -def tune_and_run_xgboost(product_data, product_name): - print(f"\n--- Model 3: Tuned XGBoost Regressor (Predictive) ---") +def tune_and_run_xgboost(product_data, product_name, save_model=False): + print(f"\n--- Model: Tuning XGBoost Regressor for {product_name} ---") features = ['Weekly_Avg_Price', 'Month', 'Week_of_Year', 'Quantity_Last_Week', 'Quantity_4_Week_MA', 'Is_Holiday_Season'] X = product_data[features] y = product_data['Quantity'] - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - param_grid_xgb = {'n_estimators': [100, 200], 'max_depth': [3, 5, 7], 'learning_rate': [0.05, 0.1], 'subsample': [0.7, 1.0]} + X_train, y_train = X, y # Train on full data for final model + + param_grid_xgb = { + 'n_estimators': [100, 200], + 'max_depth': [3, 5, 7], + 'learning_rate': [0.05, 0.1], + 'subsample': [0.7, 1.0] + } xgbr = xgb.XGBRegressor(objective='reg:squarederror', random_state=42) grid_search_xgb = GridSearchCV(estimator=xgbr, param_grid=param_grid_xgb, cv=5, n_jobs=-1, verbose=0, scoring='r2') grid_search_xgb.fit(X_train, y_train) + print(f" - Best parameters found: {grid_search_xgb.best_params_}") best_xgb_model = grid_search_xgb.best_estimator_ - y_pred = best_xgb_model.predict(X_test) - r2, rmse, mae = r2_score(y_test, y_pred), np.sqrt(mean_squared_error(y_test, y_pred)), mean_absolute_error(y_test, y_pred) - print(f" - Final R-squared: {r2:.3f}\n - Final RMSE: {rmse:.3f}\n - Final MAE: {mae:.3f}") - importances = pd.Series(best_xgb_model.feature_importances_, index=X.columns) - plt.figure(figsize=(8, 5)) - importances.sort_values().plot(kind='barh') - plt.title(f'TUNED XGBoost Feature Importance for {product_name}') - plt.tight_layout() - plot_filename = os.path.join(OUTPUT_DIR, f'{product_name}_tuned_xgb_feature_importance.png') - plt.savefig(plot_filename) - plt.close() - print(f" - Feature importance plot saved to {plot_filename}") - return y_test, y_pred + + if save_model: + save_model_joblib(best_xgb_model, product_name, 'xgboost_model') + + return best_xgb_model + def plot_model_comparison(y_test, y_pred_rf, y_pred_xgb, product_name): plt.figure(figsize=(10, 8)) @@ -293,36 +320,51 @@ def plot_model_comparison(y_test, y_pred_rf, y_pred_xgb, product_name): plt.close() print(f"\n - Comparative prediction plot saved to {plot_filename}") + def static_price_optimization(ped, cost): - if ped is None or not isinstance(ped, (int, float)) or ped >= -1: return "Cannot optimize: Demand is inelastic or PED is not valid." + if ped is None or not isinstance(ped, (int, float)) or ped >= -1: + return "Cannot optimize: Demand is inelastic or PED is not valid." return cost / (1 + (1 / ped)) + def apply_psychological_pricing(price): - if isinstance(price, (int, float)) and price > 0: return int(price) + 0.99 + if isinstance(price, (int, float)) and price > 0: + return int(price) + 0.99 return price -print("\n--- Starting Full Analysis Pipeline for Selected Products ---") + +print("\n--- Starting Final Model Training and Saving Pipeline ---") + +ped_results = {} for product in SELECTED_PRODUCTS: - print(f"\n{'='*60}\nAnalyzing Product: {product}\n{'='*60}") + print(f"\n{'='*60}\nProcessing Product: {product}\n{'='*60}") product_data = final_df[final_df['Description'] == product].copy() if product_data.empty: print(f"No data available for {product} after processing. Skipping.") continue - plot_eda(product_data, product) - plot_advanced_eda(product_data, product) # ADD THIS LINE - - run_correlation_analysis(product_data, product) - log_model = run_elasticity_model(product_data, product) - y_test_rf, y_pred_rf = tune_and_run_random_forest(product_data, product) - y_test_xgb, y_pred_xgb = tune_and_run_xgboost(product_data, product) - plot_model_comparison(y_test_rf, y_pred_rf, y_pred_xgb, product) - print("\n--- Price Optimization Recommendations (from Log-Log Model) ---") + + # plot_eda(product_data, product) + # plot_advanced_eda(product_data, product) + # run_correlation_analysis(product_data, product) + + + # --- 1. Train and save the best predictive model (Random Forest) --- + tune_and_run_random_forest(product_data, product, save_model=SAVE_MODEL) + tune_and_run_xgboost(product_data, product, save_model=SAVE_MODEL) + + # --- 2. Run the explanatory model to get PED for strategic context --- + log_model = run_elasticity_model(product_data, product_name=product) # Not saving this one by default ped_value = log_model.params.get('log_Price', None) - cost_value = PRODUCT_COSTS.get(product, np.mean(product_data['Weekly_Avg_Price']) * 0.4) - optimal_price = static_price_optimization(ped_value, cost_value) - final_price = apply_psychological_pricing(optimal_price) - print(f" - Static Optimal Price: £{optimal_price:.2f}" if isinstance(optimal_price, float) else f" - {optimal_price}") - if isinstance(final_price, float): print(f" - Recommended Price (Psychologically Adjusted): £{final_price:.2f}") + ped_results[product] = round(ped_value, 2) if ped_value is not None else "N/A" + print(f" - Calculated PED for strategic context: {ped_results[product]}") + + +# Save the PED results to a file for the app to use +ped_filename = os.path.join(MODEL_DIR, 'ped_results.joblib') +joblib.dump(ped_results, ped_filename) +print(f"\n{'='*60}\nStrategic PED values saved to: {ped_filename}") + + +print(f"\n{'='*60}\nFull Training and Saving Process Complete. All models saved to '{MODEL_DIR}'.\n{'='*60}") -print(f"\n{'='*60}\nFull Analysis Complete. All outputs saved to '{OUTPUT_DIR}'.\n{'='*60}") \ No newline at end of file