Step #2

Generating Out-of-Fold (OOF) Target Values using AutoGluon

Last update: August 14, 2025

AI Assistance: Claude.AI (Anthropic) is used for documentation, code restructuring, and performance optimization

Copyright (C) 2025 Sukanta Basu

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see https://www.gnu.org/licenses/.

Overall Strategy

Step 1: Preprocess and engineer new features.

Step 2: Use AutoGluon to generate OOF predictions for each target separately. These predictions will be used as additional input features in steps 3 and 4.

Step 3: Train the RealMLP model with processed input (step 1) + ten AutoGluon-OOFs (step 2). These additional features will capture the correlation among targets effectively.

Step 4: Similar to step 3 except use the TabPFN model.

Step 5: Combine predictions from RealMLP (step 3) and TabPFN (step 4).

Imports

[ ]:
import numpy as np
import pandas as pd
import os
import random
import warnings

from autogluon.tabular import TabularPredictor

Set Random Seeds

[ ]:
random.seed(7)
np.random.seed(7)

User Input

[ ]:
# AutoGluon quality preset
quality_preset = 'best_quality'

# AutoGluon training time (in seconds)
maxTime = 21600

# Number of input features
nFeatures = 65

# Number of target variables
nTargets = 10

Input & Output Directories

[ ]:
ROOT_DIR = '/data/Sukanta/Works_AIML/2025_SHELL_FuelProperty/'
DATA_DIR = ROOT_DIR + 'DATA/'
ExtractedDATA_DIR = ROOT_DIR + 'ExtractedDATA/'
Tuning_DIR = ROOT_DIR + 'Models/AutoGluon-OOF/'

# Create directory if it doesn't exist
os.makedirs(Tuning_DIR, exist_ok=True)

Load Processed Training and Testing Data

[ ]:
df_XyTrnVal_org = pd.read_csv(ExtractedDATA_DIR + 'train_processed.csv')
nSamples_TrnVal = df_XyTrnVal_org.shape[0]

df_XTst = pd.read_csv(ExtractedDATA_DIR + 'test_processed.csv')
nSamples_Tst = df_XTst.shape[0]

print(f"Training data shape: {df_XyTrnVal_org.shape}")
print(f"Test data shape: {df_XTst.shape}")

# Extract input features
XTrnVal = df_XyTrnVal_org.iloc[:, 0:nFeatures]

Iterative Single-target Training using AutoGluon

[ ]:
# Initialize predictions array
yTrnVal_OOF = np.zeros((nSamples_TrnVal, nTargets))
yTst = np.zeros((nSamples_Tst, nTargets))

for target in range(nTargets):
    print(f"\n--- Target {target + 1}/{nTargets} ---")

    # Extract single target from possible nTargets
    yTrnVal = df_XyTrnVal_org.iloc[:, nFeatures + target]

    # Create training dataframe with features and target
    train_data = XTrnVal.copy()
    train_data[f'target_{target}'] = yTrnVal

    # Create unique file path for each target
    target_path = os.path.join(Tuning_DIR, f'target_{target + 1}')
    os.makedirs(target_path, exist_ok=True)

    # Initialize TabularPredictor from AutoGluon
    predictor = TabularPredictor(
        label=f'target_{target}',
        path=target_path,
        eval_metric='mean_absolute_percentage_error',
        problem_type='regression'
    )

    # Train the model
    print("Starting AutoGluon training...")
    predictor.fit(
        train_data,
        time_limit=maxTime,
        presets=quality_preset,
        verbosity=2,
        auto_stack=False,
        dynamic_stacking=False,
        num_bag_folds=8,
        num_bag_sets=5,
        num_stack_levels=2,
        use_bag_holdout=False,
        fit_strategy="sequential",
        ag_args_ensemble={'fold_fitting_strategy': "parallel_local"},
        ds_args={'enable_ray_logging': False}
    )

    print("\n Model Leaderboard:")
    leaderboard = predictor.leaderboard(silent=True)
    print(leaderboard.sort_values("score_val", ascending=False).head())

    # OOF predictions based on training set
    yTrnVal_OOF[:, target] = predictor.predict_oof()

    # Make predictions on test set
    yTst[:, target] = predictor.predict(df_XTst)
    print(f"Test predictions generated for target {target + 1}")

    # Clean up predictor to free memory
    del predictor

Save Results

[ ]:
print("\n=== SAVING RESULTS ===")

# Create dataframes
df_AG_yTrnVal_OOF = pd.DataFrame()
df_AG_yTst = pd.DataFrame()

# Add prediction columns
for i in range(nTargets):
    df_AG_yTrnVal_OOF[f'AG-BlendProperty{i + 1}'] = yTrnVal_OOF[:, i]
    df_AG_yTst[f'AG-BlendProperty{i+1}'] = yTst[:, i]

# Save predictions
AG_OOF_file = os.path.join(ExtractedDATA_DIR, f'AutoGluon_{maxTime}_OOF.csv')
df_AG_yTrnVal_OOF.to_csv(AG_OOF_file, index=False)

AG_Tst_file = os.path.join(ExtractedDATA_DIR, f'AutoGluon_{maxTime}_Tst.csv')
df_AG_yTst.to_csv(AG_Tst_file, index=False)

print(f"AutoGluon training completed!")