Step #4

ML Predictions using the TabPFN Model

Last update: August 14, 2025

AI Assistance: Claude.AI (Anthropic) is used for documentation, code restructuring, and performance optimization.

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see https://www.gnu.org/licenses/.

Overall Strategy

Step 1: Preprocess and engineer new features.

Step 2: Use AutoGluon to generate OOF predictions for each target separately. These predictions will be used as additional input features in steps 3 and 4.

Step 3: Train the RealMLP model with processed input (step 1) + ten AutoGluon-OOFs (step 2). These additional features will capture the correlation among targets effectively.

Step 4: Similar to step 3 except use the TabPFN (v2) model.

Step 5: Combine predictions from RealMLP (step 3) and TabPFN (step 4).

Imports

[ ]:
import numpy as np
import pandas as pd
import os
import random

from scipy.stats import hmean
from sklearn.metrics import mean_absolute_percentage_error as mape

import tabpfn
from tabpfn import TabPFNRegressor

Set Random Seeds

[ ]:
random.seed(7)
np.random.seed(7)

# Force numpy to use legacy RandomState instead of Generator
np.random.set_state(np.random.RandomState(7).get_state())

User Input

[ ]:
# n-repetitions
nTrials = 250

# Number of folds in k-fold
nFolds = 8

# Number of input features + 10 OOFs
nFeatures = 65 + 10

# Number of target variables
nTargets = 10

Input & Output Directories

[ ]:
ROOT_DIR = '/data/Sukanta/Works_AIML/2025_SHELL_FuelProperty/'
DATA_DIR = ROOT_DIR + 'DATA/'
ExtractedDATA_DIR = ROOT_DIR + 'ExtractedDATA/'
Tuning_DIR = ROOT_DIR + 'Models/TabPFN/'

# Create directory if it doesn't exist
os.makedirs(Tuning_DIR, exist_ok=True)

Load Processed Training and Testing Data

[ ]:
df_XyTrnVal_org = pd.read_csv(ExtractedDATA_DIR + 'train_processed.csv')
nSamples_TrnVal = df_XyTrnVal_org.shape[0]

df_XTst = pd.read_csv(ExtractedDATA_DIR + 'test_processed.csv')
nSamples_Tst = df_XTst.shape[0]

Load AutoGluon-generated OOF Data

[ ]:
df_XTrnVal_AG_OOF = pd.read_csv(ExtractedDATA_DIR + 'AutoGluon_21600_OOF.csv')
df_XTst_AG_OOF = pd.read_csv(ExtractedDATA_DIR + 'AutoGluon_21600_Tst.csv')

Combine Dataframes

[ ]:
df_XyTrnVal = pd.concat([df_XTrnVal_AG_OOF, df_XyTrnVal_org], axis=1)
df_XTst = pd.concat([df_XTst_AG_OOF, df_XTst], axis=1)

Initialize Storage for Results

[ ]:
dict_yTrnVal_OOF = {}
dict_yTst_pred_allFold = {}
dict_CV_scores = {}

for trial in range(nTrials):
    dict_yTrnVal_OOF[trial] = {}
    dict_yTst_pred_allFold[trial] = {}
    dict_CV_scores[trial] = {}

Iterative Single-target Training using TabPFN

[ ]:
nSamples_per_fold = int(nSamples_TrnVal / nFolds)

# n-repetitions of TabPFN models (resampling)
for trial in range(nTrials):

    print(f"\n=== TRIAL {trial + 1}/{nTrials} ===")

    # Shuffle training dataset & track original index
    shuffle_indx = np.random.permutation(nSamples_TrnVal)
    restore_indx = np.argsort(shuffle_indx)
    df_XyTrnVal_shuffled = (
        df_XyTrnVal.iloc[shuffle_indx].reset_index(drop=True))

    # Extract input features
    XTrnVal_shuffled = df_XyTrnVal_shuffled.iloc[:, 0:nFeatures].values

    # Multioutput targets
    for target in range(nTargets):

        print(f"\n--- Target {target + 1}/{nTargets} ---")

        # Extract single target from possible nTargets
        yTrnVal_shuffled = (
            df_XyTrnVal_shuffled.iloc[:, nFeatures + target].values)

        # Initialize zero vectors for OOF & test predictions
        yTrnVal_shuffled_pred = np.zeros_like(yTrnVal_shuffled)
        yTst_pred = np.zeros((nSamples_Tst, nFolds))

        # K-folds
        for Fold in range(nFolds):
            # Create validation indices for this fold
            val_start = Fold * nSamples_per_fold
            val_end = min((Fold + 1) * nSamples_per_fold, nSamples_TrnVal)
            val_indices = list(range(val_start, val_end))

            # Create training indices (all except validation fold)
            trn_indices = list(range(0, val_start)) + list(
                range(val_end, nSamples_TrnVal))

            # Split features and targets
            XTrn_shuffled_fold = XTrnVal_shuffled[trn_indices]
            XVal_shuffled_fold = XTrnVal_shuffled[val_indices]

            yTrn_shuffled_fold = yTrnVal_shuffled[trn_indices]
            yVal_shuffled_fold = yTrnVal_shuffled[val_indices]

            print(
                f"  Fold {Fold + 1}/{nFolds}: "
                f"Train={len(trn_indices)}, "
                f"Val={len(val_indices)}")

            # Initialize TabPFN model
            regressor = TabPFNRegressor()

            # Fit (no tuning) using TabPFN model
            regressor.fit(XTrn_shuffled_fold, yTrn_shuffled_fold)

            # Make predictions on the holdout set
            yVal_shuffled_fold_pred = regressor.predict(XVal_shuffled_fold)
            yTrnVal_shuffled_pred[val_indices] = yVal_shuffled_fold_pred

            # Make predictions on the test set
            yTst_pred[:, Fold] = regressor.predict(df_XTst.iloc[:, 0:nFeatures].values)
            print(f"Test predictions generated for Fold {Fold + 1}")

        # Restore the order of the indices
        yTrnVal_OOF = yTrnVal_shuffled_pred[restore_indx]

        # Average yTst_pred across various folds (harmonic mean)
        yTst_pred_allFold = (hmean(np.abs(yTst_pred), axis=1) *
                    np.sign(np.mean(yTst_pred, axis=1)))

        # Store predictions
        dict_yTrnVal_OOF[trial][target] = yTrnVal_OOF.copy()
        dict_yTst_pred_allFold[trial][target] = yTst_pred_allFold.copy()

        # Compute CV score
        dict_CV_scores[trial][target] = mape(yTrnVal_shuffled,
                                        yTrnVal_shuffled_pred)

Average Results Across Trials

[ ]:
print("\n=== AVERAGING ACROSS TRIALS ===")

dict_yTrnVal_avg_final = {}
dict_yTst_avg_final = {}
dict_CV_scores_avg = {}

for target in range(nTargets):
    # Average training OOF predictions across trials
    trial_TrnVal = [dict_yTrnVal_OOF[trial][target] for trial in range(nTrials)]
    dict_yTrnVal_avg_final[target] = (hmean(np.abs(trial_TrnVal), axis=0) *
                              np.sign(np.mean(trial_TrnVal, axis=0)))

    # Average test OOF predictions across trials (use hmean)
    trial_Tst = [dict_yTst_pred_allFold[trial][target] for trial in range(nTrials)]
    dict_yTst_avg_final[target] = (hmean(np.abs(trial_Tst), axis=0) *
                           np.sign(np.mean(trial_Tst, axis=0)))

    # CV scores of averaged predictions
    yTrnVal = (df_XyTrnVal.iloc[:, nFeatures + target].values)
    dict_CV_scores_avg[target] = mape(yTrnVal, dict_yTrnVal_avg_final[target])

    print(f"Target {target + 1}: Avg CV MAPE = {dict_CV_scores_avg[target]:.4f}")

Save Results

[ ]:
print("\n=== SAVING RESULTS ===")

df_submission = pd.DataFrame()
df_submission['ID'] = range(1, nSamples_Tst + 1)

for target in range(nTargets):
    column_name = f'BlendProperty{target+1}'
    df_submission[column_name] = dict_yTst_avg_final[target]

df_submission.to_csv(ExtractedDATA_DIR + 'TabPFN_submission.csv', index=False)

print(f"TabPFN training completed!")