Polynomial Regression with PHP

Polynomial Regression with PHP-ML

An extension where the relationship between variables is non-linear. Polynomial regression transforms input variables to higher powers (e.g., $x2,x3x^2, x^3x2,x3$) but remains a linear model concerning the parameters, making it suitable for more complex patterns. In polynomial regression, we aim to model a non-linear relationship by transforming the input variable $x$ to include higher powers. The model equation for a polynomial regression of degree is: $y = \beta_0 + \beta_1 x + \beta_2 x^2 + \beta_3 x^3 + \dots + \beta_d x^d + \epsilon$

In this example we compare RM: average number of rooms per dwelling vs PRICE.

CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,25.0
0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,22.6
0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,33.4
0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.60,12.43,20.6
0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.90,19.15,22.9
0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.9
0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.10,18.9
0.22489,12.5,7.87,0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45,21.6
0.11747,12.5,7.87,0,0.524,6.009,82.9,6.2267,5,311,15.2,396.90,13.27,18.9
0.09378,12.5,7.87,0,0.524,5.889,39.0,5.4509,5,311,15.2,390.50,15.71,21.7
0.62976,0.0,8.14,0,0.538,5.949,61.8,4.7075,4,307,21.0,396.90,8.26,20.4
0.63796,0.0,8.14,0,0.538,6.096,84.5,4.4619,4,307,21.0,380.02,10.26,21.2
0.62739,0.0,8.14,0,0.538,5.834,56.5,4.4986,4,307,21.0,395.62,8.47,19.9
0.41238,0.0,8.14,0,0.538,5.989,61.8,4.7075,4,307,21.0,396.90,10.62,22.2
0.36894,22.0,5.86,0,0.431,8.259,8.4,8.9067,7,330,19.1,396.90,3.54,37.7
0.37578,22.0,5.86,0,0.431,8.183,7.5,8.9067,7,330,19.1,396.90,3.54,37.3
0.21719,22.0,5.86,0,0.431,7.853,33.2,8.9067,7,330,19.1,396.90,3.54,40.1
0.19133,22.0,5.86,0,0.431,7.255,92.2,8.9067,7,330,19.1,393.63,6.48,37.2
0.33983,22.0,5.86,0,0.431,6.383,67.7,7.8265,7,330,19.1,396.90,9.69,25.7
0.19657,22.0,5.86,0,0.431,6.816,40.5,8.3248,7,330,19.1,392.90,5.37,31.6
0.16439,22.0,5.86,0,0.431,7.420,71.9,8.3248,7,330,19.1,396.90,4.21,38.7
0.19073,22.0,5.86,0,0.431,7.685,17.7,8.3248,7,330,19.1,396.90,3.01,38.1

Example of use:

 
<?php

use Phpml\Dataset\CsvDataset;
use 
Phpml\Regression\LeastSquares;
use 
Phpml\Metric\Regression;
use 
Phpml\Preprocessing\Normalizer;
use 
Phpml\Math\Matrix;

try {
    
// Load the raw data from CSV
    
$dataset = new CsvDataset(dirname(__FILE__) . '/data/boston_housing.csv'13true);

    
// Get the 6th column (index 5 since arrays are zero-based)
    
$samples array_map(function($row) {
        return [(float)
$row[5]];
    }, 
$dataset->getSamples());

    
// Convert targets to float values (prices in thousands)
    
$targets array_map(function($target) {
        return (float)
$target;
    }, 
$dataset->getTargets());

    
// Calculate dataset statistics
    
$rooms array_column($samples0);
    
$stats = [
        
'min_rooms' => min($rooms),
        
'max_rooms' => max($rooms),
        
'avg_rooms' => array_sum($rooms) / count($rooms),
        
'sample_count' => count($rooms)
    ];

    
// Display dataset statistics
    
echo "\nDataset Statistics:";
    echo 
"\n-----------------\n";
    
printf("Number of samples: %d\n"$stats['sample_count']);
    
printf("Average rooms: %.2f\n"$stats['avg_rooms']);
    
printf("Room range: %.1f - %.1f\n"$stats['min_rooms'], $stats['max_rooms']);

    
// Validation checks
    
if (empty($samples) || empty($targets)) {
        throw new 
InvalidArgumentException('Empty training data provided');
    }

    if (
count($samples) !== count($targets)) {
        throw new 
InvalidArgumentException("Number of samples doesn't match number of targets");
    }

    
// Create regression model
    
$regression = new LeastSquares();

    
// Polynomial expander - transform features to include squared and cubed terms
    
$samplesTransformed array_map(function($sample) {
        return [
            
$sample[0],           // original feature
            
pow($sample[0], 2),   // squared feature
            
pow($sample[0], 3)    // cubed feature
        
];
    }, 
$samples);

    
// Train the model
    
echo "\nTraining model...\n";

    
// Train the model with original and squared features
    
$regression->train($samplesTransformed$targets);

    
// Make predictions
    
echo "\nPredicting house prices...\n";

    
// Prepare test samples
    
$testSamples = [
        [
5.5],  // Small house
        
[6.0],  // Medium house
        
[8.0],  // Large house
        
[$stats['min_rooms'] + ($stats['max_rooms'] - $stats['min_rooms']) / 2],  // Middle
        
[$stats['min_rooms']], // Smallest in dataset
        
[$stats['max_rooms']]  // Largest in dataset
    
];

    
// Polynomial expander - transform features to include squared and cubed terms
    
$samplesTransformed array_map(function($sample) {
        return [
            
$sample[0],           // original feature
            
pow($sample[0], 2),   // squared feature
            
pow($sample[0], 3)    // cubed feature
        
];
    }, 
$testSamples);

    
$predictions $regression->predict($samplesTransformed);

    
// Display results
    
echo "\nPrice Predictions:";
    echo 
"\n-----------------\n";
    foreach (
array_map(null$testSamples$predictions) as [$rooms$price]) {
        
printf(
            
"A house with %.1f rooms is predicted to cost $%s\n",
            
$rooms[0],
            
number_format($price 10002)
        );
    }

} catch (
Exception $e) {
    echo 
"Error: " $e->getMessage() . "\n";
    exit(
1);
}