Linear Regression with PHP
Lasso Regression with Rubix
In Lasso (Least Absolute Shrinkage and Selection Operator) regression, the penalty added is proportional to the absolute value of each
coefficient. The cost function to minimize is:
$J(\beta) = \sum_{i=1}^{n} (y_i - \hat{y_i})^2 + \lambda \sum_{j=1}^{p} |\beta_j|$
The L1 -norm penalty $\sum_{j=1}^{p} |\beta_j|$ often results in some coefficients being reduced to zero, effectively performing feature selection.
Dataset
bedrooms,bathrooms,sqft,lot_size,year_built,days_on_market,proximity_to_transit,crime_rate,school_rating,neighborhood_rating,price
4,4.2,4197,2301,1996,12,7.7,2.2,8.0,6.2,820956
5,1.3,3311,14468,2004,55,6.3,1.1,6.2,8.7,901715
3,4.9,2594,18990,1989,13,9.7,4.2,5.5,5.2,738654
5,4.1,1459,21554,2001,23,4.4,6.3,5.8,7.8,394300
5,1.8,3611,2699,1965,89,3.6,4.5,6.2,8.8,792960
2,1.0,2169,23472,1962,99,8.8,4.9,5.8,9.4,604787
3,4.3,2786,27715,1979,30,3.0,9.1,5.9,6.7,693999
3,3.8,946,2190,1968,17,9.7,4.1,6.4,9.1,243059
3,3.9,4019,12492,1966,62,1.1,5.6,5.9,5.6,857531
5,4.1,3711,19364,2012,84,9.7,8.1,9.5,9.2,1084233
4,1.3,2534,4975,1968,89,1.4,4.6,5.4,5.6,622258
3,2.4,2643,8102,2007,86,9.0,6.6,7.6,7.0,748687
6,1.5,1288,19568,2004,13,5.7,8.8,7.1,9.0,515523
5,4.5,3776,21711,2011,59,9.9,9.5,9.9,5.7,939902
2,3.5,2759,21778,1972,19,1.7,2.3,5.6,6.1,688263
4,2.3,3185,26703,1958,49,6.0,9.3,7.0,8.6,868783
6,1.3,3719,28641,1961,12,9.7,5.4,9.8,8.6,930698
6,2.2,5536,3816,1950,61,5.7,3.3,9.3,8.2,1247529
2,2.3,2602,24238,2007,19,6.7,5.1,9.1,8.5,772573
4,3.9,4861,25093,1950,76,7.3,9.8,6.3,7.7,1130523
5,3.6,4169,2569,1983,9,5.1,5.4,5.9,6.3,951592
1,4.5,1062,7442,1997,71,6.6,4.0,8.3,6.7,382423
4,2.9,1423,3895,1950,28,6.3,6.7,9.6,5.9,356226
2,1.5,1816,21117,1965,78,9.1,3.2,7.8,9.5,622751
6,3.9,4443,5863,2010,95,1.4,1.7,7.9,7.9,993061
5,4.0,5776,14913,2013,52,3.5,2.2,6.4,7.0,1370561
4,3.2,2849,9455,2012,83,9.6,2.2,8.8,7.3,782130
1,4.1,4523,6014,2018,16,9.0,2.4,5.9,9.7,1050792
1,3.0,5348,13093,1971,69,5.1,2.2,6.6,5.8,1211771
3,3.1,3908,20070,2016,99,6.6,6.8,7.1,7.9,946594
3,2.7,4890,5009,1975,12,3.5,2.6,7.5,7.5,1136246
2,1.1,2856,26190,1965,25,2.7,4.1,6.2,8.1,761704
4,1.4,3970,18538,2000,52,5.2,9.1,5.6,5.1,934353
4,1.1,2482,12729,2006,85,4.2,5.3,8.1,9.4,751695
6,3.5,3055,3409,1978,53,6.3,7.0,6.4,9.7,689427
6,2.3,1954,14249,2018,23,1.7,2.6,7.9,7.8,528021
6,3.0,5299,2784,1996,16,9.8,2.7,5.8,8.5,1094888
3,4.6,3991,29751,2011,57,9.9,1.4,7.4,9.6,1116055
4,2.0,2496,10096,2018,39,7.3,2.5,7.7,8.5,625658
4,2.6,4791,9560,1965,53,5.8,3.5,5.3,5.8,1135139
1,4.0,5523,14533,1997,42,3.8,2.6,6.7,7.9,1177368
3,1.9,5482,9343,1988,58,8.3,1.8,5.7,8.0,1313937
5,1.3,2448,9206,1982,39,7.2,2.1,5.3,7.1,614980
3,2.2,2245,23980,1972,14,2.5,5.1,9.9,8.7,681664
5,1.6,4999,7801,1959,95,9.2,2.9,6.6,9.7,1091159
1,4.7,4723,21190,2018,5,8.4,4.3,9.0,9.6,1232306
2,4.2,1053,18921,1983,35,9.5,5.5,6.3,7.3,384723
4,3.5,3983,7986,2001,87,7.5,7.2,8.4,5.6,998495
1,4.5,3357,20225,1959,93,6.5,1.4,8.8,9.9,881436
4,4.2,898,12647,1968,75,4.8,8.2,8.0,9.2,348950
6,1.7,3000,10716,2007,18,9.4,6.7,7.4,5.6,738199
2,4.6,3761,25355,1950,76,8.8,1.7,7.1,9.6,950927
2,3.2,5773,24009,2018,9,1.4,8.9,6.7,9.3,1426435
1,4.2,5233,21334,1953,74,1.2,9.3,9.6,7.6,1154561
2,4.6,4577,26376,1965,58,4.4,1.5,9.2,8.0,1077870
5,2.3,3669,14323,1973,17,8.3,3.5,9.8,7.0,946578
2,1.4,4034,6780,1951,7,9.9,8.3,5.6,5.3,875643
4,1.9,4591,4368,1981,46,2.4,7.7,8.7,6.7,1029440
4,2.7,1756,14039,1973,13,6.3,2.7,9.7,9.0,541316
4,4.3,4867,8655,1961,40,4.4,2.9,5.9,5.0,1071054
4,4.4,960,10173,1999,42,9.7,4.3,5.3,6.7,298477
5,1.0,5293,6495,1984,9,8.6,5.4,8.7,7.0,1156159
3,3.0,2100,12893,1982,50,8.5,6.6,7.9,7.7,593857
6,2.7,5711,27547,1982,27,5.2,4.3,9.2,9.6,1358816
1,1.9,4787,24386,2010,66,4.7,5.2,5.7,6.7,1197342
4,1.5,2327,24998,2000,5,3.5,7.7,9.0,6.7,625860
2,2.4,2018,27742,1992,29,1.5,1.3,6.0,8.7,692367
4,4.8,5296,15403,1961,37,8.8,3.3,5.8,7.3,1214244
2,2.3,5535,15121,2016,38,8.3,7.4,5.8,6.1,1295206
6,3.1,3904,24303,2014,83,10.0,9.1,9.1,7.3,1091849
6,3.8,3535,12966,1982,8,10.0,5.6,8.3,5.7,942703
6,2.5,5355,28558,1989,65,6.0,5.8,7.6,5.9,1353708
2,4.9,1754,2853,1992,86,7.9,2.0,6.8,7.5,370021
4,4.8,5365,22530,1993,17,9.5,5.0,9.4,7.1,1242471
6,2.0,4246,22153,1978,71,8.6,5.8,7.0,9.6,1030524
5,3.0,1845,18958,1962,89,3.2,3.2,9.1,6.8,549494
2,2.2,2493,29837,1961,45,5.1,3.4,7.2,7.9,746867
2,2.1,4236,19532,1995,4,2.2,4.4,6.9,8.2,1015172
4,1.1,1362,24677,1951,36,9.6,1.2,7.3,5.1,446923
2,3.4,3285,24841,1984,70,6.5,3.9,6.5,8.3,835835
2,3.0,4154,25893,1957,31,3.1,2.9,8.7,5.9,997889
6,1.2,1025,11907,1975,19,7.0,3.9,7.5,9.8,299601
4,2.1,5693,6777,1983,61,6.6,2.1,6.2,5.7,1193923
6,4.6,1759,22238,1956,54,4.2,9.0,9.5,7.1,480205
6,2.0,5348,17338,2017,39,2.0,6.3,6.9,5.4,1213696
4,1.6,3867,4491,2007,91,7.0,7.1,7.7,10.0,989537
1,3.0,1615,29236,1978,74,5.7,8.1,9.5,7.5,531710
6,4.9,5554,17328,1985,90,8.0,5.5,8.1,8.0,1436059
5,2.0,5442,16599,1970,19,5.7,1.8,5.6,5.3,1241460
5,3.7,3920,21508,1985,39,8.7,5.8,9.7,8.7,1029381
2,4.0,816,24075,1959,67,6.0,6.3,8.1,6.0,305657
5,2.0,2043,5051,2022,45,6.0,7.7,6.7,9.5,610870
2,3.9,3005,17979,1973,13,8.9,4.9,5.7,6.0,717113
1,2.5,3324,18388,2013,92,4.6,2.1,9.0,6.0,890840
4,3.5,1869,23606,1998,58,2.2,3.6,8.1,5.2,717434
4,3.5,4244,23699,1985,20,1.3,4.3,7.7,7.4,1060692
4,3.1,3365,23253,1973,92,7.8,6.8,9.5,7.8,954128
5,1.4,1860,2876,1972,72,6.6,6.1,8.9,5.3,431621
1,4.3,3127,27567,2011,61,7.3,4.2,5.8,8.9,878685
5,2.3,4220,25411,1986,39,2.9,9.9,6.6,7.3,1146995
Example of use:
<?php
/////////////////
///
///
use Rubix\ML\CrossValidation\Metrics\MeanAbsoluteError;
use Rubix\ML\CrossValidation\Metrics\RMSE;
use Rubix\ML\CrossValidation\Metrics\RSquared;
use Rubix\ML\Datasets\Labeled;
use Rubix\ML\Datasets\Unlabeled;
use Rubix\ML\Extractors\CSV;
use Rubix\ML\Regressors\Ridge;
use Rubix\ML\Transformers\MinMaxNormalizer;
use Rubix\ML\Transformers\ZScaleStandardizer;
// Load and prepare the dataset
$extractor = new CSV(dirname(__FILE__) . '/data/houses3.csv', true);
$samples = [];
$labels = [];
// Manually parse the CSV to ensure numeric types
foreach ($extractor->getIterator() as $row) {
// Get price (last column) as the label
$label = (float)array_pop($row);
// Convert all feature values to float
$sample = array_map('floatval', $row);
$samples[] = $sample;
$labels[] = $label;
}
// Feature names (for display)
$featureNames = $extractor->header();
// Create labeled dataset
$dataset = new Labeled($samples, $labels);
// Randomize the dataset
//$dataset = $dataset->randomize();
//echo "General info\n";
//echo "Total dataset size: " . $dataset->numSamples() . " samples\n\n";
//
////// Debug: Print sample data types to verify all are numeric
////if ($dataset->numSamples() > 0) {
//// $sampleData = $dataset->samples()[0];
//// echo "Sample data types verification:\n";
//// foreach ($sampleData as $i => $value) {
//// $featureName = isset($featureNames[$i]) ? $featureNames[$i] : "Feature $i";
//// echo "$featureName: " . gettype($value) . " (value: $value)\n";
//// }
////}
//
//// Use regular split instead of stratified split (which is for classification)
//$trainSize = 0.8 * $dataset->numSamples();
//$trainSize = (int) $trainSize;
//$testSize = $dataset->numSamples() - $trainSize;
//
//// Split manually
//$samples = $dataset->samples();
//$labels = $dataset->labels();
//
//$trainingSamples = array_slice($samples, 0, $trainSize);
//$trainingLabels = array_slice($labels, 0, $trainSize);
//$testingSamples = array_slice($samples, $trainSize, $testSize);
//$testingLabels = array_slice($labels, $trainSize, $testSize);
//
//$training = new Labeled($trainingSamples, $trainingLabels);
//$testing = new Labeled($testingSamples, $testingLabels);
//
//echo 'Training samples: ' . $training->numSamples() . PHP_EOL;
//echo 'Testing samples: ' . $testing->numSamples() . PHP_EOL;
//
//// Calculate mean price for baseline comparison
//$meanPrice = array_sum($trainingLabels) / count($trainingLabels);
//echo 'Mean price in training data: $' . number_format($meanPrice, 2) . PHP_EOL;
//
//// Normalize the features for better model convergence
//echo "Normalizing features...\n";
//$normalizer = new MinMaxNormalizer();
//
//// Create an unlabeled dataset for fitting the normalizer
//$unlabeledTraining = new Unlabeled($training->samples());
//$normalizer->fit($unlabeledTraining);
//
//// Apply the normalizer to training and testing data
//$normalizedTrainingSamples = $training->samples();
////$normalizer->transform($normalizedTrainingSamples);
//$normalizedTraining = new Labeled($normalizedTrainingSamples, $training->labels());
//
//$normalizedTestingSamples = $testing->samples();
////$normalizer->transform($normalizedTestingSamples);
//$normalizedTesting = new Labeled($normalizedTestingSamples, $testing->labels());
//
//// Debug: Print sample data types to verify all are numeric
//if ($normalizedTraining->numSamples() > 0) {
// $sampleData = $normalizedTraining->samples()[0];
// echo "Sample data types verification:\n";
// foreach ($sampleData as $i => $value) {
// $featureName = isset($featureNames[$i]) ? $featureNames[$i] : "Feature $i";
// echo "$featureName: " . gettype($value) . " (value: $value)\n";
// }
//}
//
//// Create baseline predictions (just the mean)
//$baselinePredictions = array_fill(0, count($testingLabels), $meanPrice);
//
//// Calculate baseline metrics
//$rmseMetric = new RMSE();
//$baselineRMSE = $rmseMetric->score($baselinePredictions, $testingLabels);
//echo "Baseline RMSE (using mean price): $" . number_format(abs($baselineRMSE), 2) . PHP_EOL;
//
//$r2Metric = new RSquared();
//$baselineR2 = $r2Metric->score($baselinePredictions, $testingLabels);
//echo "Baseline R^2 (using mean price): " . number_format($baselineR2, 4) . PHP_EOL;
// Try different alpha values for Ridge regression
//$alphaValues = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0];
$bestAlpha = 0.0001;
//$bestRMSE = PHP_FLOAT_MAX;
//$bestR2 = -PHP_FLOAT_MAX;
//$bestMAE = PHP_FLOAT_MAX;
//$bestEstimator = null;
//echo "\nTuning alpha parameter for Ridge regression...\n";
//foreach ($alphaValues as $alpha) {
// // Set up Ridge regression with L2 regularization
// $estimator = new Ridge(
// $alpha, // alpha (regularization strength)
// false // normalize (set to false since we already normalized)
// );
//
// // Train the model
// echo "Training with alpha = $alpha...\n";
// try {
// $estimator->train($normalizedTraining);
//
// // Make predictions on the test set
// $predictions = $estimator->predict($normalizedTesting);
//
// // Calculate metrics
// $rmseMetric = new RMSE();
// $rmse = $rmseMetric->score($predictions, $testingLabels);
//
// $r2Metric = new RSquared();
// $r2 = $r2Metric->score($predictions, $testingLabels);
//
// $maeMetric = new MeanAbsoluteError();
// $mae = $maeMetric->score($predictions, $testingLabels);
//
// echo "Alpha = $alpha: RMSE = $" . number_format(abs($rmse), 2) .
// ", R^2 = " . number_format($r2, 4) .
// ", MAE = $" . number_format(abs($mae), 2) . PHP_EOL;
//
// // Track best model based on R^2
// if ($r2 > $bestR2) {
// $bestAlpha = $alpha;
// $bestRMSE = $rmse;
// $bestR2 = $r2;
// $bestMAE = $mae;
// $bestEstimator = $estimator;
// }
// } catch (Exception $e) {
// echo "\nError during training with alpha = $alpha: " . $e->getMessage() . "\n";
// continue;
// }
//}
//// Report best model
//echo "\nBest model performance with alpha = $bestAlpha:\n";
//echo 'RMSE: $' . number_format(abs($bestRMSE), 2) . PHP_EOL;
//echo 'R^2: ' . number_format($bestR2, 4) . PHP_EOL;
//echo 'Mean Absolute Error: $' . number_format(abs($bestMAE), 2) . PHP_EOL;
//echo 'Improvement over baseline: ' . number_format(abs($baselineRMSE) - abs($bestRMSE), 2) . ' RMSE' . PHP_EOL;
//
//////////////
// Simple implementation of Lasso using soft thresholding
// Use the custom Lasso implementation
echo "Custom Lasso Implementation:\n";
echo "----------------\n";
// Extract standardized features and labels
$X = $dataset->samples();
$y = $dataset->labels();
// Run Lasso
$coefficients = simpleLasso($X, $y, $bestAlpha);
// Print results
echo "Lasso Regression (alpha=$bestAlpha):\n";
foreach ($coefficients as $i => $coefficient) {
echo humanize($featureNames[$i]) . ': ' . number_format($coefficient, 0) . PHP_EOL;
}
// Predict for example homes
// Example homes for prediction
$exampleHomes = [
// 2 bed, 1 bath, small older home
[2.0, 1.0, 1000.0, 3500.0, 1965.0, 60.0, 9.0, 3.5, 6.5, 7.0],
// 3 bed, 2 bath, medium-sized newer home
[3.0, 2.0, 2000.0, 6000.0, 2010.0, 30.0, 8.0, 3.0, 8.0, 8.0],
// 5 bed, 3.5 bath, large luxury home
[5.0, 3.5, 3500.0, 12000.0, 2018.0, 15.0, 6.5, 2.0, 9.0, 9.0]
];
// Create descriptions for the homes
$homeDescriptions = [
"Small home (2 bed, 1 bath, 1000 sqft, built 1965)",
"Medium home (3 bed, 2 bath, 2000 sqft, built 2010)",
"Luxury home (5 bed, 3.5 bath, 3500 sqft, built 2018)",
];
$exampleDataset = new Labeled($exampleHomes, [0, 0, 0]); // Dummy labels
$exampleStandardized = $exampleDataset->samples();
$predictions = [];
foreach ($exampleStandardized as $home) {
$pred = 0;
for ($i = 0; $i < count($coefficients); $i++) {
$pred += $home[$i] * $coefficients[$i];
}
$predictions[] = $pred;
}
echo "\nPredictions for example homes:\n";
echo "----------------\n";
foreach ($predictions as $i => $prediction) {
echo $homeDescriptions[$i] . ": $" . number_format($prediction, 2) . PHP_EOL;
}
function simpleLasso($X, $y, $alpha = 1.0, $maxIter = 1000, $tolerance = 1e-4) {
$n = count($X);
$p = count($X[0]);
// Initialize coefficients to zero
$beta = array_fill(0, $p, 0.0);
// Calculate X^T
$Xt = [];
for ($j = 0; $j < $p; $j++) {
$Xt[$j] = [];
for ($i = 0; $i < $n; $i++) {
$Xt[$j][$i] = $X[$i][$j];
}
}
// Coordinate descent
for ($iter = 0; $iter < $maxIter; $iter++) {
$maxDiff = 0;
for ($j = 0; $j < $p; $j++) {
// Calculate residual without current feature
$r = [];
for ($i = 0; $i < $n; $i++) {
$pred = 0;
for ($k = 0; $k < $p; $k++) {
if ($k != $j) {
$pred += $X[$i][$k] * $beta[$k];
}
}
$r[$i] = $y[$i] - $pred;
}
// Calculate correlation of residual with feature j
$corr = 0;
for ($i = 0; $i < $n; $i++) {
$corr += $Xt[$j][$i] * $r[$i];
}
// Calculate L2 norm of feature j
$l2Norm = 0;
for ($i = 0; $i < $n; $i++) {
$l2Norm += $Xt[$j][$i] * $Xt[$j][$i];
}
// Soft thresholding
$oldBeta = $beta[$j];
if ($corr > $alpha) {
$beta[$j] = ($corr - $alpha) / $l2Norm;
} else if ($corr < -$alpha) {
$beta[$j] = ($corr + $alpha) / $l2Norm;
} else {
$beta[$j] = 0;
}
// Track max difference for convergence check
$diff = abs($oldBeta - $beta[$j]);
if ($diff > $maxDiff) {
$maxDiff = $diff;
}
}
// Check convergence
if ($maxDiff < $tolerance) {
break;
}
}
return $beta;
}