Case 1. Similar document search
Implementation in pure PHP
One of the most natural TF–IDF use cases is similar-text search. In this case, we build a mini search engine in pure PHP: tokenize documents and query, compute TF–IDF vectors, compare them with cosine similarity, and return documents that are the closest to the user query.
Example of code:
<?php
// Source documents for similarity search.
$documents = [
1 => 'How to reset a user password',
2 => 'Database connection error',
3 => 'Configuring SMTP for sending email',
4 => 'Restoring access to a user account',
];
// Converts text to lowercase and splits by spaces.
function tokenize(string $text): array {
$text = mb_strtolower($text);
return explode(' ', $text);
}
// Calculates normalized term frequency for one document.
function termFrequency(array $tokens): array {
$tf = [];
$count = count($tokens);
foreach ($tokens as $token) {
$tf[$token] = ($tf[$token] ?? 0) + 1;
}
foreach ($tf as $word => $value) {
$tf[$word] = $value / $count;
}
return $tf;
}
// Calculates inverse document frequency across all documents.
function inverseDocumentFrequency(array $documents): array {
$df = [];
$N = count($documents);
foreach ($documents as $doc) {
foreach (array_unique($doc) as $word) {
$df[$word] = ($df[$word] ?? 0) + 1;
}
}
$idf = [];
foreach ($df as $word => $freq) {
$idf[$word] = log($N / $freq);
// This version of the formula uses smoothing and helps avoid the situation
// where very frequent words get exactly 0 weight.
// $idf[$word] = log(($N + 1) / ($freq + 1)) + 1;
}
return $idf;
}
// Builds TF-IDF vector for one document/query.
function tfidf(array $tf, array $idf): array {
$vector = [];
foreach ($tf as $word => $value) {
$vector[$word] = $value * ($idf[$word] ?? 0);
}
return $vector;
}
// Measures similarity between two sparse vectors.
function cosineSimilarity(array $a, array $b): float {
$dot = 0;
$normA = 0;
$normB = 0;
$words = array_unique(array_merge(
array_keys($a),
array_keys($b)
));
foreach ($words as $word) {
$va = $a[$word] ?? 0;
$vb = $b[$word] ?? 0;
$dot += $va * $vb;
$normA += $va * $va;
$normB += $vb * $vb;
}
if ($normA == 0 || $normB == 0) {
return 0;
}
return $dot / (sqrt($normA) * sqrt($normB));
}
// Precompute tokenized documents, IDF, and document TF-IDF vectors.
$tokenizedDocs = array_map('tokenize', $documents);
$idf = inverseDocumentFrequency($tokenizedDocs);
$documentVectors = [];
foreach ($tokenizedDocs as $id => $tokens) {
$tf = termFrequency($tokens);
$documentVectors[$id] = tfidf($tf, $idf);
}