Case 1. Similar document search

Implementation in pure PHP

One of the most natural TF–IDF use cases is similar-text search. In this case, we build a mini search engine in pure PHP: tokenize documents and query, compute TF–IDF vectors, compare them with cosine similarity, and return documents that are the closest to the user query.

Example of code:


<?php

// Source documents for similarity search.
$documents = [
    1 => 'How to reset a user password',
    2 => 'Database connection error',
    3 => 'Configuring SMTP for sending email',
    4 => 'Restoring access to a user account',
];

// Converts text to lowercase and splits by spaces.
function tokenize(string $text): array {
    $text = mb_strtolower($text);

    return explode(' ', $text);
}

// Calculates normalized term frequency for one document.
function termFrequency(array $tokens): array {
    $tf = [];
    $count = count($tokens);

    foreach ($tokens as $token) {
        $tf[$token] = ($tf[$token] ?? 0) + 1;
    }

    foreach ($tf as $word => $value) {
        $tf[$word] = $value / $count;
    }

    return $tf;
}

// Calculates inverse document frequency across all documents.
function inverseDocumentFrequency(array $documents): array {
    $df = [];
    $N = count($documents);

    foreach ($documents as $doc) {
        foreach (array_unique($doc) as $word) {
            $df[$word] = ($df[$word] ?? 0) + 1;
        }
    }

    $idf = [];

    foreach ($df as $word => $freq) {
        $idf[$word] = log($N / $freq);
        // This version of the formula uses smoothing and helps avoid the situation
        // where very frequent words get exactly 0 weight.
        // $idf[$word] = log(($N + 1) / ($freq + 1)) + 1;
    }

    return $idf;
}

// Builds TF-IDF vector for one document/query.
function tfidf(array $tf, array $idf): array {
    $vector = [];

    foreach ($tf as $word => $value) {
        $vector[$word] = $value * ($idf[$word] ?? 0);
    }

    return $vector;
}

// Measures similarity between two sparse vectors.
function cosineSimilarity(array $a, array $b): float {
    $dot = 0;
    $normA = 0;
    $normB = 0;

    $words = array_unique(array_merge(
        array_keys($a),
        array_keys($b)
    ));

    foreach ($words as $word) {
        $va = $a[$word] ?? 0;
        $vb = $b[$word] ?? 0;

        $dot += $va * $vb;

        $normA += $va * $va;
        $normB += $vb * $vb;
    }

    if ($normA == 0 || $normB == 0) {
        return 0;
    }

    return $dot / (sqrt($normA) * sqrt($normB));
}

// Precompute tokenized documents, IDF, and document TF-IDF vectors.
$tokenizedDocs = array_map('tokenize', $documents);
$idf = inverseDocumentFrequency($tokenizedDocs);

$documentVectors = [];

foreach ($tokenizedDocs as $id => $tokens) {
    $tf = termFrequency($tokens);
    $documentVectors[$id] = tfidf($tf, $idf);
}