Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions .github/workflows/formats.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
name: Formats

on: ['push', 'pull_request']

jobs:
ci:
runs-on: ${{ matrix.os }}

strategy:
fail-fast: true
matrix:
os: [ubuntu-latest]
php: [8.4]
dependency-version: [prefer-lowest, prefer-stable]

name: Formats P${{ matrix.php }} - ${{ matrix.os }} - ${{ matrix.dependency-version }}

steps:

- name: Checkout
uses: actions/checkout@v3

- name: Setup PHP
uses: shivammathur/setup-php@v2
with:
php-version: ${{ matrix.php }}
extensions: dom, mbstring, zip
coverage: pcov

- name: Get Composer cache directory
id: composer-cache
shell: bash
run: echo "dir=$(composer config cache-files-dir)" >> $GITHUB_OUTPUT

- name: Cache dependencies
uses: actions/cache@v3
with:
path: ${{ steps.composer-cache.outputs.dir }}
key: dependencies-php-${{ matrix.php }}-os-${{ matrix.os }}-version-${{ matrix.dependency-version }}-composer-${{ hashFiles('composer.json') }}
restore-keys: dependencies-php-${{ matrix.php }}-os-${{ matrix.os }}-version-${{ matrix.dependency-version }}-composer-

- name: Install Composer dependencies
run: composer update --${{ matrix.dependency-version }} --no-interaction --prefer-dist

- name: Coding Style Checks
run: composer test:lint

- name: Type Checks
run: composer test:types
45 changes: 45 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: Tests

on: ['push', 'pull_request']

jobs:
ci:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: true
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
php: ['8.3', '8.4']
dependency-version: [prefer-lowest, prefer-stable]

name: Tests P${{ matrix.php }} - ${{ matrix.os }} - ${{ matrix.dependency-version }}

steps:

- name: Checkout
uses: actions/checkout@v3

- name: Setup PHP
uses: shivammathur/setup-php@v2
with:
php-version: ${{ matrix.php }}
extensions: dom, mbstring, zip
coverage: xdebug

- name: Get Composer cache directory
id: composer-cache
shell: bash
run: echo "dir=$(composer config cache-files-dir)" >> $GITHUB_OUTPUT

- name: Cache dependencies
uses: actions/cache@v3
with:
path: ${{ steps.composer-cache.outputs.dir }}
key: dependencies-php-${{ matrix.php }}-os-${{ matrix.os }}-version-${{ matrix.dependency-version }}-composer-${{ hashFiles('composer.json') }}
restore-keys: dependencies-php-${{ matrix.php }}-os-${{ matrix.os }}-version-${{ matrix.dependency-version }}-composer-

- name: Install Composer dependencies
run: composer update --${{ matrix.dependency-version }} --no-interaction --prefer-dist

- name: Integration Tests
run: php ./vendor/phpunit/phpunit/phpunit --testsuite default
80 changes: 58 additions & 22 deletions composer.json
Original file line number Diff line number Diff line change
@@ -1,25 +1,61 @@
{
"name": "edgaras/strsim",
"description": "Collection of string similarity and distance algorithms in PHP including Levenshtein, Damerau-Levenshtein, Jaro-Winkler, and more",
"type": "library",
"homepage": "https://github.com/Edgaras0x4E/StrSim",
"license": "MIT",
"keywords": ["string", "similarity", "distance", "levenshtein", "damerau-levenshtein", "jaro", "jaro-winkler", "lcs", "smith-waterman", "needleman-wunsch", "cosine", "jaccard", "monge-elkan", "text", "fuzzy", "comparison", "algorithm", "multibyte"],
"autoload": {
"psr-4": {
"Edgaras\\StrSim\\": "src/"
}
},
"autoload-dev": {
"psr-4": {
"Edgaras\\StrSim\\Tests\\": "tests/"
}
},
"minimum-stability": "stable",
"require": {
"php": ">=8.3.0"
},
"require-dev": {
"phpunit/phpunit": "^11.5"
"name": "edgaras/strsim",
"description": "Collection of string similarity and distance algorithms in PHP including Levenshtein, Damerau-Levenshtein, Jaro-Winkler, and more",
"type": "library",
"homepage": "https://github.com/Edgaras0x4E/StrSim",
"license": "MIT",
"keywords": [
"string",
"similarity",
"distance",
"levenshtein",
"damerau-levenshtein",
"jaro",
"jaro-winkler",
"lcs",
"smith-waterman",
"needleman-wunsch",
"cosine",
"jaccard",
"monge-elkan",
"text",
"fuzzy",
"comparison",
"algorithm",
"multibyte"
],
"autoload": {
"psr-4": {
"Edgaras\\StrSim\\": "src/"
}
},
"autoload-dev": {
"psr-4": {
"Edgaras\\StrSim\\Tests\\": "tests/"
}
},
"minimum-stability": "stable",
"require": {
"php": ">=8.3.0"
},
"require-dev": {
"phpunit/phpunit": "^11.5",
"phpstan/phpstan": "^2.1",
"rector/rector": "^2.0",
"laravel/pint": "^1.19"
},
"scripts": {
"refacto": "rector",
"lint": "pint",
"test:refacto": "rector --dry-run",
"test:lint": "pint --test",
"test:types": "phpstan analyse --ansi",
"test:unit": "phpunit",
"test": [
"@test:refacto",
"@test:lint",
"@test:types",
"@test:unit"
]
}
}
6 changes: 6 additions & 0 deletions phpstan.neon
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
parameters:
level: max
paths:
- src

reportUnmatchedIgnoredErrors: true
31 changes: 31 additions & 0 deletions phpunit.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<?xml version="1.0" encoding="UTF-8"?>
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="vendor/phpunit/phpunit/phpunit.xsd"
bootstrap="vendor/autoload.php"
cacheDirectory=".phpunit.cache"
executionOrder="depends,defects"
shortenArraysForExportThreshold="10"
requireCoverageMetadata="true"
beStrictAboutCoverageMetadata="true"
beStrictAboutOutputDuringTests="true"
displayDetailsOnPhpunitDeprecations="true"
displayDetailsOnTestsThatTriggerDeprecations="true"
failOnPhpunitDeprecation="true"
failOnRisky="true"
failOnWarning="true">
<testsuites>
<testsuite name="default">
<directory>tests</directory>
</testsuite>
</testsuites>
<source ignoreIndirectDeprecations="true" restrictNotices="true" restrictWarnings="true">
<include>
<directory>src</directory>
</include>
</source>
<coverage>
<report>
<html outputDirectory=".phpunit.cache/code-coverage" />
</report>
</coverage>
</phpunit>
6 changes: 6 additions & 0 deletions pint.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"preset": "psr12",
"rules": {
"declare_strict_types": true
}
}
18 changes: 18 additions & 0 deletions rector.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<?php

declare(strict_types=1);

use Rector\Config\RectorConfig;

return RectorConfig::configure()
->withPaths([
__DIR__.'/src',
__DIR__.'/tests',
])
->withPreparedSets(
deadCode: true,
codeQuality: true,
typeDeclarations: true,
privatization: true,
earlyReturn: true,
);
46 changes: 30 additions & 16 deletions src/Cosine.php
Original file line number Diff line number Diff line change
@@ -1,43 +1,57 @@
<?php

declare(strict_types=1);

namespace Edgaras\StrSim;

class Cosine {
public static function similarity(string $a, string $b): float {
class Cosine
{
public static function similarity(string $a, string $b): float
{
if (!mb_check_encoding($a, 'UTF-8') || !mb_check_encoding($b, 'UTF-8')) {
throw new \InvalidArgumentException("Input strings must be valid UTF-8.");
}

$tokensA = self::countMbChars($a);
$tokensB = self::countMbChars($b);
$dot = 0;
$normA = 0;
$normB = 0;
$dot = 0.0;
$normA = 0.0;
$normB = 0.0;

foreach ($tokensA as $k => $v) {
$dot += $v * ($tokensB[$k] ?? 0);
$normA += $v * $v;
$dot += (float)$v * (float)($tokensB[$k] ?? 0);
$normA += (float)$v * (float)$v;
}

foreach ($tokensB as $v) {
$normB += $v * $v;
$normB += (float)$v * (float)$v;
}

return ($normA && $normB) ? $dot / (sqrt($normA) * sqrt($normB)) : 0;
}

private static function countMbChars(string $str): array {
/**
* @return array<string, int>
*/
private static function countMbChars(string $str): array
{
$chars = [];
$length = mb_strlen($str, 'UTF-8');

for ($i = 0; $i < $length; $i++) {
$char = mb_substr($str, $i, 1, 'UTF-8');
$chars[$char] = ($chars[$char] ?? 0) + 1;
}

return $chars;
}

public static function similarityFromVectors(array $vecA, array $vecB): float {
/**
* @param array<int|string, int|float> $vecA
* @param array<int|string, int|float> $vecB
*/
public static function similarityFromVectors(array $vecA, array $vecB): float
{
if (count($vecA) !== count($vecB)) {
throw new \InvalidArgumentException("Vectors must be the same length.");
}
Expand All @@ -48,9 +62,9 @@ public static function similarityFromVectors(array $vecA, array $vecB): float {

foreach ($vecA as $i => $valA) {
$valB = $vecB[$i];
$dot += $valA * $valB;
$normA += $valA * $valA;
$normB += $valB * $valB;
$dot += (float)$valA * (float)$valB;
$normA += (float)$valA * (float)$valA;
$normB += (float)$valB * (float)$valB;
}

return ($normA && $normB) ? $dot / (sqrt($normA) * sqrt($normB)) : 0.0;
Expand Down
23 changes: 16 additions & 7 deletions src/DamerauLevenshtein.php
Original file line number Diff line number Diff line change
@@ -1,31 +1,40 @@
<?php

declare(strict_types=1);

namespace Edgaras\StrSim;

class DamerauLevenshtein {
public static function distance(string $a, string $b): int {
class DamerauLevenshtein
{
public static function distance(string $a, string $b): int
{
if (!mb_check_encoding($a, 'UTF-8') || !mb_check_encoding($b, 'UTF-8')) {
throw new \InvalidArgumentException("Input strings must be valid UTF-8.");
}

$lenA = mb_strlen($a, 'UTF-8');
$lenB = mb_strlen($b, 'UTF-8');
$dp = [];

for ($i = 0; $i <= $lenA; $i++) $dp[$i][0] = $i;
for ($j = 0; $j <= $lenB; $j++) $dp[0][$j] = $j;
for ($i = 0; $i <= $lenA; $i++) {
$dp[$i][0] = $i;
}
for ($j = 0; $j <= $lenB; $j++) {
$dp[0][$j] = $j;
}

for ($i = 1; $i <= $lenA; $i++) {
for ($j = 1; $j <= $lenB; $j++) {
$charA = mb_substr($a, $i - 1, 1, 'UTF-8');
$charB = mb_substr($b, $j - 1, 1, 'UTF-8');
$cost = ($charA === $charB) ? 0 : 1;

$dp[$i][$j] = min(
$dp[$i - 1][$j] + 1,
$dp[$i][$j - 1] + 1,
$dp[$i - 1][$j - 1] + $cost
);

if ($i > 1 && $j > 1) {
$charA2 = mb_substr($a, $i - 2, 1, 'UTF-8');
$charB2 = mb_substr($b, $j - 2, 1, 'UTF-8');
Expand Down
Loading