Skip to content

Refactor levenshtein() #5816

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ext/standard/basic_functions.stub.php
Original file line number Diff line number Diff line change
Expand Up @@ -985,7 +985,7 @@ function iptcparse(string $iptcblock): array|false {}

/* levenshtein.c */

function levenshtein(string $str1, string $str2, $cost_ins = UNKNOWN, int $cost_rep = UNKNOWN, int $cost_del = UNKNOWN): int {}
function levenshtein(string $str1, string $str2, int $cost_ins = 1, int $cost_rep = 1, int $cost_del = 1): int {}

/* link.c */

Expand Down
8 changes: 4 additions & 4 deletions ext/standard/basic_functions_arginfo.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* This is a generated file, edit the .stub.php file instead.
* Stub hash: 9cf2c691c081d9aaefaa9d22337a9e00efb0af77 */
* Stub hash: 05b740207a70a9d272b4c327882fd0b52016a0af */

ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_set_time_limit, 0, 1, _IS_BOOL, 0)
ZEND_ARG_TYPE_INFO(0, seconds, IS_LONG, 0)
Expand Down Expand Up @@ -1568,9 +1568,9 @@ ZEND_END_ARG_INFO()
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_levenshtein, 0, 2, IS_LONG, 0)
ZEND_ARG_TYPE_INFO(0, str1, IS_STRING, 0)
ZEND_ARG_TYPE_INFO(0, str2, IS_STRING, 0)
ZEND_ARG_INFO(0, cost_ins)
ZEND_ARG_TYPE_INFO(0, cost_rep, IS_LONG, 0)
ZEND_ARG_TYPE_INFO(0, cost_del, IS_LONG, 0)
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, cost_ins, IS_LONG, 0, "1")
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, cost_rep, IS_LONG, 0, "1")
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, cost_del, IS_LONG, 0, "1")
ZEND_END_ARG_INFO()

#if defined(HAVE_SYMLINK) || defined(PHP_WIN32)
Expand Down
90 changes: 29 additions & 61 deletions ext/standard/levenshtein.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,42 +15,36 @@
*/

#include "php.h"
#include <stdlib.h>
#include <errno.h>
#include <ctype.h>
#include "php_string.h"

#define LEVENSHTEIN_MAX_LENGTH 255

/* {{{ reference_levdist
* reference implementation, only optimized for memory usage, not speed */
static zend_long reference_levdist(const char *s1, size_t l1, const char *s2, size_t l2, zend_long cost_ins, zend_long cost_rep, zend_long cost_del )
static zend_long reference_levdist(const zend_string *string1, const zend_string *string2, zend_long cost_ins, zend_long cost_rep, zend_long cost_del )
{
zend_long *p1, *p2, *tmp;
zend_long c0, c1, c2;
size_t i1, i2;

if (l1 == 0) {
return l2 * cost_ins;
if (ZSTR_LEN(string1) == 0) {
return ZSTR_LEN(string2) * cost_ins;
}
if (l2 == 0) {
return l1 * cost_del;
if (ZSTR_LEN(string2) == 0) {
return ZSTR_LEN(string1) * cost_del;
}

if ((l1 > LEVENSHTEIN_MAX_LENGTH) || (l2 > LEVENSHTEIN_MAX_LENGTH)) {
return -1;
}
p1 = safe_emalloc((l2 + 1), sizeof(zend_long), 0);
p2 = safe_emalloc((l2 + 1), sizeof(zend_long), 0);
p1 = safe_emalloc((ZSTR_LEN(string2) + 1), sizeof(zend_long), 0);
p2 = safe_emalloc((ZSTR_LEN(string2) + 1), sizeof(zend_long), 0);

for (i2 = 0; i2 <= l2; i2++) {
for (i2 = 0; i2 <= ZSTR_LEN(string2); i2++) {
p1[i2] = i2 * cost_ins;
}
for (i1 = 0; i1 < l1 ; i1++) {
for (i1 = 0; i1 < ZSTR_LEN(string1) ; i1++) {
p2[0] = p1[0] + cost_del;

for (i2 = 0; i2 < l2; i2++) {
c0 = p1[i2] + ((s1[i1] == s2[i2]) ? 0 : cost_rep);
for (i2 = 0; i2 < ZSTR_LEN(string2); i2++) {
c0 = p1[i2] + ((ZSTR_VAL(string1)[i1] == ZSTR_VAL(string2)[i2]) ? 0 : cost_rep);
c1 = p1[i2 + 1] + cost_del;
if (c1 < c0) {
c0 = c1;
Expand All @@ -65,7 +59,7 @@ static zend_long reference_levdist(const char *s1, size_t l1, const char *s2, si
p1 = p2;
p2 = tmp;
}
c0 = p1[l2];
c0 = p1[ZSTR_LEN(string2)];

efree(p1);
efree(p2);
Expand All @@ -74,58 +68,32 @@ static zend_long reference_levdist(const char *s1, size_t l1, const char *s2, si
}
/* }}} */

/* {{{ custom_levdist
*/
static int custom_levdist(char *str1, char *str2, char *callback_name)
{
php_error_docref(NULL, E_WARNING, "The general Levenshtein support is not there yet");
/* not there yet */

return -1;
}
/* }}} */

/* {{{ proto int levenshtein(string str1, string str2[, int cost_ins, int cost_rep, int cost_del])
Calculate Levenshtein distance between two strings */
PHP_FUNCTION(levenshtein)
{
int argc = ZEND_NUM_ARGS();
char *str1, *str2;
char *callback_name;
size_t str1_len, str2_len, callback_len;
zend_long cost_ins, cost_rep, cost_del;
zend_long distance = -1;

switch (argc) {
case 2: /* just two strings: use maximum performance version */
if (zend_parse_parameters(2, "ss", &str1, &str1_len, &str2, &str2_len) == FAILURE) {
RETURN_THROWS();
}
distance = reference_levdist(str1, str1_len, str2, str2_len, 1, 1, 1);
break;

case 5: /* more general version: calc cost by ins/rep/del weights */
if (zend_parse_parameters(5, "sslll", &str1, &str1_len, &str2, &str2_len, &cost_ins, &cost_rep, &cost_del) == FAILURE) {
RETURN_THROWS();
}
distance = reference_levdist(str1, str1_len, str2, str2_len, cost_ins, cost_rep, cost_del);
break;

case 3: /* most general version: calc cost by user-supplied function */
if (zend_parse_parameters(3, "sss", &str1, &str1_len, &str2, &str2_len, &callback_name, &callback_len) == FAILURE) {
RETURN_THROWS();
}
distance = custom_levdist(str1, str2, callback_name);
break;
zend_string *string1, *string2;
zend_long cost_ins = 1;
zend_long cost_rep = 1;
zend_long cost_del = 1;
zend_long distance = 0;

if (zend_parse_parameters(ZEND_NUM_ARGS(), "SS|lll", &string1, &string2, &cost_ins, &cost_rep, &cost_del) == FAILURE) {
RETURN_THROWS();
}

default:
WRONG_PARAM_COUNT;
if (ZSTR_LEN(string1) > LEVENSHTEIN_MAX_LENGTH) {
zend_argument_value_error(1, "must be less than %d characters", LEVENSHTEIN_MAX_LENGTH + 1);
RETURN_THROWS();
}

if (distance < 0 && /* TODO */ ZEND_NUM_ARGS() != 3) {
php_error_docref(NULL, E_WARNING, "Argument string(s) too long");
if (ZSTR_LEN(string2) > LEVENSHTEIN_MAX_LENGTH) {
zend_argument_value_error(2, "must be less than %d characters", LEVENSHTEIN_MAX_LENGTH + 1);
RETURN_THROWS();
}

distance = reference_levdist(string1, string2, cost_ins, cost_rep, cost_del);

RETURN_LONG(distance);
}
/* }}} */
123 changes: 54 additions & 69 deletions ext/standard/tests/strings/levenshtein.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -3,76 +3,61 @@ levenshtein() function test
--FILE--
<?php

function test_me($title,$expect,$text1,$text2,$cost1="",$cost2="",$cost3="") {

if ($cost1=="") {
$result=levenshtein($text1,$text2);
}
elseif ($cost2=="") {
$result=levenshtein($text1,$text2,$cost1);
}
else {
$result=levenshtein($text1,$text2,$cost1,$cost2,$cost3);
}
if($result==$expect) return 0;

echo "$title: result is $result instead of $expect ";
echo "for '$text1'/'$text2' ";
if($cost1) echo "($cost1:$cost2:$cost3)";
echo "\n";

return 1;
}

$n=0;

$n += test_me("equal" , 0, "12345", "12345");
$n += test_me("1st empty" , 3, "", "xzy");
$n += test_me("2nd empty" , 3, "xzy", "");
$n += test_me("both empty" , 0, "", "");
$n += test_me("1 char" , 1, "1", "2");
$n += test_me("2 char swap", 2, "12", "21");

$n += test_me("inexpensive delete", 2, "2121", "11", 2, 1, 1);
$n += test_me("expensive delete" , 10, "2121", "11", 2, 1, 5);
$n += test_me("inexpensive insert", 2, "11", "2121", 1, 1, 1);
$n += test_me("expensive insert" , 10, "11", "2121", 5, 1, 1);

$n += test_me("expensive replace" , 3, "111", "121", 2, 3, 2);
$n += test_me("very expensive replace", 4, "111", "121", 2, 9, 2);

$n += test_me("bug #7368", 2, "13458", "12345");
$n += test_me("bug #7368", 2, "1345", "1234");

$n += test_me("bug #6562", 1, "debugg", "debug");
$n += test_me("bug #6562", 1, "ddebug", "debug");
$n += test_me("bug #6562", 2, "debbbug", "debug");
$n += test_me("bug #6562", 1, "debugging", "debuging");

$n += test_me("bug #16473", 2, "a", "bc");
$n += test_me("bug #16473", 2, "xa", "xbc");
$n += test_me("bug #16473", 2, "xax", "xbcx");
$n += test_me("bug #16473", 2, "ax", "bcx");

$n += test_me("custom", -1, "111", "121", "my_levcode");
$n += test_me("lt maxlength1", 254, "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu", "A");
$n += test_me("gt maxlength1", -1, "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv", "A");

$n += test_me("lt maxlength2", 254, "A", "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu");
$n += test_me("gt maxlength2", -1, "A", "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv");

echo ($n==0)?"all passed\n":"$n failed\n";

var_dump(levenshtein("", "", -1, -1, -1));
var_dump(levenshtein("", "", 10, 10, 10));
echo '--- Equal ---' . \PHP_EOL;
var_dump(levenshtein('12345', '12345'));

echo '--- First string empty ---' . \PHP_EOL;
var_dump(levenshtein('', 'xyz'));
echo '--- Second string empty ---' . \PHP_EOL;
var_dump(levenshtein('xyz', ''));
echo '--- Both empty ---' . \PHP_EOL;
var_dump(levenshtein('', ''));
var_dump(levenshtein('', '', 10, 10, 10));

echo '--- 1 character ---' . \PHP_EOL;
var_dump(levenshtein('1', '2'));
echo '--- 2 character swapped ---' . \PHP_EOL;
var_dump(levenshtein('12', '21'));

echo '--- Inexpensive deletion ---' . \PHP_EOL;
var_dump(levenshtein('2121', '11', 2));
echo '--- Expensive deletion ---' . \PHP_EOL;
var_dump(levenshtein('2121', '11', 2, 1, 5));

echo '--- Inexpensive insertion ---' . \PHP_EOL;
var_dump(levenshtein('11', '2121'));
echo '--- Expensive insertion ---' . \PHP_EOL;
var_dump(levenshtein('11', '2121', 5));

echo '--- Expensive replacement ---' . \PHP_EOL;
var_dump(levenshtein('111', '121', 2, 3, 2));
echo '--- Very expensive replacement ---' . \PHP_EOL;
var_dump(levenshtein('111', '121', 2, 9, 2));

?>
--EXPECTF--
Warning: levenshtein(): The general Levenshtein support is not there yet in %s on line %d

Warning: levenshtein(): Argument string(s) too long in %s on line %d

Warning: levenshtein(): Argument string(s) too long in %s on line %d
all passed
--EXPECT--
--- Equal ---
int(0)
--- First string empty ---
int(3)
--- Second string empty ---
int(3)
--- Both empty ---
int(0)
int(0)
--- 1 character ---
int(1)
--- 2 character swapped ---
int(2)
--- Inexpensive deletion ---
int(2)
--- Expensive deletion ---
int(10)
--- Inexpensive insertion ---
int(2)
--- Expensive insertion ---
int(10)
--- Expensive replacement ---
int(3)
--- Very expensive replacement ---
int(4)
16 changes: 16 additions & 0 deletions ext/standard/tests/strings/levenshtein_bug_16473.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
--TEST--
levenshtein() bug 16473
--FILE--
<?php

var_dump(levenshtein('a', 'bc'));
var_dump(levenshtein('xa', 'xbc'));
var_dump(levenshtein('xax', 'xbcx'));
var_dump(levenshtein('ax', 'bcx'));

?>
--EXPECT--
int(2)
int(2)
int(2)
int(2)
16 changes: 16 additions & 0 deletions ext/standard/tests/strings/levenshtein_bug_6562.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
--TEST--
levenshtein() bug 6562
--FILE--
<?php

var_dump(levenshtein('debugg', 'debug'));
var_dump(levenshtein('ddebug', 'debug'));
var_dump(levenshtein('debbbug', 'debug'));
var_dump(levenshtein('debugging', 'debuging'));

?>
--EXPECT--
int(1)
int(1)
int(2)
int(1)
12 changes: 12 additions & 0 deletions ext/standard/tests/strings/levenshtein_bug_7368.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
--TEST--
levenshtein() bug 7368
--FILE--
<?php

var_dump(levenshtein('13458', '12345'));
var_dump(levenshtein('1345', '1234'));

?>
--EXPECT--
int(2)
int(2)
31 changes: 31 additions & 0 deletions ext/standard/tests/strings/levenshtein_error_conditions.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
--TEST--
levenshtein() error conditions
--FILE--
<?php

echo '--- String 1 ---' . \PHP_EOL;
var_dump(levenshtein('AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu', 'A'));
try {
var_dump(levenshtein('AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv', 'A'));
} catch (\ValueError $e) {
echo $e->getMessage() . \PHP_EOL;
}
echo '--- String 2 ---' . \PHP_EOL;
var_dump(levenshtein('A', 'AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu'));
try {
var_dump(levenshtein('A', 'AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv'));
} catch (\ValueError $e) {
echo $e->getMessage() . \PHP_EOL;
}

// TODO ValueError for negative costs?
// var_dump(levenshtein("", "", -1, -1, -1));

?>
--EXPECT--
--- String 1 ---
int(254)
levenshtein(): Argument #1 ($str1) must be less than 256 characters
--- String 2 ---
int(254)
levenshtein(): Argument #2 ($str2) must be less than 256 characters