# 计算2个字符串的相似距离（采用Levenshtein distance）

```# --------------------------------------------------------------------
# function: Return the Levenshtein distance (also called Edit distance)
#       between two strings
#
# The Levenshtein distance (LD) is a measure of similarity between two
# strings, denoted here by s1 and s2. The distance is the number of
# deletions, insertions or substitutions required to transform s1 into
# s2. The greater the distance, the more different the strings are.
#
# The algorithm employs a proximity matrix, which denotes the distances
# between substrings of the two given strings. Read the embedded comments
# for more info. If you want a deep understanding of the algorithm, print
# the matrix for some test strings and study it
#
# The beauty of this system is that nothing is magical -
#   the distance is intuitively understandable by humans
#
# The distance is named after the Russian scientist Vladimir Levenshtein,
#   who devised the algorithm in 1965
# --------------------------------------------------------------------
sub levenshtein
{

my (\$s1, \$s2) = @_;
my (\$len1, \$len2) = (length \$s1, length \$s2);

# If one of the strings is empty,
#   the distance is the length of the other string
return \$len2 if (\$len1 == 0);
return \$len1 if (\$len2 == 0);

my %mat;

# Init the distance matrix
#
# The first row to 0..\$len1
# The first column to 0..\$len2
# The rest to 0
#
# The first row and column are initialized so to denote distance from the empty string
for (my \$i = 0; \$i <= \$len1; ++\$i)
{
for (my \$j = 0; \$j <= \$len2; ++\$j)
{
\$mat{\$i}{\$j} = 0;
\$mat{0}{\$j} = \$j;
}

\$mat{\$i}{0} = \$i;
}

# Some char-by-char processing is ahead, so prepare array of chars from the strings
my @ar1 = split(//, \$s1);
my @ar2 = split(//, \$s2);

for (my \$i = 1; \$i <= \$len1; ++\$i)
{
for (my \$j = 1; \$j <= \$len2; ++\$j)
{
# Set the cost to 1 iff the ith char of \$s1
#   equals the jth of \$s2
#
# Denotes a substitution cost. When the char are equal
#   there is no need to substitute, so the cost is 0
my \$cost = (\$ar1[\$i-1] eq \$ar2[\$j-1]) ? 0 : 1;

# Cell \$mat{\$i}{\$j} equals the minimum of:
#
# - The cell immediately above plus 1
# - The cell immediately to the left plus 1
# - The cell diagonally above and to the left plus the cost
#
# We can either insert a new char, delete a char or
#   substitute an existing char (with an associated cost)
\$mat{\$i}{\$j} = min([\$mat{\$i-1}{\$j} + 1,
\$mat{\$i}{\$j-1} + 1,
\$mat{\$i-1}{\$j-1} + \$cost]);
}
}

# Finally, the Levenshtein distance equals the rightmost bottom cell of the matrix
# Note that \$mat{\$x}{\$y} denotes the distance between the substrings : 1..\$x and 1..\$y
return \$mat{\$len1}{\$len2};
}
```

0人收藏

0

0