PHP字符串比较和相似性索引

时间:2013-06-04 18:39:59

标签: php

在不包含PHP空间的情况下,在两个字符串中查找常用字母的优雅代码是什么?

同样返回相似性索引,即计算常用字符数,并将其作为百分比返回字符总数。

假设我有一个字符串“LEGENDARY”,而其他字符串为“BARNEY STINSON”,所以我需要在不包含空格的情况下找到公共字母b / w。

此外,我的代码应该返回一个相似性索引,即计算常用字符的数量,并将其作为百分比返回到总字符数。

对于这两个字符串,常用字符为“ARNEY”,因此得分为5 / 22~ = 22%。对此有何帮助?

2 个答案:

答案 0 :(得分:4)

我发现要计算字符串之间的相似性百分比,LevenshteinJaro Winkler算法适用于拼写错误和字符串之间的小变化,而Smith Waterman Gotoh算法适用于字符串,其中文本的重要块将是相同的,但有"噪声"在边缘。 This answer to a similar question显示了更多详细信息。

以下是使用这三个示例中的每一个返回两个字符串之间的相似性百分比的php示例:

Levenshtein

echo levenshtein("LEGENDARY","BARNEY STINSON");

Jaro Winkler

class StringCompareJaroWinkler 
{
    public function compare($str1, $str2)
    {
        return $this->JaroWinkler($str1, $str2, $PREFIXSCALE = 0.1 );
    }

    private function getCommonCharacters( $string1, $string2, $allowedDistance ){

      $str1_len = mb_strlen($string1);
      $str2_len = mb_strlen($string2);
      $temp_string2 = $string2;

      $commonCharacters='';
      for( $i=0; $i < $str1_len; $i++){

        $noMatch = True;
        // compare if char does match inside given allowedDistance
        // and if it does add it to commonCharacters
        for( $j= max( 0, $i-$allowedDistance ); $noMatch && $j < min( $i + $allowedDistance + 1, $str2_len ); $j++){
          if( $temp_string2[$j] == $string1[$i] ){
            $noMatch = False;
        $commonCharacters .= $string1[$i];
        $temp_string2[$j] = '';
          }
        }
      }
      return $commonCharacters;
    }

    private function Jaro( $string1, $string2 ){

      $str1_len = mb_strlen( $string1 );
      $str2_len = mb_strlen( $string2 );

      // theoretical distance
      $distance = (int) floor(min( $str1_len, $str2_len ) / 2.0); 

      // get common characters
      $commons1 = $this->getCommonCharacters( $string1, $string2, $distance );
      $commons2 = $this->getCommonCharacters( $string2, $string1, $distance );

      if( ($commons1_len = mb_strlen( $commons1 )) == 0) return 0;
      if( ($commons2_len = mb_strlen( $commons2 )) == 0) return 0;
      // calculate transpositions
      $transpositions = 0;
      $upperBound = min( $commons1_len, $commons2_len );
      for( $i = 0; $i < $upperBound; $i++){
        if( $commons1[$i] != $commons2[$i] ) $transpositions++;
      }
      $transpositions /= 2.0;
      // return the Jaro distance
      return ($commons1_len/($str1_len) + $commons2_len/($str2_len) + ($commons1_len - $transpositions)/($commons1_len)) / 3.0;

    }

    private function getPrefixLength( $string1, $string2, $MINPREFIXLENGTH = 4 ){

      $n = min( array( $MINPREFIXLENGTH, mb_strlen($string1), mb_strlen($string2) ) );

      for($i = 0; $i < $n; $i++){
        if( $string1[$i] != $string2[$i] ){
          // return index of first occurrence of different characters 
          return $i;
        }
      }
      // first n characters are the same   
      return $n;
    }

    private function JaroWinkler($string1, $string2, $PREFIXSCALE = 0.1 ){

      $JaroDistance = $this->Jaro( $string1, $string2 );
      $prefixLength = $this->getPrefixLength( $string1, $string2 );
      return $JaroDistance + $prefixLength * $PREFIXSCALE * (1.0 - $JaroDistance);
    }
}

$jw = new StringCompareJaroWinkler();
echo $jw->compare("LEGENDARY","BARNEY STINSON");

Smith Waterman Gotoh

class SmithWatermanGotoh 
{
    private $gapValue;
    private $substitution;

    /**
     * Constructs a new Smith Waterman metric.
     * 
     * @param gapValue
     *            a non-positive gap penalty
     * @param substitution
     *            a substitution function
     */
    public function __construct($gapValue=-0.5, 
                $substitution=null) 
    {
        if($gapValue > 0.0) throw new Exception("gapValue must be <= 0");
        //if(empty($substitution)) throw new Exception("substitution is required");
        if (empty($substitution)) $this->substitution = new SmithWatermanMatchMismatch(1.0, -2.0);
        else $this->substitution = $substitution;
        $this->gapValue = $gapValue;
    }

    public function compare($a, $b) 
    {
        if (empty($a) && empty($b)) {
            return 1.0;
        }

        if (empty($a) || empty($b)) {
            return 0.0;
        }

        $maxDistance = min(mb_strlen($a), mb_strlen($b))
                * max($this->substitution->max(), $this->gapValue);
        return $this->smithWatermanGotoh($a, $b) / $maxDistance;
    }

    private function smithWatermanGotoh($s, $t) 
    {   
        $v0 = [];
        $v1 = [];
        $t_len = mb_strlen($t);
        $max = $v0[0] = max(0, $this->gapValue, $this->substitution->compare($s, 0, $t, 0));

        for ($j = 1; $j < $t_len; $j++) {
            $v0[$j] = max(0, $v0[$j - 1] + $this->gapValue,
                    $this->substitution->compare($s, 0, $t, $j));

            $max = max($max, $v0[$j]);
        }

        // Find max
        for ($i = 1; $i < mb_strlen($s); $i++) {
            $v1[0] = max(0, $v0[0] + $this->gapValue, $this->substitution->compare($s, $i, $t, 0));

            $max = max($max, $v1[0]);

            for ($j = 1; $j < $t_len; $j++) {
                $v1[$j] = max(0, $v0[$j] + $this->gapValue, $v1[$j - 1] + $this->gapValue,
                        $v0[$j - 1] + $this->substitution->compare($s, $i, $t, $j));

                $max = max($max, $v1[$j]);
            }

            for ($j = 0; $j < $t_len; $j++) {
                $v0[$j] = $v1[$j];
            }
        }

        return $max;
    }
}

class SmithWatermanMatchMismatch
{
    private $matchValue;
    private $mismatchValue;

    /**
     * Constructs a new match-mismatch substitution function. When two
     * characters are equal a score of <code>matchValue</code> is assigned. In
     * case of a mismatch a score of <code>mismatchValue</code>. The
     * <code>matchValue</code> must be strictly greater then
     * <code>mismatchValue</code>
     * 
     * @param matchValue
     *            value when characters are equal
     * @param mismatchValue
     *            value when characters are not equal
     */
    public function __construct($matchValue, $mismatchValue) {
        if($matchValue <= $mismatchValue) throw new Exception("matchValue must be > matchValue");

        $this->matchValue = $matchValue;
        $this->mismatchValue = $mismatchValue;
    }

    public function compare($a, $aIndex, $b, $bIndex) {
        return ($a[$aIndex] === $b[$bIndex] ? $this->matchValue
                : $this->mismatchValue);
    }

    public function max() {
        return $this->matchValue;
    }

    public function min() {
        return $this->mismatchValue;
    }
}

$o = new SmithWatermanGotoh();
echo $o->compare("LEGENDARY","BARNEY STINSON");

答案 1 :(得分:1)

similar_text()。如果您想先排除str_replace(' ', '', $string);之前的空格。

echo similar_text ( 'LEGENDARY' , 'BARNEYSTINSON', $percent); // outputs 3
echo $percent; // outputs 27.272727272727

这是另一种仅使用唯一字符的方式

<?php
function unique_chars($string) {
   return count_chars(strtolower(str_replace(' ', '', $string)), 3);
}
function compare_strings($a, $b) {
    $index = similar_text(unique_chars($a), unique_chars($b), $percent);
    return array('index' => $index, 'percent' => $percent);
}
print_r( compare_strings('LEGENDARY', 'BARNEY STINSON') );

// outputs:
?>

Array
(
    [index] => 5
    [percent] => 55.555555555556
)