java中如何计算两个字符串的相似度？

时间：2020-12-03 12:02:01 阅读：7 评论：0 收藏：0 [点我收藏+]

标签：imu tran exe osi eth pre javadoc follow mil

发现apache提供了现成的解决方案
技术图片

1.Cosine similarity

package org.apache.commons.text.similarity;

import java.util.HashSet;
import java.util.Map;
import java.util.Set;

/**
 * Measures the Cosine similarity of two vectors of an inner product space and
 * compares the angle between them.
 *
 * <p>
 * For further explanation about the Cosine Similarity, refer to
 * http://en.wikipedia.org/wiki/Cosine_similarity.
 * </p>
 *
 * @since 1.0
 */
public class CosineSimilarity {

 /**
 * Calculates the cosine similarity for two given vectors.
 *
 * @param leftVector left vector
 * @param rightVector right vector
 * @return cosine similarity between the two vectors
 */
 public Double cosineSimilarity(final Map<CharSequence, Integer> leftVector, final Map<CharSequence, Integer> rightVector) {
 if (leftVector == null || rightVector == null) {
 throw new IllegalArgumentException("Vectors must not be null");
 }

 final Set<CharSequence> intersection = getIntersection(leftVector, rightVector);

 final double dotProduct = dot(leftVector, rightVector, intersection);
 double d1 = 0.0d;
 for (final Integer value : leftVector.values()) {
 d1 += Math.pow(value, 2);
 }
 double d2 = 0.0d;
 for (final Integer value : rightVector.values()) {
 d2 += Math.pow(value, 2);
 }
 double cosineSimilarity;
 if (d1 <= 0.0 || d2 <= 0.0) {
 cosineSimilarity = 0.0;
 } else {
 cosineSimilarity = (double) (dotProduct / (double) (Math.sqrt(d1) * Math.sqrt(d2)));
 }
 return cosineSimilarity;
 }

 /**
 * Returns a set with strings common to the two given maps.
 *
 * @param leftVector left vector map
 * @param rightVector right vector map
 * @return common strings
 */
 private Set<CharSequence> getIntersection(final Map<CharSequence, Integer> leftVector,
 final Map<CharSequence, Integer> rightVector) {
 final Set<CharSequence> intersection = new HashSet<>(leftVector.keySet());
 intersection.retainAll(rightVector.keySet());
 return intersection;
 }

 /**
 * Computes the dot product of two vectors. It ignores remaining elements. It means
 * that if a vector is longer than other, then a smaller part of it will be used to compute
 * the dot product.
 *
 * @param leftVector left vector
 * @param rightVector right vector
 * @param intersection common elements
 * @return the dot product
 */
 private double dot(final Map<CharSequence, Integer> leftVector, final Map<CharSequence, Integer> rightVector,
 final Set<CharSequence> intersection) {
 long dotProduct = 0;
 for (final CharSequence key : intersection) {
 dotProduct += leftVector.get(key) * rightVector.get(key);
 }
 return dotProduct;
 }

}

2.JaccardSimilarity

package org.apache.commons.text.similarity;

import java.util.HashSet;
import java.util.Set;

/**
 * Measures the Jaccard similarity (aka Jaccard index) of two sets of character
 * sequence. Jaccard similarity is the size of the intersection divided by the
 * size of the union of the two sets.
 *
 * <p>
 * For further explanation about Jaccard Similarity, refer
 * https://en.wikipedia.org/wiki/Jaccard_index
 * </p>
 *
 * @since 1.0
 */
public class JaccardSimilarity implements SimilarityScore<Double> {

 /**
 * Calculates Jaccard Similarity of two set character sequence passed as
 * input.
 *
 * @param left first character sequence
 * @param right second character sequence
 * @return index
 * @throws IllegalArgumentException
 * if either String input {@code null}
 */
 @Override
 public Double apply(CharSequence left, CharSequence right) {
 if (left == null || right == null) {
 throw new IllegalArgumentException("Input cannot be null");
 }
 return Math.round(calculateJaccardSimilarity(left, right) * 100d) / 100d;
 }

 /**
 * Calculates Jaccard Similarity of two character sequences passed as
 * input. Does the calculation by identifying the union (characters in at
 * least one of the two sets) of the two sets and intersection (characters
 * which are present in set one which are present in set two)
 *
 * @param left first character sequence
 * @param right second character sequence
 * @return index
 */
 private Double calculateJaccardSimilarity(CharSequence left, CharSequence right) {
 Set<String> intersectionSet = new HashSet<String>();
 Set<String> unionSet = new HashSet<String>();
 boolean unionFilled = false;
 int leftLength = left.length();
 int rightLength = right.length();
 if (leftLength == 0 || rightLength == 0) {
 return 0d;
 }

 for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) {
 unionSet.add(String.valueOf(left.charAt(leftIndex)));
 for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) {
 if (!unionFilled) {
 unionSet.add(String.valueOf(right.charAt(rightIndex)));
 }
 if (left.charAt(leftIndex) == right.charAt(rightIndex)) {
 intersectionSet.add(String.valueOf(left.charAt(leftIndex)));
 }
 }
 unionFilled = true;
 }
 return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size());
 }
}

3.LevenshteinDistance

/**
     * LevenshteinDistance
 * copied from https://commons.apache.org/proper/commons-lang/javadocs/api-2.5/src-html/org/apache/commons/lang/StringUtils.html#line.6162
 */
 public static int getLevenshteinDistance(String s, String t) {
 if (s == null || t == null) {
 throw new IllegalArgumentException("Strings must not be null");
 }

 int n = s.length(); // length of s
 int m = t.length(); // length of t

 if (n == 0) {
 return m;
 } else if (m == 0) {
 return n;
 }

 if (n > m) {
 // swap the input strings to consume less memory
 String tmp = s;
 s = t;
 t = tmp;
 n = m;
 m = t.length();
 }

 int p[] = new int[n + 1]; //‘previous‘ cost array, horizontally
 int d[] = new int[n + 1]; // cost array, horizontally
 int _d[]; //placeholder to assist in swapping p and d

 // indexes into strings s and t
 int i; // iterates through s
 int j; // iterates through t

 char t_j; // jth character of t

 int cost; // cost

 for (i = 0; i <= n; i++) {
 p[i] = i;
 }

 for (j = 1; j <= m; j++) {
 t_j = t.charAt(j - 1);
 d[0] = j;

 for (i = 1; i <= n; i++) {
 cost = s.charAt(i - 1) == t_j ? 0 : 1;
 // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
 d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
 }

 // copy current distance counts to ‘previous row‘ distance counts
 _d = p;
 p = d;
 d = _d;
 }

 // our last action in the above loop was to switch d and p, so p now
 // actually has the most recent cost counts
 return p[n];
 }

4.JaroWinklerDistance

/**
 * JaroWinklerDistance
 * Copied from https://commons.apache.org/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/JaroWinklerDistance.java.html
 * apply method changed to similarity
 */
 public static Double similarity(final CharSequence left, final CharSequence right) {
 final double defaultScalingFactor = 0.1;
 final double percentageRoundValue = 100.0;

 if (left == null || right == null) {
 throw new IllegalArgumentException("Strings must not be null");
 }

 int[] mtp = matches(left, right);
 double m = mtp[0];
 if (m == 0) {
 return 0D;
 }
 double j = ((m / left.length() + m / right.length() + (m - mtp[1]) / m)) / 3;
 double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / mtp[3]) * mtp[2] * (1D - j);
 return Math.round(jw * percentageRoundValue) / percentageRoundValue;
 }

 protected static int[] matches(final CharSequence first, final CharSequence second) {
 CharSequence max, min;
 if (first.length() > second.length()) {
 max = first;
 min = second;
 } else {
 max = second;
 min = first;
 }
 int range = Math.max(max.length() / 2 - 1, 0);
 int[] matchIndexes = new int[min.length()];
 Arrays.fill(matchIndexes, -1);
 boolean[] matchFlags = new boolean[max.length()];
 int matches = 0;
 for (int mi = 0; mi < min.length(); mi++) {
 char c1 = min.charAt(mi);
 for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
 if (!matchFlags[xi] && c1 == max.charAt(xi)) {
 matchIndexes[mi] = xi;
 matchFlags[xi] = true;
 matches++;
 break;
 }
 }
 }
 char[] ms1 = new char[matches];
 char[] ms2 = new char[matches];
 for (int i = 0, si = 0; i < min.length(); i++) {
 if (matchIndexes[i] != -1) {
 ms1[si] = min.charAt(i);
 si++;
 }
 }
 for (int i = 0, si = 0; i < max.length(); i++) {
 if (matchFlags[i]) {
 ms2[si] = max.charAt(i);
 si++;
 }
 }
 int transpositions = 0;
 for (int mi = 0; mi < ms1.length; mi++) {
 if (ms1[mi] != ms2[mi]) {
 transpositions++;
 }
 }
 int prefix = 0;
 for (int mi = 0; mi < min.length(); mi++) {
 if (first.charAt(mi) == second.charAt(mi)) {
 prefix++;
 } else {
 break;
 }
 }
 return new int[] { matches, transpositions / 2, prefix, max.length() };
 }