halo321 2015-04-27 07:49 采纳率: 0%
浏览 2326

用simhash比较几句话的相似性

现在能做到的是把每个字的hash值求出并存放在string[]中,接下来就不太会了,求大神指导

  • 写回答

1条回答 默认 最新

  • Allen@@ 2015-04-27 08:21
    关注

    //words [0]为属性 [1]为权重

    //hashbits hashCode权重

    //return SimHash串

    getSimHash(String[][] words,int hashbits)

    ------------------------------------------------------------------

    //计算汉明距离

    //str1 simHash生成的code

    //str2

    //return 整形距离越小越相似

    getDistance(str1,str2)

    ----------------------------------------------------------------------

    下面是代码

    
    package com.yeahmobi.ymconv.util;
    
    public class MySimHash {
    
        public static String getSimHash(String[][] words, int hashbits) {
    
            double[] hash = new double[hashbits];
    
            for (int i = 0; i < words.length; i++) {
                long t = MurmurHash.hash64(words[i][0]);
                // long t = hash(words[i][0], 64).longValue();
    
                String str = getZero(Long.toBinaryString(t), hashbits);
                for (int j = 0; j < str.length(); j++) {
                    int weights = Integer.parseInt(words[i][1]) <= 0 ? 1 : Integer.parseInt(words[i][1]);
                    int c = Integer.parseInt(str.charAt(j) + "");
                    if (c == 1)
                        hash[j] = hash[j] + (weights);
                    else
                        hash[j] = hash[j] + (-weights);
                }
            }
    
            String hash1 = "";
            for (double d : hash) {
                hash1 += d > 0 ? "1" : "0";
            }
    
            return hash1;
        }
    
        public static String getZero(String str, int hashbits) {
            return String.format("%" + hashbits + "s", str).replace(" ", "0");
        }
    
        public static int getDistance(String str1, String str2) {
            int distance;
            if (str1.length() != str2.length()) {
                distance = -1;
            } else {
                distance = 0;
                for (int i = 0; i < str1.length(); i++) {
                    if (str1.charAt(i) != str2.charAt(i)) {
                        distance++;
                    }
                }
            }
            return distance;
        }
    
        public static void main(String[] args) {
    //      String s1 = MySimHash.getSimHash(new String[][] { { "187.237.239.16", "3" }, { "mx", "3" }, { "775", "3" }, { "60541", "3" }, { "2342256", "3" }, { "alcatel", "3" }, { "onetouch5020", "3" }, { "android", "3" }, { "4.1.1", "3" }, { "hh", "3" } }, 64);
    //      String s2 = MySimHash.getSimHash(new String[][] { { "177.224.174.214", "1" }, { "mx", "1" }, { "775", "1" }, { "6177", "1" }, { "2478822", "1" }, { "generic", "1" }, { "storm", "1" }, { "android", "1" }, { "4.2.2", "1" } }, 64);
    //      String s3 = MySimHash.getSimHash(new String[][] { { "5.246.82.36", "1" }, { "sdf", "1" }, { "663", "1" }, { "333", "1" }, { "55", "0" }, { "sd", "1" }, { "er", "1" }, { "34", "1" }, { "sdfasdf", "1" }, { "hh", "1" } }, 64);
    //      String s4 = MySimHash.getSimHash(new String[][] { { "189.132.168.157", "1" }, { "mx", "1" }, { "390", "1" }, { "3203", "1" }, { "2342277", "1" }, { "samsung", "1" }, { "gt-i8190l", "1" }, { "android", "1" }, { "4.1.2", "1" } }, 64);
    //      String s5 = MySimHash.getSimHash(new String[][] { { "187.237.239.16", "1" }, { "mx", "1" }, { "775", "3" }, { "60541", "1" }, { "2342256", "1" }, { "alcatel", "1" }, { "onetouch5020", "1" }, { "android", "1" }, { "4.1.1", "1" }, { "hh", "1" } }, 64);
    //      String s6 = MySimHash.getSimHash(new String[][] { { "187.237.239.25", "3" }, { "mx", "3" }, { "775", "3" }, { "60541", "3" }, { "2342256", "3" }, { "alcatel", "3" }, { "onetouch5020", "3" }, { "android", "3" }, { "4.1.1", "3" }, { "hh", "3" } }, 64);
    //      String s7 = MySimHash.getSimHash(new String[][] { { "187.237.239.16", "1" }, { "mx", "3" }, { "775", "3" }, { "60541", "3" }, { "2342256", "3" }, { "alcatel", "3" }, { "onetouch5020", "3" }, { "android", "3" }, { "4.1.1", "3" }, { "hh", "3" } }, 64);
    //      System.out.println("----------");
    //      System.out.println(MySimHash.getDistance(s1, s2));
    //      System.out.println(MySimHash.getDistance(s1, s3));
    //      System.out.println(MySimHash.getDistance(s1, s4));
    //      System.out.println(MySimHash.getDistance(s1, s5));
    //      System.out.println(MySimHash.getDistance(s1, s6));
    //      System.out.println(MySimHash.getDistance(s1, s7));
    //
    //      System.out.println(s1);
    //      System.out.println(s2);
    //      System.out.println(s3);
    //      System.out.println(s4);
    //      System.out.println(s5);
    //      System.out.println(s6);
        }
    }
    
    评论

报告相同问题?

悬赏问题

  • ¥15 执行 virtuoso 命令后,界面没有,cadence 启动不起来
  • ¥50 comfyui下连接animatediff节点生成视频质量非常差的原因
  • ¥20 有关区间dp的问题求解
  • ¥15 多电路系统共用电源的串扰问题
  • ¥15 slam rangenet++配置
  • ¥15 有没有研究水声通信方面的帮我改俩matlab代码
  • ¥15 ubuntu子系统密码忘记
  • ¥15 保护模式-系统加载-段寄存器
  • ¥15 电脑桌面设定一个区域禁止鼠标操作
  • ¥15 求NPF226060磁芯的详细资料