halo321 2015-04-27 07:49 采纳率: 0%
浏览 2326

用simhash比较几句话的相似性

现在能做到的是把每个字的hash值求出并存放在string[]中,接下来就不太会了,求大神指导

  • 写回答

1条回答 默认 最新

  • Allen@@ 2015-04-27 08:21
    关注

    //words [0]为属性 [1]为权重

    //hashbits hashCode权重

    //return SimHash串

    getSimHash(String[][] words,int hashbits)

    ------------------------------------------------------------------

    //计算汉明距离

    //str1 simHash生成的code

    //str2

    //return 整形距离越小越相似

    getDistance(str1,str2)

    ----------------------------------------------------------------------

    下面是代码

    
    package com.yeahmobi.ymconv.util;
    
    public class MySimHash {
    
        public static String getSimHash(String[][] words, int hashbits) {
    
            double[] hash = new double[hashbits];
    
            for (int i = 0; i < words.length; i++) {
                long t = MurmurHash.hash64(words[i][0]);
                // long t = hash(words[i][0], 64).longValue();
    
                String str = getZero(Long.toBinaryString(t), hashbits);
                for (int j = 0; j < str.length(); j++) {
                    int weights = Integer.parseInt(words[i][1]) <= 0 ? 1 : Integer.parseInt(words[i][1]);
                    int c = Integer.parseInt(str.charAt(j) + "");
                    if (c == 1)
                        hash[j] = hash[j] + (weights);
                    else
                        hash[j] = hash[j] + (-weights);
                }
            }
    
            String hash1 = "";
            for (double d : hash) {
                hash1 += d > 0 ? "1" : "0";
            }
    
            return hash1;
        }
    
        public static String getZero(String str, int hashbits) {
            return String.format("%" + hashbits + "s", str).replace(" ", "0");
        }
    
        public static int getDistance(String str1, String str2) {
            int distance;
            if (str1.length() != str2.length()) {
                distance = -1;
            } else {
                distance = 0;
                for (int i = 0; i < str1.length(); i++) {
                    if (str1.charAt(i) != str2.charAt(i)) {
                        distance++;
                    }
                }
            }
            return distance;
        }
    
        public static void main(String[] args) {
    //      String s1 = MySimHash.getSimHash(new String[][] { { "187.237.239.16", "3" }, { "mx", "3" }, { "775", "3" }, { "60541", "3" }, { "2342256", "3" }, { "alcatel", "3" }, { "onetouch5020", "3" }, { "android", "3" }, { "4.1.1", "3" }, { "hh", "3" } }, 64);
    //      String s2 = MySimHash.getSimHash(new String[][] { { "177.224.174.214", "1" }, { "mx", "1" }, { "775", "1" }, { "6177", "1" }, { "2478822", "1" }, { "generic", "1" }, { "storm", "1" }, { "android", "1" }, { "4.2.2", "1" } }, 64);
    //      String s3 = MySimHash.getSimHash(new String[][] { { "5.246.82.36", "1" }, { "sdf", "1" }, { "663", "1" }, { "333", "1" }, { "55", "0" }, { "sd", "1" }, { "er", "1" }, { "34", "1" }, { "sdfasdf", "1" }, { "hh", "1" } }, 64);
    //      String s4 = MySimHash.getSimHash(new String[][] { { "189.132.168.157", "1" }, { "mx", "1" }, { "390", "1" }, { "3203", "1" }, { "2342277", "1" }, { "samsung", "1" }, { "gt-i8190l", "1" }, { "android", "1" }, { "4.1.2", "1" } }, 64);
    //      String s5 = MySimHash.getSimHash(new String[][] { { "187.237.239.16", "1" }, { "mx", "1" }, { "775", "3" }, { "60541", "1" }, { "2342256", "1" }, { "alcatel", "1" }, { "onetouch5020", "1" }, { "android", "1" }, { "4.1.1", "1" }, { "hh", "1" } }, 64);
    //      String s6 = MySimHash.getSimHash(new String[][] { { "187.237.239.25", "3" }, { "mx", "3" }, { "775", "3" }, { "60541", "3" }, { "2342256", "3" }, { "alcatel", "3" }, { "onetouch5020", "3" }, { "android", "3" }, { "4.1.1", "3" }, { "hh", "3" } }, 64);
    //      String s7 = MySimHash.getSimHash(new String[][] { { "187.237.239.16", "1" }, { "mx", "3" }, { "775", "3" }, { "60541", "3" }, { "2342256", "3" }, { "alcatel", "3" }, { "onetouch5020", "3" }, { "android", "3" }, { "4.1.1", "3" }, { "hh", "3" } }, 64);
    //      System.out.println("----------");
    //      System.out.println(MySimHash.getDistance(s1, s2));
    //      System.out.println(MySimHash.getDistance(s1, s3));
    //      System.out.println(MySimHash.getDistance(s1, s4));
    //      System.out.println(MySimHash.getDistance(s1, s5));
    //      System.out.println(MySimHash.getDistance(s1, s6));
    //      System.out.println(MySimHash.getDistance(s1, s7));
    //
    //      System.out.println(s1);
    //      System.out.println(s2);
    //      System.out.println(s3);
    //      System.out.println(s4);
    //      System.out.println(s5);
    //      System.out.println(s6);
        }
    }
    
    评论

报告相同问题?

悬赏问题

  • ¥15 用stata实现聚类的代码
  • ¥15 请问paddlehub能支持移动端开发吗?在Android studio上该如何部署?
  • ¥170 如图所示配置eNSP
  • ¥20 docker里部署springboot项目,访问不到扬声器
  • ¥15 netty整合springboot之后自动重连失效
  • ¥15 悬赏!微信开发者工具报错,求帮改
  • ¥20 wireshark抓不到vlan
  • ¥20 关于#stm32#的问题:需要指导自动酸碱滴定仪的原理图程序代码及仿真
  • ¥20 设计一款异域新娘的视频相亲软件需要哪些技术支持
  • ¥15 stata安慰剂检验作图但是真实值不出现在图上