fatesses
fatesses
采纳率98%
2019-08-30 09:55 阅读 550

python 两句话匹配相同的词

10
SA='成都|上海|北京|南京|深圳'
SV='北京的中国的首都'

我需要一个判定,判断sv中是否有sa的词语,该如何写呢呢?
我最开始写的是

    for k in SA:
        if re.search(k.replace('|', ''), SV):
            print('yes')
  • 点赞
  • 写回答
  • 关注问题
  • 收藏
  • 复制链接分享

4条回答 默认 最新

  • 已采纳
    qq_39412061 吃鸡王者 2019-08-30 11:06

    import re
    SA=r'成都|上海|北京|南京|深圳'
    SV='北京的中国的首都'
    if re.search(SA,SV):
    print('YES')
    else:
    print('No')

    点赞 评论 复制链接分享
  • dushanglang 独殇狼 2019-08-30 10:16

    SA='成都|上海|北京|南京|深圳'
    SV='北京的中国的首都'

    for s in SA.split("|"):
    if SV.find(s) != -1:
    print(s)

    点赞 评论 复制链接分享
  • weixin_38722900 LingTianshi 2019-08-30 15:19
    SA='成都|上海|北京|南京|深圳'
    SV='京北的中国的首都'
    
    for i  in SA.split("|"):
        if i in SV:
            print("ok")
        else:
            print("no")
    
    点赞 评论 复制链接分享
  • dabocaiqq dabocaiqq 2019-08-30 17:08

    https://blog.csdn.net/weixin_33912246/article/details/94517601

    #coding=utf-8

    import xlrd
    import distance
    from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
    import numpy as np
    from scipy.linalg import norm

    workbook = xlrd.open_workbook(u'工程师问答.xls')
    sheet_names= workbook.sheet_names()

    ls = []
    for sheet_name in sheet_names:

    sheet1 = workbook.sheet_by_name(sheet_name)
    for i in range(1, 3858):
        row = sheet1.row_values(i)
        ls.append(row[0])
    

    print len(ls)

    target = u'D90的发动机热效率是多少?'
    print u'目标语句:' + target

    编辑距离计算

    def edit_distance(s1, s2):
    return distance.levenshtein(s1, s2)

    results = list(filter(lambda x: edit_distance(x, target) <= 5, ls))
    print u'1)编辑距离计算,阈值为5'
    for i in results:
    print i

    杰卡德系数计算

    def jaccard_similarity(s1, s2):
    def add_space(s):
    return ' '.join(list(s))

    # 将字中间加入空格
    s1, s2 = add_space(s1), add_space(s2)
    # 转化为TF矩阵
    cv = CountVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = cv.fit_transform(corpus).toarray()
    # 求交集
    numerator = np.sum(np.min(vectors, axis=0))
    # 求并集
    denominator = np.sum(np.max(vectors, axis=0))
    # 计算杰卡德系数
    return 1.0 * numerator / denominator
    

    results = list(filter(lambda x: jaccard_similarity(x, target) > 0.6, ls))
    print u'2)杰卡德系数计算,阈值为0.6'
    for i in results:
    print i

    TF 计算

    def tf_similarity(s1, s2):
    def add_space(s):
    return ' '.join(list(s))

    # 将字中间加入空格
    s1, s2 = add_space(s1), add_space(s2)
    # 转化为TF矩阵
    cv = CountVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = cv.fit_transform(corpus).toarray()
    # 计算TF系数
    return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))
    

    results = list(filter(lambda x: tf_similarity(x, target) > 0.7, ls))
    print u'3)TF 计算,阈值为0.7'
    for i in results:
    print i

    TFIDF 系数

    def tfidf_similarity(s1, s2):
    def add_space(s):
    return ' '.join(list(s))

    # 将字中间加入空格
    s1, s2 = add_space(s1), add_space(s2)
    # 转化为TF矩阵
    cv = TfidfVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = cv.fit_transform(corpus).toarray()
    # 计算TF系数
    return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))
    

    results = list(filter(lambda x: tfidf_similarity(x, target) > 0.6, ls))
    print u'4)TFIDF 系数,阈值为0.6'
    for i in results:
    print i

    点赞 评论 复制链接分享

相关推荐