问题遇到的现象和发生背景
下载官方代码使用DWY100k数据集对UEA进行测试
遇到的现象和发生背景,请写出第一个错误信息
出现如下报错:IndexError: index 125000 is out of bounds for axis 0 with size 95500
用代码块功能插入代码,请勿粘贴截图。 不用代码块回答率下降 50%
utils代码
import tensorflow as tf
from include.Model import build_SE, training
import time
from include.Load import *
import json
import scipy
from scipy import spatial
import copy
from collections import defaultdict
def get_hits_gen(simM, test1, test2, id2confi, confi, correct, gap):
rowindex = []
columnindex = []
for iii in test1:
if iii>=25500:
rowindex.append(iii-15000)
else:
rowindex.append(iii)
for iii in test2:
columnindex.append(iii-10500)
partialsim = simM[0,rowindex]
partialsim = partialsim[:, columnindex]
sim = partialsim
counn = 0
for i in range(len(rowindex)):
rank = sim[i, :].argsort()
scores = copy.deepcopy(sim[i, :])
scores.sort()
minrank = rank[0]
minscore = scores[0]
minscoregap = scores[1] - scores[0]
scores_col = copy.deepcopy(sim[:, minrank])
scores_col.sort()
minscoregap_col = scores_col[1] - scores_col[0]
rank_col = sim[:, minrank].argsort()
minrank_col = rank_col[0]
# if minscore<gap:
if minrank_col == i and minscore<gap:
# if minrank_col == i and minscoregap > gap and minscoregap_col > gap:
confi.append([test1[i], test2[minrank]])
id2confi[test1[i]] = test2[minrank]
if test1[i] + 10500 == test2[minrank]:
correct.append([test1[i], test2[minrank]])
counn += 1
matchable= 0
for item in confi:
if item[0] < 10500:
matchable += 1
print("Evaluated: " + str(counn))
print("Confi " + str(len(confi)))
print("Among which matchable " + str(matchable))
print("Correct " + str(len(correct)))
return confi, correct, id2confi
def get_hits_gen_nochange(simM, test1, test2, id2confi, correct, gap):
rowindex = []
columnindex = []
for iii in test1:
if iii>=25500:
rowindex.append(iii-15000)
else:
rowindex.append(iii)
for iii in test2:
columnindex.append(iii-10500)
partialsim = simM[rowindex]
partialsim = partialsim[:, columnindex]
sim = partialsim
counn = 0
confi = []
for i in range(len(rowindex)):
rank = sim[i, :].argsort()
scores = copy.deepcopy(sim[i, :])
scores.sort()
minrank = rank[0]
# get column-wise results
minscore = scores[0]
scores_col = copy.deepcopy(sim[:, minrank])
scores_col.sort()
minscoregap_col = scores_col[1] - scores_col[0]
rank_col = sim[:, minrank].argsort()
minrank_col = rank_col[0]
if minrank_col == i and minscore<gap:
confi.append([test1[i], test2[minrank]])
id2confi[test1[i]] = test2[minrank]
if test1[i] + 10500 == test2[minrank]:
correct.append([test1[i], test2[minrank]])
counn += 1
print("Evaluated: " + str(counn))
print("Confi " + str(len(confi)))
print("Correct " + str(len(correct)))
return confi, correct, id2confi
def getsim_matrix_cosine(vec, test_left, test_right):
Lvec = tf.placeholder(tf.float32, [None, vec.shape[1]])
Rvec = tf.placeholder(tf.float32, [None, vec.shape[1]])
he = tf.nn.l2_normalize(Lvec, dim=-1)
norm_e_em = tf.nn.l2_normalize(Rvec, dim=-1)
aep = tf.matmul(he, tf.transpose(norm_e_em))
sess = tf.Session()
Lv = np.array([vec[e1] for e1 in test_left])
Rv = np.array([vec[e2] for e2 in test_right])
aep = sess.run(aep, feed_dict = {Lvec: Lv, Rvec: Rv})
aep = 1-aep
return aep
def get_hits_ma(sim, test_pair, top_k=(1, 10)):
top_lr = [0] * len(top_k)
mrr_sum_l = 0
for i in range(sim.shape[0]):
rank = sim[i, :].argsort()
if i < 10500:
rank_index = np.where(rank == i)[0][0]
mrr_sum_l = mrr_sum_l + 1.0 / (rank_index + 1)
for j in range(len(top_k)):
if rank_index < top_k[j]:
top_lr[j] += 1
msg = 'Hits@1:%.3f, Hits@10:%.3f, MRR:%.3f\n' % (top_lr[0] / len(test_pair), top_lr[1] / len(test_pair), mrr_sum_l / len(test_pair))
print(msg)
msg = 'Hits@1:%.3f\n' % (top_lr[0] / 14888)
print(msg)
main代码
import tensorflow as tf
from include.Model import build_SE, training
from include.utils import get_hits_gen, getsim_matrix_cosine, get_hits_ma
import time
from include.Load import *
import json
import scipy
from scipy import spatial
import copy
from collections import defaultdict
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,3"
# def make_print_to_file(fileName, path='./'):
# import sys
# import os
# import sys
# import datetime
#
# class Logger(object):
# def __init__(self, filename="Default.log", path="./"):
# self.terminal = sys.stdout
# self.log = open(os.path.join(path, filename), "a", encoding='utf8',)
#
# def write(self, message):
# self.terminal.write(message)
# self.log.write(message)
#
# def flush(self):
# pass
# sys.stdout = Logger(fileName + '.log', path=path)
# print(fileName.center(60,'*'))
seed = 12306
np.random.seed(seed)
tf.set_random_seed(seed)
import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='UEA')
parser.add_argument('--lan', type=str, default='zh_en')
parser.add_argument('--alpha', type=float, default=0.5)
parser.add_argument('--beta', type=float, default=0.5)
parser.add_argument('--thres', type=float, default=0.05) # initial threshold
parser.add_argument('--inc', type=float, default=0.1) # the increment of threshold
parser.add_argument('--stopThres', type=float, default=0.45) # the maximum of threshold
parser.add_argument('--adj', type=bool, default=True) # whether dynamically adjust the threshold
parser.add_argument('--fixedThres', type=float, default=0.45) # if adj is false, one should set a fixed weight
args = parser.parse_args()
print(args)
language = args.lan
e1 = 'data/' + language + '/ent_ids_1'
e2 = 'data/' + language + '/ent_ids_2'
r1 = 'data/' + language + '/rel_ids_1'
r2 = 'data/' + language + '/rel_ids_2'
ill = 'data/' + language + '/ref_ent_ids'
kg1 = 'data/' + language + '/triples_1'
kg2 = 'data/' + language + '/triples_2'
# e1_trans = 'data/' + language + '/ent_ids_1_trans_goo'
sup = 'data/' + language + '/sup_ent_ids'
epochs_se = 300
epochs_ae = 600
se_dim = 300
ae_dim = 100
act_func = tf.nn.relu
gamma = 3.0 # margin based loss
k = 25 # number of negative samples for each positive one
seed = 3 # 30% of seeds
beta = 0.9 # weight of SE
t = time.time()
e = len(set(loadfile(e1, 1)) | set(loadfile(e2, 1))) # print(e)
ILL = loadfile(ill, 2)
illL = len(ILL)
test = ILL
test_left = []; test_right = []
inf = open(e1,"rb")
for i, line in enumerate(inf):
strs = line.decode().strip().split('\t')
if i<10500 or i>=15000:
test_left.append(int(strs[0]))
inf = open(e2,"rb")
for i, line in enumerate(inf):
strs = line.decode().strip().split('\t')
if i < 10500:# or i >= 15000:
test_right.append(int(strs[0]))
seedss = loadfile(sup, 2)
KG1 = loadfile(kg1, 3)
KG2 = loadfile(kg2, 3)
path = 'data' #'entity-alignment-full-data'
lang = language.split('_')[0]
with open(file='data/' + lang + '_en/' + lang + '_vectorList.json', mode='r', encoding='utf-8') as f:
embedding_list = json.load(f)
ne_vec = np.array(embedding_list)
str_sim = np.load('./'+path+'/' + language + '/string_mat.npy')
str_sim = 1 - str_sim
aep_n = getsim_matrix_cosine(ne_vec, test_left, test_right)
text_combine = aep_n * args.alpha + str_sim * (1 - args.alpha)
clenold = 0
id2confi = dict()
confi = []
correct = []
if args.adj:
thres = args.thres
else:
thres = args.fixedThres
confi, correct, id2confi = get_hits_gen(text_combine, test_left, test_right, id2confi, confi, correct, thres)
countt = 0
if len(confi) < 10499:
while len(confi) - clenold > 30:
print('ROUND ' + str(countt))
train = copy.deepcopy(confi)
train = np.array(train)
clenold = len(confi)
test1 = []
test2 = []
for ee in test_left:
if ee not in id2confi.keys():
test1.append(ee)
for ee in test_right:
if ee not in id2confi.values():
test2.append(ee)
print("Generating structural embeddings.... ")
output_layer, loss, = build_SE(se_dim, act_func, gamma, k, e, train, KG1 + KG2)
se_vec, J = training(output_layer, loss, 25, epochs_se, train, e, k)
countt += 1
aep = getsim_matrix_cosine(se_vec, test_left, test_right)
combine = aep * (1-args.beta) + text_combine * (args.beta)
if args.adj:
if thres >= args.stopThres:
thres = args.stopThres
else:
thres = thres + args.inc
else:
thres = args.fixedThres
confi, correct, id2confi = get_hits_gen(combine, test1, test2, id2confi, confi, correct,thres)
print()
conf = 0
for item in confi:
if item[0] < 10500:
conf += 1
corr = 0
for item in correct:
if item[0] < 10500:
corr += 1
print("Confi: " + str(len(confi)))
print("Matchable: " + str(conf))
print("Correct: " + str(corr))
print("Precision: " + str(corr*1.0/len(confi)))
print("Recall: " + str(corr * 1.0 / 10500))
print("total time elapsed: {:.4f} s".format(time.time() - t))
运行结果及详细报错内容
Traceback (most recent call last):
File "main.py", line 119, in
confi, correct, id2confi = get_hits_gen(text_combine, test_left, test_right, id2confi, confi, correct, thres)
File "/home/cclsol/zzy/UEA-main/include/utils.py", line 21, in get_hits_gen
partialsim = simM[rowindex]
IndexError: index 125000 is out of bounds for axis 0 with size 95500
我的解答思路和尝试过的方法,不写自己思路的,回答率下降 60%
该错误是由于索引超出了列表的长度引起的,但是不会改。