之前提了这个问题,现在更新以下,寻求大家的帮助,用基于tensorflow的python程序跑模型预测时,是在cpu服务器上跑的,程序偶尔会出现卡死的情况,而且一般是程序运行了一段时间后才出现卡死的情况,这两天具体定位了卡住的语句,主要是 outputs = self.generator(inputs, reuse=False)
这句话卡住一直不执行,这句话是我在定义计算图,generator是我定义的神经网络,input是输入,output是输出,请个位大神帮忙看看,为什么执行一段时间以后会卡在这句话上,谢谢大家
这个是generator函数:
def generator(self, inputs, reuse=False, scope='g_net'):
n, h, w, c = inputs.get_shape().as_list()
if self.args.model == 'lstm':
with tf.variable_scope('LSTM'):
cell = BasicConvLSTMCell([h / 4, w / 4], [3, 3], 128)
rnn_state = cell.zero_state(batch_size=self.batch_size, dtype=tf.float32)
x_unwrap = []
with tf.variable_scope(scope, reuse=reuse):
with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
activation_fn=tf.nn.relu, padding='SAME', normalizer_fn=None,
weights_initializer=tf.contrib.layers.xavier_initializer(uniform=True),
biases_initializer=tf.constant_initializer(0.0)):
inp_pred = inputs
for i in xrange(self.n_levels):
scale = self.scale ** (self.n_levels - i - 1)
hi = int(round(h * scale))
wi = int(round(w * scale))
inp_blur = tf.image.resize_images(inputs, [hi, wi], method=0)
inp_pred = tf.stop_gradient(tf.image.resize_images(inp_pred, [hi, wi], method=0))
inp_all = tf.concat([inp_blur, inp_pred], axis=3, name='inp')
if self.args.model == 'lstm':
rnn_state = tf.image.resize_images(rnn_state, [hi // 4, wi // 4], method=0)
# encoder
conv1_1 = slim.conv2d(inp_all, 32, [5, 5], scope='enc1_1')
conv1_2 = ResnetBlock(conv1_1, 32, 5, scope='enc1_2')
conv1_3 = ResnetBlock(conv1_2, 32, 5, scope='enc1_3')
conv1_4 = ResnetBlock(conv1_3, 32, 5, scope='enc1_4')
conv2_1 = slim.conv2d(conv1_4, 64, [5, 5], stride=2, scope='enc2_1')
conv2_2 = ResnetBlock(conv2_1, 64, 5, scope='enc2_2')
conv2_3 = ResnetBlock(conv2_2, 64, 5, scope='enc2_3')
conv2_4 = ResnetBlock(conv2_3, 64, 5, scope='enc2_4')
conv3_1 = slim.conv2d(conv2_4, 128, [5, 5], stride=2, scope='enc3_1')
conv3_2 = ResnetBlock(conv3_1, 128, 5, scope='enc3_2')
conv3_3 = ResnetBlock(conv3_2, 128, 5, scope='enc3_3')
conv3_4 = ResnetBlock(conv3_3, 128, 5, scope='enc3_4')
if self.args.model == 'lstm':
deconv3_4, rnn_state = cell(conv3_4, rnn_state)
else:
deconv3_4 = conv3_4
# decoder
deconv3_3 = ResnetBlock(deconv3_4, 128, 5, scope='dec3_3')
deconv3_2 = ResnetBlock(deconv3_3, 128, 5, scope='dec3_2')
deconv3_1 = ResnetBlock(deconv3_2, 128, 5, scope='dec3_1')
deconv2_4 = slim.conv2d_transpose(deconv3_1, 64, [4, 4], stride=2, scope='dec2_4')
cat2 = deconv2_4 + conv2_4
deconv2_3 = ResnetBlock(cat2, 64, 5, scope='dec2_3')
deconv2_2 = ResnetBlock(deconv2_3, 64, 5, scope='dec2_2')
deconv2_1 = ResnetBlock(deconv2_2, 64, 5, scope='dec2_1')
deconv1_4 = slim.conv2d_transpose(deconv2_1, 32, [4, 4], stride=2, scope='dec1_4')
cat1 = deconv1_4 + conv1_4
deconv1_3 = ResnetBlock(cat1, 32, 5, scope='dec1_3')
deconv1_2 = ResnetBlock(deconv1_3, 32, 5, scope='dec1_2')
deconv1_1 = ResnetBlock(deconv1_2, 32, 5, scope='dec1_1')
inp_pred = slim.conv2d(deconv1_1, self.chns, [5, 5], activation_fn=None, scope='dec1_0')
if i >= 0:
x_unwrap.append(inp_pred)
if i == 0:
tf.get_variable_scope().reuse_variables()
return x_unwrap
补充以下:主函数是这段代码,现在的执行逻辑是有一个批处理文件会一共调我这个主函数12次,但是每次调到第10次左右的时候就会卡死
这个是主函数:
def parse_args():
parser = argparse.ArgumentParser(description='deblur arguments')
parser.add_argument('--phase', type=str, default='test', help='determine whether train or test')
#parser.add_argument('--datalist', type=str, default='./datalist_gopro21.txt', help='training datalist')
parser.add_argument('--model', type=str, default='gray', help='model type: [lstm | gray | color]')
parser.add_argument('--batch_size', help='training batch size', type=int, default=32)
parser.add_argument('--epoch', help='training epoch number', type=int, default=15)
parser.add_argument('--lr', type=float, default=1*1e-4, dest='learning_rate', help='initial learning rate')
parser.add_argument('--gpu', dest='gpu_id', type=str, default='1', help='use gpu or cpu')
parser.add_argument('--height', type=int, default=512,
help='height for the tensorflow placeholder, should be multiples of 16')
parser.add_argument('--width', type=int, default=512,
help='width for the tensorflow placeholder, should be multiple of 16 for 3 scales')
parser.add_argument('--input_path', type=str, default='./testing_set1',
help='input path for testing images')
parser.add_argument('--output_path', type=str, default='./testing_res1',
help='output path for testing images')
parser.add_argument('--checkpoint_path', type=str, default='./checkpoints/gray',
help='output path for checkpoint')
args = parser.parse_args()
return args
def main(_):
args = parse_args()
# set gpu/cpu mode
if int(args.gpu_id) >= 0:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
else:
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
# set up deblur models
deblur = model.DEBLUR(args)
if args.phase == 'test':
deblur.test(args.height, args.width, args.input_path, args.output_path,args.checkpoint_path)#测试时执行这条语句
elif args.phase == 'train':
deblur.train()
else:
print('phase should be set to either test or train')
if __name__ == '__main__':
tf.app.run()
def test(self, height, width, input_path, output_path,checkpoint_path):
imgsName = sorted(os.listdir(input_path))#遍历文件夹中的所有图像
H, W = height, width
inp_chns = 3 if self.args.model == 'color' else 1
self.batch_size = 1 if self.args.model == 'color' else 1
model_name = "deblur.model"
ckpt_name = model_name + '-' + '15000'
tf.reset_default_graph()
graph = tf.get_default_graph()
inputs = tf.placeholder(shape=[self.batch_size, H, W, inp_chns], dtype=tf.float32) #输入占位符
outputs = self.generator(inputs, reuse=False)#建立计算图
saver = tf.train.Saver(tf.global_variables(), max_to_keep=2)
sess=tf.Session(graph=graph,config=tf.ConfigProto(device_count={"CPU": 1},allow_soft_placement=True,inter_op_parallelism_threads=1,intra_op_parallelism_threads=1,use_per_session_threads=True))#设置sess
saver.restore(sess, os.path.join(checkpoint_path, 'B5678-1-60-noise7', ckpt_name))#加载训练的模型
for imgName in imgsName: #循环处理之前遍历的图像
blur =cv2.imread(os.path.join(input_path, imgName),-1)#读入图
h, w = blur.shape
x=h//512
#print(x)
y=w//512
#print(y)
if x>y:
blur = np.pad(blur, ((0, ((x+1)*512 - h)), (0,((x+1)*512 - w))), 'edge') #把图像扩充为512*512的整数倍方便裁切
after_deblur=np.zeros((((x+1)*512), ((x+1)*512))) #建立相同大小空矩阵
if x<=y:
blur = np.pad(blur, ((0, ((y+1)*512 - h)), (0,((y+1)*512 - w))), 'edge') #把图像扩充为512*512的整数倍方便裁切
after_deblur=np.zeros((((y+1)*512), ((y+1)*512)))#建立相同大小空矩阵
#把图像切分成512*512的小图,依次送入神经网络得到结果
starttotal = time.time()
for ii in range(x+1):
for jj in range(y+1):
blurPad = blur[ii * 512:(ii + 1) * 512, jj * 512:(jj + 1) * 512] #按顺序裁切成512*512的图像块
blurPad = np.expand_dims(blurPad, -1)
blurPad = np.expand_dims(blurPad, 0)
if self.args.model != 'color':
blurPad = np.transpose(blurPad, (3, 1, 2, 0))
start = time.time()
deblur = sess.run(outputs, feed_dict={inputs: blurPad / 4095.0})#把图像块送入计算图中sess.run计算
duration = time.time() - start
res = deblur[-1]
res = np.clip(res, a_min=0, a_max=1)
if self.args.model != 'color':
res = np.transpose(res, (3, 1, 2, 0))
res = res[0, :, :, :] * 4095.0
res = (res.astype(np.uint16))
res = np.squeeze(res)
after_deblur = (after_deblur.astype(np.uint16))
after_deblur[ii * 512:(ii + 1) * 512, jj * 512:(jj + 1) * 512]=res #用计算得到的结果替换空矩阵相同位置的值
durationtotal = time.time() - starttotal
print('total time use %4.3fs' % (durationtotal))
#print(after_deblur.shape)
after_deblur = after_deblur[:h, :w]
after_deblur = np.clip(after_deblur, a_min=0, a_max=4095)
#print(after_deblur.shape)
imtiff = Image.fromarray(after_deblur)
imtiff.save(os.path.join(output_path,imgName)) #写出图像
sess.close()
del sess