I have got strange error in tensorflow when restoring weights from previously saved model. During reading variables from file tensorflow shows:
2017-11-23 13:50:06.433715: F C:\tf_jenkins\home\workspace\rel-win\M\windows\PY\36\tensorflow\core\util\tensor_bundle\tensor_bundle.cc:200] Check failed: size > 0 (0 vs. 0)
Immedietly after that it finishes the process with code:
Process finished with exit code -1073740791 (0xC0000409)
The general problem is that I import to new graph a subset of ResNet-101 (only block1), then add transfer layer. After that I save newly created graph (connected block1 with transfer layer) with initialized values to new model file. Then I want to close previous session, reset default graph and import graph with values from file I created moment ago. As I do that, tensorflow crashes with the error code shown above.
I suspect that variable
<tf.Variable 'trainable_block/fc/kernel:0' shape=(64, 32, 256, 2048) dtype=float32_ref>
is causing the problem, when I remove it from the tf.Saver, everything is imported properly (without this variable ofc).
Another strange thing is when I alter importing ResNet code with importing my own graph (importing done the same way as ResNet) it runs smoothly without any problems despite presence of
<tf.Variable 'trainable_block/fc/kernel:0' shape=(64, 32, 256, 2048) dtype=float32_ref>
Why I am suspicious about this particular variable? Function
print_tensors_in_checkpoint_file(file_name=saved_model_path, tensor_name='', all_tensors=True)
crashes in the same way when restoring tensor connected to this variable.
From the error I conclude that the variable definition is present in the .meta file but weight for it are not in the .data files.
I made minimal example code to reproduce the error.
import tensorflow as tf
import tensorflow.contrib.slim as slim
from tensorflow.contrib.slim.nets import resnet_v2
from os import listdir
from os.path import isfile, join
import os
from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file
X_DIR = r"C:\workspace\projects\datasets\seg\64_32_test\\"
Y_DIR = r"C:\workspace\projects\datasets\seg\64_32_mask_test\\"
MODEL_DIR = r"C:\workspace\projects\models\test"
RESNET_MODEL_PATH = os.path.join(MODEL_DIR, 'resnet_v2_101.ckpt')
FAKE_RESNET_MODEL_PATH = os.path.join(MODEL_DIR, 'resnet_fake')
SAVED_FRES_MODEL_PATH = os.path.join(MODEL_DIR, 'model_fake_train')
SAVED_RES_MODEL_PATH = os.path.join(MODEL_DIR, 'model_train')
IMG_HEIGHT = 64
IMG_WIDTH = 32
CHANNELS_X = 3
CHANNELS_Y = 1
USE_RESNET = True
def input_fn(files_names):
"""
Creates dataset interators for x and y_true
:param files_names: names of the files to be loaded as x and y_true
:return: x and y_true tensors (as tensorflow iterators)
"""
def get_img(img_path, channels, shape):
""" Loads and reshapes image """
x_img_file = tf.read_file(img_path)
img_decoded = tf.image.decode_image(x_img_file, channels=channels)
shaped = tf.reshape(img_decoded, shape)
img = tf.cast(shaped, tf.float32)
return img
def map_fn(img_name):
""" Maps file names to actual x and y_true """
x = get_img(tf.string_join([X_DIR, img_name]), CHANNELS_X, [IMG_HEIGHT, IMG_WIDTH, CHANNELS_X])
y = get_img(tf.string_join([Y_DIR, img_name]), CHANNELS_Y, [IMG_HEIGHT * IMG_WIDTH * CHANNELS_Y])
return x, y
dataset = tf.data.Dataset.from_tensor_slices(files_names)
dataset = dataset.map(map_fn, num_parallel_calls=8)
dataset = dataset.prefetch(buffer_size=256)
dataset = dataset.repeat(1)
dataset = dataset.shuffle(buffer_size=256)
dataset = dataset.batch(32)
iterator = dataset.make_one_shot_iterator()
next_example, next_label = iterator.get_next()
return next_example, next_label
def model_fn(features, labels):
"""
Creates network graph as default graph
:param features: input to network as x
:param labels: ground truth output of the network as y_true
:return: loss tensor, train operator, values that should be restored into session from model
"""
def create_resnet(features, ops_to_extract):
"""
Creates ResNet_v2_101 graph, then it exports subgraph of ops_to_extract into newly created graph
:param features: input to the network
:param ops_to_extract: operations which will define subgraph to be created
:return: two params containing ops extracted (as defined in ops_to_extract) and values that should be restored
from resnet saved model for newly created graph
"""
with slim.arg_scope(resnet_v2.resnet_arg_scope()):
net, net_tensors = resnet_v2.resnet_v2_101(features)
resnet_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
scope="resnet_v2_101/(block1|conv1)")
resnet_vars_names = [var.name for var in resnet_vars]
sub_graph = tf.graph_util.extract_sub_graph(tf.get_default_graph().as_graph_def(), ops_to_extract)
tf.reset_default_graph()
nodes_to_extract = ops_to_extract + resnet_vars_names
extracted_nodes = tf.import_graph_def(sub_graph, name='', return_elements=nodes_to_extract)
return extracted_nodes[:len(ops_to_extract)], extracted_nodes[len(ops_to_extract):]
def imitate_resnet(features, ops_to_extract):
"""
Imitates above function create_resnet, created for debugging purposes
"""
with tf.variable_scope('resnet'):
with tf.variable_scope('b1'):
net = tf.layers.conv2d(
inputs=features,
filters=32,
kernel_size=[3, 3],
padding="same",
activation=tf.nn.relu)
net = tf.layers.max_pooling2d(net, [2, 2], 2)
with tf.variable_scope('b2'):
net = tf.layers.conv2d(
inputs=net,
filters=32,
kernel_size=[3, 3],
padding="same",
activation=tf.nn.relu)
net = tf.layers.max_pooling2d(net, [2, 2], 2)
with tf.variable_scope('b3'):
net = tf.layers.conv2d(
inputs=net,
filters=32,
kernel_size=[3, 3],
padding="same",
activation=tf.nn.relu)
net = tf.layers.max_pooling2d(net, [2, 2], 2)
if not isfile(FAKE_RESNET_MODEL_PATH + '.meta'):
saver = tf.train.Saver()
session = tf.Session()
session.run(tf.global_variables_initializer())
session.run(tf.local_variables_initializer())
saver.save(session, FAKE_RESNET_MODEL_PATH)
session.close()
resnet_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
scope="resnet/b1")
resnet_vars_names = [var.name for var in resnet_vars]
sub_graph = tf.graph_util.extract_sub_graph(tf.get_default_graph().as_graph_def(), ops_to_extract)
tf.reset_default_graph()
nodes_to_extract = ops_to_extract + resnet_vars_names
extracted_nodes = tf.import_graph_def(sub_graph, name='', return_elements=nodes_to_extract)
return extracted_nodes[:len(ops_to_extract)], extracted_nodes[len(ops_to_extract):]
if USE_RESNET:
extracted_ops, values_to_restore = create_resnet(features, [labels.op.name, 'resnet_v2_101/block1/unit_3/bottleneck_v2/add'])
else:
extracted_ops, values_to_restore = imitate_resnet(features, [labels.op.name, 'resnet/b1/max_pooling2d/MaxPool'])
graph_labels = extracted_ops[0].outputs[1]
net = extracted_ops[1].outputs[0]
with tf.variable_scope('trainable_block'):
net = tf.layers.batch_normalization(net)
net = tf.image.resize_images(net, [IMG_HEIGHT, IMG_WIDTH])
net = tf.layers.conv2d(net, IMG_HEIGHT * IMG_WIDTH, kernel_size=[IMG_HEIGHT, IMG_WIDTH], name='fc')
with tf.name_scope('loss'):
labels_4_loss = graph_labels / 255.0
sq = tf.squeeze(net, [1, 2])
loss_tensor = tf.losses.sigmoid_cross_entropy(multi_class_labels=labels_4_loss, logits=sq)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
with tf.name_scope('train'):
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
scope="trainable_block")
train_op = tf.train.GradientDescentOptimizer(learning_rate=0.001).minimize(loss_tensor, var_list=train_vars)
return values_to_restore
if '__main__' == __name__:
create_model = False
if (USE_RESNET and not isfile(SAVED_RES_MODEL_PATH + '.meta')) or (not USE_RESNET and not isfile(SAVED_FRES_MODEL_PATH + '.meta')):
create_model = True
if USE_RESNET:
saved_model_path = SAVED_RES_MODEL_PATH
else:
saved_model_path = SAVED_FRES_MODEL_PATH
if create_model:
files = []
for f in listdir(X_DIR):
if isfile(join(X_DIR, f)):
files.append(f)
x, y_true = input_fn(files)
vals_to_restore = model_fn(x, y_true)
restorer = tf.train.Saver(var_list=vals_to_restore)
session = tf.Session()
session.run(tf.global_variables_initializer())
session.run(tf.local_variables_initializer())
if USE_RESNET:
restorer.restore(session, RESNET_MODEL_PATH)
else:
restorer.restore(session, FAKE_RESNET_MODEL_PATH)
saver = tf.train.Saver()
checkpoint = saver.save(session, saved_model_path)
session.close()
#RESET GRAPH & session
tf.reset_default_graph()
restorer = tf.train.import_meta_graph(saved_model_path + '.meta')
session = tf.Session()
session.run(tf.global_variables_initializer())
session.run(tf.local_variables_initializer())
'''
vars = []
for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):
if 'trainable_block/fc/kernel' not in v.name:
vars.append(v)
xyz = tf.train.Saver(var_list=vars)
xyz.restore(session, saved_model_path)
'''
print_tensors_in_checkpoint_file(file_name=saved_model_path, tensor_name='', all_tensors=True)
restorer.restore(session, saved_model_path)
What to do do run this code?
How does the script work? Start with the main.
INFO: I don't have GPU on my machine nor tensorflow-gpu installed.
What puzzles me is why when using resnet the variable cannot be read from saved model? Why tensorflow forces application to exit? Someone got any ideas?
User contributions licensed under CC BY-SA 3.0