Gentle folks of the forum -- howdy!
I am training a model (VILLAIN) in Colab Pro and upon continuation I keep getting the following error:
(Tried restoring from a backup and from a saved snapshot but get the same thing)
Code: Select all
08/17/2022 20:53:20 INFO Loading Trainer from Original plugin...
08/17/2022 20:53:44 CRITICAL Error caught! Exiting...
08/17/2022 20:53:44 ERROR Caught exception in thread: '_training_0'
08/17/2022 20:53:45 ERROR Got Exception on main handler:
Traceback (most recent call last):
File "/content/faceswap/lib/cli/launcher.py", line 192, in execute_script
process.process()
File "/content/faceswap/scripts/train.py", line 216, in process
self._end_thread(thread, err)
File "/content/faceswap/scripts/train.py", line 256, in _end_thread
thread.join()
File "/content/faceswap/lib/multithreading.py", line 121, in join
raise thread.err[1].with_traceback(thread.err[2])
File "/content/faceswap/lib/multithreading.py", line 37, in run
self._target(*self._args, **self._kwargs)
File "/content/faceswap/scripts/train.py", line 278, in _training
raise err
File "/content/faceswap/scripts/train.py", line 268, in _training
self._run_training_cycle(model, trainer)
File "/content/faceswap/scripts/train.py", line 353, in _run_training_cycle
trainer.train_one_step(viewer, timelapse)
File "/content/faceswap/plugins/train/trainer/_base.py", line 191, in train_one_step
loss = self._model.model.train_on_batch(model_inputs, y=model_targets)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 2144, in train_on_batch
logs = self.train_function(iterator)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/execute.py", line 55, in quick_execute
inputs, attrs, num_outputs)
tensorflow.python.framework.errors_impl.UnimplementedError: Graph execution error:
Detected at node 'villain/encoder/conv_128_4_conv2d/Conv2D' defined at (most recent call last):
File "/usr/lib/python3.7/threading.py", line 890, in _bootstrap
self._bootstrap_inner()
File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
self.run()
File "/content/faceswap/lib/multithreading.py", line 37, in run
self._target(*self._args, **self._kwargs)
File "/content/faceswap/scripts/train.py", line 268, in _training
self._run_training_cycle(model, trainer)
File "/content/faceswap/scripts/train.py", line 353, in _run_training_cycle
trainer.train_one_step(viewer, timelapse)
File "/content/faceswap/plugins/train/trainer/_base.py", line 191, in train_one_step
loss = self._model.model.train_on_batch(model_inputs, y=model_targets)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 2144, in train_on_batch
logs = self.train_function(iterator)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1051, in train_function
return step_function(self, iterator)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1040, in step_function
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1030, in run_step
outputs = model.train_step(data)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 889, in train_step
y_pred = self(x, training=True)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 490, in __call__
return super().__call__(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/base_layer.py", line 1014, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/functional.py", line 459, in call
inputs, training=training, mask=mask)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/functional.py", line 596, in _run_internal_graph
outputs = node.layer(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 490, in __call__
return super().__call__(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/base_layer.py", line 1014, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/functional.py", line 459, in call
inputs, training=training, mask=mask)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/functional.py", line 596, in _run_internal_graph
outputs = node.layer(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/base_layer.py", line 1014, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/layers/convolutional/base_conv.py", line 250, in call
outputs = self.convolution_op(inputs, self.kernel)
File "/usr/local/lib/python3.7/dist-packages/keras/layers/convolutional/base_conv.py", line 232, in convolution_op
name=self.__class__.__name__)
Node: 'villain/encoder/conv_128_4_conv2d/Conv2D'
DNN library is not found.
[[{{node villain/encoder/conv_128_4_conv2d/Conv2D}}]] [Op:__inference_train_function_13460]
Any help is appreciated.
Thank you for your time!
EDIT: Tried other, older and already trained models and it is happening for all - same error