add Knowledge distillation TF example (#533)

Zhiwei35 · web-flow · commit a79576e08437 · 2021-11-29T14:48:21.000+08:00
diff --git a/examples/tensorflow/distillation/conf.yaml b/examples/tensorflow/distillation/conf.yaml
@@ -21,7 +21,7 @@ model:
 distillation:
   train:
     start_epoch: 0
-    end_epoch: 10
+    end_epoch: 90
     iteration: 1000
     frequency: 1
     dataloader:
@@ -30,23 +30,21 @@ distillation:
         ImageFolder:
           root: /path/to/dataset
       transform:
-        AlignImageChannel:
-          dim: 3
-        ResizeCropImagenet: 
-          height: 224
-          width: 224
-        Normalize:
+        Resize:
+          size: 224
+          interpolation: nearest
+        KerasRescale:
           rescale: [127.5, 1]
     optimizer:
       SGD:
-        learning_rate: 0.1
+        learning_rate: 0.001  
         momentum: 0.1
         nesterov: True
         weight_decay: 0.001
     criterion:
       KnowledgeDistillationLoss:
         temperature: 1.0
-        loss_types: ['CE', 'KL']
+        loss_types: ['CE', 'CE']
         loss_weights: [0.5, 0.5]
   
 evaluation:                                          # optional. required if user doesn't provide eval_func in neural_compressor.Quantization.
@@ -59,14 +57,11 @@ evaluation:                                          # optional. required if use
         ImageFolder:
           root: /path/to/dataset
       transform:
-        AlignImageChannel:
-          dim: 3
-        ResizeCropImagenet: 
-          height: 224
-          width: 224
-        Normalize:
+        Resize:
+          size: 224
+          interpolation: nearest
+        KerasRescale:   
           rescale: [127.5, 1]
-
 tuning:
   accuracy_criterion:
     relative: 0.01                             # the tuning target of accuracy loss percentage: 1%
diff --git a/examples/tensorflow/distillation/main.py b/examples/tensorflow/distillation/main.py
@@ -1,3 +1,20 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import os
 import logging
@@ -6,9 +23,9 @@
 import warnings
 import tensorflow as tf
 from neural_compressor.utils import logger
-model_names = ['mobilenet','mobilenetv2']
+model_names = ['mobilenet','densenet201']
 
-parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser = argparse.ArgumentParser(description='Tensorflow ImageNet Training')
 parser.add_argument('-t', '--topology', metavar='ARCH', default='resnet18',
                     choices=model_names,
                     help='model architecture: ' +
@@ -51,11 +68,10 @@ def main_worker(args):
     global best_acc1
 
     print("=> using pre-trained model '{}'".format(args.topology))
-    model = tf.keras.applications.mobilenet.MobileNet(weights='imagenet')
-
+    model = tf.keras.applications.MobileNet(weights='imagenet')
+    
     print("=> using pre-trained teacher model '{}'".format(args.teacher))
-    teacher_model = tf.keras.applications.mobilenet_v2.MobileNetV2(weights='imagenet')
-    # optionally resume from a checkpoint
+    teacher_model = tf.keras.applications.DenseNet201(weights='imagenet')
 
     if args.distillation:
         from neural_compressor.experimental import Distillation, common
diff --git a/neural_compressor/adaptor/tensorflow.py b/neural_compressor/adaptor/tensorflow.py
@@ -120,10 +120,9 @@ def train(self, model, dataloader, optimizer_tuple,
         iters = kwargs['kwargs'].get('iteration', None)
         callbacks = kwargs['kwargs'].get('callbacks', None)
         distributed = getattr(dataloader, 'distributed', False)
-
         from neural_compressor.experimental.common.criterion import TensorflowKnowledgeDistillationLoss
         if isinstance(criterion, TensorflowKnowledgeDistillationLoss):
-            input_model = model._model      
+            input_model = model._model
         else:
             input_model = tf.keras.models.load_model(model._model)
             hooks = callbacks['tf_pruning'](model, input_model, hooks)
@@ -148,7 +147,7 @@ def train(self, model, dataloader, optimizer_tuple,
         def training_step(first_batch):
             with tf.GradientTape() as tape:
                 tape.watch(input_model.trainable_variables)
-                y_ = input_model(x, training=True)
+                y_ = input_model(x)
                 loss_value = criterion(y, y_)
             tape = self.hvd.DistributedGradientTape(tape) if distributed else tape
             # Get gradient
@@ -178,17 +177,21 @@ def training_step(first_batch):
                 hooks['on_batch_end']()            # on_batch_end hook
                 if iters is not None and cnt >= iters:
                     break
+            model._sess = None
             hooks['on_epoch_end']()                # on_epoch_end hook
             # End epoch
             train_loss_results.append(epoch_loss_avg.result())
             if not distributed or self.hvd.local_rank() == 0:
                 logger.info("Epoch {:03d}: Loss: {:.3f}".format(epoch+1, epoch_loss_avg.result()))
+
         hooks['post_epoch_end']()                  # post_epoch_end hook
         model._sess = None
         if not isinstance(criterion, TensorflowKnowledgeDistillationLoss):
             if not distributed or self.hvd.rank() == 0:
                 # Update the input model with pruned weights manually due to keras API limitation.
                 input_model.save(model._model)
+        else:
+            input_model.save('distillation_model')
 
     @dump_elapsed_time(customized_msg="Model inference")
     def evaluate(self, model, dataloader, postprocess=None,
diff --git a/neural_compressor/experimental/common/criterion.py b/neural_compressor/experimental/common/criterion.py
@@ -20,6 +20,8 @@
 from neural_compressor.utils.utility import LazyImport, singleton
 from neural_compressor.utils import logger
 
+import numpy as np
+
 torch = LazyImport('torch')
 tf = LazyImport('tensorflow')
 
@@ -193,7 +195,9 @@ def loss_cal(self, student_outputs, targets):
 
     def __call__(self, student_outputs, targets):
         if isinstance(self, TensorflowKnowledgeDistillationLoss):
-            student_outputs, targets = targets, student_outputs
+            tmp = student_outputs
+            student_outputs = targets
+            targets = tmp
         return self.loss_cal(student_outputs, targets)
 
 class PyTorchKnowledgeDistillationLoss(KnowledgeDistillationLoss):
@@ -293,22 +297,26 @@ def __init__(self, temperature=1.0, loss_types=['CE', 'CE'],
                                                                   loss_weights=loss_weights)
         if self.student_targets_loss is None:
             if self.loss_types[0] == 'CE':
-                self.student_targets_loss = tf.nn.sparse_softmax_cross_entropy_with_logits
+                self.student_targets_loss = tf.keras.losses.SparseCategoricalCrossentropy()
             else:
                 raise NotImplementedError('Now we only support CrossEntropyLoss '
                  'for loss of student model output with respect to targets.')
             logger.info('student_targets_loss: {}, {}'.format(self.loss_types[0], \
                                                         self.loss_weights[0]))            
         if self.teacher_student_loss is None:
             if self.loss_types[1] == 'CE':
-                self.teacher_student_loss = tf.keras.losses.CategoricalCrossentropy()
+                self.teacher_student_loss = self.SoftCrossEntropy
             elif self.loss_types[1] == 'KL':
                 self.teacher_student_loss = tf.keras.losses.KLDivergence()
             else:
                 raise NotImplementedError('Now we only support CrossEntropyLoss'
                 ' for loss of student model output with respect to teacher model ouput.')
             logger.info('teacher_student_loss: {}, {}'.format(self.loss_types[1], \
                                                         self.loss_weights[1]))
+    def SoftCrossEntropy(self, targets, logits):
+        log_prob = tf.math.log(logits)
+        targets_prob = targets
+        return tf.math.reduce_mean(tf.math.reduce_sum(- targets_prob * log_prob, axis=-1), axis=-1)
 
     def teacher_model_forward(self, input, teacher_model=None):
         if self.loss_weights[1] > 0 and input is not None:
diff --git a/neural_compressor/experimental/data/datasets/dataset.py b/neural_compressor/experimental/data/datasets/dataset.py
@@ -701,6 +701,8 @@ def __getitem__(self, index):
         sample = self.image_list[index]
         label = sample[1]
         with Image.open(sample[0]) as image:
+            if image.mode != 'RGB':
+                image = image.convert('RGB')
             image = np.array(image)
             if self.transform is not None:
                 image, label = self.transform((image, label))
diff --git a/neural_compressor/experimental/data/transforms/transform.py b/neural_compressor/experimental/data/transforms/transform.py
@@ -1227,6 +1227,24 @@ def __call__(self, sample):
             image -= self.rescale[1]            
         return (image, label)
 
+@transform_registry(transform_type='KerasRescale', process="preprocess", \
+                framework='tensorflow')
+class RescaleKerasPretrainTransform(BaseTransform):
+    """Scale the values of image to [0,1].
+
+    Returns:
+        tuple of processed image and label
+    """
+    def __init__(self, rescale=None):
+        self.rescale = rescale
+
+    def __call__(self, sample):
+        image, label = sample
+        if self.rescale:
+            image /= self.rescale[0]
+            image -= self.rescale[1] 
+        return (image, label)
+
 @transform_registry(transform_type='Rescale', process="preprocess", \
                 framework='tensorflow')
 class RescaleTFTransform(BaseTransform):