TensorFlow 中的 CTCGreedyDecoder

2023年5月23日下午6:56 • 人工智能 • 阅读 159

@tf_export("nn.ctc_greedy_decoder")
@dispatch.add_dispatch_support
def ctc_greedy_decoder(inputs,
                       sequence_length,
                       merge_repeated=True,
                       blank_index=None):
  """Performs greedy decoding on the logits given in input (best path).

  Given a tensor as inputs, the blank_index parameter defines the class
  index of the blank symbol.

  For example:
  If blank_index is equal to 1:
  >>> inf = float("inf")
  >>> logits = tf.constant([[[   0., -inf, -inf],
  ...                        [ -2.3, -inf, -0.1]],
  ...                       [[ -inf, -0.5, -inf],
  ...                        [ -inf, -inf, -0.1]],
  ...                       [[ -inf, -inf, -inf],
  ...                        [ -0.1, -inf, -2.3]]])
  >>> seq_lens = tf.constant([2, 3])
  >>> outputs = tf.nn.ctc_greedy_decoder(
  ...     logits,
  ...     seq_lens,
  ...     blank_index=1)
  Notes:
  - Unlike ctc_beam_search_decoder, ctc_greedy_decoder considers blanks
    as regular elements when computing the probability of a sequence.

  - Default blank_index is (num_classes - 1), unless overriden.

  If merge_repeated is True, merge repeated classes in output.

  This means that if consecutive logits' maximum indices are the same,
  only the first of these is emitted.  The sequence A B B * B * B (where '*'
  is the blank label) becomes
    * A B B B if merge_repeated=True.

    * A B B B B if merge_repeated=False.

  Args:
    inputs: 3-D float Tensor sized [max_time, batch_size, num_classes].

      The logits.

    sequence_length: 1-D int32 vector containing sequence lengths, having size
      [batch_size].

    merge_repeated: Boolean.  Default: True.

    blank_index: (Optional). Default: num_classes - 1. Define the class index
      to use for the blank label. Negative values will start from num_classes,
      ie, -1 will reproduce the ctc_greedy_decoder behavior of using
      num_classes - 1 for the blank symbol, which corresponds to the default.

  Returns:
    A tuple (decoded, neg_sum_logits) where
    decoded: A single-element list. decoded[0]
      is an SparseTensor containing the decoded outputs s.t.:
      decoded.indices: Indices matrix (total_decoded_outputs, 2).

        The rows store: [batch, time].

      decoded.values: Values vector, size (total_decoded_outputs).

        The vector stores the decoded classes.

      decoded.dense_shape: Shape vector, size (2).

        The shape values are: [batch_size, max_decoded_length]
    neg_sum_logits: A float matrix (batch_size x 1) containing, for the
        sequence found, the negative of the sum of the greatest logit at each
        timeframe.

"""

  outputs = gen_ctc_ops.ctc_greedy_decoder(
      inputs,
      sequence_length,
      merge_repeated=merge_repeated,
      blank_index=blank_index)
  (decoded_ix, decoded_val, decoded_shape, log_probabilities) = outputs
  return ([sparse_tensor.SparseTensor(decoded_ix, decoded_val,
                                      decoded_shape)], log_probabilities)

REGISTER_OP("CTCGreedyDecoder")
    .Input("inputs: T")
    .Input("sequence_length: int32")
    .Attr("merge_repeated: bool = false")
    .Attr("blank_index: int = -1")
    .Output("decoded_indices: int64")
    .Output("decoded_values: int64")
    .Output("decoded_shape: int64")
    .Output("log_probability: T")
    .Attr("T: {float, double} = DT_FLOAT")
    .SetShapeFn([](InferenceContext* c) {
      ShapeHandle inputs;
      ShapeHandle sequence_length;

      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &inputs));
      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sequence_length));

      DimensionHandle batch_size;
      TF_RETURN_IF_ERROR(
          c->Merge(c->Dim(inputs, 1), c->Dim(sequence_length, 0), &batch_size));

      DimensionHandle total_decoded_outputs = c->UnknownDim();
      c->set_output(0, c->Matrix(total_decoded_outputs, 2));
      c->set_output(1, c->Vector(total_decoded_outputs));
      c->set_output(2, c->Vector(2));
      c->set_output(3, c->Matrix(batch_size, 1));
      return Status::OK();
    });

#define REGISTER_OP_IMPL(ctr, name, is_system_op)                         \
  static ::tensorflow::InitOnStartupMarker const register_op##ctr         \
      TF_ATTRIBUTE_UNUSED =                                               \
          TF_INIT_ON_STARTUP_IF(is_system_op || SHOULD_REGISTER_OP(name)) \
          << ::tensorflow::register_op::OpDefBuilderWrapper(name)

#define REGISTER_OP(name)        \
  TF_ATTRIBUTE_ANNOTATE("tf:op") \
  TF_NEW_ID_FOR_INIT(REGISTER_OP_IMPL, name, false)

#define REGISTER_CPU(T)                                                   \
  REGISTER_KERNEL_BUILDER(                                                \
      Name("CTCGreedyDecoder").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
      CTCGreedyDecoderOp<T>);

REGISTER_CPU(float);
REGISTER_CPU(double);

template <typename T>
class CTCGreedyDecoderOp : public OpKernel {
 public:
  explicit CTCGreedyDecoderOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
    OP_REQUIRES_OK(ctx, ctx->GetAttr("merge_repeated", &merge_repeated_));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("blank_index", &blank_index_));
  }

  void Compute(OpKernelContext* ctx) override {
    const Tensor* inputs;
    const Tensor* seq_len;
    Tensor* log_prob = nullptr;
    OpOutputList decoded_indices;
    OpOutputList decoded_values;
    OpOutputList decoded_shape;
    OP_REQUIRES_OK(ctx, decode_helper_.ValidateInputsGenerateOutputs(
                            ctx, &inputs, &seq_len, &log_prob, &decoded_indices,
                            &decoded_values, &decoded_shape));
    const TensorShape& inputs_shape = inputs->shape();

    std::vector<typename TTypes<T>::UnalignedConstMatrix> input_list_t;
    const int64_t max_time = inputs_shape.dim_size(0);
    const int64_t batch_size = inputs_shape.dim_size(1);
    const int64_t num_classes_raw = inputs_shape.dim_size(2);
    OP_REQUIRES(
        ctx, FastBoundsCheck(num_classes_raw, std::numeric_limits<int>::max()),
        errors::InvalidArgument("num_classes cannot exceed max int"));
    const int num_classes = static_cast<const int>(num_classes_raw);

    auto inputs_t = inputs->tensor<T, 3>();

    input_list_t.reserve(max_time);
    for (std::size_t t = 0; t < max_time; ++t) {
      input_list_t.emplace_back(inputs_t.data() + t * batch_size * num_classes,
                                batch_size, num_classes);
    }
    auto seq_len_t = seq_len->vec<int32>();
    auto log_prob_t = log_prob->matrix<T>();

    log_prob_t.setZero();

    int blank_index =
        (blank_index_ < 0) ? num_classes + blank_index_ : blank_index_;
    OP_REQUIRES(ctx, FastBoundsCheck(blank_index, num_classes),
                errors::InvalidArgument("blank_index expected to be between ",
                                        -num_classes, " and ", num_classes - 1,
                                        " but was ", blank_index_));

decode函数在循环中处理单 batch 的数据。
sequences存储每个批次的每个路径的解码值，所以是三层嵌套。GreedyDecoder 只生成一条路径。
从 seq_len_t数组中获取每个序列的长度。
input_list_t[t]的形状为 [batch_size, num_classes]，RowMax 找到当前批次的最大概率值及其对应索引。
log_prob_t累积其负数和。
如果不是空索引并且满足重复筛选条件，则将其添加到路径中。

[En]

If it is not a blank index and the duplicate filter condition is met, it is added to the path.


    std::vector<std::vector<std::vector<int> > > sequences(batch_size);
    auto decode = [&](const int64_t begin, const int64_t end) {
      for (int b = begin; b < end; ++b) {
        sequences[b].resize(1);
        auto &sequence = sequences[b][0];
        int prev_indices = -1;
        for (int t = 0; t < seq_len_t(b); ++t) {
          int max_class_indices;
          OP_REQUIRES(ctx, input_list_t[t].dimension(1) > 0,
                      errors::InvalidArgument("Invalid input dimensions."));
          log_prob_t(b, 0) +=
              -RowMax<T>(input_list_t[t], b, &max_class_indices);
          if (max_class_indices != blank_index &&
              !(merge_repeated_ && max_class_indices == prev_indices)) {
            sequence.push_back(max_class_indices);
          }
          prev_indices = max_class_indices;
        }
      }
    };

    const int64_t kCostPerUnit = 50 * max_time * num_classes;
    const int64_t total = batch_size;
    const DeviceBase::CpuWorkerThreads& worker_threads =
        *ctx->device()->tensorflow_cpu_worker_threads();
    Shard(worker_threads.num_threads, worker_threads.workers, total,
          kCostPerUnit, decode);

    OP_REQUIRES_OK(
        ctx, decode_helper_.StoreAllDecodedSequences(
                 sequences, &decoded_indices, &decoded_values, &decoded_shape));
  }

 private:
  CTCDecodeHelper decode_helper_;
  bool merge_repeated_;
  int blank_index_;

  TF_DISALLOW_COPY_AND_ASSIGN(CTCGreedyDecoderOp);
};

 public:
  CTCDecodeHelper() : top_paths_(1) {}

  inline int GetTopPaths() const { return top_paths_; }
  void SetTopPaths(int tp) { top_paths_ = tp; }

  Status ValidateInputsGenerateOutputs(
      OpKernelContext* ctx, const Tensor** inputs, const Tensor** seq_len,
      Tensor** log_prob, OpOutputList* decoded_indices,
      OpOutputList* decoded_values, OpOutputList* decoded_shape) const {
    Status status = ctx->input("inputs", inputs);
    if (!status.ok()) return status;
    status = ctx->input("sequence_length", seq_len);
    if (!status.ok()) return status;

    const TensorShape& inputs_shape = (*inputs)->shape();

    if (inputs_shape.dims() != 3) {
      return errors::InvalidArgument("inputs is not a 3-Tensor");
    }
    if (inputs_shape.num_elements() == 0) {
      return errors::InvalidArgument("inputs must not be empty");
    }

    const int64_t max_time = inputs_shape.dim_size(0);
    const int64_t batch_size = inputs_shape.dim_size(1);

    if (max_time == 0) {
      return errors::InvalidArgument("max_time is 0");
    }
    if (!TensorShapeUtils::IsVector((*seq_len)->shape())) {
      return errors::InvalidArgument("sequence_length is not a vector");
    }

    if (!(batch_size == (*seq_len)->dim_size(0))) {
      return errors::FailedPrecondition(
          "len(sequence_length) != batch_size.  ",
          "len(sequence_length):  ", (*seq_len)->dim_size(0),
          " batch_size: ", batch_size);
    }

    auto seq_len_t = (*seq_len)->vec<int32>();

    for (int b = 0; b < batch_size; ++b) {
      if (!(seq_len_t(b)  max_time)) {
        return errors::FailedPrecondition("sequence_length(", b,
                                          ") , max_time);
      }
    }

    Status s = ctx->allocate_output(
        "log_probability", TensorShape({batch_size, top_paths_}), log_prob);
    if (!s.ok()) return s;

    s = ctx->output_list("decoded_indices", decoded_indices);
    if (!s.ok()) return s;
    s = ctx->output_list("decoded_values", decoded_values);
    if (!s.ok()) return s;
    s = ctx->output_list("decoded_shape", decoded_shape);
    if (!s.ok()) return s;

    return Status::OK();
  }

将 sequences以3个输出变量 decoded_indices、 decoded_values和 decoded_shape表示。
top_paths_为最优路径的数量。
对于每个 batch 序列，计算每个最优路径的条目数。


  Status StoreAllDecodedSequences(
      const std::vector<std::vector<std::vector<int> > >& sequences,
      OpOutputList* decoded_indices, OpOutputList* decoded_values,
      OpOutputList* decoded_shape) const {

    const int64_t batch_size = sequences.size();
    std::vector<int64_t> num_entries(top_paths_, 0);

    for (const auto& batch_s : sequences) {
      CHECK_EQ(batch_s.size(), top_paths_);
      for (int p = 0; p < top_paths_; ++p) {
        num_entries[p] += batch_s[p].size();
      }
    }

    for (int p = 0; p < top_paths_; ++p) {
      Tensor* p_indices = nullptr;
      Tensor* p_values = nullptr;
      Tensor* p_shape = nullptr;

      const int64_t p_num = num_entries[p];

      Status s =
          decoded_indices->allocate(p, TensorShape({p_num, 2}), &p_indices);
      if (!s.ok()) return s;
      s = decoded_values->allocate(p, TensorShape({p_num}), &p_values);
      if (!s.ok()) return s;
      s = decoded_shape->allocate(p, TensorShape({2}), &p_shape);
      if (!s.ok()) return s;

      auto indices_t = p_indices->matrix<int64_t>();
      auto values_t = p_values->vec<int64_t>();
      auto shape_t = p_shape->vec<int64_t>();

      int64_t max_decoded = 0;
      int64_t offset = 0;

对于每个 batch， p_batch为序列的最优路径。 num_decoded为路径长度。拷贝到 values_t中。
在 indices_t中填 b和 t。 offset为不同 batch 的偏移。

      for (int64_t b = 0; b < batch_size; ++b) {
        auto& p_batch = sequences[b][p];
        int64_t num_decoded = p_batch.size();
        max_decoded = std::max(max_decoded, num_decoded);
        if (num_decoded > 0) {
          DCHECK_NE(values_t.data(), nullptr)
              << "values_t should not be nullptr: p_num=" << p_num
              << " num_decoded=" << num_decoded;
          DCHECK_LT(offset, values_t.size())
              << "offset should be smaller than values_t.size()";
          std::copy_n(p_batch.begin(), num_decoded, &values_t(offset));
        }
        for (int64_t t = 0; t < num_decoded; ++t, ++offset) {
          indices_t(offset, 0) = b;
          indices_t(offset, 1) = t;
        }
      }

      shape_t(0) = batch_size;
      shape_t(1) = max_decoded;
    }
    return Status::OK();
  }

 private:
  int top_paths_;
  TF_DISALLOW_COPY_AND_ASSIGN(CTCDecodeHelper);

  int index;
  TF_RETURN_IF_ERROR(get_input_index(name, &index));
  if (input_is_ref(index)) {
    return errors::InvalidArgument("OpKernel used ref input name '", name,
                                   "' when non-ref input was expected");
  }
  *tensor = (*params_->inputs)[index].tensor;
  return Status::OK();

template <typename T>
inline T RowMax(const typename TTypes<T>::UnalignedConstMatrix& m, int r,
                int* c) {
  *c = 0;
  CHECK_LT(0, m.dimension(1));
  auto p = m(r, 0);
  for (int i = 1; i < m.dimension(1); ++i) {
    if (m(r, i) > p) {
      p = m(r, i);
      *c = i;
    }
  }
  return p;
}

  CHECK_GE(total, 0);
  if (total == 0) {
    return;
  }
  max_parallelism = std::min(max_parallelism, GetPerThreadMaxParallelism());
  if (max_parallelism  1) {

    work(0, total);
    return;
  }

  if (max_parallelism >= workers->NumThreads()) {
    workers->ParallelFor(total, cost_per_unit, work);
    return;
  }

  Sharder::Do(
      total, cost_per_unit, work,
      [&workers](Sharder::Closure c) { workers->Schedule(c); },
      max_parallelism);

调用函数来处理。

  CHECK_GE(total, 0);
  CHECK_EQ(total, (int64_t)(Eigen::Index)total);
  threadpool_device_->parallelFor(
      total, Eigen::TensorOpCost(0, 0, cost_per_unit),
      [&fn](Eigen::Index first, Eigen::Index last) { fn(first, last); });


class ThreadPoolDevice : public LocalDevice {
 public:
  ThreadPoolDevice(const SessionOptions& options, const string& name,
                   Bytes memory_limit, const DeviceLocality& locality,
                   Allocator* allocator);
  ~ThreadPoolDevice() override;

  Allocator* GetAllocator(AllocatorAttributes attr) override;
  Allocator* GetScopedAllocator(AllocatorAttributes attr,
                                int64_t step_id) override;
  ScopedAllocatorMgr* GetScopedAllocatorMgr() const override {
    return scoped_allocator_mgr_.get();
  }
  Status MakeTensorFromProto(const TensorProto& tensor_proto,
                             const AllocatorAttributes alloc_attrs,
                             Tensor* tensor) override;
  void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
                              const DeviceContext* device_context,
                              StatusCallback done) override;

  Status Sync() override { return Status::OK(); }

  void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
  void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
                    AsyncOpKernel::DoneCallback done) override;

 private:
  void LogInputs(OpKernel* op_kernel, OpKernelContext* context);
  void LogOutputs(OpKernel* op_kernel, OpKernelContext* context);

  Allocator* allocator_;
  std::unique_ptr<ScopedAllocatorMgr> scoped_allocator_mgr_;
  NodeFileWriter* node_file_writer_ = nullptr;
};

参考资料：

Original: https://blog.csdn.net/yiran103/article/details/124057817
Author: 图波列夫
Title: TensorFlow 中的 CTCGreedyDecoder

原创文章受到原创版权保护。转载请注明出处：https://www.johngo689.com/497462/

转载文章受原作者版权保护。转载请注明原作者出处！

人工智能

【自取】最近整理的，有需要可以领取学习：

Linux核心资料大放送~

全栈面试题汇总（持续更新&可下载）

一个提高学习100%效率的工具！

【超详细】深度学习面试题目！

LeetCode Python刷题答案下载！

LeetCode Java版刷题答案下载！

LeetCode C++ 版本，抓紧保存！

LeetCode GO语言刷题答案下载！

遥感图像中的小样本目标检测：Few-shot Object Detection on Remote SensingImages

论文下载：https://arxiv.org/pdf/2006.07826v2.pdf Abstract 在本文中，我们处理遥感图像上的目标检测问题。以前的方法已经发展了许多基于深…

人工智能 2023年5月26日
0062
Torch安装报错pip._vendor.urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool解决方法

1.问题使用命令安装torch： pip3 install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio===…

人工智能 2023年7月23日
00100
八行代码实现图像分类-神一样的Mathematica

引言图像分类一直是神经网络的重要应用，热度居高不下，但是其相对复杂的代码量以及复杂的调试让很多小白望而却步。其实，Mathematica里面内置的 Classify功能可以极其…

人工智能 2023年7月2日
0073
粒子群优化算法及其应用

产生背景粒子群优化（Particle Swarm Optimization, PSO）算法是由美国普渡大学的Kennedy和Eberhart于1995年提出，它的基本概念源于对鸟…

人工智能 2023年6月24日
0079
15.关注模块——peewee创建模型、tornado-peewee-async查询增加删除接口

1.peewee模型类建立——自关联的多对多关系由于关注和粉丝之间的关系都是来源于用户表，并且是多对多的关系代码展示 from peewee import * from foru…

人工智能 2023年6月29日
0096
Faster R-CNN理论

前言本文参考来源于b站up主霹雳吧啦Wz. 原视频. Faster R-CNN是作者作者Ross Girshick继Fast R-NN之后的又一力作。同样使用VGG16作为网络的…

人工智能 2023年7月10日
0074
yolov4和yolov5详解

YOLOv4 YOLOv4的三大贡献：设计了强大而高效的检测模型，任何人都可以用 1080Ti 和 2080Ti训练这个超快而精准的模型。验证了很多近几年 SOTA 的深度学习…

人工智能 2023年5月28日
0077
使用SOCKET TCP OPENCV 发送视频 mat格式及jpg格式

Server.cpp #includesocket .h> #includeopencv 2/opencv .hpp> const int PORT = 5678; /…

人工智能 2023年7月19日
0035
基于pytorch平台实现对MNIST数据集的分类分析（前馈神经网络、softmax）基础版

啊哦~你想找的内容离你而去了哦内容不存在，可能为如下原因导致： ① 内容还在审核中 ② 内容以前存在，但是由于不符合新的规定而被删除 ③ 内容地址错误 ④ 作者删除了内容。可…

人工智能 2023年7月1日
0068
Pytorch—- CIFAR10实战(训练集+测试集+验证集)完整版，逐行注释—–学习笔记

文章目录 * – CIFAR10数据集准备、加载 – 搭建神经网络 – 损失函数和优化器 – 训练集 – 测试集 &#8…

人工智能 2023年7月4日
0083
pandas学习笔记——set_index()

pandas中set_index()方法是专门用来将某一列设置为index的方法。主要参数：keys：需要设置为index的列名drop：True or False。将某列设置为i…

人工智能 2023年7月7日
0082
【知识图谱】【实践工具】【Windows】 – Protege – 介绍以及安装（一）

抵扣说明： 1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。2.余额无法直接购买下载，可以购买VIP、C币套餐、付费专栏及课程。 Original: https:…

人工智能 2023年6月1日
0073
从小学4年级的数学课开始解释线性回归

从小学4年级的数学课开始解释线性回归 flyfish 文章目录从小学4年级的数学课开始解释线性回归 * 1.1 回归的由来 1.2 模型 1.2.1 直线 1.2.2 平面直角坐…

人工智能 2023年6月17日
0095
【轻量化模型】mobilenet v2

MobileNet v2 onnx 导出参考：torchvision onnx 模型导出_星魂非梦的博客-CSDN博客 1. 模型描述 MobileNet v2 来自论文：Mobi…

人工智能 2023年7月14日
0063
python利用百度/高德地图获取地理位置并转换

提示：作者原创，转载请注明文章目录一、地理编码与逆编码二、高德地图地理编码三、百度地图地理编码四、坐标转换和空间化一、地理编码与逆编码地理编码与逆编码表示的是地名地址…

人工智能 2023年7月17日
00102
SpringMvc中的@RequestMapping的用法简介说明

转自: 下文笔者讲述RequestMapping的几种用法简介说明，如下所示: 方法上使用RequestMapping @RequestMapping(value="/u…

人工智能 2023年6月27日
0056

2024 年 4 月
一	二	三	四	五	六	日
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30

TensorFlow 中的 CTCGreedyDecoder

参考资料：

大家都在看