libdl  0.0.1
Simple yet powerful deep learning
Loading...
Searching...
No Matches
wordpiece.hpp
1#pragma once
2
3#include "../../logging.hpp"
4#include "../../utils/generic_iterator.hpp"
5
6#include <experimental/propagate_const>
7#include <iostream>
8#include <memory>
9#include <string>
10#include <vector>
11
12// Forward declaration for pImpl
13namespace tsl {
14 namespace ah {
15 template <class CharT>
16 struct str_hash;
17 }
18 template <class CharT, class T, class Hash, class KeySizeT>
19 class htrie_map;
20}; // namespace tsl
21
22namespace dl {
28 class WordPieceTokenizer final {
29 public:
31
32 struct Conf {
33 std::string contSubwordPrefix;
34 };
35
36 private:
38 std::string contSubwordPrefix;
39 std::experimental::propagate_const<
41 trie;
42
43 WordPieceTokenizer(Conf conf, PieceIter begin, PieceIter end) noexcept;
44
45 public:
47
48 [[nodiscard]] std::vector<size_t> tokenize(const std::string& text) const noexcept;
49
50 [[nodiscard]] static WordPieceTokenizer fromConf(std::istream& stream) noexcept;
51 };
52}; // namespace dl
Implements WordPiece tokenizaton as proposed in using the optimized algorithm by .
Definition wordpiece.hpp:28
Represents a generic iterator that can be initialized to hold any other class instance that satisfies...