libdl
0.0.1
Simple yet powerful deep learning
Loading...
Searching...
No Matches
wordpiece.hpp
1
#pragma once
2
3
#include "../../logging.hpp"
4
#include "../../utils/generic_iterator.hpp"
5
6
#include <experimental/propagate_const>
7
#include <iostream>
8
#include <memory>
9
#include <string>
10
#include <vector>
11
12
// Forward declaration for pImpl
13
namespace
tsl {
14
namespace
ah {
15
template
<
class
CharT>
16
struct
str_hash
;
17
}
18
template
<
class
CharT,
class
T,
class
Hash,
class
KeySizeT>
19
class
htrie_map
;
20
};
// namespace tsl
21
22
namespace
dl {
28
class
WordPieceTokenizer
final {
29
public
:
30
using
PieceIter
=
utils::GenericIterator<std::tuple<std::string, std::size_t>
>;
31
32
struct
Conf
{
33
std::string
contSubwordPrefix;
34
};
35
36
private
:
37
dl::logging::LoggerPtr
logger;
38
std::string
contSubwordPrefix;
39
std::experimental::propagate_const<
40
std::unique_ptr<tsl::htrie_map<char, size_t, tsl::ah::str_hash<char>
,
std::uint16_t
>>>
41
trie;
42
43
WordPieceTokenizer
(
Conf
conf,
PieceIter
begin,
PieceIter
end)
noexcept
;
44
45
public
:
46
~WordPieceTokenizer
();
47
48
[[nodiscard]]
std::vector<size_t>
tokenize(
const
std::string
& text)
const
noexcept
;
49
50
[[nodiscard]]
static
WordPieceTokenizer
fromConf(
std::istream
& stream)
noexcept
;
51
};
52
};
// namespace dl
std::istream
std::string
dl::WordPieceTokenizer
Implements WordPiece tokenizaton as proposed in using the optimized algorithm by .
Definition
wordpiece.hpp:28
dl::utils::GenericIterator
Represents a generic iterator that can be initialized to hold any other class instance that satisfies...
Definition
generic_iterator.hpp:24
tsl::htrie_map
Definition
wordpiece.hpp:19
std::uint16_t
std::shared_ptr
dl::WordPieceTokenizer::Conf
Definition
wordpiece.hpp:32
tsl::ah::str_hash
Definition
wordpiece.hpp:16
std::unique_ptr
std::vector
dl
model
transformer
wordpiece.hpp
Generated by
1.9.8