10 #ifndef _HFST_TOKENIZER_H_
11 #define _HFST_TOKENIZER_H_
26 using hfst::StringSet;
35 class MultiCharSymbolTrie;
36 typedef std::vector<MultiCharSymbolTrie*> MultiCharSymbolTrieVector;
37 typedef std::vector<bool> SymbolEndVector;
39 class MultiCharSymbolTrie
42 MultiCharSymbolTrieVector symbol_rests;
43 SymbolEndVector is_leaf;
44 bool is_end_of_string(
const char * p)
const ;
45 void set_symbol_end(
const char * p);
46 void init_symbol_rests(
const char * p);
47 void add_symbol_rest(
const char * p);
48 bool is_symbol_end(
const char * p)
const;
49 MultiCharSymbolTrie * get_symbol_rest_trie(
const char * p)
const;
52 HFSTDLL MultiCharSymbolTrie(
void);
53 HFSTDLL ~MultiCharSymbolTrie(
void);
54 HFSTDLL
void add(
const char * p);
55 HFSTDLL
const char * find(
const char * p)
const;
87 MultiCharSymbolTrie multi_char_symbols;
88 StringSet skip_symbol_set;
89 int get_next_symbol_size(
const char * symbol)
const;
90 bool is_skip_symbol(
String &s)
const;
119 HFSTDLL
static StringPairVector tokenize_space_separated(
const std::string & str);
128 const std::string &output_string)
const;
131 const std::string &output_string,
132 void (*warn_about_pair)(
const std::pair<std::string, std::string> &symbol_pair))
const;
135 (
const std::string &input_string,
136 const std::string &output_string,
137 void (*warn_about_pair)(
const std::pair<std::string, std::string> &symbol_pair))
const;
std::pair< String, String > StringPair
A symbol pair in a transition.
Definition: HfstSymbolDefs.h:70
HFSTDLL StringPairVector tokenize(const std::string &input_string) const
Tokenize the string input_string.
Definition: HfstTokenizer.cc:125
std::string String
A UTF-8 symbol in a transition.
Definition: HfstSymbolDefs.h:59
std::vector< std::pair< std::string, std::string > > StringPairVector
A vector of string pairs.
Definition: HfstDataTypes.h:105
HFSTDLL HfstTokenizer()
Create a tokenizer that recognizes utf-8 symbols.
Definition: HfstTokenizer.cc:82
HFSTDLL void add_multichar_symbol(const std::string &symbol)
Add a multicharacter symbol symbol to this tokenizer.
Definition: HfstTokenizer.cc:112
HFSTDLL void add_skip_symbol(const std::string &symbol)
Add a symbol to be skipped to this tokenizer.
Definition: HfstTokenizer.cc:118
HFSTDLL StringVector tokenize_one_level(const std::string &input_string) const
Tokenize the string input_string.
Definition: HfstTokenizer.cc:143
std::pair< float, StringVector > HfstOneLevelPath
A path of one level of arcs with collected weight.
Definition: HfstDataTypes.h:96
static HFSTDLL void check_utf8_correctness(const std::string &input_string)
If input_String is not valid utf-8, throw an IncorrectUtf8CodingException.
Definition: HfstTokenizer.cc:362
A tokenizer for creating transducers from UTF-8 strings.
Definition: HfstTokenizer.h:84
Typedefs and functions for symbols, symbol pairs and sets of symbols.
std::vector< std::string > StringVector
A vector of strings.
Definition: HfstDataTypes.h:87