HFST - Helsinki Finite-State Transducer Technology - C++ API  version 3.9.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
HfstTokenizer.h
Go to the documentation of this file.
1 // Copyright (c) 2016 University of Helsinki
2 //
3 // This library is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU Lesser General Public
5 // License as published by the Free Software Foundation; either
6 // version 3 of the License, or (at your option) any later version.
7 // See the file COPYING included with this distribution for more
8 // information.
9 
10 #ifndef _HFST_TOKENIZER_H_
11 #define _HFST_TOKENIZER_H_
12 #include "HfstSymbolDefs.h"
13 #include "HfstExceptionDefs.h"
14 #include <iostream>
15 #include <climits>
16 #include <string>
17 
18 #include "hfstdll.h"
19 
23 namespace hfst
24 {
25  using hfst::String;
26  using hfst::StringSet;
27  using hfst::StringPair;
29 
30  // Copied from HfstDataTypes.h because including the file
31  // causes problems with header file #ifndef _HEADER_FILE_H_ guards
32  typedef std::vector<std::string> StringVector;
33  typedef std::pair<float,StringVector> HfstOneLevelPath;
34 
35  class MultiCharSymbolTrie;
36  typedef std::vector<MultiCharSymbolTrie*> MultiCharSymbolTrieVector;
37  typedef std::vector<bool> SymbolEndVector;
38 
39  class MultiCharSymbolTrie
40  {
41  private:
42  MultiCharSymbolTrieVector symbol_rests;
43  SymbolEndVector is_leaf;
44  bool is_end_of_string(const char * p) const ;
45  void set_symbol_end(const char * p);
46  void init_symbol_rests(const char * p);
47  void add_symbol_rest(const char * p);
48  bool is_symbol_end(const char * p) const;
49  MultiCharSymbolTrie * get_symbol_rest_trie(const char * p) const;
50 
51  public:
52  HFSTDLL MultiCharSymbolTrie(void);
53  HFSTDLL ~MultiCharSymbolTrie(void);
54  HFSTDLL void add(const char * p);
55  HFSTDLL const char * find(const char * p) const;
56  };
57 
85  {
86  private:
87  MultiCharSymbolTrie multi_char_symbols;
88  StringSet skip_symbol_set;
89  int get_next_symbol_size(const char * symbol) const;
90  bool is_skip_symbol(String &s) const;
91 
92  public:
93 
95  HFSTDLL HfstTokenizer();
96 
103  HFSTDLL void add_skip_symbol(const std::string &symbol);
104 
111  HFSTDLL void add_multichar_symbol(const std::string& symbol);
112 
114  HFSTDLL StringPairVector tokenize(const std::string &input_string) const;
115 
117  HFSTDLL StringVector tokenize_one_level(const std::string &input_string) const;
118 
119  HFSTDLL static StringPairVector tokenize_space_separated(const std::string & str);
120 
127  HFSTDLL StringPairVector tokenize(const std::string &input_string,
128  const std::string &output_string) const;
129 
130  HFSTDLL StringPairVector tokenize(const std::string &input_string,
131  const std::string &output_string,
132  void (*warn_about_pair)(const std::pair<std::string, std::string> &symbol_pair)) const;
133 
134  HFSTDLL StringPairVector tokenize_and_align_flag_diacritics
135  (const std::string &input_string,
136  const std::string &output_string,
137  void (*warn_about_pair)(const std::pair<std::string, std::string> &symbol_pair)) const;
138 
156  HFSTDLL static void check_utf8_correctness(const std::string &input_string);
157  };
158 }
159 #endif
std::pair< String, String > StringPair
A symbol pair in a transition.
Definition: HfstSymbolDefs.h:70
HFSTDLL StringPairVector tokenize(const std::string &input_string) const
Tokenize the string input_string.
Definition: HfstTokenizer.cc:125
std::string String
A UTF-8 symbol in a transition.
Definition: HfstSymbolDefs.h:59
std::vector< std::pair< std::string, std::string > > StringPairVector
A vector of string pairs.
Definition: HfstDataTypes.h:105
A file for exceptions.
HFSTDLL HfstTokenizer()
Create a tokenizer that recognizes utf-8 symbols.
Definition: HfstTokenizer.cc:82
HFSTDLL void add_multichar_symbol(const std::string &symbol)
Add a multicharacter symbol symbol to this tokenizer.
Definition: HfstTokenizer.cc:112
HFSTDLL void add_skip_symbol(const std::string &symbol)
Add a symbol to be skipped to this tokenizer.
Definition: HfstTokenizer.cc:118
HFSTDLL StringVector tokenize_one_level(const std::string &input_string) const
Tokenize the string input_string.
Definition: HfstTokenizer.cc:143
std::pair< float, StringVector > HfstOneLevelPath
A path of one level of arcs with collected weight.
Definition: HfstDataTypes.h:96
static HFSTDLL void check_utf8_correctness(const std::string &input_string)
If input_String is not valid utf-8, throw an IncorrectUtf8CodingException.
Definition: HfstTokenizer.cc:362
A tokenizer for creating transducers from UTF-8 strings.
Definition: HfstTokenizer.h:84
Typedefs and functions for symbols, symbol pairs and sets of symbols.
std::vector< std::string > StringVector
A vector of strings.
Definition: HfstDataTypes.h:87