HFST - Helsinki Finite-State Transducer Technology - C++ API  version 3.9.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
HfstStrings2FstTokenizer.h
1 #ifndef HEADER_STRINGS_2_FST_TOKENIZER_H
2 #define HEADER_STRINGS_2_FST_TOKENIZER_H
3 
4 // Copyright (c) 2016 University of Helsinki
5 //
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU Lesser General Public
8 // License as published by the Free Software Foundation; either
9 // version 3 of the License, or (at your option) any later version.
10 // See the file COPYING included with this distribution for more
11 // information.
12 
13 #include <vector>
14 #include <string>
15 #include <utility>
16 
17 #ifdef HAVE_CONFIG_H
18 # include <config.h>
19 #endif
20 
21 #include "HfstDataTypes.h"
22 #include "HfstTokenizer.h"
23 
24 typedef std::vector<std::string> StringVector;
25 typedef std::pair<std::string,std::string> StringPair;
26 typedef std::vector<StringPair> StringPairVector;
27 
28 #define COL ":"
29 #define BACKSLASH "\\"
30 #define SPACE " "
31 #define BACKSLASH_ESC "@_BACKSLASH_@"
32 #define EPSILON_SYMBOL "@_EPSILON_SYMBOL_@"
33 #define EMPTY ""
34 
35 #define COL_CHAR ':'
36 #define BACKSLASH_CHAR '\\'
37 
38 #define COL_ESCAPE "@_COLON_@"
39 #define TAB_ESCAPE "@_TAB_@"
40 #define SPACE_ESCAPE "@_SPACE_@"
41 
42 namespace hfst {
43 
44 class EmptyMulticharSymbol
45 {};
46 
47 class UnescapedColsFound
48 {};
49 
50 class HfstStrings2FstTokenizer
51 {
52  public:
53  HfstStrings2FstTokenizer
54  (StringVector &multichar_symbols,const std::string &eps);
55 
59  StringPairVector tokenize_pair_string(const std::string &str,bool spaces);
60 
64  StringPairVector tokenize_string_pair(const std::string &str,bool spaces);
65 
66  protected:
67 
68  hfst::HfstTokenizer tokenizer;
69  std::string eps;
70 
71  // Add the multichar symbol XYZ to the tokenizer.
72  void add_multichar_symbol(const std::string &multichar_symbol);
73 
74  // For multichar symbol XYZ, add the multichar symbol \X to tokenizer.
75  void add_multichar_symbol_head(const std::string &multichar_symbol);
76 
77  // Transform v into a StringPairVector by treating symbols separated by a
78  // ":" symbol as the input and output symbols of a pair and treating other
79  // symbols as identity pairs. Treat initial and final colons as eps
80  StringPairVector make_pair_vector(const StringVector &v);
81 
82  // Make a pair string out of input and output by pairing symbols at the
83  // same indices. Pad with zeroes at the end when ncecessary.
84  StringPairVector make_pair_vector(const StringVector &input,
85  const StringVector &output);
86 
87  // Return true it *it is followed by ':' and another symbol.
88  bool is_pair_input_symbol(StringVector::const_iterator it,
89  StringVector::const_iterator end);
90 
91  // Remove backslashes, except quoted backslashes '\\'.
92  std::string unescape(std::string symbol);
93 
94  // Return the position of the first unquoted ':'. Return -1 if not found.
95  int get_col_pos(const std::string &str);
96 
98  StringVector split_at_spaces(const std::string &str);
99 
101  void check_cols(const std::string &symbol);
102 };
103 
104 } // namespace hfst
105 
106 #endif // HEADER_STRINGS_2_FST_TOKENIZER_H
Declaration of class hfst::HfstTokenizer.
std::pair< String, String > StringPair
A symbol pair in a transition.
Definition: HfstSymbolDefs.h:70
std::vector< std::pair< std::string, std::string > > StringPairVector
A vector of string pairs.
Definition: HfstDataTypes.h:105
Datatypes that are needed when using the HFST API.
A tokenizer for creating transducers from UTF-8 strings.
Definition: HfstTokenizer.h:84
std::vector< std::string > StringVector
A vector of strings.
Definition: HfstDataTypes.h:87