9 #ifndef _HFST_OL_TRANSDUCER_PMATCH_H_
10 #define _HFST_OL_TRANSDUCER_PMATCH_H_
17 #include "transducer.h"
21 class PmatchTransducer;
22 class PmatchContainer;
24 class WeightedDoubleTape;
26 const unsigned int PMATCH_MAX_RECURSION_DEPTH = 5000;
28 typedef std::vector<PmatchTransducer *> RtnVector;
29 typedef std::map<std::string, SymbolNumber> RtnNameMap;
30 typedef std::vector<Location> LocationVector;
31 typedef std::vector<LocationVector> LocationVectorVector;
32 typedef std::vector<WeightedDoubleTape> WeightedDoubleTapeVector;
35 enum SpecialSymbol{entry,
52 SymbolPair(
void): input(0), output(0) {}
53 SymbolPair(SymbolNumber i, SymbolNumber o): input(i), output(o) {}
56 class DoubleTape:
public std::vector<SymbolPair>
60 void write(
unsigned int pos, SymbolNumber in, SymbolNumber out)
62 while (pos >= this->size()) {
63 this->push_back(SymbolPair());
65 this->operator[](pos) = SymbolPair(in, out);
68 DoubleTape extract_slice(
unsigned int start,
unsigned int stop)
72 retval.push_back(this->at(start));
80 class PositionStack:
public std::vector<unsigned int>
84 void push(
unsigned int val) { push_back(val); }
85 void pop(
void) { tmp = back(); pop_back(); }
86 void unpop(
void) { push_back(tmp); }
87 unsigned int top(
void) {
return back(); }
90 class WeightedDoubleTape:
public DoubleTape
94 WeightedDoubleTape(DoubleTape dt, Weight w): DoubleTape(dt), weight(w) {}
97 class PmatchAlphabet:
public TransducerAlphabet {
100 SymbolNumberVector special_symbols;
101 std::map<SymbolNumber, std::string> end_tag_map;
102 RtnNameMap rtn_names;
104 SymbolNumberVector symbol2lists;
106 SymbolNumberVector list2symbols;
110 SymbolNumberVector exclusionary_lists;
111 std::vector<SymbolNumberVector> symbol_lists;
112 std::vector<SymbolNumberVector> symbol_list_members;
113 std::vector<unsigned long int> counters;
114 SymbolNumberVector guards;
115 std::vector<bool> printable_vector;
116 bool is_end_tag(
const SymbolNumber symbol)
const;
117 bool is_guard(
const SymbolNumber symbol)
const;
118 bool is_counter(
const SymbolNumber symbol)
const;
119 std::string end_tag(
const SymbolNumber symbol);
120 std::string start_tag(
const SymbolNumber symbol);
124 PmatchAlphabet(std::istream& is, SymbolNumber symbol_count);
125 PmatchAlphabet(
void);
126 ~PmatchAlphabet(
void);
127 virtual void add_symbol(
const std::string & symbol);
128 static bool is_end_tag(
const std::string & symbol);
129 static bool is_insertion(
const std::string & symbol);
130 static bool is_guard(
const std::string & symbol);
131 static bool is_list(
const std::string & symbol);
132 static bool is_counter(
const std::string & symbol);
133 static bool is_special(
const std::string & symbol);
134 static bool is_printable(
const std::string & symbol);
135 static std::string name_from_insertion(
136 const std::string & symbol);
137 bool is_printable(SymbolNumber symbol);
138 void add_special_symbol(
const std::string & str, SymbolNumber symbol_number);
139 void process_symbol_list(std::string str, SymbolNumber sym);
140 void process_counter(std::string str, SymbolNumber sym);
141 void count(SymbolNumber sym);
142 void add_rtn(PmatchTransducer * rtn, std::string
const & name);
143 bool has_rtn(std::string
const & name)
const;
144 bool has_rtn(SymbolNumber symbol)
const;
145 PmatchTransducer * get_rtn(SymbolNumber symbol);
146 std::string get_counter_name(SymbolNumber symbol);
147 SymbolNumber get_special(SpecialSymbol special)
const;
148 SymbolNumberVector get_specials(
void)
const;
149 std::string stringify(
const DoubleTape & str);
150 Location locatefy(
unsigned int input_offset,
151 const WeightedDoubleTape & str);
153 friend class PmatchTransducer;
154 friend class PmatchContainer;
157 class PmatchContainer
160 PmatchAlphabet alphabet;
162 SymbolNumber orig_symbol_count;
163 SymbolNumber symbol_count;
164 PmatchTransducer * toplevel;
166 SymbolNumberVector input;
167 PositionStack entry_stack;
170 LocationVectorVector locations;
171 std::vector<char> possible_first_symbols;
175 bool single_codepoint_tokenization;
176 unsigned int recursion_depth_left;
182 unsigned long call_counter;
188 PmatchContainer(std::istream & is);
189 PmatchContainer(
void);
190 ~PmatchContainer(
void);
192 unsigned long line_number;
194 void initialize_input(
const char * input);
195 bool has_unsatisfied_rtns(
void)
const;
196 std::string get_unsatisfied_rtn_name(
void)
const;
197 void process(std::string & input);
198 std::string match(std::string & input,
199 double time_cutoff = 0.0);
200 LocationVectorVector locate(std::string & input,
201 double time_cutoff = 0.0);
202 std::string get_profiling_info(
void);
203 bool has_queued_input(
unsigned int input_pos);
204 bool not_possible_first_symbol(SymbolNumber sym)
206 if (possible_first_symbols.size() == 0) {
209 return sym >= possible_first_symbols.size() ||
210 possible_first_symbols[sym] == 0;
212 void copy_to_output(
const DoubleTape & best_result);
213 void copy_to_output(SymbolNumber input, SymbolNumber output);
214 std::string stringify_output(
void);
216 static std::string parse_name_from_hfst3_header(std::istream & f);
217 void set_verbose(
bool b) { verbose = b; }
218 void set_extract_tags_mode(
bool b)
219 { alphabet.extract_tags = b; }
220 void set_single_codepoint_tokenization(
bool b)
221 { single_codepoint_tokenization = b; }
222 void set_profile(
bool b) { profile_mode = b; }
223 bool try_recurse(
void)
225 if (recursion_depth_left > 0) {
226 --recursion_depth_left;
232 void unrecurse(
void) { ++recursion_depth_left; }
233 void reset_recursion(
void) { recursion_depth_left = PMATCH_MAX_RECURSION_DEPTH; }
235 friend class PmatchTransducer;
247 bool operator<(Location rhs)
const
248 {
return this->weight < rhs.weight; }
251 struct ContextMatchedTrap
254 ContextMatchedTrap(
bool p): polarity(p) {}
257 class PmatchTransducer
260 enum ContextChecking{none, LC, NLC, RC, NRC};
266 struct LocalVariables
268 hfst::FdState<SymbolNumber> flag_state;
270 unsigned int context_placeholder;
271 ContextChecking context;
272 bool default_symbol_trap;
273 bool negative_context_success;
274 bool pending_passthrough;
275 Weight running_weight;
280 unsigned int candidate_input_pos;
281 unsigned int candidate_tape_pos;
282 unsigned int input_tape_entry;
283 unsigned int tape_entry;
284 DoubleTape best_result;
286 bool candidate_found;
289 std::stack<LocalVariables> local_stack;
290 std::stack<RtnVariables> rtn_stack;
292 std::vector<TransitionW> transition_table;
293 std::vector<TransitionWIndex> index_table;
295 PmatchAlphabet & alphabet;
296 SymbolNumber orig_symbol_count;
297 PmatchContainer * container;
298 WeightedDoubleTapeVector * locations;
300 bool is_final(TransitionTableIndex i)
302 if (indexes_transition_table(i)) {
303 return transition_table[i - TRANSITION_TARGET_TABLE_START].final();
305 return index_table[i].final();
309 bool get_weight(TransitionTableIndex i)
311 if (indexes_transition_table(i)) {
312 return transition_table[i - TRANSITION_TARGET_TABLE_START].get_weight();
314 return index_table[i].final_weight();
318 TransitionTableIndex make_transition_table_index(
319 TransitionTableIndex i, SymbolNumber input) {
320 if (indexes_transition_table(i)) {
321 return i - TRANSITION_TARGET_TABLE_START;
323 if (index_table[i + input].get_input_symbol() == input) {
324 return index_table[i + input].get_target() - TRANSITION_TARGET_TABLE_START;
326 return TRANSITION_TARGET_TABLE_START;
333 void take_epsilons(
unsigned int input_pos,
334 unsigned int tape_pos,
335 TransitionTableIndex i);
337 void check_context(
unsigned int input_pos,
338 unsigned int tape_pos,
339 TransitionTableIndex i);
341 void take_rtn(SymbolNumber input,
342 unsigned int input_pos,
343 unsigned int tape_pos,
344 TransitionTableIndex i);
346 void take_flag(SymbolNumber input,
347 unsigned int input_pos,
348 unsigned int tape_pos,
349 TransitionTableIndex i);
351 void take_transitions(SymbolNumber input,
352 unsigned int input_pos,
353 unsigned int tape_pos,
354 TransitionTableIndex i);
356 void get_analyses(
unsigned int input_pos,
357 unsigned int tape_pos,
358 TransitionTableIndex index);
360 bool checking_context(
void)
const;
361 bool try_entering_context(SymbolNumber symbol);
362 bool try_exiting_context(SymbolNumber symbol);
363 void exit_context(
void);
365 void collect_first_epsilon(TransitionTableIndex i,
366 SymbolNumberVector
const& input_symbols,
367 std::set<TransitionTableIndex> & seen_indices);
368 void collect_first_epsilon_index(TransitionTableIndex i,
369 SymbolNumberVector
const& input_symbols,
370 std::set<TransitionTableIndex> & seen_indices);
371 void collect_first_transition(TransitionTableIndex i,
372 SymbolNumberVector
const& input_symbols,
373 std::set<TransitionTableIndex> & seen_indices);
374 void collect_first_index(TransitionTableIndex i,
375 SymbolNumberVector
const& input_symbols,
376 std::set<TransitionTableIndex> & seen_indices);
377 void collect_first(TransitionTableIndex i,
378 SymbolNumberVector
const& input_symbols,
379 std::set<TransitionTableIndex> & seen_indices);
383 PmatchTransducer(std::istream& is,
384 TransitionTableIndex index_table_size,
385 TransitionTableIndex transition_table_size,
386 PmatchAlphabet & alphabet,
387 PmatchContainer * container);
389 std::set<SymbolNumber> possible_first_symbols;
391 bool final_index(TransitionTableIndex i)
const
393 if (indexes_transition_table(i)) {
394 return transition_table[i].final();
396 return index_table[i].final();
400 static bool indexes_transition_table(TransitionTableIndex i)
401 {
return i >= TRANSITION_TARGET_TABLE_START; }
403 static bool is_good(TransitionTableIndex i)
404 {
return i < TRANSITION_TARGET_TABLE_START; }
406 const DoubleTape & get_best_result(
void)
const
407 {
return rtn_stack.top().best_result; }
408 unsigned int get_candidate_input_pos(
void)
const
409 {
return rtn_stack.top().candidate_input_pos; }
410 Weight get_best_weight(
void)
const
411 {
return rtn_stack.top().best_weight; }
413 void match(
unsigned int & input_pos,
unsigned int & tape_pos);
414 void rtn_call(
unsigned int & input_pos,
unsigned int & tape_pos);
416 void note_analysis(
unsigned int input_pos,
unsigned int tape_pos);
417 void grab_location(
unsigned int input_pos,
unsigned int tape_pos);
418 void collect_possible_first_symbols(
void);
420 friend class PmatchContainer;
425 #endif //_HFST_OL_TRANSDUCER_PMATCH_H_