HFST - Helsinki Finite-State Transducer Technology - C++ API  version 3.9.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
pmatch.h
1 // Copyright (c) 2016 University of Helsinki
2 //
3 // This library is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU Lesser General Public
5 // License as published by the Free Software Foundation; either
6 // version 3 of the License, or (at your option) any later version.
7 // See the file COPYING included with this distribution for more
8 // information.
9 #ifndef _HFST_OL_TRANSDUCER_PMATCH_H_
10 #define _HFST_OL_TRANSDUCER_PMATCH_H_
11 
12 #include <map>
13 #include <stack>
14 #include <sstream>
15 #include <algorithm>
16 #include <ctime>
17 #include "transducer.h"
18 
19 namespace hfst_ol {
20 
21  class PmatchTransducer;
22  class PmatchContainer;
23  struct Location;
24  class WeightedDoubleTape;
25 
26  const unsigned int PMATCH_MAX_RECURSION_DEPTH = 5000;
27 
28  typedef std::vector<PmatchTransducer *> RtnVector;
29  typedef std::map<std::string, SymbolNumber> RtnNameMap;
30  typedef std::vector<Location> LocationVector;
31  typedef std::vector<LocationVector> LocationVectorVector;
32  typedef std::vector<WeightedDoubleTape> WeightedDoubleTapeVector;
33 
34 
35  enum SpecialSymbol{entry,
36  exit,
37  LC_entry,
38  LC_exit,
39  RC_entry,
40  RC_exit,
41  NLC_entry,
42  NLC_exit,
43  NRC_entry,
44  NRC_exit,
45  Pmatch_passthrough,
46  boundary};
47 
48  struct SymbolPair
49  {
50  SymbolNumber input;
51  SymbolNumber output;
52  SymbolPair(void): input(0), output(0) {}
53  SymbolPair(SymbolNumber i, SymbolNumber o): input(i), output(o) {}
54  };
55 
56  class DoubleTape: public std::vector<SymbolPair>
57  {
58  public:
59 
60  void write(unsigned int pos, SymbolNumber in, SymbolNumber out)
61  {
62  while (pos >= this->size()) {
63  this->push_back(SymbolPair());
64  }
65  this->operator[](pos) = SymbolPair(in, out);
66  }
67 
68  DoubleTape extract_slice(unsigned int start, unsigned int stop)
69  {
70  DoubleTape retval;
71  while(start < stop) {
72  retval.push_back(this->at(start));
73  ++start;
74  }
75  return retval;
76  }
77 
78  };
79 
80  class PositionStack: public std::vector<unsigned int>
81  {
82  unsigned int tmp;
83  public:
84  void push(unsigned int val) { push_back(val); }
85  void pop(void) { tmp = back(); pop_back(); }
86  void unpop(void) { push_back(tmp); }
87  unsigned int top(void) { return back(); }
88  };
89 
90  class WeightedDoubleTape: public DoubleTape
91  {
92  public:
93  Weight weight;
94  WeightedDoubleTape(DoubleTape dt, Weight w): DoubleTape(dt), weight(w) {}
95  };
96 
97  class PmatchAlphabet: public TransducerAlphabet {
98  protected:
99  RtnVector rtns;
100  SymbolNumberVector special_symbols;
101  std::map<SymbolNumber, std::string> end_tag_map;
102  RtnNameMap rtn_names;
103 // For each symbol, either NO_SYMBOL for "no corresponding list" or an index into symbol_lists
104  SymbolNumberVector symbol2lists;
105 // For each a symbol, either NO_SYMBOL for "this is not a list" or an index into symbol_list_members
106  SymbolNumberVector list2symbols;
107  // For each entry referring to entries in the symbol table, indicate
108  // "this symbol is an exclusionary list", ie. symbols not in it
109  // will match
110  SymbolNumberVector exclusionary_lists;
111  std::vector<SymbolNumberVector> symbol_lists;
112  std::vector<SymbolNumberVector> symbol_list_members;
113  std::vector<unsigned long int> counters;
114  SymbolNumberVector guards;
115  std::vector<bool> printable_vector;
116  bool is_end_tag(const SymbolNumber symbol) const;
117  bool is_guard(const SymbolNumber symbol) const;
118  bool is_counter(const SymbolNumber symbol) const;
119  std::string end_tag(const SymbolNumber symbol);
120  std::string start_tag(const SymbolNumber symbol);
121  bool extract_tags;
122 
123  public:
124  PmatchAlphabet(std::istream& is, SymbolNumber symbol_count);
125  PmatchAlphabet(void);
126  ~PmatchAlphabet(void);
127  virtual void add_symbol(const std::string & symbol);
128  static bool is_end_tag(const std::string & symbol);
129  static bool is_insertion(const std::string & symbol);
130  static bool is_guard(const std::string & symbol);
131  static bool is_list(const std::string & symbol);
132  static bool is_counter(const std::string & symbol);
133  static bool is_special(const std::string & symbol);
134  static bool is_printable(const std::string & symbol);
135  static std::string name_from_insertion(
136  const std::string & symbol);
137  bool is_printable(SymbolNumber symbol);
138  void add_special_symbol(const std::string & str, SymbolNumber symbol_number);
139  void process_symbol_list(std::string str, SymbolNumber sym);
140  void process_counter(std::string str, SymbolNumber sym);
141  void count(SymbolNumber sym);
142  void add_rtn(PmatchTransducer * rtn, std::string const & name);
143  bool has_rtn(std::string const & name) const;
144  bool has_rtn(SymbolNumber symbol) const;
145  PmatchTransducer * get_rtn(SymbolNumber symbol);
146  std::string get_counter_name(SymbolNumber symbol);
147  SymbolNumber get_special(SpecialSymbol special) const;
148  SymbolNumberVector get_specials(void) const;
149  std::string stringify(const DoubleTape & str);
150  Location locatefy(unsigned int input_offset,
151  const WeightedDoubleTape & str);
152 
153  friend class PmatchTransducer;
154  friend class PmatchContainer;
155  };
156 
157  class PmatchContainer
158  {
159  protected:
160  PmatchAlphabet alphabet;
161  Encoder * encoder;
162  SymbolNumber orig_symbol_count;
163  SymbolNumber symbol_count;
164  PmatchTransducer * toplevel;
165  size_t io_size;
166  SymbolNumberVector input;
167  PositionStack entry_stack;
168  DoubleTape tape;
169  DoubleTape output;
170  LocationVectorVector locations;
171  std::vector<char> possible_first_symbols;
172  bool verbose;
173  bool locate_mode;
174  bool profile_mode;
175  bool single_codepoint_tokenization;
176  unsigned int recursion_depth_left;
177  // An optional time limit for operations
178  double max_time;
179  // When we started work
180  clock_t start_clock;
181  // A counter to avoid checking the clock too often
182  unsigned long call_counter;
183  // A flag to set for when time has been overstepped
184  bool limit_reached;
185 
186  public:
187 
188  PmatchContainer(std::istream & is);
189  PmatchContainer(void);
190  ~PmatchContainer(void);
191 
192  unsigned long line_number;
193 
194  void initialize_input(const char * input);
195  bool has_unsatisfied_rtns(void) const;
196  std::string get_unsatisfied_rtn_name(void) const;
197  void process(std::string & input);
198  std::string match(std::string & input,
199  double time_cutoff = 0.0);
200  LocationVectorVector locate(std::string & input,
201  double time_cutoff = 0.0);
202  std::string get_profiling_info(void);
203  bool has_queued_input(unsigned int input_pos);
204  bool not_possible_first_symbol(SymbolNumber sym)
205  {
206  if (possible_first_symbols.size() == 0) {
207  return false;
208  }
209  return sym >= possible_first_symbols.size() ||
210  possible_first_symbols[sym] == 0;
211  }
212  void copy_to_output(const DoubleTape & best_result);
213  void copy_to_output(SymbolNumber input, SymbolNumber output);
214  std::string stringify_output(void);
215 // LocationVector locatefy_output(void);
216  static std::string parse_name_from_hfst3_header(std::istream & f);
217  void set_verbose(bool b) { verbose = b; }
218  void set_extract_tags_mode(bool b)
219  { alphabet.extract_tags = b; }
220  void set_single_codepoint_tokenization(bool b)
221  { single_codepoint_tokenization = b; }
222  void set_profile(bool b) { profile_mode = b; }
223  bool try_recurse(void)
224  {
225  if (recursion_depth_left > 0) {
226  --recursion_depth_left;
227  return true;
228  } else {
229  return false;
230  }
231  }
232  void unrecurse(void) { ++recursion_depth_left; }
233  void reset_recursion(void) { recursion_depth_left = PMATCH_MAX_RECURSION_DEPTH; }
234 
235  friend class PmatchTransducer;
236  };
237 
238  struct Location
239  {
240  unsigned int start;
241  unsigned int length;
242  std::string input;
243  std::string output;
244  std::string tag;
245  Weight weight;
246 
247  bool operator<(Location rhs) const
248  { return this->weight < rhs.weight; }
249  };
250 
251  struct ContextMatchedTrap
252  {
253  bool polarity;
254  ContextMatchedTrap(bool p): polarity(p) {}
255  };
256 
257  class PmatchTransducer
258  {
259  protected:
260  enum ContextChecking{none, LC, NLC, RC, NRC};
261 
262 // Transducers have static data, ie. tables for describing the states and
263 // transitions, and dynamic data, which is altered during lookup.
264 // In pmatch several instances of the same transducer may be operating
265 // in a stack, so this dynamic data is put in a class of its own.
266  struct LocalVariables
267  {
268  hfst::FdState<SymbolNumber> flag_state;
269  char tape_step;
270  unsigned int context_placeholder;
271  ContextChecking context;
272  bool default_symbol_trap;
273  bool negative_context_success;
274  bool pending_passthrough;
275  Weight running_weight;
276  };
277 
278  struct RtnVariables
279  {
280  unsigned int candidate_input_pos;
281  unsigned int candidate_tape_pos;
282  unsigned int input_tape_entry;
283  unsigned int tape_entry;
284  DoubleTape best_result;
285  Weight best_weight;
286  bool candidate_found;
287  };
288 
289  std::stack<LocalVariables> local_stack;
290  std::stack<RtnVariables> rtn_stack;
291 
292  std::vector<TransitionW> transition_table;
293  std::vector<TransitionWIndex> index_table;
294 
295  PmatchAlphabet & alphabet;
296  SymbolNumber orig_symbol_count;
297  PmatchContainer * container;
298  WeightedDoubleTapeVector * locations;
299 
300  bool is_final(TransitionTableIndex i)
301  {
302  if (indexes_transition_table(i)) {
303  return transition_table[i - TRANSITION_TARGET_TABLE_START].final();
304  } else {
305  return index_table[i].final();
306  }
307  }
308 
309  bool get_weight(TransitionTableIndex i)
310  {
311  if (indexes_transition_table(i)) {
312  return transition_table[i - TRANSITION_TARGET_TABLE_START].get_weight();
313  } else {
314  return index_table[i].final_weight();
315  }
316  }
317 
318  TransitionTableIndex make_transition_table_index(
319  TransitionTableIndex i, SymbolNumber input) {
320  if (indexes_transition_table(i)) {
321  return i - TRANSITION_TARGET_TABLE_START;
322  } else {
323  if (index_table[i + input].get_input_symbol() == input) {
324  return index_table[i + input].get_target() - TRANSITION_TARGET_TABLE_START;
325  } else {
326  return TRANSITION_TARGET_TABLE_START;
327  }
328  }
329  }
330 
331  // The mutually recursive lookup-handling functions
332 
333  void take_epsilons(unsigned int input_pos,
334  unsigned int tape_pos,
335  TransitionTableIndex i);
336 
337  void check_context(unsigned int input_pos,
338  unsigned int tape_pos,
339  TransitionTableIndex i);
340 
341  void take_rtn(SymbolNumber input,
342  unsigned int input_pos,
343  unsigned int tape_pos,
344  TransitionTableIndex i);
345 
346  void take_flag(SymbolNumber input,
347  unsigned int input_pos,
348  unsigned int tape_pos,
349  TransitionTableIndex i);
350 
351  void take_transitions(SymbolNumber input,
352  unsigned int input_pos,
353  unsigned int tape_pos,
354  TransitionTableIndex i);
355 
356  void get_analyses(unsigned int input_pos,
357  unsigned int tape_pos,
358  TransitionTableIndex index);
359 
360  bool checking_context(void) const;
361  bool try_entering_context(SymbolNumber symbol);
362  bool try_exiting_context(SymbolNumber symbol);
363  void exit_context(void);
364 
365  void collect_first_epsilon(TransitionTableIndex i,
366  SymbolNumberVector const& input_symbols,
367  std::set<TransitionTableIndex> & seen_indices);
368  void collect_first_epsilon_index(TransitionTableIndex i,
369  SymbolNumberVector const& input_symbols,
370  std::set<TransitionTableIndex> & seen_indices);
371  void collect_first_transition(TransitionTableIndex i,
372  SymbolNumberVector const& input_symbols,
373  std::set<TransitionTableIndex> & seen_indices);
374  void collect_first_index(TransitionTableIndex i,
375  SymbolNumberVector const& input_symbols,
376  std::set<TransitionTableIndex> & seen_indices);
377  void collect_first(TransitionTableIndex i,
378  SymbolNumberVector const& input_symbols,
379  std::set<TransitionTableIndex> & seen_indices);
380 
381 
382  public:
383  PmatchTransducer(std::istream& is,
384  TransitionTableIndex index_table_size,
385  TransitionTableIndex transition_table_size,
386  PmatchAlphabet & alphabet,
387  PmatchContainer * container);
388 
389  std::set<SymbolNumber> possible_first_symbols;
390 
391  bool final_index(TransitionTableIndex i) const
392  {
393  if (indexes_transition_table(i)) {
394  return transition_table[i].final();
395  } else {
396  return index_table[i].final();
397  }
398  }
399 
400  static bool indexes_transition_table(TransitionTableIndex i)
401  { return i >= TRANSITION_TARGET_TABLE_START; }
402 
403  static bool is_good(TransitionTableIndex i)
404  { return i < TRANSITION_TARGET_TABLE_START; }
405 
406  const DoubleTape & get_best_result(void) const
407  { return rtn_stack.top().best_result; }
408  unsigned int get_candidate_input_pos(void) const
409  { return rtn_stack.top().candidate_input_pos; }
410  Weight get_best_weight(void) const
411  { return rtn_stack.top().best_weight; }
412 
413  void match(unsigned int & input_pos, unsigned int & tape_pos);
414  void rtn_call(unsigned int & input_pos, unsigned int & tape_pos);
415  void rtn_exit(void);
416  void note_analysis(unsigned int input_pos, unsigned int tape_pos);
417  void grab_location(unsigned int input_pos, unsigned int tape_pos);
418  void collect_possible_first_symbols(void);
419 
420  friend class PmatchContainer;
421  };
422 
423 }
424 
425 #endif //_HFST_OL_TRANSDUCER_PMATCH_H_