HFST - Helsinki Finite-State Transducer Technology - C++ API  version 3.9.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
convert.h
1 // Copyright (c) 2016 University of Helsinki
2 //
3 // This library is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU Lesser General Public
5 // License as published by the Free Software Foundation; either
6 // version 3 of the License, or (at your option) any later version.
7 // See the file COPYING included with this distribution for more
8 // information.
9 
10 #ifndef _HFST_OL_CONVERT_H_
11 #define _HFST_OL_CONVERT_H_
12 
13 #if HAVE_OPENFST
14 #include "fst/fstlib.h"
15 #endif // HAVE_OPENFST
16 
17 #include "transducer.h"
18 #include "pmatch.h"
19 
20 namespace hfst_ol {
21 
22  typedef std::map<hfst_ol::TransitionTableIndex,unsigned int>
23  HfstOlToBasicStateMap;
24 
25 struct TransitionPlaceholder {
26  unsigned int target;
27  SymbolNumber input;
28  SymbolNumber output;
29  float weight;
30 
31  TransitionPlaceholder(unsigned int t, SymbolNumber i, SymbolNumber o, float w):
32  target(t),
33  input(i),
34  output(o),
35  weight(w)
36  {}
37 };
38 
39 //typedef std::map<SymbolNumber, std::vector<TransitionPlaceholder> >
40 // SymbolTransitionsMap;
41 
42 
43 struct StatePlaceholder {
44  enum indexing_type {empty, simple_zero_index, simple_nonzero_index, nonsimple};
45 
46  unsigned int state_number;
47  unsigned int start_index;
48  unsigned int first_transition;
49  std::vector<unsigned int> symbol_to_transition_placeholder_v;
50  std::vector<std::vector<TransitionPlaceholder> > transition_placeholders;
51  indexing_type type;
52  SymbolNumber inputs;
53  bool final;
54  float final_weight;
55  StatePlaceholder (unsigned int state, bool finality, unsigned int first,
56  Weight final_weight):
57  state_number(state),
58  start_index(UINT_MAX),
59  first_transition(first),
60  final(finality),
61  final_weight(final_weight),
62  type(state == 0 ? nonsimple: empty),
63  inputs(0)
64  { }
65  StatePlaceholder ():
66  state_number(UINT_MAX),
67  start_index(UINT_MAX),
68  first_transition(UINT_MAX),
69  final(false),
70  final_weight(0.0),
71  type(empty),
72  inputs(0)
73  { }
74 
75  bool is_simple(void) const
76  {
77  return type != nonsimple;
78  }
79 
80  unsigned int number_of_transitions(void) const {
81  unsigned int count = 0;
82  for(std::vector<std::vector<TransitionPlaceholder> >::const_iterator it
83  = transition_placeholders.begin();
84  it != transition_placeholders.end(); ++it) {
85  count += it->size();
86  }
87  return count;
88  }
89 
90  bool input_present(SymbolNumber input) const {
91  return input < symbol_to_transition_placeholder_v.size() &&
92  symbol_to_transition_placeholder_v[input] != UINT_MAX;
93  }
94 
95  void add_input(SymbolNumber input, std::set<SymbolNumber> const & flag_symbols)
96  {
97  if (input_present(input)) {
98  return;
99  }
100  while (symbol_to_transition_placeholder_v.size() <= input) {
101  symbol_to_transition_placeholder_v.push_back(UINT_MAX);
102  }
103  symbol_to_transition_placeholder_v[input] = transition_placeholders.size();
104  transition_placeholders.push_back(std::vector<TransitionPlaceholder>());
105  ++inputs;
106  if (type != nonsimple) {
107  // Depending on what type of inputs we now have, adjust the index type.
108  // Epsilons and flags both index to 0. If we have only one input symbol,
109  // we're simple.
110  if (type == empty) {
111  if (input == 0 || flag_symbols.count(input) == 1) {
112  type = simple_zero_index;
113  } else {
114  type = simple_nonzero_index;
115  }
116  } else if (type == simple_zero_index) {
117  if (input != 0 && flag_symbols.count(input) == 0) {
118  type = nonsimple;
119  }
120  } else { // simple_nonzero_index
121  if (inputs > 1 || input == 0 || flag_symbols.count(input) == 1) {
122  type = nonsimple;
123  }
124  }
125  }
126  }
127 
128  SymbolNumber get_largest_index(void)
129  {
130  return transition_placeholders[symbol_to_transition_placeholder_v.back()][0].input;
131  }
132 
133  void add_transition(TransitionPlaceholder & trans)
134  {
135  transition_placeholders[symbol_to_transition_placeholder_v[trans.input]].push_back(trans);
136  }
137 
138  std::vector<TransitionPlaceholder> & get_transition_placeholders(SymbolNumber input)
139  {
140  return transition_placeholders[symbol_to_transition_placeholder_v[input]];
141  }
142 
143  unsigned int symbol_offset(
144  SymbolNumber const symbol,
145  std::set<SymbolNumber> const & flag_symbols) {
146  if (symbol == 0) {
147  return 0;
148  }
149  unsigned int offset = 0;
150  // if (flag_symbols.size() == 0) {
151  // for(int i = 0; i < symbol_to_transition_placeholder_v.size(); ++i) {
152  // if (symbol_to_transition_placeholder_v[i] != UINT_MAX) {
153  // if (symbol == i) {
154  // return offset;
155  // }
156  // offset += get_transition_placeholders(i).size();
157  // }
158  // }
159  // } else {
160  if (input_present(0)) { // if there are epsilons
161  offset = get_transition_placeholders(0).size();
162  }
163  for(std::set<SymbolNumber>::iterator flag_it = flag_symbols.begin();
164  flag_it != flag_symbols.end(); ++flag_it) {
165  if (input_present(*flag_it)) {
166  if (symbol == *flag_it) {
167  // Flags go to 0 (even if there's no epsilon)
168  return 0;
169  }
170  offset += get_transition_placeholders(*flag_it).size();
171  }
172  }
173  for(unsigned int i = 1; i < symbol_to_transition_placeholder_v.size(); ++i) {
174  if (input_present(i)) {
175  if (flag_symbols.count(i) != 0) {
176  // already counted
177  continue;
178  }
179  if (symbol == i) {
180  return offset;
181  }
182  offset += get_transition_placeholders(i).size();
183  }
184  }
185  std::string message("error in conversion between optimized lookup "
186  "format and HfstTransducer;\ntried to calculate "
187  "symbol_offset for symbol not present in state");
190  message);
191  }
192 };
193 
194 bool compare_states_by_input_size(
195  const StatePlaceholder & lhs, const StatePlaceholder & rhs);
196 bool compare_states_by_state_number(
197  const StatePlaceholder & lhs, const StatePlaceholder & rhs);
198 
199 struct IndexPlaceholders
200 {
201  std::vector<unsigned int> indices;
202  std::vector<std::pair<unsigned int, SymbolNumber> > targets;
203 
204  bool used(unsigned int const position) const
205  {
206  return position < indices.size() && indices[position] != NO_TABLE_INDEX;
207  }
208 
209  void assign(unsigned int const position, unsigned int target, SymbolNumber sym)
210  {
211  while (position >= indices.size()) {
212  indices.push_back(NO_TABLE_INDEX);
213  }
214  indices[position] = targets.size();
215  targets.push_back(std::pair<unsigned int, SymbolNumber>(target, sym));
216  }
217 
218  std::pair<unsigned int, SymbolNumber> get_target(unsigned int index)
219  {
220  return targets[indices[index]];
221  }
222 
223  bool fits(StatePlaceholder const & state,
224  std::set<SymbolNumber> const & flag_symbols,
225  unsigned int const position) const
226  {
227  if (used(position)) {
228  return false;
229  }
230  for (std::vector<std::vector<TransitionPlaceholder> >::const_iterator it = state.transition_placeholders.begin();
231  it != state.transition_placeholders.end(); ++it) {
232  SymbolNumber index_offset = it->at(0).input;
233  if (flag_symbols.count(index_offset) != 0) {
234  index_offset = 0;
235  }
236  if (used(index_offset + position + 1)) {
237  return false;
238  }
239  }
240  return true;
241  }
242 
243  bool unsuitable(unsigned int const index,
244  SymbolNumber const symbols,
245  float const packing_aggression) const
246  {
247  if (used(index)) {
248  return true;
249  }
250 
251  // "Perfect packing" (under this strategy)
252 /* for (unsigned int i = 0; i < symbols; ++i) {
253 
254  if (count(index + i) == 0) {
255  return true;
256  }
257  return false;
258  }*/
259 
260  unsigned int filled = 0;
261  for (unsigned int i = 0; i < symbols; ++i) {
262  filled += used(index + i + 1);
263  if (filled >= (packing_aggression*symbols)) {
264  return true; // too full
265  }
266  }
267  return false;
268  }
269 };
270 
271 
272 void write_transitions_from_state_placeholders(
273  TransducerTable<TransitionW> & transition_table,
274  std::vector<hfst_ol::StatePlaceholder>
275  & state_placeholders,
276  std::set<SymbolNumber> & flag_symbols);
277 
278 void add_transitions_with(SymbolNumber symbol,
279  std::vector<TransitionPlaceholder> & transitions,
280  TransducerTable<TransitionW> & transition_table,
281  std::vector<hfst_ol::StatePlaceholder>
282  & state_placeholders,
283  std::set<SymbolNumber> & flag_symbols);
284 
285 #if HAVE_OPENFST // Covers remainder of file
286 typedef fst::StdArc::StateId StateId;
287 typedef fst::StdArc StdArc;
288 typedef fst::StdVectorFst TransduceR;
289 typedef fst::ArcIterator<TransduceR> ArcIterator;
290 typedef std::set<StateId> StateIdSet;
291 typedef std::set<int64> OfstSymbolSet;
292 
293 const StateIdNumber NO_ID_NUMBER = UINT_MAX;
294 const StateId NO_STATE_ID = UINT_MAX;
295 const SymbolNumber BIG_STATE_LIMIT = 1;
296 
297 
298 struct transition_label
299 {
300  int64 input_symbol;
301  int64 output_symbol;
302 };
303 
304 struct compare_transition_labels
305 {
306  bool operator() ( const transition_label &l1,
307  const transition_label &l2) const
308  {
309  if (l1.input_symbol == l2.input_symbol)
310  return l1.output_symbol < l2.output_symbol;
311  return l1.input_symbol < l2.input_symbol;
312  }
313 };
314 
315 typedef std::set<transition_label,compare_transition_labels> LabelSet;
316 
317 enum place_holder {EMPTY, EMPTY_START, OCCUPIED_START, OCCUPIED };
318 typedef std::vector<place_holder> PlaceHolderVector;
319 
320 /*
321  A class which can translate between StateId and StateIdNumbers
322 */
323 class ConvertIdNumberMap
324 {
325 private:
326  typedef std::map<StateIdNumber,StateId> IdNumbersToStateIds;
327  typedef std::map<StateId,StateIdNumber> StateIdsToIdNumbers;
328 
329  IdNumbersToStateIds id_to_node;
330  StateIdsToIdNumbers node_to_id;
331 
332  StateIdNumber node_counter;
333 
334  void add_node(StateId n, TransduceR *tr);
335  /*
336  Assign every node n in t a unique id number i. Assign the start node
337  t->root_node() id number 0. Set node_to_id[n] = i and
338  id_to_node[i] = n.
339  */
340  void set_node_maps(TransduceR * t);
341 
342 public:
343  ConvertIdNumberMap(TransduceR * t):
344  node_counter(0)
345  { set_node_maps(t); }
346 
347  StateIdNumber get_number_of_nodes(void) const
348  { return node_counter; }
349 
350  StateIdNumber get_node_id(StateId n) const;
351  StateId get_id_node(StateIdNumber i) const;
352 };
353 
354 typedef std::map<int64,unsigned int> OfstSymbolCountMap;
355 typedef std::set<std::string> SymbolSet;
356 
357 class ConvertTransducerAlphabet
358 {
359  private:
360  SymbolTable symbol_table;
361 
362  TransduceR* transducer;
363  fst::SymbolTable * ofst_symbol_table;
364  // input and output symbol tables together
365 
366  std::map<int64, SymbolNumber> input_symbols_map;
367  std::map<int64, SymbolNumber> output_symbols_map;
368 
369  void inspect_node(StateId n, StateIdSet& visited_nodes,
370  OfstSymbolCountMap& symbol_count_map, SymbolSet& all_symbol_set);
371  void get_symbol_info(
372  OfstSymbolCountMap &symbol_count_map, SymbolSet& all_symbol_set);
373  void populate_symbol_table(
374  OfstSymbolCountMap &input_symbol_counts, SymbolSet& all_symbol_set);
375  void set_maps();
376  public:
377  ConvertTransducerAlphabet(TransduceR* t);
378 
379  void display() const;
380 
381  const SymbolTable& get_symbol_table() const {return symbol_table;}
382  SymbolNumber lookup_ofst_input_symbol(int64 s) const;
383  SymbolNumber lookup_ofst_output_symbol(int64 s) const;
384  bool is_flag_diacritic(SymbolNumber symbol) const;
385 
386  TransducerAlphabet to_alphabet() const;
387 };
388 
389 class ConvertTransition
390 {
391 private:
392  SymbolNumber input_symbol;
393  SymbolNumber output_symbol;
394  // initially we only know the StateIdNumber of the target, but once the
395  // tables have been laid out we can just store the state's table index
396  union
397  {
398  StateIdNumber target_state_id;
399  TransitionTableIndex target_state_index;
400  };
401  Weight weight;
402 
403  TransitionTableIndex table_index; // location in the transition table
404 public:
405  /*
406  Set the symbol numbers and set the indices of the states according
407  to ConvertIdNumberMap nodes_to_id_numbers.
408  */
409  ConvertTransition(const StdArc &a);
410 
411  void display() const;
412 
413  SymbolNumber get_input_symbol(void) const
414  { return input_symbol; }
415 
416  void set_target_state_index();
417 
418  void set_table_index(TransitionTableIndex i)
419  { table_index = i; }
420 
421  TransitionTableIndex get_table_index(void) const
422  { return table_index; }
423 
424  template<class T> T to_transition() const;
425 
426  bool numerical_cmp( const ConvertTransition &another_transition) const;
427  bool operator<(const ConvertTransition &another_index) const;
428 };
429 
430 class ConvertTransitionIndex
431 {
432 private:
433  SymbolNumber input_symbol;
434  // initially we only have the corresponding transition object, but once the
435  // tables have been laid out we can just store the transition's table index
436  union
437  {
438  ConvertTransition* first_transition;
439  TransitionTableIndex first_transition_index;
440  };
441 public:
442  ConvertTransitionIndex(SymbolNumber input, ConvertTransition* transition):
443  input_symbol(input), first_transition(transition) {}
444 
445  void display() const;
446 
447  SymbolNumber get_input_symbol(void) const
448  { return input_symbol; }
449 
450  ConvertTransition* get_first_transition() const
451  { return first_transition; }
452 
453  void set_first_transition_index(TransitionTableIndex i)
454  { first_transition_index = i; }
455 
456  template<class T> T to_transition_index() const;
457 
458  bool operator<(const ConvertTransitionIndex &another_index) const;
459 };
460 
461 struct ConvertTransitionCompare
462 {
463  bool operator() (const ConvertTransition * t1,
464  const ConvertTransition * t2) const
465  {
466  return t1->operator<(*t2);
467  }
468 };
469 
470 struct ConvertTransitionIndexCompare
471 {
472  bool operator() (const ConvertTransitionIndex * i1,
473  const ConvertTransitionIndex * i2) const
474  {
475  return i1->operator<(*i2);
476  }
477 };
478 
479 typedef std::set<ConvertTransition*,ConvertTransitionCompare>
480  ConvertTransitionSet;
481 typedef std::set<ConvertTransitionIndex*,ConvertTransitionIndexCompare>
482  ConvertTransitionIndexSet;
483 
484 class ConvertFstState
485 {
486 private:
487  ConvertTransitionSet transitions;
488  ConvertTransitionIndexSet transition_indices;
489 
490  // the location in the transition table of the state's first transition. For
491  // states without transitions, this is where the first transition *would* be
492  TransitionTableIndex first_transition_index;
493  // the location in the table tables where the state's transition indices
494  // begin the state's location in the index tables. This can be either in the
495  // transition index table, or, if the state only has transitions with
496  // one input symbol, in the transition table, immediately preceding
497  // the first transition
498  TransitionTableIndex table_index;
499 
500  bool final;
501  Weight weight;
502 
503  StateIdNumber id;
504 
505  void set_transitions(StateId n, TransduceR * tr);
506  void set_transition_indices(void);
507 
508  friend class fst_state_compare;
509 public:
510  ConvertFstState(StateId n, TransduceR * tr);
511  ~ConvertFstState();
512 
513  void display() const;
514 
515  TransitionTableIndex set_transition_table_indices(
516  TransitionTableIndex place);
517 
518  TransitionTableIndex get_first_transition_index() const
519  { return first_transition_index; }
520 
521  void set_table_index(TransitionTableIndex i)
522  { table_index = i; }
523  TransitionTableIndex get_table_index(void) const
524  { return table_index; }
525 
526  SymbolNumberSet * get_input_symbols(void) const;
527 
528  SymbolNumber number_of_input_symbols(void) const
529  { return transition_indices.size(); }
530  SymbolNumber number_of_transitions(void) const
531  { return transitions.size(); }
532  bool is_final(void) const {return final;}
533  bool is_big_state(void) const
534  {
535  return (transition_indices.size() > BIG_STATE_LIMIT);
536  }
537  bool is_start_state(void) const {return id == 0;}
538  StateIdNumber get_id(void) const {return id;}
539 
540  // update transitions with the state's location in the tables
541  void set_transition_target_indices();
542 
543  // add this state's transition indices into the given transition index table
544  template<class T>
545  void insert_transition_indices(TransducerTable<T>& index_table) const;
546 
547  template<class T>
548  TransitionTableIndex append_transitions(TransducerTable<T>& transition_table,
549  TransitionTableIndex place) const;
550 };
551 
552  typedef std::vector<ConvertFstState*> ConvertFstStateVector;
553 
554 class ConvertTransitionTableIndices
555 {
556 private:
557  PlaceHolderVector indices;
558  PlaceHolderVector::size_type lower_bound;
559  unsigned int lower_bound_test_count;
560  SymbolNumber number_of_input_symbols;
561 
562  bool state_fits(SymbolNumberSet * input_symbols,
563  bool final_state,
564  PlaceHolderVector::size_type index);
565 
566  void insert_state(SymbolNumberSet * input_symbols,
567  bool final_state,
568  PlaceHolderVector::size_type index);
569  void get_more_space(void);
570 public:
571  ConvertTransitionTableIndices(SymbolNumber input_symbol_count):
572  lower_bound(0), lower_bound_test_count(0),
573  number_of_input_symbols(input_symbol_count)
574  {
575  get_more_space();
576  };
577 
578  PlaceHolderVector::size_type add_state(ConvertFstState * state);
579  PlaceHolderVector::size_type size(void) const
580  { return indices.size(); }
581 
582  PlaceHolderVector::size_type last_full_index(void) const;
583 };
584 
585 class ConvertTransducerHeader
586 {
587  private:
588  static void full_traversal(TransducerHeader& h, TransduceR* tr, StateId n,
589  StateIdSet& visited_nodes, StateIdSet& nodes_in_path,
590  OfstSymbolSet& all_input_symbols);
591  static void find_input_epsilon_cycles(StateId n, StateId t,
592  StateIdSet &epsilon_targets,
593  bool unweighted_only, TransduceR * tr,
594  TransducerHeader& h);
595  public:
596  static void compute_header(TransducerHeader& header,
597  TransduceR * t, SymbolNumber symbol_count,
598  TransitionTableIndex number_of_index_table_entries,
599  TransitionTableIndex number_of_target_table_entries,
600  bool weighted);
601 };
602 
603 class ConvertTransducer
604 {
605 private:
606  TransduceR * fst;
607  ConvertIdNumberMap * id_number_map;
608  ConvertTransitionTableIndices * fst_indices;
609  PlaceHolderVector::size_type index_table_size;
610 
611  TransducerHeader header;
612  ConvertTransducerAlphabet alphabet;
613  ConvertFstStateVector states;
614 
615  void read_nodes(void);
616  void set_transition_table_indices(void);
617  void set_index_table_indices(void);
618 
619  void add_input_symbols(StateId n,
620  SymbolNumberSet &input_symbols,
621  StateIdSet &visited_nodes);
622  SymbolNumber number_of_input_symbols(void);
623 
624  TransitionTableIndex count_transitions(void) const;
625 
626  void display_states() const;
627  void display_tables() const;
628 
629  template<class T> TransducerTable<T> make_index_table(
630  TransitionTableIndex index_table_size) const;
631  template<class T> TransducerTable<T> make_transition_table() const;
632 public:
633  ConvertTransducer(TransduceR * tr, bool weighted);
634  ~ConvertTransducer();
635 
636  ConvertIdNumberMap& get_id_number_map() {return *id_number_map;}
637  ConvertTransducerAlphabet& get_alphabet() {return alphabet;}
638  ConvertFstState& get_state(StateIdNumber s) {return *states[s];}
639  bool is_weighted() const {return header.probe_flag(Weighted);}
640 
641  Transducer* to_transducer() const;
642 
643  // exposed to this module during the constructor
644  static ConvertTransducer* constructing_transducer;
645 };
646 
647 #endif // HAVE_OPENFST
648 
649 }
650 
651 #endif // include guard
An error happened probably due to a bug in the HFST code.
Definition: HfstExceptionDefs.h:390
#define HFST_THROW_MESSAGE(E, M)
Macro to throw an exception of type E with message M. Use THROW instead of regular throw with subclas...
Definition: HfstExceptionDefs.h:47