HFST - Helsinki Finite-State Transducer Technology - C++ API  version 3.9.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
transducer.h
1 // -*- mode: c++; -*-
2 // Copyright (c) 2016 University of Helsinki
3 //
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License as published by the Free Software Foundation; either
7 // version 3 of the License, or (at your option) any later version.
8 // See the file COPYING included with this distribution for more
9 // information.
10 
11 #ifndef _HFST_OL_TRANSDUCER_TRANSDUCER_H_
12 #define _HFST_OL_TRANSDUCER_TRANSDUCER_H_
13 
14 #ifndef _MSC_VER
15 # include <unistd.h>
16 #else
17 # include <io.h>
18 #endif
19 
20 #include <vector>
21 #include <set>
22 #include <iostream>
23 #include <limits>
24 #include <string>
25 #include <cstdlib>
26 #include <cstring>
27 #include <climits>
28 #include <utility>
29 #include <deque>
30 #include <queue>
31 #include <stdexcept>
32 #include <time.h>
33 
34 #include "../../HfstExceptionDefs.h"
35 #include "../../HfstFlagDiacritics.h"
36 #include "../../HfstSymbolDefs.h"
37 
38 #ifdef _MSC_VER
39  #include <BaseTsd.h>
40  typedef SSIZE_T ssize_t;
41 #endif
42 
43 
45 namespace hfst_ol {
46 using hfst::FdOperation;
47 using hfst::FdState;
48 using hfst::FdTable;
49 
50 // using namespace hfst;
51 
52 
53 typedef unsigned short SymbolNumber;
54 typedef unsigned int TransitionTableIndex;
55 typedef unsigned int TransitionNumber;
56 typedef unsigned int StateIdNumber;
57 typedef short ValueNumber;
58 typedef float Weight;
59 typedef std::set<SymbolNumber> SymbolNumberSet;
60 typedef std::vector<SymbolNumber> SymbolNumberVector;
61 typedef std::set<TransitionTableIndex> TransitionTableIndexSet;
62 typedef std::vector<std::string> SymbolTable;
63 
64 // for lookup
65 typedef std::pair<Weight, std::vector<std::string> > HfstOneLevelPath;
66 typedef std::set<HfstOneLevelPath> HfstOneLevelPaths;
67 typedef std::vector<std::string> StringVector;
68 
69 // for ospell
70 typedef std::vector<short> FlagDiacriticState;
71 typedef std::map<SymbolNumber, hfst::FdOperation> OperationMap;
72 typedef std::map<std::string, SymbolNumber> StringSymbolMap;
73 class STransition;
74 
75 // for epsilon loop checking
76 struct TraversalState
77 {
78  TransitionTableIndex index;
79  FlagDiacriticState flags;
80  TraversalState(TransitionTableIndex i, FlagDiacriticState f):
81  index(i), flags(f) {}
82  bool operator==(const TraversalState & rhs) const;
83  bool operator<(const TraversalState & rhs) const;
84 
85 };
86 typedef std::set<TraversalState> TraversalStates;
87 
88  // parentheses avoid collision with windows macro 'max'
89  const SymbolNumber NO_SYMBOL_NUMBER = (std::numeric_limits<SymbolNumber>::max)();
90 const TransitionTableIndex NO_TABLE_INDEX =
91  (std::numeric_limits<TransitionTableIndex>::max)();
92  const unsigned long NO_COUNTER = (std::numeric_limits<unsigned long>::max)();
93 const Weight INFINITE_WEIGHT = static_cast<float>(NO_TABLE_INDEX);
94 
95 enum HeaderFlag {Weighted, Deterministic, Input_deterministic, Minimized,
96  Cyclic, Has_epsilon_epsilon_transitions,
97  Has_input_epsilon_transitions, Has_input_epsilon_cycles,
98  Has_unweighted_input_epsilon_cycles};
99 
100 // This is 2^31, hopefully equal to UINT_MAX/2 rounded up.
101 // For some profound reason it can't be replaced with (UINT_MAX+1)/2.
102 const TransitionTableIndex TRANSITION_TARGET_TABLE_START = 2147483648u;
103 const unsigned int MAX_IO_LEN = 10000;
104 const unsigned int MAX_RECURSION_DEPTH = 5000;
105 
106 // This function is queried to check whether we should do the
107 // single-character ascii lookup tokenization or the regular
108 // trie-tokenization
109 bool should_ascii_tokenize(unsigned char c);
110 
111 inline bool indexes_transition_table(const TransitionTableIndex i)
112 {
113  return i >= TRANSITION_TARGET_TABLE_START;
114 }
115 inline bool indexes_transition_index_table(const TransitionTableIndex i)
116 {
117  return i < TRANSITION_TARGET_TABLE_START;
118 }
119 
120 class TransducerHeader
121 {
122 private:
123  SymbolNumber number_of_input_symbols;
124  SymbolNumber number_of_symbols;
125  TransitionTableIndex size_of_transition_index_table;
126  TransitionTableIndex size_of_transition_target_table;
127 
128  StateIdNumber number_of_states;
129  TransitionNumber number_of_transitions;
130 
131  bool weighted;
132  bool deterministic;
133  bool input_deterministic;
134  bool minimized;
135  bool cyclic;
136  bool has_epsilon_epsilon_transitions;
137  bool has_input_epsilon_transitions;
138  bool has_input_epsilon_cycles;
139  bool has_unweighted_input_epsilon_cycles;
140 
141  static void header_error()
142  {
144  }
145 
146  template<class T>
147  static T read_property(std::istream& is)
148  {
149  T p;
150  is.read(reinterpret_cast<char*>(&p), sizeof(T));
151  return p;
152  }
153  template<class T>
154  static void write_property(T prop, std::ostream& os)
155  { os.write(reinterpret_cast<const char*>(&prop), sizeof(prop)); }
156  static bool read_bool_property(std::istream& is)
157  {
158  unsigned int prop;
159  is.read(reinterpret_cast<char*>(&prop), sizeof(unsigned int));
160  if(prop == 0)
161  return false;
162  if(prop == 1)
163  return true;
164  header_error();
165  return false;
166  }
167  static void write_bool_property(bool value, std::ostream& os)
168  {
169  unsigned int prop = (value?1:0);
170  os.write(reinterpret_cast<char*>(&prop), sizeof(prop));
171  }
172 public:
173  TransducerHeader(bool weights):
174  number_of_input_symbols(0),
175  number_of_symbols(1), // epsilon
176  size_of_transition_index_table(1),
177  size_of_transition_target_table(0),
178  number_of_states(1),
179  number_of_transitions(0),
180  weighted(weights),
181  deterministic(true),
182  input_deterministic(true),
183  minimized(true),
184  cyclic(false),
185  has_epsilon_epsilon_transitions(false),
186  has_input_epsilon_transitions(false),
187  has_input_epsilon_cycles(false),
188  has_unweighted_input_epsilon_cycles(false)
189  {}
190 
191  // a basic constructor that's only told information we
192  // actually use at the moment
193  TransducerHeader(
194  SymbolNumber input_symbols,
195  SymbolNumber symbols,
196  TransitionTableIndex transition_index_table,
197  TransitionTableIndex transition_table,
198  bool weights):
199  number_of_input_symbols(input_symbols),
200  number_of_symbols(symbols), // epsilon
201  size_of_transition_index_table(transition_index_table),
202  size_of_transition_target_table(transition_table),
203  number_of_states(0),
204  number_of_transitions(0),
205  weighted(weights),
206  deterministic(true),
207  input_deterministic(true),
208  minimized(true),
209  cyclic(false),
210  has_epsilon_epsilon_transitions(false),
211  has_input_epsilon_transitions(false),
212  has_input_epsilon_cycles(false),
213  has_unweighted_input_epsilon_cycles(false)
214  { }
215 
216 
217  TransducerHeader(std::istream& is):
218  number_of_input_symbols(read_property<SymbolNumber>(is)),
219  number_of_symbols(read_property<SymbolNumber>(is)),
220  size_of_transition_index_table(
221  read_property<TransitionTableIndex>(is)),
222  size_of_transition_target_table(
223  read_property<TransitionTableIndex>(is)),
224  number_of_states(read_property<StateIdNumber>(is)),
225  number_of_transitions(read_property<TransitionNumber>(is)),
226  weighted(read_bool_property(is)),
227  deterministic(read_bool_property(is)),
228  input_deterministic(read_bool_property(is)),
229  minimized(read_bool_property(is)),
230  cyclic(read_bool_property(is)),
231  has_epsilon_epsilon_transitions(read_bool_property(is)),
232  has_input_epsilon_transitions(read_bool_property(is)),
233  has_input_epsilon_cycles(read_bool_property(is)),
234  has_unweighted_input_epsilon_cycles(read_bool_property(is))
235  {
236  if(!is) {
238  }
239  }
240 
241  SymbolNumber symbol_count(void) const { return number_of_symbols; }
242  SymbolNumber input_symbol_count(void) const {
243  return number_of_input_symbols;
244  }
245  void increment_symbol_count(void)
246  {++number_of_symbols; ++number_of_input_symbols;}
247 
248  TransitionTableIndex index_table_size(void) const
249  { return size_of_transition_index_table; }
250  TransitionTableIndex target_table_size(void) const
251  { return size_of_transition_target_table; }
252 
253  bool probe_flag(HeaderFlag flag) const
254  {
255  switch (flag) {
256  case Weighted:
257  return weighted;
258  case Deterministic:
259  return deterministic;
260  case Input_deterministic:
261  return input_deterministic;
262  case Minimized:
263  return minimized;
264  case Cyclic:
265  return cyclic;
266  case Has_epsilon_epsilon_transitions:
267  return has_epsilon_epsilon_transitions;
268  case Has_input_epsilon_transitions:
269  return has_input_epsilon_transitions;
270  case Has_input_epsilon_cycles:
271  return has_input_epsilon_cycles;
272  case Has_unweighted_input_epsilon_cycles:
273  return has_unweighted_input_epsilon_cycles;
274  }
275  return false;
276  }
277 
278  void set_flag(HeaderFlag flag, bool value)
279  {
280  switch (flag) {
281  case Weighted:
282  weighted = true;
283  break;
284  case Deterministic:
285  deterministic = true;
286  break;
287  case Input_deterministic:
288  input_deterministic = true;
289  break;
290  case Minimized:
291  minimized = true;
292  break;
293  case Cyclic:
294  cyclic = true;
295  break;
296  case Has_epsilon_epsilon_transitions:
297  has_epsilon_epsilon_transitions = true;
298  break;
299  case Has_input_epsilon_transitions:
300  has_input_epsilon_transitions = true;
301  break;
302  case Has_input_epsilon_cycles:
303  has_input_epsilon_cycles = true;
304  break;
305  case Has_unweighted_input_epsilon_cycles:
306  has_unweighted_input_epsilon_cycles = true;
307  default:
308  return;
309  }
310  }
311 
312  void display() const
313  {
314  std::cout << "Transducer properties:" << std::endl
315  << " number_of_symbols: "
316  << number_of_symbols << std::endl
317  << " number_of_input_symbols: "
318  << number_of_input_symbols << std::endl
319  << " size_of_transition_index_table: "
320  << size_of_transition_index_table << std::endl
321  << " size_of_transition_target_table: "
322  << size_of_transition_target_table << std::endl
323  << " number_of_states: "
324  << number_of_states << std::endl
325  << " number_of_transitions: "
326  << number_of_transitions << std::endl
327  << " weighted: "
328  << weighted << std::endl
329  << " deterministic: "
330  << deterministic << std::endl
331  << " input_deterministic: "
332  << input_deterministic << std::endl
333  << " minimized: "
334  << minimized << std::endl
335  << " cyclic: "
336  << cyclic << std::endl
337  << " has_epsilon_epsilon_transitions: "
338  << has_epsilon_epsilon_transitions << std::endl
339  << " has_input_epsilon_transitions: "
340  << has_input_epsilon_transitions << std::endl
341  << " has_input_epsilon_cycles: "
342  << has_input_epsilon_cycles << std::endl
343  << " has_unweighted_input_epsilon_cycles: "
344  << has_unweighted_input_epsilon_cycles << std::endl;
345  }
346 
347  void write(std::ostream& os) const
348  {
349  write_property(number_of_input_symbols, os);
350  write_property(number_of_symbols, os);
351  write_property(size_of_transition_index_table, os);
352  write_property(size_of_transition_target_table, os);
353  write_property(number_of_states, os);
354  write_property(number_of_transitions, os);
355  write_bool_property(weighted, os);
356  write_bool_property(deterministic, os);
357  write_bool_property(input_deterministic, os);
358  write_bool_property(minimized, os);
359  write_bool_property(cyclic, os);
360  write_bool_property(has_epsilon_epsilon_transitions, os);
361  write_bool_property(has_input_epsilon_transitions, os);
362  write_bool_property(has_input_epsilon_cycles, os);
363  write_bool_property(has_unweighted_input_epsilon_cycles, os);
364  }
365 
366  friend class ConvertTransducerHeader;
367 };
368 
369 class TransducerAlphabet
370 {
371 protected:
372  SymbolTable symbol_table;
373  hfst::FdTable<SymbolNumber> fd_table;
374  SymbolNumber unknown_symbol;
375  SymbolNumber default_symbol;
376  SymbolNumber identity_symbol;
377  SymbolNumber orig_symbol_count;
378 
379 public:
380  TransducerAlphabet()
381  {
382  symbol_table.push_back("@_EPSILON_SYMBOL_@");
383  unknown_symbol = NO_SYMBOL_NUMBER;
384  default_symbol = NO_SYMBOL_NUMBER;
385  identity_symbol = NO_SYMBOL_NUMBER;
386  orig_symbol_count = 1;
387  }
388  TransducerAlphabet(std::istream& is,
389  SymbolNumber symbol_count,
390  bool preserve_diacritic_strings = true);
391  TransducerAlphabet(const SymbolTable& st);
392 
393  void display() const;
394 
395  void write(std::ostream& os) const
396  {
397  for(SymbolTable::const_iterator i = symbol_table.begin();
398  i != symbol_table.end(); i++)
399  {
400  os << *i;
401  os.put('\0');
402  }
403  }
404 
405  bool has_flag_diacritics() const
406  { return fd_table.num_features() > 0; }
407  bool is_flag_diacritic(SymbolNumber symbol) const
408  { return fd_table.is_diacritic(symbol); }
409  bool is_like_epsilon(SymbolNumber symbol) const;
410 
411  const SymbolTable& get_symbol_table() const
412  { return symbol_table; }
413 
414  const std::string string_from_symbol(const SymbolNumber symbol) const
415  // represent epsilon as blank string
416  { return (symbol == 0) ? "" : symbol_table[symbol]; }
417 
418  SymbolNumber symbol_from_string(const std::string symbol_string) const;
419  StringSymbolMap build_string_symbol_map(void) const;
420  const hfst::FdTable<SymbolNumber>& get_fd_table() const
421  { return fd_table; }
422  const hfst::FdOperation * get_operation(SymbolNumber symbol) const
423  {
424  return fd_table.get_operation(symbol);
425  }
426  SymbolNumber get_unknown_symbol(void) const
427  { return unknown_symbol; }
428  SymbolNumber get_default_symbol(void) const
429  { return default_symbol; }
430  SymbolNumber get_identity_symbol(void) const
431  { return identity_symbol; }
432  SymbolNumber get_orig_symbol_count(void) const
433  { return orig_symbol_count; }
434  virtual void add_symbol(char * symbol);
435  virtual void add_symbol(const std::string & symbol);
436 
437 
438 };
439 
440 class TransitionIndex
441 {
442 protected:
443  SymbolNumber input_symbol;
444  TransitionTableIndex first_transition_index;
445 public:
446  static const size_t size =
447  sizeof(SymbolNumber) + sizeof(TransitionTableIndex);
448  TransitionIndex(): input_symbol(NO_SYMBOL_NUMBER),
449  first_transition_index(NO_TABLE_INDEX) {}
450  TransitionIndex(SymbolNumber input,
451  TransitionTableIndex first_transition):
452  input_symbol(input), first_transition_index(first_transition) {}
453 
454  TransitionIndex(std::istream& is):
455  input_symbol(NO_SYMBOL_NUMBER), first_transition_index(0)
456  {
457  is.read(reinterpret_cast<char*>(&input_symbol),
458  sizeof(SymbolNumber));
459  is.read(reinterpret_cast<char*>(&first_transition_index),
460  sizeof(TransitionTableIndex));
461  }
462  // A constructor for reading from a char array at p
463  TransitionIndex(char * p):
464  input_symbol(*((SymbolNumber*) p)),
465  first_transition_index((*(TransitionTableIndex*)
466  (p + sizeof(SymbolNumber)))) {}
467  virtual ~TransitionIndex() {}
468 
469  void write(std::ostream& os, bool weighted) const
470  {
471  os.write(reinterpret_cast<const char*>(&input_symbol),
472  sizeof(SymbolNumber));
473  if(!weighted && input_symbol == NO_SYMBOL_NUMBER &&
474  first_transition_index != NO_TABLE_INDEX) {
475  // Make sure that we write the correct type of final index
476  unsigned int unweighted_final_index = 1;
477  os.write(reinterpret_cast<const char*>(&unweighted_final_index),
478  sizeof(first_transition_index));
479  } else {
480  os.write(reinterpret_cast<const char*>(
481  &first_transition_index),
482  sizeof(first_transition_index));
483  }
484  }
485 
486  void display() const;
487 
488  TransitionTableIndex get_target(void) const
489  { return first_transition_index; }
490  SymbolNumber get_input_symbol(void) const
491  { return input_symbol; }
492 
493  bool matches(const SymbolNumber s) const;
494  virtual bool final(void) const;
495  virtual Weight final_weight(void) const { return 0.0; }
496 
497  static TransitionIndex create_final()
498  { return TransitionIndex(NO_SYMBOL_NUMBER, 1); }
499 };
500 
501 class TransitionWIndex : public TransitionIndex
502 {
503 public:
504  TransitionWIndex(): TransitionIndex() {}
505  TransitionWIndex(SymbolNumber input,
506  TransitionTableIndex first_transition):
507  TransitionIndex(input, first_transition) {}
508  TransitionWIndex(std::istream& is):
509  TransitionIndex(is) {}
510  TransitionWIndex(char * p):
511  TransitionIndex(p) {}
512 
513  Weight final_weight(void) const;
514 
515  static TransitionWIndex create_final()
516  { return TransitionWIndex(NO_SYMBOL_NUMBER, 0); }
517 
518  static TransitionWIndex create_final(Weight w)
519  {
520  union to_weight
521  {
522  TransitionTableIndex i;
523  Weight w;
524  } weight;
525  weight.w = w;
526  return TransitionWIndex(NO_SYMBOL_NUMBER, weight.i);
527  }
528 };
529 
530 class Transition
531 {
532 protected:
533  SymbolNumber input_symbol;
534  SymbolNumber output_symbol;
535  TransitionTableIndex target_index;
536 public:
537  static const size_t size = 2 * sizeof(SymbolNumber) +
538  sizeof(TransitionTableIndex);
539  Transition(SymbolNumber input, SymbolNumber output,
540  TransitionTableIndex target, Weight bogus=0.0f):
541  input_symbol(input), output_symbol(output), target_index(target)
542  {(void)bogus; bogus=0.0f;}
543  Transition(bool final, Weight bogus=0.0f):
544  input_symbol(NO_SYMBOL_NUMBER), output_symbol(NO_SYMBOL_NUMBER),
545  target_index(final?1:NO_TABLE_INDEX) {(void)bogus; bogus=0.0f;}
546  Transition(std::istream& is):
547  input_symbol(NO_SYMBOL_NUMBER), output_symbol(NO_SYMBOL_NUMBER),
548  target_index(0)
549  {
550  is.read(reinterpret_cast<char*>(&input_symbol),
551  sizeof(SymbolNumber));
552  is.read(reinterpret_cast<char*>(&output_symbol),
553  sizeof(SymbolNumber));
554  is.read(reinterpret_cast<char*>(&target_index),
555  sizeof(target_index));
556  }
557  // A constructor for reading from char array
558  Transition(char * p):
559  input_symbol(*(SymbolNumber*) p),
560  output_symbol(*(SymbolNumber*) (p + sizeof(SymbolNumber))),
561  target_index(*(TransitionTableIndex*) (p + 2 * sizeof(SymbolNumber)))
562  {}
563 
564  virtual ~Transition() {}
565 
566  virtual void write(std::ostream& os, bool weighted) const
567  {
568  os.write(reinterpret_cast<const char*>(&input_symbol),
569  sizeof(input_symbol));
570  os.write(reinterpret_cast<const char*>(&output_symbol),
571  sizeof(output_symbol));
572  os.write(reinterpret_cast<const char*>(&target_index),
573  sizeof(target_index));
574  if (weighted) {
575  os << 0.0f;
576  }
577  }
578 
579  virtual void display() const;
580 
581  TransitionTableIndex get_target(void) const {return target_index;}
582  SymbolNumber get_output_symbol(void) const {return output_symbol;}
583  SymbolNumber get_input_symbol(void) const {return input_symbol;}
584 
585  bool matches(const SymbolNumber s) const;
586  virtual bool final(void) const;
587  virtual Weight get_weight(void) const { return 0.0; }
588 };
589 
590 class TransitionW : public Transition
591 {
592 protected:
593  Weight transition_weight;
594 public:
595  static const size_t size = 2 * sizeof(SymbolNumber) +
596  sizeof(TransitionTableIndex) + sizeof(Weight);
597 
598  TransitionW(SymbolNumber input, SymbolNumber output,
599  TransitionTableIndex target, Weight w):
600  Transition(input, output, target), transition_weight(w) {}
601  TransitionW(bool final, Weight w):
602  Transition(final), transition_weight(w) {}
603  TransitionW(std::istream& is): Transition(is), transition_weight(0.0f)
604  {is.read(reinterpret_cast<char*>(&transition_weight), sizeof(Weight));}
605  TransitionW(char * p):
606  Transition(p),
607  transition_weight(*((Weight*)
608  (p + 2 * sizeof(SymbolNumber)
609  + sizeof(TransitionTableIndex))))
610  {}
611 
612  void write(std::ostream& os, bool weighted) const
613  {
614  Transition::write(os, false);
615  if (weighted) {
616  os.write(reinterpret_cast<const char*>(&transition_weight),
617  sizeof(transition_weight));
618  }
619  }
620 
621  void display() const;
622 
623  Weight get_weight(void) const { return transition_weight; }
624 };
625 
626 
627 template <class T>
628 class TransducerTable
629 {
630 protected:
631  std::vector<T> table;
632 public:
633  TransducerTable(): table() {}
634  TransducerTable(size_t size, const T& entry): table(size, entry) {}
635  TransducerTable(
636  std::istream& is, TransitionTableIndex index_count): table()
637  {
638  char * p = (char*) malloc(T::size * index_count);
639  is.read(p, T::size * index_count);
640  char * p_orig = p;
641  while(index_count) {
642  table.push_back(T(p));
643  --index_count;
644  p += T::size;
645  }
646  free(p_orig);
647  }
648  TransducerTable(const TransducerTable& t): table(t.table) {}
649 
650  void append(const T& v) {table.push_back(v);}
651  void set(size_t index, const T& v) {table[index] = v;}
652 
653  const T& operator[](TransitionTableIndex i) const
654  {
655  return (i < TRANSITION_TARGET_TABLE_START) ?
656  table[i] : table[i-TRANSITION_TARGET_TABLE_START];
657  }
658 
659  void display(bool transition_table) const
660  {
661  for(size_t i=0;i<table.size();i++)
662  {
663  std::cout << i;
664  if(transition_table)
665  std::cout << "/" << i+TRANSITION_TARGET_TABLE_START;
666  std::cout << ": ";
667  table[i].display();
668  }
669  }
670 
671  unsigned int size() const {return table.size();}
672 };
673 
674 class TransducerTablesInterface
675 {
676 public:
677  virtual ~TransducerTablesInterface() {}
678 
679  virtual const TransitionIndex& get_index(
680  TransitionTableIndex i) const = 0;
681  virtual const Transition& get_transition(
682  TransitionTableIndex i) const = 0;
683  virtual Weight get_weight(
684  TransitionTableIndex i) const = 0;
685  virtual SymbolNumber get_transition_input(
686  TransitionTableIndex i) const = 0;
687  virtual SymbolNumber get_transition_output(
688  TransitionTableIndex i) const = 0;
689  virtual TransitionTableIndex get_transition_target(
690  TransitionTableIndex i) const = 0;
691  virtual bool get_transition_finality(
692  TransitionTableIndex i) const = 0;
693  virtual SymbolNumber get_index_input(
694  TransitionTableIndex i) const = 0;
695  virtual TransitionTableIndex get_index_target(
696  TransitionTableIndex i) const = 0;
697  virtual bool get_index_finality(
698  TransitionTableIndex i) const = 0;
699  virtual Weight get_final_weight(
700  TransitionTableIndex i) const = 0;
701 
702  virtual void display() const {}
703 };
704 
705 template <class T1, class T2>
706 class TransducerTables : public TransducerTablesInterface
707 {
708 protected:
709  TransducerTable<T1> index_table;
710  TransducerTable<T2> transition_table;
711 public:
712  TransducerTables(std::istream& is, TransitionTableIndex index_table_size,
713  TransitionTableIndex transition_table_size):
714  index_table(
715  is, index_table_size),
716  transition_table(is, transition_table_size) { }
717 
718  TransducerTables(): index_table(1, T1::create_final()),
719  transition_table() {}
720  TransducerTables(const TransducerTable<T1>& index_table,
721  const TransducerTable<T2>& transition_table):
722  index_table(index_table), transition_table(transition_table) {}
723 
724  const TransitionIndex& get_index(TransitionTableIndex i) const
725  {return index_table[i];}
726  const Transition& get_transition(TransitionTableIndex i) const
727  {return transition_table[i];}
728  Weight get_weight(TransitionTableIndex i) const
729  { return transition_table[i].get_weight(); }
730  SymbolNumber get_transition_input(TransitionTableIndex i) const
731  { return transition_table[i].get_input_symbol(); }
732  SymbolNumber get_transition_output(TransitionTableIndex i) const
733  { return transition_table[i].get_output_symbol(); }
734  TransitionTableIndex get_transition_target(TransitionTableIndex i) const
735  { return transition_table[i].get_target(); }
736  bool get_transition_finality(TransitionTableIndex i) const
737  { return transition_table[i].final(); }
738  SymbolNumber get_index_input(TransitionTableIndex i) const
739  { return index_table[i].get_input_symbol(); }
740  TransitionTableIndex get_index_target(TransitionTableIndex i) const
741  { return index_table[i].get_target(); }
742  bool get_index_finality(TransitionTableIndex i) const
743  { return index_table[i].final(); }
744  Weight get_final_weight(TransitionTableIndex i) const
745  { return index_table[i].final_weight(); }
746 
747 
748  void display() const
749  {
750  std::cout << "Transition index table:" << std::endl;
751  index_table.display(false);
752  std::cout << "Transition table:" << std::endl;
753  transition_table.display(true);
754  }
755 };
756 
757 
758 // There follow some classes for implementing lookup
759 
760 class OlLetterTrie;
761 typedef std::vector<OlLetterTrie*> OlLetterTrieVector;
762 
763 class OlLetterTrie
764 {
765 private:
766  OlLetterTrieVector letters;
767  SymbolNumberVector symbols;
768 
769 public:
770  OlLetterTrie():
771  letters(UCHAR_MAX, static_cast<OlLetterTrie*>(NULL)),
772  symbols(UCHAR_MAX,NO_SYMBOL_NUMBER)
773  {}
774 
775  ~OlLetterTrie() {
776  for (size_t i=0 ; i<letters.size() ; ++i) {
777  delete letters[i];
778  letters[i] = 0;
779  }
780  }
781 
782  void add_string(const char * p,SymbolNumber symbol_key);
783  bool has_key_starting_with(const char c) const;
784 
785  SymbolNumber find_key(char ** p);
786 
787 };
788 
789 class Encoder {
790 
791 protected:
792  SymbolNumber number_of_input_symbols;
793  OlLetterTrie letters;
794  SymbolNumberVector ascii_symbols;
795 
796  void read_input_symbols(const SymbolTable & kt);
797  void read_input_symbol(const char * symbol, const int symbol_number);
798 
799 public:
800  Encoder(const SymbolTable & st, SymbolNumber input_symbol_count):
801  number_of_input_symbols(input_symbol_count),
802  ascii_symbols(128, NO_SYMBOL_NUMBER)
803  {
804  read_input_symbols(st);
805  }
806 
807  SymbolNumber find_key(char ** p);
808 
809  friend class Transducer;
810  friend class PmatchContainer;
811 };
812 
813 // A vector that can be written to at any position, so that it
814 // adds new elements if the desired element isn't already present.
815 class Tape: public SymbolNumberVector
816 {
817 public:
818  void write(unsigned int i, SymbolNumber s)
819  {
820  if (this->size() > i) {
821  this->operator[](i) = s;
822  } else {
823  while (this->size() <= i) {
824  this->push_back(NO_SYMBOL_NUMBER);
825  }
826  this->operator[](i) = s;
827  }
828  }
829 };
830 
834 {
835 protected:
836  TransducerHeader* header;
837  TransducerAlphabet* alphabet;
838  TransducerTablesInterface* tables;
839  void load_tables(std::istream& is);
840 
841  // for lookup
842  Weight current_weight;
843  HfstOneLevelPaths * lookup_paths;
844  Encoder * encoder;
845  Tape input_tape;
846  Tape output_tape;
847  hfst::FdState<SymbolNumber> flag_state;
848  // This is to keep track of whether we're going to take a default transition
849  bool found_transition;
850  // For keeping a tally of previously epsilon-visited states to control
851  // going into loops
852  TraversalStates traversal_states;
853 
854  ssize_t max_lookups;
855  unsigned int recursion_depth_left;
856  double max_time;
857  clock_t start_clock;
858 
859  void try_epsilon_transitions(unsigned int input_tape_pos,
860  unsigned int output_tape_pos,
861  TransitionTableIndex i);
862 
863  void try_epsilon_indices(unsigned int input_tape_pos,
864  unsigned int output_tape_pos,
865  TransitionTableIndex i);
866 
867  void find_transitions(SymbolNumber input,
868  unsigned int input_tape_pos,
869  unsigned int output_tape_pos,
870  TransitionTableIndex i);
871 
872  void find_index(SymbolNumber input,
873  unsigned int input_tape_pos,
874  unsigned int output_tape_pos,
875  TransitionTableIndex i);
876 
877  void get_analyses(unsigned int input_tape_pos,
878  unsigned int output_tape_pos,
879  TransitionTableIndex i);
880 
881  void find_loop_epsilon_transitions(unsigned int input_pos,
882  TransitionTableIndex i);
883  void find_loop_epsilon_indices(unsigned int input_pos,
884  TransitionTableIndex i);
885  void find_loop_transitions(SymbolNumber input,
886  unsigned int input_pos,
887  TransitionTableIndex i);
888  void find_loop_index(SymbolNumber input,
889  unsigned int input_pos,
890  TransitionTableIndex i);
891  void find_loop(unsigned int input_pos,
892  TransitionTableIndex i);
893 
894 
895 public:
896  Transducer(std::istream& is);
897  Transducer(bool weighted);
898  Transducer(Transducer * t);
899  Transducer();
900  Transducer(const TransducerHeader& header,
901  const TransducerAlphabet& alphabet,
902  const TransducerTable<TransitionIndex>& index_table,
903  const TransducerTable<Transition>& transition_table);
904  Transducer(const TransducerHeader& header,
905  const TransducerAlphabet& alphabet,
906  const TransducerTable<TransitionWIndex>& index_table,
907  const TransducerTable<TransitionW>& transition_table);
908  virtual ~Transducer();
909 
910  void write(std::ostream& os) const;
911  Transducer * copy(Transducer * t, bool weighted = false);
912  void display() const;
913 
914  const TransducerHeader& get_header() const
915  { return *header; }
916  const TransducerAlphabet& get_alphabet() const
917  { return *alphabet; }
918  const Encoder& get_encoder(void) const
919  { return *encoder; }
920  const hfst::FdTable<SymbolNumber>& get_fd_table() const
921  { return alphabet->get_fd_table(); }
922  const SymbolTable& get_symbol_table() const
923  { return alphabet->get_symbol_table(); }
924 
925 
926  const TransitionIndex& get_index(TransitionTableIndex i) const
927  { return tables->get_index(i); }
928  const Transition& get_transition(TransitionTableIndex i) const
929  { return tables->get_transition(i); }
930  bool final_index(TransitionTableIndex i) const
931  {
932  if (indexes_transition_table(i)) {
933  return tables->get_transition_finality(i);
934  } else {
935  return tables->get_index_finality(i);
936  }
937  }
938  bool is_infinitely_ambiguous(void) const
939  {
940  return header->probe_flag(Has_input_epsilon_cycles);
941  }
942 
943  bool is_lookup_infinitely_ambiguous(const StringVector & s);
944  bool is_lookup_infinitely_ambiguous(const std::string & input);
945 
946  TransducerTable<TransitionWIndex> copy_windex_table();
947  TransducerTable<TransitionW> copy_transitionw_table();
948  TransducerTable<TransitionIndex> copy_index_table();
949  TransducerTable<Transition> copy_transition_table();
950 
951  // state_index must be an index to a state which is defined as either:
952  // (1) the start of a set of entries in the transition index table, or
953  // (2) the boundary before a set of entries in the transition table, in
954  // which case the following entries will all have the same input symbol
955  // This function will return a vector of indices to the transition table,
956  // i.e. the arcs from the given state
957  TransitionTableIndexSet get_transitions_from_state(
958  TransitionTableIndex state_index) const;
959 
960 
961  bool initialize_input(const char * input_str);
962  HfstOneLevelPaths * lookup_fd(const StringVector & s, ssize_t limit = -1,
963  double time_cutoff = 0.0);
964  /* Tokenize and lookup, accounting for flag diacritics, the surface string
965  \a s. The return value, a pointer to HfstOneLevelPaths
966  (which is a set) of analyses, is newly allocated.
967  */
968  HfstOneLevelPaths * lookup_fd(const std::string & s, ssize_t limit = -1,
969  double time_cutoff = 0.0);
970  HfstOneLevelPaths * lookup_fd(const char * s, ssize_t limit = -1,
971  double time_cutoff = 0.0);
972  void note_analysis(void);
973 
974  // Methods for supporting ospell
975  SymbolNumber get_unknown_symbol(void) const
976  { return alphabet->get_unknown_symbol(); }
977  StringSymbolMap get_string_symbol_map(void) const
978  { return alphabet->build_string_symbol_map(); }
979  STransition take_epsilons(const TransitionTableIndex i) const;
980  STransition take_epsilons_and_flags(const TransitionTableIndex i);
981  STransition take_non_epsilons(const TransitionTableIndex i,
982  const SymbolNumber symbol) const;
983  TransitionTableIndex next(const TransitionTableIndex i,
984  const SymbolNumber symbol) const;
985  TransitionTableIndex next_e(const TransitionTableIndex i) const;
986  bool has_transitions(const TransitionTableIndex i,
987  const SymbolNumber symbol) const;
988  bool has_epsilons_or_flags(const TransitionTableIndex i);
989  Weight final_weight(const TransitionTableIndex i) const;
990  bool is_flag(const SymbolNumber symbol)
991  { return alphabet->is_flag_diacritic(symbol); }
992  bool is_weighted(void)
993  { return header->probe_flag(Weighted);}
994 
995 
996  friend class ConvertTransducer;
997 };
998 
999 class STransition{
1000 public:
1001  TransitionTableIndex index;
1002  SymbolNumber symbol;
1003  Weight weight;
1004 
1005  STransition(TransitionTableIndex i,
1006  SymbolNumber s):
1007  index(i),
1008  symbol(s),
1009  weight(0.0)
1010  {}
1011 
1012  STransition(TransitionTableIndex i,
1013  SymbolNumber s,
1014  Weight w):
1015  index(i),
1016  symbol(s),
1017  weight(w)
1018  {}
1019 
1020 };
1021 
1022 typedef std::pair<std::string, Weight> StringWeightPair;
1023 
1024 class StringWeightComparison
1025 /* results are reversed by default because greater weights represent
1026  worse results - to reverse the reversal, give a true argument*/
1027 
1028 {
1029  bool reverse;
1030 public:
1031  StringWeightComparison(bool reverse_result=false):
1032  reverse(reverse_result)
1033  {}
1034 
1035  bool operator() (StringWeightPair lhs, StringWeightPair rhs)
1036  { // return true when we want rhs to appear before lhs
1037  if (reverse) {
1038  return (lhs.second < rhs.second);
1039  } else {
1040  return (lhs.second > rhs.second);
1041  }
1042  }
1043 };
1044 
1045 typedef std::priority_queue<StringWeightPair,
1046  std::vector<StringWeightPair>,
1047  StringWeightComparison> CorrectionQueue;
1048 typedef std::priority_queue<StringWeightPair,
1049  std::vector<StringWeightPair>,
1050  StringWeightComparison> AnalysisQueue;
1051 typedef std::priority_queue<StringWeightPair,
1052  std::vector<StringWeightPair>,
1053  StringWeightComparison> HyphenationQueue;
1054 
1055 /*class STransducer: public Transducer
1056  {
1057  protected:
1058  bool final_transition(TransitionTableIndex i)
1059  {
1060  return transitions[i]->final();
1061  }
1062 
1063  bool final_index(TransitionTableIndex i)
1064  {
1065  return indices[i]->final();
1066  }
1067 
1068 
1069  public:
1070  Transducer(FILE * f):
1071  header(TransducerHeader(f)),
1072  alphabet(TransducerAlphabet(f, header.symbol_count())),
1073  keys(alphabet.get_key_table()),
1074  index_reader(f,header.index_table_size()),
1075  transition_reader(f,header.target_table_size()),
1076  encoder(keys,header.input_symbol_count()),
1077  indices(index_reader()),
1078  transitions(transition_reader())
1079  {}
1080 
1081  TransitionIndexVector &indices;
1082 
1083  TransitionVector &transitions;
1084 
1085  SymbolNumber find_next_key(char ** p)
1086  {
1087  return encoder.find_key(p);
1088  }
1089 
1090 
1091  unsigned int get_state_size(void)
1092  {
1093  return alphabet.get_state_size();
1094  }
1095 
1096  std::vector<const char*> * get_symbol_table(void)
1097  {
1098  return &symbol_table;
1099  }
1100 
1101  SymbolNumber get_other(void)
1102  {
1103  return alphabet.get_other();
1104  }
1105 
1106  TransducerAlphabet * get_alphabet(void)
1107  {
1108  return &alphabet;
1109  }
1110 
1111  OperationMap * get_operations(void)
1112  {
1113  return alphabet.get_operation_map();
1114  }
1115 
1116  STransition take_epsilons(const TransitionTableIndex i) const;
1117  STransition take_epsilons_and_flags(const TransitionTableIndex i);
1118  STransition take_non_epsilons(const TransitionTableIndex i,
1119  const SymbolNumber symbol) const;
1120  TransitionTableIndex next(const TransitionTableIndex i,
1121  const SymbolNumber symbol) const;
1122  TransitionTableIndex next_e(const TransitionTableIndex i) const;
1123  bool has_transitions(const TransitionTableIndex i,
1124  const SymbolNumber symbol) const;
1125  bool has_epsilons_or_flags(const TransitionTableIndex i);
1126  bool is_final(const TransitionTableIndex i);
1127  Weight final_weight(const TransitionTableIndex i) const;
1128  bool is_flag(const SymbolNumber symbol)
1129  { return alphabet.is_flag(symbol); }
1130  bool is_weighted(void)
1131  { return header.probe_flag(Weighted);}
1132 
1133  };*/
1134 
1135 class TreeNode
1136 {
1137 public:
1138  SymbolNumberVector string;
1139  unsigned int input_state;
1140  TransitionTableIndex mutator_state;
1141  TransitionTableIndex lexicon_state;
1142  hfst::FdState<SymbolNumber> flag_state;
1143  Weight weight;
1144 
1145  TreeNode(SymbolNumberVector prev_string,
1146  unsigned int i,
1147  TransitionTableIndex mutator,
1148  TransitionTableIndex lexicon,
1149  hfst::FdState<SymbolNumber> state,
1150  Weight w):
1151  string(prev_string),
1152  input_state(i),
1153  mutator_state(mutator),
1154  lexicon_state(lexicon),
1155  flag_state(state),
1156  weight(w)
1157  { }
1158 
1159  TreeNode(hfst::FdState<SymbolNumber> start_state): // starting state node
1160  string(SymbolNumberVector()),
1161  input_state(0),
1162  mutator_state(0),
1163  lexicon_state(0),
1164  flag_state(start_state),
1165  weight(0.0)
1166  { }
1167 
1168  TreeNode update_lexicon(SymbolNumber next_symbol,
1169  TransitionTableIndex next_lexicon,
1170  Weight weight);
1171 
1172  TreeNode update_mutator(SymbolNumber next_symbol,
1173  TransitionTableIndex next_mutator,
1174  Weight weight);
1175 
1176  void increment_mutator(void);
1177 
1178  TreeNode update(SymbolNumber next_symbol,
1179  unsigned int next_input,
1180  TransitionTableIndex next_mutator,
1181  TransitionTableIndex next_lexicon,
1182  Weight weight);
1183 
1184  TreeNode update(SymbolNumber next_symbol,
1185  TransitionTableIndex next_mutator,
1186  TransitionTableIndex next_lexicon,
1187  Weight weight);
1188 
1189 
1190 };
1191 
1192 typedef std::deque<TreeNode> TreeNodeQueue;
1193 
1194 int nByte_utf8(unsigned char c);
1195 
1196 class InputString
1197 {
1198 
1199 private:
1200  SymbolNumberVector s;
1201 
1202 public:
1203  InputString():
1204  s(SymbolNumberVector())
1205  { }
1206 
1207  bool initialize(const Encoder & encoder, char * input, SymbolNumber other);
1208 
1209  unsigned int len(void)
1210  {
1211  return s.size();
1212  }
1213 
1214  SymbolNumber operator[](unsigned int i)
1215  {
1216  return s[i];
1217  }
1218 
1219 };
1220 
1221 class AlphabetTranslationException: public std::runtime_error
1222 { // "what" should hold the first untranslatable symbol
1223 public:
1224 
1225  AlphabetTranslationException(const std::string what):
1226  std::runtime_error(what)
1227  { }
1228 };
1229 
1233 class Speller
1234 {
1235 public:
1236  Transducer * mutator;
1237  Transducer * lexicon;
1238  InputString input;
1239  TreeNodeQueue queue;
1240  SymbolNumberVector alphabet_translator;
1241 // hfst::FdTable<SymbolNumber> operations;
1242  std::vector<std::string> symbol_table;
1243 
1244  Speller(Transducer * mutator_ptr, Transducer * lexicon_ptr):
1245  mutator(mutator_ptr),
1246  lexicon(lexicon_ptr),
1247  input(InputString()),
1248  queue(TreeNodeQueue()),
1249  alphabet_translator(SymbolNumberVector()),
1250 // operations(lexicon->get_fd_table()),
1251  symbol_table(lexicon->get_symbol_table())
1252  {
1253  build_alphabet_translator();
1254  }
1255 
1256  bool init_input(char * str, const Encoder & encoder, SymbolNumber other);
1257 
1258  void build_alphabet_translator(void);
1259  void lexicon_epsilons(void);
1260  void mutator_epsilons(void);
1261  void consume_input(void);
1262  void lexicon_consume(void);
1265  bool check(char * line);
1268  CorrectionQueue correct(char * line);
1269  std::string stringify(SymbolNumberVector symbol_vector);
1270 };
1271 
1272 }
1273 
1274 #endif
A compiled transducer format, suitable for fast lookup operations.
Definition: transducer.h:833
#define HFST_THROW(E)
Macro to throw an exception of type E. Use THROW instead of regular throw with subclasses of HfstExce...
Definition: HfstExceptionDefs.h:40
A spellchecker, constructed from two optimized-lookup transducer instances. An alphabet translator is...
Definition: transducer.h:1233
std::set< HfstOneLevelPath > HfstOneLevelPaths
A set of simple paths.
Definition: HfstDataTypes.h:100
std::pair< float, StringVector > HfstOneLevelPath
A path of one level of arcs with collected weight.
Definition: HfstDataTypes.h:96
CorrectionQueue correct(char *line)
Definition: ospell.cc:290
bool check(char *line)
Definition: ospell.cc:334
Transducer has wrong type.
Definition: HfstExceptionDefs.h:400