HFST - Helsinki Finite-State Transducer Technology - Python API
version 3.11.0
|
A tokenizer for creating transducers from UTF-8 strings. More...
Public Member Functions | |
def | __init__ |
Create a tokenizer that recognizes utf-8 symbols. More... | |
def | add_multichar_symbol |
Add a multicharacter symbol symbol to this tokenizer. More... | |
def | add_skip_symbol |
Add a symbol to be skipped to this tokenizer. More... | |
def | check_utf8_correctness |
If input_string is not valid utf-8, throw an IncorrectUtf8CodingException. More... | |
def | tokenize |
Tokenize the string input_string. More... | |
def | tokenize |
Tokenize the string pair input_string : output_string. More... | |
def | tokenize_one_level |
Tokenize the string input_string. More... | |
def | tokenize_space_separated |
Tokenize str and skip all spaces. More... | |
A tokenizer for creating transducers from UTF-8 strings.
With an HfstTokenizer, it is possible to split UTF-8 strings into tuples of symbols which can then in turn be used to create transducers:
>>> tok = hfst.HfstTokenizer() >>> tok.add_multichar_symbol('foo') >>> tok.add_skip_symbol('b') >>> tok.tokenize('foobar') (('foo', 'foo'), ('a', 'a'), ('r', 'r')) >>> tok.tokenize_one_level('foobar') ('foo', 'a', 'r') >>> tok.tokenize('foobar','barfoo') (('foo', 'a'), ('a', 'r'), ('r', 'foo'))
Strings are tokenized from left to right using longest match tokenization.
def __init__ | ( | self | ) |
Create a tokenizer that recognizes utf-8 symbols.
tok = hfst.HfstTokenizer()
def add_multichar_symbol | ( | self, | |
symbol | |||
) |
Add a multicharacter symbol symbol to this tokenizer.
Strings are always tokenized from left to right using longest match tokenization.
>>> tok = hfst.HfstTokenizer() >>> tok.add_multichar_symbol('fo') >>> tok.add_multichar_symbol('foo') >>> tok.add_multichar_symbol('of') >>> tok.tokenize_one_level('fofoofooof') ('fo', 'foo', 'foo', 'of')
def add_skip_symbol | ( | self, | |
symbol | |||
) |
Add a symbol to be skipped to this tokenizer.
>>> tok = hfst.HfstTokenizer() >>> tok.add_skip_symbol('foo') >>> tok.tokenize_one_level('foofofoobar') ('f', 'o', 'b', 'a', 'r')
Note that both multicharacter symbols and skip symbols are matched from left to right using longest match tokenization:
>>> tok = hfst.HfstTokenizer() >>> tok.add_multichar_symbol('foo') >>> tok.add_skip_symbol('fo') >>> tok.tokenize_one_level('foofo') ('foo',)
>>> tok = hfst.HfstTokenizer() >>> tok.add_multichar_symbol('fo') >>> tok.add_skip_symbol('foo') >>> tok.tokenize_one_level('foofo') ('fo',)
def check_utf8_correctness | ( | input_string | ) |
If input_string is not valid utf-8, throw an IncorrectUtf8CodingException.
A string is non-valid if:
try: hfst.HfstTokenizer.check_utf8_correctness('föö') except hfst.exceptions.IncorrectUtf8CodingException as e: print('FAIL')
This function is a static one.
def tokenize | ( | self, | |
input_string | |||
) |
Tokenize the string input_string.
>>> tok = hfst.HfstTokenizer() >>> t = tok.tokenize('foobar') >>> print(t) (('f', 'f'), ('o', 'o'), ('o', 'o'), ('b', 'b'), ('a', 'a'), ('r', 'r'))
def tokenize | ( | self, | |
input_string, | |||
output_string | |||
) |
Tokenize the string pair input_string : output_string.
If one string has more tokens than the other, epsilons will be inserted to the end of the tokenized string with less tokens so that both tokenized strings have the same number of tokens.
>>> tok = hfst.HfstTokenizer() >>> tok.add_multichar_symbol('foo') >>> tok.add_skip_symbol('b') >>> tok.tokenize('foobar','Foobar') (('foo', 'F'), ('a', 'o'), ('r', 'o'), ('@_EPSILON_SYMBOL_@', 'a'), ('@_EPSILON_SYMBOL_@', 'r'))
def tokenize_one_level | ( | self, | |
input_string | |||
) |
Tokenize the string input_string.
>>> tok = hfst.HfstTokenizer() >>> t = tok.tokenize_one_level('foobar') >>> print(t) ('f', 'o', 'o', 'b', 'a', 'r')
def tokenize_space_separated | ( | self, | |
str | |||
) |
Tokenize str and skip all spaces.
>>> tok = hfst.HfstTokenizer() >>> tok.tokenize_space_separated('f o o b a r') (('f', 'f'), ('o', 'o'), ('o', 'o'), ('b', 'b'), ('a', 'a'), ('r', 'r')) >>> tok.tokenize_space_separated('foo b a r') (('foo', 'foo'), ('b', 'b'), ('a', 'a'), ('r', 'r')) >>> tok.tokenize_space_separated('f o o bar') (('f', 'f'), ('o', 'o'), ('o', 'o'), ('bar', 'bar')) >>> tok.tokenize_space_separated('foo bar') (('foo', 'foo'), ('bar', 'bar')) >>> tok.tokenize_space_separated('foobar') (('foobar', 'foobar'),)
Note that skip symbols and multicharacter symbols defined with add_skip_symbol and add_multichar_symbol have no effect when tokenize_space_separated is called.