active_matcher.tokenizer package

Submodules

active_matcher.tokenizer.tokenizer module

class active_matcher.tokenizer.tokenizer.AlphaNumericTokenizer

Bases: Tokenizer

Methods

out_col_name(input_col)

the name of the output column from the tokenizer e.g. for a 3gram tokenizer, the tokens from the name columns could be "3gram(name)".

tokenize(s)

convert the string into a BAG of tokens (tokens should not be deduped)

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_spark(input_col)

return a column expression that gives the same output as the tokenize method.

NAME = 'alnum_tokens'
tokenize(s)

convert the string into a BAG of tokens (tokens should not be deduped)

class active_matcher.tokenizer.tokenizer.NumericTokenizer

Bases: Tokenizer

Methods

out_col_name(input_col)

the name of the output column from the tokenizer e.g. for a 3gram tokenizer, the tokens from the name columns could be "3gram(name)".

tokenize(s)

convert the string into a BAG of tokens (tokens should not be deduped)

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_spark(input_col)

return a column expression that gives the same output as the tokenize method.

NAME = 'num_tokens'
tokenize(s)

convert the string into a BAG of tokens (tokens should not be deduped)

class active_matcher.tokenizer.tokenizer.QGramTokenizer(n)

Bases: Tokenizer

Methods

out_col_name(input_col)

the name of the output column from the tokenizer e.g. for a 3gram tokenizer, the tokens from the name columns could be "3gram(name)".

tokenize(s)

convert the string into a BAG of tokens (tokens should not be deduped)

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_spark(input_col)

return a column expression that gives the same output as the tokenize method.

tokenize(s: str) list

convert the string into a BAG of tokens (tokens should not be deduped)

class active_matcher.tokenizer.tokenizer.ShingleTokenizer(n)

Bases: Tokenizer

Methods

base_tokenize(s)

convert the string into a BAG of tokens (tokens should not be deduped)

out_col_name(input_col)

the name of the output column from the tokenizer e.g. for a 3gram tokenizer, the tokens from the name columns could be "3gram(name)".

tokenize(s)

convert the string into a BAG of tokens (tokens should not be deduped)

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_spark(input_col)

return a column expression that gives the same output as the tokenize method.

base_tokenize(s)

convert the string into a BAG of tokens (tokens should not be deduped)

tokenize(s: str) list

convert the string into a BAG of tokens (tokens should not be deduped)

class active_matcher.tokenizer.tokenizer.StrippedQGramTokenizer(n)

Bases: Tokenizer

Methods

out_col_name(input_col)

the name of the output column from the tokenizer e.g. for a 3gram tokenizer, the tokens from the name columns could be "3gram(name)".

tokenize(s)

convert the string into a BAG of tokens (tokens should not be deduped)

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_spark(input_col)

return a column expression that gives the same output as the tokenize method.

RE = re.compile('\\W+')
tokenize(s: str) list

convert the string into a BAG of tokens (tokens should not be deduped)

class active_matcher.tokenizer.tokenizer.StrippedWhiteSpaceTokenizer

Bases: Tokenizer

Methods

out_col_name(input_col)

the name of the output column from the tokenizer e.g. for a 3gram tokenizer, the tokens from the name columns could be "3gram(name)".

tokenize(s)

convert the string into a BAG of tokens (tokens should not be deduped)

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_spark(input_col)

return a column expression that gives the same output as the tokenize method.

NAME = 'stripped_whitespace_tokens'
RE = re.compile('[^a-z0-9 ]+')
WHITESPACE_NORM = re.compile('\\s+')
tokenize(s)

convert the string into a BAG of tokens (tokens should not be deduped)

class active_matcher.tokenizer.tokenizer.Tokenizer

Bases: ABC

Methods

out_col_name(input_col)

the name of the output column from the tokenizer e.g. for a 3gram tokenizer, the tokens from the name columns could be "3gram(name)".

tokenize(s)

convert the string into a BAG of tokens (tokens should not be deduped)

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_spark(input_col)

return a column expression that gives the same output as the tokenize method.

out_col_name(input_col)

the name of the output column from the tokenizer e.g. for a 3gram tokenizer, the tokens from the name columns could be “3gram(name)”

abstractmethod tokenize(s)

convert the string into a BAG of tokens (tokens should not be deduped)

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_spark(input_col: Column)

return a column expression that gives the same output as the tokenize method. required for effeciency when building metadata for certain methods

class active_matcher.tokenizer.tokenizer.WhiteSpaceTokenizer

Bases: Tokenizer

Methods

out_col_name(input_col)

the name of the output column from the tokenizer e.g. for a 3gram tokenizer, the tokens from the name columns could be "3gram(name)".

tokenize(s)

convert the string into a BAG of tokens (tokens should not be deduped)

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_spark(input_col)

return a column expression that gives the same output as the tokenize method.

NAME = 'whitespace_tokens'
tokenize(s)

convert the string into a BAG of tokens (tokens should not be deduped)

active_matcher.tokenizer.vectorizer module

class active_matcher.tokenizer.vectorizer.SIFVectorizer

Bases: object

Methods

build_from_doc_freqs

init

out_col_name

vectorize

build_from_doc_freqs(doc_freqs)
init()
out_col_name(base)
vectorize(tokens)
class active_matcher.tokenizer.vectorizer.TFIDFVectorizer

Bases: object

Methods

build_from_doc_freqs

init

out_col_name

vectorize

build_from_doc_freqs(doc_freqs)
init()
out_col_name(base)
vectorize(tokens)

Module contents