delex package

delex.tokenizer module

class delex.tokenizer.AlphaNumericTokenizer

Bases: Tokenizer

Attributes:
nunique_tokens

Methods

deinit()

deinitialize the object, closing resources (e.g. file handles).

init()

initialize the object to be used on in a spark worker

to_spark()

send the obj to the spark cluster to be used on spark workers

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_set_spark(input_col)

return a column expression that gives the same output as the tokenize method.

tokenize_spark(input_col)

return a column expression that gives the same output as the tokenize method.

CacheKey

build

tokenize

NAME = 'alnum_tokens'
class delex.tokenizer.NumericTokenizer

Bases: Tokenizer

Attributes:
nunique_tokens

Methods

deinit()

deinitialize the object, closing resources (e.g. file handles).

init()

initialize the object to be used on in a spark worker

to_spark()

send the obj to the spark cluster to be used on spark workers

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_set_spark(input_col)

return a column expression that gives the same output as the tokenize method.

tokenize_spark(input_col)

return a column expression that gives the same output as the tokenize method.

CacheKey

build

tokenize

NAME = 'num_tokens'
class delex.tokenizer.QGramTokenizer(q, use_freqs: bool = True)

Bases: Tokenizer

Attributes:
nunique_tokens

Methods

deinit()

deinitialize the object, closing resources (e.g. file handles).

init()

initialize the object to be used on in a spark worker

to_spark()

send the obj to the spark cluster to be used on spark workers

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_set_spark(input_col)

return a column expression that gives the same output as the tokenize method.

tokenize_spark(input_col)

return a column expression that gives the same output as the tokenize method.

CacheKey

build

tokenize

class delex.tokenizer.ShingleTokenizer(n)

Bases: Tokenizer

Attributes:
nunique_tokens

Methods

deinit()

deinitialize the object, closing resources (e.g. file handles).

init()

initialize the object to be used on in a spark worker

to_spark()

send the obj to the spark cluster to be used on spark workers

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_set_spark(input_col)

return a column expression that gives the same output as the tokenize method.

tokenize_spark(input_col)

return a column expression that gives the same output as the tokenize method.

CacheKey

base_tokenize

build

tokenize

base_tokenize(s: str) array
class delex.tokenizer.StrippedQGramTokenizer(q, use_freqs: bool = True)

Bases: Tokenizer

Attributes:
nunique_tokens

Methods

deinit()

deinitialize the object, closing resources (e.g. file handles).

init()

initialize the object to be used on in a spark worker

to_spark()

send the obj to the spark cluster to be used on spark workers

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_set_spark(input_col)

return a column expression that gives the same output as the tokenize method.

tokenize_spark(input_col)

return a column expression that gives the same output as the tokenize method.

CacheKey

build

tokenize

RE = re.compile('\\W+')
class delex.tokenizer.StrippedWhiteSpaceTokenizer(use_freqs: bool = True)

Bases: Tokenizer

Attributes:
nunique_tokens

Methods

deinit()

deinitialize the object, closing resources (e.g. file handles).

init()

initialize the object to be used on in a spark worker

to_spark()

send the obj to the spark cluster to be used on spark workers

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_set_spark(input_col)

return a column expression that gives the same output as the tokenize method.

tokenize_spark(input_col)

return a column expression that gives the same output as the tokenize method.

CacheKey

build

tokenize

NAME = 'stripped_whitespace_tokens'
RE = re.compile('[^a-z0-9 ]+')
WHITESPACE_NORM = re.compile('\\s+')
class delex.tokenizer.Tokenizer(use_freqs: bool = True)

Bases: SparkDistributable

Attributes:
nunique_tokens

Methods

deinit()

deinitialize the object, closing resources (e.g. file handles).

init()

initialize the object to be used on in a spark worker

to_spark()

send the obj to the spark cluster to be used on spark workers

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_set_spark(input_col)

return a column expression that gives the same output as the tokenize method.

tokenize_spark(input_col)

return a column expression that gives the same output as the tokenize method.

CacheKey

build

tokenize

class CacheKey(index_col: str, search_col: str | None, tokenizer_type: str)

Bases: CachedObjectKey

index_col: str
search_col: str | None
tokenizer_type: str
build(df: DataFrame, col: str)
deinit()

deinitialize the object, closing resources (e.g. file handles)

init()

initialize the object to be used on in a spark worker

property nunique_tokens
to_spark()

send the obj to the spark cluster to be used on spark workers

tokenize(s: str) array
tokenize_set(s: str) array

tokenize the string and return a set or None if the tokenize returns None

tokenize_set_spark(input_col: Column)

return a column expression that gives the same output as the tokenize method. Required for efficiency when building metadata for certain methods

tokenize_spark(input_col: Column)

return a column expression that gives the same output as the tokenize method. Required for efficiency when building metadata for certain methods

class delex.tokenizer.WhiteSpaceTokenizer

Bases: Tokenizer

Attributes:
nunique_tokens

Methods

deinit()

deinitialize the object, closing resources (e.g. file handles).

init()

initialize the object to be used on in a spark worker

to_spark()

send the obj to the spark cluster to be used on spark workers

tokenize_set(s)

tokenize the string and return a set or None if the tokenize returns None

tokenize_set_spark(input_col)

return a column expression that gives the same output as the tokenize method.

tokenize_spark(input_col)

return a column expression that gives the same output as the tokenize method.

CacheKey

build

tokenize

NAME = 'whitespace_tokens'