delex package

delex.tokenizer module

class delex.tokenizer.AlphaNumericTokenizer

Bases: Tokenizer

Attributes:

nunique_tokens

Methods

`deinit`()	deinitialize the object, closing resources (e.g. file handles).
`init`()	initialize the object to be used on in a spark worker
`to_spark`()	send the obj to the spark cluster to be used on spark workers
`tokenize_set`(s)	tokenize the string and return a set or None if the tokenize returns None
`tokenize_set_spark`(input_col)	return a column expression that gives the same output as the tokenize method.
`tokenize_spark`(input_col)	return a column expression that gives the same output as the tokenize method.

CacheKey
build
tokenize

NAME = 'alnum_tokens'

class delex.tokenizer.NumericTokenizer

Bases: Tokenizer

Attributes:

nunique_tokens

Methods

`deinit`()	deinitialize the object, closing resources (e.g. file handles).
`init`()	initialize the object to be used on in a spark worker
`to_spark`()	send the obj to the spark cluster to be used on spark workers
`tokenize_set`(s)	tokenize the string and return a set or None if the tokenize returns None
`tokenize_set_spark`(input_col)	return a column expression that gives the same output as the tokenize method.
`tokenize_spark`(input_col)	return a column expression that gives the same output as the tokenize method.

CacheKey
build
tokenize

NAME = 'num_tokens'

class delex.tokenizer.QGramTokenizer(q, use_freqs: bool = True)

Bases: Tokenizer

Attributes:

nunique_tokens

Methods

`deinit`()	deinitialize the object, closing resources (e.g. file handles).
`init`()	initialize the object to be used on in a spark worker
`to_spark`()	send the obj to the spark cluster to be used on spark workers
`tokenize_set`(s)	tokenize the string and return a set or None if the tokenize returns None
`tokenize_set_spark`(input_col)	return a column expression that gives the same output as the tokenize method.
`tokenize_spark`(input_col)	return a column expression that gives the same output as the tokenize method.

CacheKey
build
tokenize

class delex.tokenizer.ShingleTokenizer(n)

Bases: Tokenizer

Attributes:

nunique_tokens

Methods

`deinit`()	deinitialize the object, closing resources (e.g. file handles).
`init`()	initialize the object to be used on in a spark worker
`to_spark`()	send the obj to the spark cluster to be used on spark workers
`tokenize_set`(s)	tokenize the string and return a set or None if the tokenize returns None
`tokenize_set_spark`(input_col)	return a column expression that gives the same output as the tokenize method.
`tokenize_spark`(input_col)	return a column expression that gives the same output as the tokenize method.

CacheKey
base_tokenize
build
tokenize

base_tokenize(s: str) → array

class delex.tokenizer.StrippedQGramTokenizer(q, use_freqs: bool = True)

Bases: Tokenizer

Attributes:

nunique_tokens

Methods

`deinit`()	deinitialize the object, closing resources (e.g. file handles).
`init`()	initialize the object to be used on in a spark worker
`to_spark`()	send the obj to the spark cluster to be used on spark workers
`tokenize_set`(s)	tokenize the string and return a set or None if the tokenize returns None
`tokenize_set_spark`(input_col)	return a column expression that gives the same output as the tokenize method.
`tokenize_spark`(input_col)	return a column expression that gives the same output as the tokenize method.

CacheKey
build
tokenize

RE = re.compile('\\W+')

class delex.tokenizer.StrippedWhiteSpaceTokenizer(use_freqs: bool = True)

Bases: Tokenizer

Attributes:

nunique_tokens

Methods

`deinit`()	deinitialize the object, closing resources (e.g. file handles).
`init`()	initialize the object to be used on in a spark worker
`to_spark`()	send the obj to the spark cluster to be used on spark workers
`tokenize_set`(s)	tokenize the string and return a set or None if the tokenize returns None
`tokenize_set_spark`(input_col)	return a column expression that gives the same output as the tokenize method.
`tokenize_spark`(input_col)	return a column expression that gives the same output as the tokenize method.

CacheKey
build
tokenize

NAME = 'stripped_whitespace_tokens'

RE = re.compile('[^a-z0-9 ]+')

WHITESPACE_NORM = re.compile('\\s+')

class delex.tokenizer.Tokenizer(use_freqs: bool = True)

Bases: SparkDistributable

Attributes:

nunique_tokens

Methods

`deinit`()	deinitialize the object, closing resources (e.g. file handles).
`init`()	initialize the object to be used on in a spark worker
`to_spark`()	send the obj to the spark cluster to be used on spark workers
`tokenize_set`(s)	tokenize the string and return a set or None if the tokenize returns None
`tokenize_set_spark`(input_col)	return a column expression that gives the same output as the tokenize method.
`tokenize_spark`(input_col)	return a column expression that gives the same output as the tokenize method.

CacheKey
build
tokenize

class CacheKey(index_col: str, search_col: str | None, tokenizer_type: str)

Bases: CachedObjectKey

index_col: str

search_col: str | None

tokenizer_type: str

build(df: DataFrame, col: str)

deinit(): deinitialize the object, closing resources (e.g. file handles)

init(): initialize the object to be used on in a spark worker

property nunique_tokens

to_spark(): send the obj to the spark cluster to be used on spark workers

tokenize(s: str) → array

tokenize_set(s: str) → array: tokenize the string and return a set or None if the tokenize returns None

tokenize_set_spark(input_col: Column): return a column expression that gives the same output as the tokenize method. Required for efficiency when building metadata for certain methods

tokenize_spark(input_col: Column): return a column expression that gives the same output as the tokenize method. Required for efficiency when building metadata for certain methods

class delex.tokenizer.WhiteSpaceTokenizer

Bases: Tokenizer

Attributes:

nunique_tokens

Methods

`deinit`()	deinitialize the object, closing resources (e.g. file handles).
`init`()	initialize the object to be used on in a spark worker
`to_spark`()	send the obj to the spark cluster to be used on spark workers
`tokenize_set`(s)	tokenize the string and return a set or None if the tokenize returns None
`tokenize_set_spark`(input_col)	return a column expression that gives the same output as the tokenize method.
`tokenize_spark`(input_col)	return a column expression that gives the same output as the tokenize method.

CacheKey
build
tokenize

NAME = 'whitespace_tokens'