delex.index package

Submodules

delex.index.filtered_set_sim_index module

class delex.index.filtered_set_sim_index.FilteredSetSimIndex(sim, threshold, max_slice_size=16384)

Bases: SparkDistributable

an optimized memory mapped index for set similarity measures

Methods

deinit()

deinitialize the object, closing resources (e.g. file handles).

init()

initialize the object to be used on in a spark worker

search(tokens, thres)

search the index with tokens and retrieve all ids with score > thres

to_spark()

send the obj to the spark cluster to be used on spark workers

CacheKey

build

from_sparse_mat

size_in_bytes

class CacheKey(index_col: str, search_col: str | None, tokenizer_type: str, sim: str, threshold: float)

Bases: CachedObjectKey

index_col: str
search_col: str | None
sim: str
threshold: float
tokenizer_type: str
SLICE_TYPES = {'cosine': <class 'delex.index.filtered_set_sim_index_slice.CosineSetSimIndexSlice'>, 'jaccard': <class 'delex.index.filtered_set_sim_index_slice.JaccardSetSimIndexSlice'>}
build(df, token_col, id_col='_id')
deinit()

deinitialize the object, closing resources (e.g. file handles)

classmethod from_sparse_mat(sparse_mat, sim, threshold, ids=None)
init()

initialize the object to be used on in a spark worker

search(tokens: ndarray, thres: float) Tuple[ndarray, ndarray]

search the index with tokens and retrieve all ids with score > thres

Parameters:
tokensnp.ndarray np.int32

the tokens for searching

thresfloat

the minimum threshold to retrieve

Returns:
np.ndarray np.int64

the ids from the index with score that satisfies the threshold

size_in_bytes() int
to_spark()

send the obj to the spark cluster to be used on spark workers

delex.index.filtered_set_sim_index_slice module

class delex.index.filtered_set_sim_index_slice.CosineSetSimIndexSlice(*args, **kwargs)

Bases: CosineSetSimIndexSlice

Methods

class_type

search

class_type = jitclass.CosineSetSimIndexSlice#15ac50aa0<nrow:int32,thres:float32,data:array(int16, 1d, A),set_data:array(int32, 1d, A),size:array(int32, 1d, A),set_indptr:array(int32, 1d, A),span_map:unaligned array(Record(hash[type=int32;offset=0],offset[type=int32;offset=4],len[type=int16;offset=8];10;False), 1d, A),offset:int32>
class delex.index.filtered_set_sim_index_slice.FilteredSetSimIndexSlice(nrow, thres, set_data, set_indptr, offset, data=None, span_map=None)

Bases: object

Methods

search

search(indexes, thres, scores_out, indexes_out)
class delex.index.filtered_set_sim_index_slice.JaccardSetSimIndexSlice(*args, **kwargs)

Bases: JaccardSetSimIndexSlice

Methods

class_type

search

class_type = jitclass.JaccardSetSimIndexSlice#1597eb650<nrow:int32,thres:float32,data:array(int16, 1d, A),set_data:array(int32, 1d, A),size:array(int32, 1d, A),set_indptr:array(int32, 1d, A),span_map:unaligned array(Record(hash[type=int32;offset=0],offset[type=int32;offset=4],len[type=int16;offset=8];10;False), 1d, A),offset:int32>

delex.index.hash_index module

class delex.index.hash_index.HashIndex

Bases: SparkDistributable

a memory mapped hash index to be used on Spark

Methods

build(index_table, index_col[, id_col])

build the index over index_col of index_table using id_col as a unique id,

deinit()

deinitialize the object, closing resources (e.g. file handles).

fetch(key)

fetch all records with key, return None if entry doesn't exist in index

init()

initialize the object to be used on in a spark worker

to_spark()

send the obj to the spark cluster to be used on spark workers

CacheKey

size_in_bytes

class CacheKey(index_col: str, lowercase: bool)

Bases: CachedObjectKey

index_col: str
lowercase: bool
build(index_table: DataFrame, index_col: str, id_col: str = '_id')

build the index over index_col of index_table using id_col as a unique id,

Parameters:
index_tablepyspark.sql.DataFrame

the dataframe that will be preprocessed / indexed

index_colstr

the name of the string column to be indexes

id_colstr

the name of the unique id column in index_table

deinit()

deinitialize the object, closing resources (e.g. file handles)

fetch(key: str) ndarray | None

fetch all records with key, return None if entry doesn’t exist in index

Parameters:
keystr

the key to retrieve

Returns:
a numpy array of ids if key is found else None
init()

initialize the object to be used on in a spark worker

size_in_bytes()
to_spark()

send the obj to the spark cluster to be used on spark workers

delex.index.set_sim_index module

class delex.index.set_sim_index.SetSimIndex

Bases: object

Methods

cosine_threshold

from_sparse_mat

init

jaccard_threshold

overlap_coeff_threshold

to_spark

cosine_threshold(indexes, thres)
classmethod from_sparse_mat(sparse_mat)
init()
jaccard_threshold(indexes, thres)
overlap_coeff_threshold(indexes, thres)
to_spark()
class delex.index.set_sim_index.SetSimIndexSlice(*args, **kwargs)

Bases: SetSimIndexSlice

a reference class for set similarity metrics, DO NOT USE THIS

Methods

class_type

cosine_threshold

jaccard_threshold

overlap_coeff_threshold

class_type = jitclass.SetSimIndexSlice#15ac51850<nrow:int32,ncol:int32,data:array(int16, 1d, A),size:array(int32, 1d, A),indptr:array(int32, 1d, A),offset:int32>

Module contents