Source code for py_entitymatching.sampler.single_table

# coding=utf-8
"""
This module contains sampling.rst related routines for a single table.
"""
import logging

import pandas as pd
import numpy as np
import six

import py_entitymatching.catalog.catalog_manager as cm
import py_entitymatching.utils.catalog_helper as ch
from py_entitymatching.utils.validation_helper import validate_object_type

logger = logging.getLogger(__name__)


# sample one table using random sampling.rst
[docs]def sample_table(table, sample_size, replace=False, verbose=False): """ Samples a candidate set of tuple pairs (for labeling purposes). This function samples a DataFrame, typically used for labeling purposes. This function expects the input DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable). Specifically, this function creates a copy of the input DataFrame, samples the data using uniform random sampling (uses 'random' function from numpy to sample) and returns the sampled DataFrame. Further, also copies the properties from the input DataFrame to the output DataFrame. Args: table (DataFrame): The input DataFrame to be sampled. Specifically, a DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable) in the catalog. sample_size (int): The number of samples to be picked from the input DataFrame. replace (boolean): A flag to indicate whether sampling should be done with replacement or not (defaults to False). verbose (boolean): A flag to indicate whether more detailed information about the execution steps should be printed out (defaults to False). Returns: A new DataFrame with 'sample_size' number of rows. Further, this function sets the output DataFrame's properties same as input DataFrame. Raises: AssertionError: If `table` is not of type pandas DataFrame. AssertionError: If the size of `table` is 0. AssertionError: If the `sample_size` is greater than the input DataFrame size. Examples: >>> import py_entitymatching as em >>> S = em.sample_table(C, sample_size=450) # C is the candidate set to be sampled from. Note: As mentioned in the above description, the output DataFrame is updated (in the catalog) with the properties from the input DataFrame. A subtle point to note here is, when the replace flag is set to True, then the output DataFrame can contain duplicate keys. In that case, this function will not set the key and it is up to the user to fix it after the function returns. """ # Validate input parameters. # # The input DataFrame is expected to be of type pandas DataFrame. validate_object_type(table, pd.DataFrame) # # There should at least not-zero rows to sample from if len(table) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') # # The sample size should be less than or equal to the number of rows in # the input DataFrame if len(table) < sample_size: logger.error('Sample size is larger than the input table size') raise AssertionError('Sample size is larger than the input table size') # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Get the sample set for the output table sample_indices = np.random.choice(len(table), sample_size, replace=replace) # Sort the indices ordered by index value sample_indices = sorted(sample_indices) sampled_table = table.iloc[list(sample_indices)] # Copy the properties cm.init_properties(sampled_table) # # If the replace is set to True, then we should check for the validity # of key before setting it if replace: properties = cm.get_all_properties(table) for property_name, property_value in six.iteritems(properties): if property_name == 'key': # Check for the validity of key before setting it cm.set_key(sampled_table, property_value) else: # Copy the other properties as is cm.set_property(sampled_table, property_name, property_value) else: cm.copy_properties(table, sampled_table) # Return the sampled table return sampled_table