Source code for py_entitymatching.catalog.catalog_manager

# coding=utf-8
"""
This module contains wrapper functions for the catalog.
"""
import logging

import pandas as pd
import six

import py_entitymatching.utils.catalog_helper as ch
from py_entitymatching.catalog.catalog import Catalog
from py_entitymatching.utils.validation_helper import validate_object_type

logger = logging.getLogger(__name__)


[docs]def get_property(data_frame, property_name): """ Gets the value of a property (with the given property name) for a pandas DataFrame from the catalog. Args: data_frame (DataFrame): The DataFrame for which the property should be retrieved. property_name (string): The name of the property that should be retrieved. Returns: A Python object (typically a string or a pandas DataFrame depending on the property name) is returned. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `property_name` is not of type string. KeyError: If `data_frame` information is not present in the catalog. KeyError: If requested property for the `data_frame` is not present in the catalog. Examples: >>> import py_entitymatching as em >>> import pandas as pd >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> em.set_key(A, 'id') >>> em.get_property(A, 'key') # id """ # Validate input parameters # # The input object should be of type pandas DataFrame validate_object_type(data_frame, pd.DataFrame) # # The property name should be of type string validate_object_type(property_name, six.string_types, error_prefix='Property name') # Get the catalog instance, this is imported here because this object # used to validate the presence of a DataFrame in the catalog, and the # presence of requested metadata in the catalog. catalog = Catalog.Instance() # Check for the present of input DataFrame in the catalog. if not catalog.is_df_info_present_in_catalog(data_frame): logger.error('DataFrame information is not present in the catalog') raise KeyError('DataFrame information is not present in the catalog') # Check if the requested property is present in the catalog. if not catalog.is_property_present_for_df(data_frame, property_name): logger.error( 'Requested metadata ( %s ) for the given DataFrame is not ' 'present in the catalog' % property_name) raise KeyError( 'Requested metadata ( %s ) for the given DataFrame is not ' 'present in the catalog' % property_name) # Return the requested property for the input DataFrame return catalog.get_property(data_frame, property_name)
[docs]def set_property(data_frame, property_name, property_value): """ Sets the value of a property (with the given property name) for a pandas DataFrame in the catalog. Args: data_frame (DataFrame): The DataFrame for which the property must be set. property_name (string): The name of the property to be set. property_value (object): The value of the property to be set. This is typically a string (such as key) or pandas DataFrame (such as ltable, rtable). Returns: A Boolean value of True is returned if the update was successful. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `property_name` is not of type string. Examples: >>> import py_entitymatching as em >>> import pandas as pd >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> em.set_property(A, 'key', 'id') >>> em.get_property(A, 'key') # id >>> em.get_key(A) # id Note: If the input DataFrame is not present in the catalog, this function will create an entry in the catalog and set the given property. """ # Validate input parameters # # The input object should be of type pandas DataFrame validate_object_type(data_frame, pd.DataFrame) # # The property name should be of type string validate_object_type(property_name, six.string_types, error_prefix='Property name') # Get the catalog instance catalog = Catalog.Instance() # Check if the DataFrame information is present in the catalog. If the # information is not present, then initialize an entry for that DataFrame # in the catalog. if not catalog.is_df_info_present_in_catalog(data_frame): catalog.init_properties(data_frame) # Set the property in the catalog, and relay the return value from the # underlying catalog object's function. The return value is typically # True if the update was successful. return catalog.set_property(data_frame, property_name, property_value)
def init_properties(data_frame): """ Initializes properties for a pandas DataFrame in the catalog. Specifically, this function creates an entry in the catalog and sets its properties to empty. Args: data_frame (DataFrame): DataFrame for which the properties must be initialized. Returns: A Boolean value of True is returned if the initialization was successful. """ # Validate input parameters # # The input object should be of type pandas DataFrame validate_object_type(data_frame, pd.DataFrame) # Get the catalog instance catalog = Catalog.Instance() # Initialize the property in the catalog. # Relay the return value from the underlying catalog object's function. # The return value is typically True if the initialization was successful return catalog.init_properties(data_frame) def get_all_properties(data_frame): """ Gets all the properties for a pandas DataFrame object from the catalog. Args: data_frame (DataFrame): DataFrame for which the properties must be retrieved. Returns: A dictionary containing properties for the input pandas DataFrame. Raises: AttributeError: If the input object is not of type pandas DataFrame. KeyError: If the information about DataFrame is not present in the catalog. """ # Validate input parameters # # The input object is expected to be of type DataFrame # # The input object should be of type pandas DataFrame validate_object_type(data_frame, pd.DataFrame) # Get the catalog instance catalog = Catalog.Instance() # Check if the DataFrame information is present in the catalog. If not # raise an error. if not catalog.is_df_info_present_in_catalog(data_frame): logger.error('DataFrame information is not present in the catalog') raise KeyError('DataFrame information is not present in the catalog') # Retrieve the properties for the DataFrame from the catalog and return # it back to the user. return catalog.get_all_properties(data_frame)
[docs]def del_property(data_frame, property_name): """ Deletes a property for a pandas DataFrame from the catalog. Args: data_frame (DataFrame): The input DataFrame for which a property must be deleted from the catalog. property_name (string): The name of the property that should be deleted. Returns: A Boolean value of True is returned if the deletion was successful. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `property_name` is not of type string. KeyError: If `data_frame` information is not present in the catalog. KeyError: If requested property for the DataFrame is not present in the catalog. Examples: >>> import py_entitymatching as em >>> import pandas as pd >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> em.set_property(A, 'key', 'id') >>> em.get_property(A, 'key') # id >>> em.del_property(A, 'key') >>> em.is_property_present_for_df(A, 'key') # False """ # Validate input parameters # # The input object should be of type pandas DataFrame validate_object_type(data_frame, pd.DataFrame) # # The property name should be of type string validate_object_type(property_name, six.string_types, error_prefix='Property name') # Get the catalog instance catalog = Catalog.Instance() # Check if the DataFrame information is present in the catalog, if not # raise an error. if not catalog.is_df_info_present_in_catalog(data_frame): logger.error('DataFrame information is not present in the catalog') raise KeyError('DataFrame information is not present in the catalog') # Check if the requested property name to be deleted is present for the # DataFrame in the catalog, if not raise an error. if not catalog.is_property_present_for_df(data_frame, property_name): logger.error('Requested metadata ( %s ) for the given DataFrame is ' 'not present in the catalog' %property_name) raise KeyError('Requested metadata ( %s ) for the given DataFrame is ' 'not present in the catalog' %property_name) # Delete the property using the underlying catalog object and relay the # return value. Typically the return value is True if the deletion was # successful return catalog.del_property(data_frame, property_name)
def del_all_properties(data_frame): """ Deletes all properties for a DataFrame from the catalog. Args: data_frame (DataFrame): Input DataFrame for which all the properties must be deleted from the catalog. Returns: A boolean of True is returned if the deletion was successful from the catalog. Raises: AssertionError: If the `data_frame` is not of type pandas DataFrame. KeyError: If the DataFrame information is not present in the catalog. Note: This method's functionality is not as same as init_properties. Here the DataFrame's entry will be removed from the catalog, but init_properties will add (if the DataFrame is not present in the catalog) and initialize its properties to an empty object ( specifically, an empty Python dictionary). """ # Validations of input parameters # # The input object is expected to be of type pandas DataFrame if not isinstance(data_frame, pd.DataFrame): logger.error('Input object is not of type pandas data frame') raise AssertionError('Input object is not of type pandas data frame') # Get the catalog instance catalog = Catalog.Instance() # Check if the DataFrame is present in the catalog. If not, raise an error if not catalog.is_df_info_present_in_catalog(data_frame): logger.error('DataFrame information is not present in the catalog') raise KeyError('DataFrame information is not present in the catalog') # Call the underlying catalog object's function to delete the properties # and relay its return value return catalog.del_all_properties(data_frame)
[docs]def get_catalog(): """ Gets the catalog information for the current session. Returns: A Python dictionary containing the catalog information. Specifically, the dictionary contains the Python identifier of a DataFrame (obtained by id(DataFrame object)) as the key and their properties as value. Examples: >>> import py_entitymatching as em >>> catalog = em.get_catalog() """ # Get the catalog instance catalog = Catalog.Instance() # Call the underlying catalog object's function to get the catalog. Relay # the return value from the delegated function. return catalog.get_catalog()
[docs]def del_catalog(): """ Deletes the catalog for the current session. Returns: A Boolean value of True is returned if the deletion was successful. Examples: >>> import py_entitymatching as em >>> em.del_catalog() """ # Get the catalog instance catalog = Catalog.Instance() # Call the underlying catalog object's function to delete the catalog (a # dict). Relay the return value from the delegated function. return catalog.del_catalog()
[docs]def is_catalog_empty(): """ Checks if the catalog is empty. Returns: A Boolean value of True is returned if the catalog is empty, else returns False. Examples: >>> import py_entitymatching as em >>> import pandas as pd >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> em.set_key(A, 'id') >>> em.is_catalog_empty() # False """ # Get the catalog instance catalog = Catalog.Instance() # Call the underlying catalog object's function to check if the catalog # is empty. Relay the return value from the delegated function. return catalog.is_catalog_empty()
[docs]def is_dfinfo_present(data_frame): """ Checks whether the DataFrame information is present in the catalog. Args: data_frame (DataFrame): The DataFrame that should be checked for its presence in the catalog. Returns: A Boolean value of True is returned if the DataFrame is present in the catalog, else False is returned. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. Examples: >>> import py_entitymatching as em >>> import pandas as pd >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> em.set_key(A, 'id') >>> em.is_dfinfo_present(A) # True """ # Validate inputs # We expect the input object to be of type pandas DataFrame validate_object_type(data_frame, pd.DataFrame) # Get the catalog instance catalog = Catalog.Instance() # Call the underlying catalog object's function to check if the # DataFrame information is present in the catalog. # Relay the return value from the delegated function. return catalog.is_df_info_present_in_catalog(data_frame)
[docs]def is_property_present_for_df(data_frame, property_name): """ Checks if the given property is present for the given DataFrame in the catalog. Args: data_frame (DataFrame): The DataFrame for which the property must be checked for. property_name (string): The name of the property that should be checked for its presence for the DataFrame, in the catalog. Returns: A Boolean value of True is returned if the property is present for the given DataFrame. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `property_name` is not of type string. KeyError: If `data_frame` is not present in the catalog. Examples: >>> import py_entitymatching as em >>> import pandas as pd >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> em.set_key(A, 'id') >>> em.is_property_present_for_df(A, 'id') # True >>> em.is_property_present_for_df(A, 'fk_ltable') # False """ # Input validations # # The input object should be of type pandas DataFrame validate_object_type(data_frame, pd.DataFrame) # # The property name should be of type string validate_object_type(property_name, six.string_types, error_prefix='Property name') # Get the catalog instance catalog = Catalog.Instance() # Check if the given DataFrame information is present in the catalog. If # not, raise an error. if catalog.is_df_info_present_in_catalog(data_frame) is False: logger.error('DataFrame information is not present in the catalog') raise KeyError('DataFrame information is not present in the catalog') # Call the underlying catalog object's function to check if the property # is present for the given DataFrame. Relay the return value from that # function. return catalog.is_property_present_for_df(data_frame, property_name)
[docs]def get_catalog_len(): """ Get the length (i.e the number of entries) in the catalog. Returns: The number of entries in the catalog as an integer. Examples: >>> import py_entitymatching as em >>> len = em.get_catalog_len() """ # Get the catalog instance catalog = Catalog.Instance() # Call the underlying catalog object's function to get the catalog length. # Relay the return value from that function. return catalog.get_catalog_len()
def set_properties(data_frame, properties, replace=True): """ Sets the properties for a DataFrame in the catalog. Args: data_frame (DataFrame): DataFrame for which the properties must be set. properties (dict): A Python dictionary with keys as property names and values as Python objects (typically strings or DataFrames) replace (Optional[bool]): Flag to indicate whether the input properties can replace the properties in the catalog. The default value for the flag is True. Specifically, if the DataFrame information is already present in the catalog then the function will check if the replace flag is True. If the flag is set to True, then the function will first delete the existing properties, set it with the given properties. If the flag is False, the function will just return without modifying the existing properties. Returns: A Boolean value of True is returned if the properties were set for the given DataFrame, else returns False. Raises: AssertionError: If the input data_frame object is not of type pandas DataFrame. AssertionError: If the input properties object is not of type Python dictionary. """ # Validate input parameters # # Input object is expected to be a pandas DataFrame validate_object_type(data_frame, pd.DataFrame) # # Input properties is expected to be of type Python dictionary validate_object_type(properties, dict, error_prefix='The properties') # Get the catalog instance catalog = Catalog.Instance() # Check if the the DataFrame information is present in the catalog. If # present, we expect the replace flag to be True. If the flag was set to # False, then warn the user and return False. if catalog.is_df_info_present_in_catalog(data_frame): if not replace: logger.warning( 'Properties already exists for df ( %s ). Not replacing it' %str(id(data_frame))) return False else: # DataFrame information is present and replace flag is True. We # now reset the properties dictionary for this DataFrame. catalog.init_properties(data_frame) else: # The DataFrame information is not present in the catalog. so # initialize the properties catalog.init_properties(data_frame) # Now iterate through the given properties and set for the DataFrame. # Note: Here we don't check the correctness of the input properties (i.e # we do not check if a property 'key' is indeed a key) for property_name, property_value in six.iteritems(properties): catalog.set_property(data_frame, property_name, property_value) # Finally return True, if everything was successful return True
[docs]def copy_properties(source_data_frame, target_data_frame, replace=True): """ Copies properties from a source DataFrame to target DataFrame in the catalog. Args: source_data_frame (DataFrame): The DataFrame from which the properties to be copied from, in the catalog. target_data_frame (DataFrame): The DataFrame to which the properties to be copied to, in the catalog. replace (boolean): A flag to indicate whether the source DataFrame's properties can replace the target DataFrame's properties in the catalog. The default value for the flag is True. Specifically, if the target DataFrame's information is already present in the catalog then the function will check if the replace flag is True. If the flag is set to True, then the function will first delete the existing properties and then set it with the source DataFrame properties. If the flag is False, the function will just return without modifying the existing properties. Returns: A Boolean value of True is returned if the copying was successful. Raises: AssertionError: If `source_data_frame` is not of type pandas DataFrame. AssertionError: If `target_data_frame` is not of type pandas DataFrame. KeyError: If source DataFrame is not present in the catalog. Examples: >>> import py_entitymatching as em >>> import pandas as pd >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> em.set_key(A, 'id') >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) >>> em.copy_properties(A, B) >>> em.get_key(B) # 'id' """ # Validate input parameters # # The source_data_frame is expected to be of type pandas DataFrame validate_object_type(source_data_frame, pd.DataFrame, error_prefix='Input object (source_data_frame)') # # The target_data_frame is expected to be of type pandas DataFrame validate_object_type(target_data_frame, pd.DataFrame, error_prefix='Input object (target_data_frame)') # Get the catalog instance catalog = Catalog.Instance() # Check if the source DataFrame information is present in the catalog. If # not raise an error. if catalog.is_df_info_present_in_catalog(source_data_frame) is False: logger.error( 'DataFrame information (source_data_frame) is not present in the ' 'catalog') raise KeyError( 'DataFrame information (source_data_frame) is not present in the ' 'catalog') # Get all properties for the source DataFrame metadata = catalog.get_all_properties(source_data_frame) # Set the properties to the target DataFrame. Specifically, call the set # properties function and relay its return value. # Note: There is a redundancy in validating the input parameters. This # might have a slight performance impact, but we don't expect that this # function gets called so often. return set_properties(target_data_frame, metadata, replace) # this initializes tar in the catalog.
# key related methods
[docs]def get_key(data_frame): """ Gets the value of 'key' property for a DataFrame from the catalog. Args: data_frame (DataFrame): The DataFrame for which the key must be retrieved from the catalog. Returns: A string value containing the key column name is returned (if present). Examples: >>> import py_entitymatching as em >>> import pandas as pd >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> em.set_key(A, 'id') >>> em.get_key(A) # 'id' See Also: :meth:`~py_entitymatching.get_property` """ # This function is just a sugar to get the 'key' property for a DataFrame return get_property(data_frame, 'key')
[docs]def set_key(data_frame, key_attribute): """ Sets the value of 'key' property for a DataFrame in the catalog with the given attribute (i.e column name). Specifically, this function set the the key attribute for the DataFrame if the given attribute satisfies the following two properties: The key attribute should have unique values. The key attribute should not have missing values. A missing value is represented as np.NaN. Args: data_frame (DataFrame): The DataFrame for which the key must be set in the catalog. key_attribute (string): The key attribute (column name) in the DataFrame. Returns: A Boolean value of True is returned, if the given attribute satisfies the conditions for a key and the update was successful. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `key_attribute` is not of type string. KeyError: If given `key_attribute` is not in the DataFrame columns. Examples: >>> import py_entitymatching as em >>> import pandas as pd >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> em.set_key(A, 'id') >>> em.get_key(A) # 'id' See Also: :meth:`~py_entitymatching.set_property` """ # Validate input parameters # # We expect the input object (data_frame) to be of type pandas DataFrame validate_object_type(data_frame, pd.DataFrame) # # We expect input key attribute to be of type string validate_object_type(key_attribute, six.string_types, error_prefix='Input key attribute') # Check if the key attribute is present as one of the columns in the # DataFrame if not ch.check_attrs_present(data_frame, key_attribute): logger.error('Input key ( %s ) not in the DataFrame' % key_attribute) raise KeyError('Input key ( %s ) not in the DataFrame' % key_attribute) # Check if the key attribute satisfies the conditions to be a key. If # not, just return False. # Note: Currently it is not clear, whether we should return False from # here or raise an exception. As of now resorting to just returning # False, because this function is used by other computation # intensive commands in py_entitymatching and raising an exception might make all # the work done in those commands go in vain (or those commands should # catch the exception correctly, which may be complicated and require # changes to the current code). We need to revisit this # later. if ch.is_key_attribute(data_frame, key_attribute) is False: logger.warning('Attribute (%s ) does not qualify to be a key; Not ' 'setting/replacing the key' % key_attribute) return False else: # Set the key property for the input DataFrame return set_property(data_frame, 'key', key_attribute)
[docs]def get_fk_ltable(data_frame): """ Gets the foreign key to left table for a DataFrame from the catalog. Specifically this function is a sugar function that will get the foreign key to left table using underlying :meth:`~py_entitymatching.get_property` function. This function is typically called on a DataFrame which contains metadata such as fk_ltable, fk_rtable, ltable, rtable. Args: data_frame (DataFrame): The input DataFrame for which the foreign key ltable property must be retrieved. Returns: A Python object, typically a string is returned. Examples: >>> import py_entitymatching as em >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) >>> em.set_key(A, 'id') >>> em.set_key(B, 'id') >>> C = pd.DataFrame({'id':[1, 2], 'ltable_id':[1, 2], 'rtable_id':[2, 1]}) >>> em.set_key(C, 'id') >>> em.set_fk_ltable(C, 'ltable_id') >>> em.get_fk_ltable(C) # 'ltable_id' See Also: :meth:`~py_entitymatching.get_property` """ # Call the get_property function and relay the result. return get_property(data_frame, 'fk_ltable')
[docs]def get_fk_rtable(data_frame): """ Gets the foreign key to right table for a DataFrame from the catalog. Specifically this function is a sugar function that will get the foreign key to right table using :meth:`py_entitymatching.get_property` function. This function is typically called on a DataFrame which contains metadata such as fk_ltable, fk_rtable, ltable, rtable. Args: data_frame (DataFrame): The input DataFrame for which the foreign key rtable property must be retrieved. Returns: A Python object, (typically a string) is returned. Examples: >>> import py_entitymatching as em >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) >>> em.set_key(A, 'id') >>> em.set_key(B, 'id') >>> C = pd.DataFrame({'id':[1, 2], 'ltable_id':[1, 2], 'rtable_id':[2, 1]}) >>> em.set_key(C, 'id') >>> em.set_fk_rtable(C, 'rtable_id') >>> em.get_fk_rtable(C) # 'rtable_id' See Also: :meth:`~py_entitymatching.get_property` """ # Call the get_property function and relay the result. return get_property(data_frame, 'fk_rtable')
[docs]def set_fk_ltable(data_frame, fk_ltable): """ Sets the foreign key to ltable for a DataFrame in the catalog. Specifically this function is a sugar function that will set the foreign key to the left table using :meth:`py_entitymatching.set_property` function. This function is typically called on a DataFrame which contains metadata such as fk_ltable, fk_rtable, ltable, rtable. Args: data_frame (DataFrame): The input DataFrame for which the foreign key ltable property must be set. fk_ltable (string): The attribute that must ne set as the foreign key to the ltable in the catalog. Returns: A Boolean value of True is returned if the foreign key to ltable was set successfully. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `fk_ltable` is not of type string. AssertionError: If `fk_ltable` is not in the input DataFrame. Examples: >>> import py_entitymatching as em >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) >>> em.set_key(A, 'id') >>> em.set_key(B, 'id') >>> C = pd.DataFrame({'id':[1, 2], 'ltable_id':[1, 2], 'rtable_id':[2, 1]}) >>> em.set_key(C, 'id') >>> em.set_fk_ltable(C, 'ltable_id') >>> em.get_fk_ltable(C) # 'ltable_id' See Also: :meth:`~py_entitymatching.set_property` """ # Validate the input parameters # # We expect the input object to be of type pandas DataFrame validate_object_type(data_frame, pd.DataFrame) # # We expect the input fk_ltable to be of type string validate_object_type(fk_ltable, six.string_types, error_prefix='The input (fk_ltable)') # # The fk_ltable attribute should be one of the columns in the input # DataFrame if not ch.check_attrs_present(data_frame, fk_ltable): logger.error('Input attr. ( %s ) not in the DataFrame' % fk_ltable) raise KeyError('Input attr. ( %s ) not in the DataFrame' % fk_ltable) # Call the set_property function and relay the result. return set_property(data_frame, 'fk_ltable', fk_ltable)
def validate_and_set_fk_ltable(foreign_data_frame, foreign_key_ltable, ltable, ltable_key): """ Validates and set the foreign key ltable for a DataFrame in the the catalog. Specifically, given a DataFrame and a foreign key attribute it checks for the following conditions to be satisfied for the attribute. First it checks that foreign key ltable attribute does not have any missing values. Second it checks that the subset of foreign key values, have unique values in the primary (base) table. Args: foreign_data_frame (DataFrame): DataFrame containing the foreign key (typically a candidate set, for example output from blocking two tables). foreign_key_ltable (string): An attribute in the foreign DataFrame ltable (DataFrame): Base DataFrame, in which the foreign key attribute would form the primary key. ltable_key (string): An attribute in the base table (typically a primary key attribute). Returns: A Boolean value of True will be returned if the validation was successful and the update was successful in the catalog. Raises: AssertionError: If the input foreign DataFrame (foreign_data_frame) is not of type pandas DataFrame. AssertionError: If the foreign key ltable (foreign_key_ltable) is not of type string. AssertionError: If the input ltable (ltable) is not of type pandas DataFrame. AssertionError: If the ltable key (ltable_key) is not of type string. """ # check the foreign key constraint # # Note all the validations are done inside the function # check_fk_constraint status = ch.check_fk_constraint(foreign_data_frame, foreign_key_ltable, ltable, ltable_key) # If the validation is successful then set the property if status: return set_property(foreign_data_frame, 'fk_ltable', foreign_key_ltable) else: # else report the error and just return False. logger.warning( 'FK constraint for fk_ltable is not satisfied; ' 'Not setting the fk_ltable') return False def validate_and_set_fk_rtable(foreign_data_frame, foreign_key_rtable, rtable, rtable_key): """ Validates and set the foreign key ltable for a DataFrame in the the catalog. Specifically, given a DataFrame and a foreign key attribute it checks for the following conditions to be satisfied for the attribute. First it checks that foreign key rtable attribute does not have any missing values. Second it checks that the subset of foreign key values, have unique values in the primary (base) table. Args: foreign_data_frame (DataFrame): DataFrame containing the foreign key (typically a candidate set, for example output from blocking two tables). foreign_key_rtable (string): An attribute in the foreign DataFrame rtable (DataFrame): Base DataFrame, in which the foreign key attribute would form the primary key. rtable_key (string): An attribute in the base table (typically a primary key attribute). Returns: A Boolean value of True will be returned if the validation was successful and the update was successful in the catalog. Raises: AssertionError: If the input foreign DataFrame (foreign_data_frame) is not of type pandas DataFrame. AssertionError: If the foreign key ltable (foreign_key_ltable) is not of type string. AssertionError: If the input ltable (ltable) is not of type pandas DataFrame. AssertionError: If the ltable key (ltable_key) is not of type string. """ # Validate the foreign key constraint # Note: All the basic input validations are done inside the # check_fk_constraint function. status = ch.check_fk_constraint(foreign_data_frame, foreign_key_rtable, rtable, rtable_key) # If the validation was successful, then set the property if status: return set_property(foreign_data_frame, 'fk_rtable', foreign_key_rtable) # else just warn and return False else: logger.warning( 'FK constraint for fk_rtable is not satisfied; Not ' 'setting the fk_rtable and rtable') return False
[docs]def set_fk_rtable(data_frame, foreign_key_rtable): """ Sets the foreign key to rtable for a DataFrame in the catalog. Specifically this function is a sugar function that will set the foreign key to right table using set_property function. This function is typically called on a DataFrame which contains metadata such as fk_ltable, fk_rtable, ltable, rtable. Args: data_frame (DataFrame): The input DataFrame for which the foreign key rtable property must be set. foreign_key_rtable (string): The attribute that must be set as foreign key to rtable in the catalog. Returns: A Boolean value of True is returned if the foreign key to rtable was set successfully. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `foreign_key_rtable` is not of type string. AssertionError: If `fk_rtable` is not in the input DataFrame. Examples: >>> import py_entitymatching as em >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) >>> em.set_key(A, 'id') >>> em.set_key(B, 'id') >>> C = pd.DataFrame({'id':[1, 2], 'ltable_id':[1, 2], 'rtable_id':[2, 1]}) >>> em.set_key(C, 'id') >>> em.set_fk_rtable(C, 'rtable_id') >>> em.get_fk_rtable(C) # 'rtable_id' See Also: :meth:`~py_entitymatching.set_property` """ # Validate the input parameters # # The input object is expected to be of type pandas DataFrame validate_object_type(data_frame, pd.DataFrame) validate_object_type(foreign_key_rtable, six.string_types, error_prefix='Input (foreign key ltable)') # Check if the given attribute is present in the DataFrame if not ch.check_attrs_present(data_frame, foreign_key_rtable): logger.error('Input attr. ( %s ) not in the DataFrame' % foreign_key_rtable) raise KeyError('Input attr. ( %s ) not in the DataFrame' % foreign_key_rtable) # Finally set the property and relay the result return set_property(data_frame, 'fk_rtable', foreign_key_rtable)
[docs]def show_properties(data_frame): """ Prints the properties for a DataFrame that is present in the catalog. Args: data_frame (DataFrame): The input pandas DataFrame for which the properties must be displayed. Examples: >>> A = pd.DataFrame({'key_attr' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> em.set_key(A, 'key_attr') >>> em.show_properties(A) # id: 4572922488 # This will change dynamically # key: key_attr """ # Check if the DataFrame information is present in the catalog. If not # return if not is_dfinfo_present(data_frame): logger.error('DataFrame information is not present in the catalog') return # Delegate it to show properties for the id if an object in the catalog show_properties_for_id(id(data_frame))
# # Get the properties for the DataFrame from the catalog # metadata = get_all_properties(data_frame) # # # First print the id for the DataFrame # print('id: ' + str(id(data_frame))) # # For each property name anf value, print the contents to the user # for property_name, property_value in six.iteritems(metadata): # # If the property value is string print it out # if isinstance(property_value, six.string_types): # print(property_name + ": " + property_value) # # else, print just the id. # else: # print(property_name + "(obj.id): " + str(id(property_value)))
[docs]def show_properties_for_id(object_id): """ Shows the properties for an object id present in the catalog. Specifically, given an object id got from typically executing id( <object>), where the object could be a DataFrame, this function will display the properties present for that object id in the catalog. Args: object_id (int): The Python identifier of an object (typically a pandas DataFrame). Examples: >>> A = pd.DataFrame({'key_attr' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> em.set_key(A, 'key_attr') >>> em.show_properties_for_id(id(A)) # id: 4572922488 # This will change dynamically # key: key_attr """ catalog = Catalog.Instance() metadata = catalog.get_all_properties_for_id(object_id) # First print the id for the DataFrame print('id: ' + str(object_id)) # For each property name anf value, print the contents to the user for property_name, property_value in six.iteritems(metadata): # If the property value is string print it out if isinstance(property_value, six.string_types): print(property_name + ": " + property_value) # else, print just the id. else: print(property_name + "(obj.id): " + str(id(property_value)))
def set_candset_properties(candset, key, foreign_key_ltable, foreign_key_rtable, ltable, rtable): """ Sets candidate set properties. Specifically, this is a sugar function that sets all the properties for a candidate set such as key, foreign key ltable, foreign key rtable, ltable and rtable. Further, this function does not check the integrity of input properties. Args: candset (DataFrame): Input DataFrame for which the properties must be set. key (string): Key attribute that must be set for the DataFrame in the catalog. foreign_key_ltable (string): Foreign key ltable attribute that must be set for the DataFrame in the catalog. foreign_key_rtable (string): Foreign key rtable attribute that must be set for the DataFrame in the catalog. ltable (DataFrame): DataFrame that must be set as ltable. rtable (DataFrame): DataFrame that must be set as rtable. Returns: A Boolean value of True is returned if the updates were successful. """ # set the key set_property(candset, 'key', key) # set the foreign key attributes set_fk_ltable(candset, foreign_key_ltable) set_fk_rtable(candset, foreign_key_rtable) # set the ltable and rtables set_property(candset, 'ltable', ltable) set_property(candset, 'rtable', rtable) return True def _validate_metadata_for_table(table, key, output_string, lgr, verbose): """ Validates metadata for table (DataFrame) """ # Validate input parameters # # We expect the input table to be of type pandas DataFrame validate_object_type(table, pd.DataFrame) # Check the key column is present in the table if not ch.check_attrs_present(table, key): raise KeyError('Input key ( %s ) not in the DataFrame' % key) # Validate the key ch.log_info(lgr, 'Validating ' + output_string + ' key: ' + str(key), verbose) # We expect the key to be of type string validate_object_type(key, six.string_types, error_prefix='Key attribute') if not ch.is_key_attribute(table, key, verbose): raise AssertionError('Attribute %s in the %s table does not ' 'qualify to be the key' % ( str(key), output_string)) ch.log_info(lgr, '..... Done', verbose) return True def _validate_metadata_for_candset(candset, key, foreign_key_ltable, foreign_key_rtable, ltable, rtable, ltable_key, rtable_key, lgr, verbose): """ Validates metadata for a candidate set. """ # Validate input parameters # # We expect candset to be of type pandas DataFrame validate_object_type(candset, pd.DataFrame, error_prefix='Input candset') # Check if the key column is present in the candset if not ch.check_attrs_present(candset, key): raise KeyError('Input key ( %s ) not in the DataFrame' % key) # Check if the foreign key ltable column is present in the candset if not ch.check_attrs_present(candset, foreign_key_ltable): raise KeyError( 'Input foreign_key_ltable ( %s ) not in the DataFrame' % foreign_key_ltable) # Check if the foreign key rtable column is present in the candset if not ch.check_attrs_present(candset, foreign_key_rtable): raise KeyError( 'Input fk_rtable ( %s ) not in the DataFrame' % foreign_key_rtable) # We expect the ltable to be of type pandas DataFrame validate_object_type(ltable, pd.DataFrame, error_prefix='Input ltable') # We expect the rtable to be of type pandas DataFrame validate_object_type(rtable, pd.DataFrame, error_prefix='Input rtable') # We expect the ltable key to be present in the ltable if not ch.check_attrs_present(ltable, ltable_key): raise KeyError('ltable key ( %s ) not in ltable' % ltable_key) # We expect the rtable key to be present in the rtable if not ch.check_attrs_present(rtable, rtable_key): raise KeyError('rtable key ( %s ) not in rtable' % rtable_key) # First validate metadata for the candidate set (as a table) _validate_metadata_for_table(candset, key, 'candset', lgr, verbose) ch.log_info(lgr, 'Validating foreign key constraint for left table', verbose) # Second check foreign key constraints if not ch.check_fk_constraint(candset, foreign_key_ltable, ltable, ltable_key): raise AssertionError( 'Candset does not satisfy foreign key constraint with ' 'the left table') if not ch.check_fk_constraint(candset, foreign_key_rtable, rtable, rtable_key): raise AssertionError( 'Candset does not satisfy foreign key constraint with ' 'the right table') ch.log_info(lgr, '..... Done', verbose) ch.log_info(lgr, 'Validating foreign key constraint for right table', verbose) ch.log_info(lgr, '..... Done', verbose) return True # noinspection PyIncorrectDocstring def get_keys_for_ltable_rtable(ltable, rtable, lgr, verbose): """ Gets keys for the ltable and rtable. """ # We expect the ltable to be of type pandas DataFrame if not isinstance(ltable, pd.DataFrame): logger.error('Input ltable is not of type pandas data frame') raise AssertionError('Input ltable is not of type pandas data frame') # We expect the rtable to be of type pandas DataFrame if not isinstance(rtable, pd.DataFrame): logger.error('Input rtable is not of type pandas data frame') raise AssertionError('Input rtable is not of type pandas data frame') ch.log_info(lgr, 'Required metadata: ltable key, rtable key', verbose) ch.log_info(lgr, 'Getting metadata from the catalog', verbose) # Get the ltable key and rtable key from the catalog ltable_key = get_key(ltable) rtable_key = get_key(rtable) ch.log_info(lgr, '..... Done', verbose) # return the ltable and rtable keys return ltable_key, rtable_key # noinspection PyIncorrectDocstring def get_metadata_for_candset(candset, lgr, verbose): """ Gets metadata for the candset """ # Validate input parameters validate_object_type(candset, pd.DataFrame, error_prefix='Input candset') ch.log_info(lgr, 'Getting metadata from the catalog', verbose) # Get the key, foreign keys, ltable, rtable and their keys # # Get key key = get_key(candset) # # Get the foreign keys fk_ltable = get_fk_ltable(candset) fk_rtable = get_fk_rtable(candset) # # Get the base tables ltable = get_ltable(candset) rtable = get_rtable(candset) # Get the base table keys l_key = get_key(ltable) r_key = get_key(rtable) ch.log_info(lgr, '..... Done', verbose) # Return the metadata return key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key # noinspection PyIncorrectDocstring
[docs]def get_ltable(candset): """ Gets the ltable for a DataFrame from the catalog. Args: candset (DataFrame): The input table for which the ltable must be returned. Returns: A pandas DataFrame that is pointed by 'ltable' property of the input table. Examples: >>> import py_entitymatching as em >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) >>> em.set_key(A, 'id') >>> em.set_key(B, 'id') >>> C = pd.DataFrame({'id':[1, 2], 'ltable_id':[1, 2], 'rtable_id':[2, 1]}) >>> em.set_key(C, 'id') >>> em.set_ltable(C, A) >>> id(em.get_ltable(A) == id(A) # True See Also: :meth:`~py_entitymatching.get_property` """ # Return the ltable for a candidate set. This function is just a sugar return get_property(candset, 'ltable')
# noinspection PyIncorrectDocstring
[docs]def get_rtable(candset): """ Gets the rtable for a DataFrame from the catalog. Args: candset (DataFrame): Input table for which the rtable must be returned. Returns: A pandas DataFrame that is pointed by 'rtable' property of the input table. Examples: >>> import py_entitymatching as em >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) >>> em.set_key(A, 'id') >>> em.set_key(B, 'id') >>> C = pd.DataFrame({'id':[1, 2], 'ltable_id':[1, 2], 'rtable_id':[2, 1]}) >>> em.set_key(C, 'id') >>> em.set_rtable(C, B) >>> id(em.get_rtable(B) == id(B) # True See Also: :meth:`~py_entitymatching.get_property` """ # Return the rtable for a candidate set. This function is just a sugar return get_property(candset, 'rtable')
[docs]def set_ltable(candset, table): """ Sets the ltable for a DataFrame in the catalog. Args: candset (DataFrame): The input table for which the ltable must be set. table (DataFrame): The table (typically a pandas DataFrame) that must be set as ltable for the input DataFrame. Returns: A Boolean value of True is returned, if the update was successful. Examples: >>> import py_entitymatching as em >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) >>> em.set_key(A, 'id') >>> em.set_key(B, 'id') >>> C = pd.DataFrame({'id':[1, 2], 'ltable_id':[1, 2], 'rtable_id':[2, 1]}) >>> em.set_key(C, 'id') >>> em.set_ltable(C, A) >>> id(em.get_ltable(A) == id(A) # True See Also: :meth:`~py_entitymatching.set_property` """ # Return the ltable for a candidate set. This function is just a sugar return set_property(candset, 'ltable', table)
# noinspection PyIncorrectDocstring
[docs]def set_rtable(candset, table): """ Sets the rtable for a DataFrame in the catalog. Args: candset (DataFrame): The input table for which the rtable must be set. table (DataFrame): The table that must be set as rtable for the input DataFrame. Returns: A Boolean value of True is returned, if the update was successful. Examples: >>> import py_entitymatching as em >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) >>> em.set_key(A, 'id') >>> em.set_key(B, 'id') >>> C = pd.DataFrame({'id':[1, 2], 'ltable_id':[1, 2], 'rtable_id':[2, 1]}) >>> em.set_key(C, 'id') >>> em.set_rtable(C, B) >>> id(em.get_rtable(B) == id(B) # True See Also: :meth:`~py_entitymatching.set_property` """ # Return the rtable for a candidate set. This function is just a sugar return set_property(candset, 'rtable', table)