Source code for py_entitymatching.labeler.labeler

"""
This module contains labeling related routines for a single table.
"""
import logging

import pandas as pd
import six

import py_entitymatching.catalog.catalog_manager as cm
import py_entitymatching.utils.catalog_helper as ch
from py_entitymatching.utils.validation_helper import validate_object_type

logger = logging.getLogger(__name__)


[docs]def label_table(table, label_column_name, verbose=False):
    """
    Label a pandas DataFrame (for supervised learning purposes).

    This functions labels a DataFrame, typically used for supervised learning
    purposes. This function expects the input DataFrame containing the metadata
    of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable).
    This function creates a copy of the input DataFrame, adds label column
    at the end of the DataFrame, fills the column values with 0, invokes a
    GUI for the user to enter labels (0/1, 0: non-match, 1: match) and finally
    returns the labeled DataFrame. Further, this function also copies the
    properties from the input DataFrame to the output DataFrame.

    Args:
        table (DataFrame): The input DataFrame to be labeled.
            Specifically,
            a DataFrame containing the metadata of a candidate set (such as
            key, fk_ltable, fk_rtable, ltable, rtable) in the catalog.
        label_column_name (string): The column name to be given for the labels
            entered by the user.
        verbose (boolean): A flag to indicate whether more detailed information
            about the execution steps should be printed out (default value is
            False).

    Returns:
        A new DataFrame with the labels entered by the user. Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Raises:
        AssertionError: If `table` is not of type pandas DataFrame.
        AssertionError: If `label_column_name` is not of type string.
        AssertionError: If the `label_column_name` is already present in the
            input table.

    Examples:
        >>> import py_entitymatching as em
        >>> G = em.label_table(S, label_column_name='label') # S is the (sampled) table that has to be labeled.

    """
    # Validate the input parameters: check input types, check the metadata
    # for the input DataFrame as it will get copied to the labeled DataFrame
    _validate_inputs(table, label_column_name, verbose)

    # Initialize the table to be labeled: create a copy and set the column
    # values to be 0s
    labeled_table = _init_label_table(table, label_column_name)

    # Invoke the GUI
    try:
        from PyQt5 import QtGui
    except ImportError:
        raise ImportError('PyQt5 is not installed. Please install PyQt5 to use '
                      'GUI related functions in py_entitymatching.')

    from py_entitymatching.gui.table_gui import edit_table
    edit_table(labeled_table)

    # Post process the labeled table: validate whether the labels contain
    # only 0/1s, copy the properties (in the catalog) of the input table to the
    # labeled table
    labeled_table = _post_process_labelled_table(table, labeled_table,
                                                 label_column_name)
    # Return the labeled table
    return labeled_table


def _validate_inputs(table, label_column_name, verbose):
    """
    This function validates the inputs for the label_table function
    """
    # Validate the input parameters

    # # The input table table is expected to be of type pandas DataFrame
    validate_object_type(table, pd.DataFrame)

    # # The label column name is expected to be of type string
    validate_object_type(label_column_name, six.string_types, error_prefix='Input attr.')

    # # Check if the label column name is already present in the input table
    if ch.check_attrs_present(table, label_column_name):
        logger.error('The label column name (%s) is already present in the '
                     'input table', label_column_name)
        raise AssertionError('The label column name (%s) is already present '
                             'in the input table', label_column_name)

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, ltable, rtable, ltable key, rtable key',
                verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key,
                                      logger, verbose)

    # Return True if everything was successful
    return True


def _init_label_table(table, label_column_name):
    """
    This function initializes inputs required for label_table function.
    Specifically, this function makes a copy of the input table and
    initializes the column values to 0s.
    """
    # Create a copy of the input table
    labeled_table = table.copy()

    # Add the label column at the end and initialize to 0s (non-match)
    labeled_table[label_column_name] = 0

    # Return the label table
    return labeled_table


def _post_process_labelled_table(input_table, labeled_table, col_name):
    """
    This function post processes the labeled table and updates the catalog.
    Specifically, this function validates that the label column contain only
    0 and 1's, and finally copies the properties from the input table to the
    output table.
    """
    # Cast the label values to int as initially they will be strings when it
    # comes from the GUI
    labeled_table[col_name] = labeled_table[col_name].astype(int)

    # Check if the table contains only 0s and 1s
    label_value_with_1 = labeled_table[col_name] == 1
    label_value_with_0 = labeled_table[col_name] == 0
    sum_of_labels = sum(label_value_with_1 | label_value_with_0)

    # If they contain column values other than 0 and 1, raise an error
    if not sum_of_labels == len(labeled_table):
        logger.error('The label column contains values other than 0 and 1')
        raise AssertionError(
            'The label column contains values other than 0 and 1')

    # Copy the properties from the input table to label table.
    # Note: Here we dont have to check for the integrity of 'key' because the
    # key column is not tampered from the input table.
    cm.init_properties(labeled_table)
    cm.copy_properties(input_table, labeled_table)

    # Return the label table
    return labeled_table