Source code for py_entitymatching.evaluation.evaluation

"""
This module contains evaluation related functions.
"""
import collections
import logging

import pandas as pd
import six

import py_entitymatching.catalog.catalog_manager as cm
import py_entitymatching.utils.catalog_helper as ch
from py_entitymatching.debugmatcher.debug_gui_utils import _get_dataframe, _get_metric
from py_entitymatching.utils.validation_helper import validate_object_type

logger = logging.getLogger(__name__)


[docs]def eval_matches(data_frame, gold_label_attr, predicted_label_attr): """ Evaluates the matches from the matcher. Specifically, given a DataFrame containing golden labels and predicted labels, this function would evaluate the matches and return the accuracy results such as precision, recall and F1. Args: data_frame (DataFrame): The input pandas DataFrame containing "gold" labels and "predicted" labels. gold_label_attr (string): An attribute in the input DataFrame containing "gold" labels. predicted_label_attr (string): An attribute in the input DataFrame containing "predicted" labels. Returns: A Python dictionary containing the accuracy measures such as precision, recall, F1. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `gold_label_attr` is not of type string. AssertionError: If `predicted_label_attr` is not of type string. AssertionError: If the `gold_label_attr` is not in the input dataFrame. AssertionError: If the `predicted_label_attr` is not in the input dataFrame. Examples: >>> import py_entitymatching as em >>> # G is the labeled data used for development purposes, match_f is the feature table >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels') >>> dt = em.DTMatcher() >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> pred_table = dt.predict(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], append=True, target_attr='predicted_labels') >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels') """ # Validate input parameters # # We expect the input object to be of type pandas DataFrame validate_object_type(data_frame, pd.DataFrame, 'The input table') # # We expect the input attribute (gold_label_attr) to be of type string validate_object_type(gold_label_attr, six.string_types, 'The input gold_label_attr') # # We expect the input attribute (predicted_label_attr) to be of type # string validate_object_type(predicted_label_attr, six.string_types, 'The input predicted_label_attr') # Check whether the gold label attribute is present in the input table if not ch.check_attrs_present(data_frame, gold_label_attr): logger.error( 'The gold_label_attr is not present in the input DataFrame') raise AssertionError( 'The gold_label_attr is not present in the input DataFrame') # Check whether the predicted label attribute is present in the input table if not ch.check_attrs_present(data_frame, predicted_label_attr): logger.error( 'The predicted_label_attr is not present in the input DataFrame') raise AssertionError( 'The predicted_label_attr is not present in the input DataFrame') # Reset the index to get the indices set as 0..len(table) new_data_frame = data_frame.reset_index(drop=False, inplace=False) # Project out the gold and label attributes. gold = new_data_frame[gold_label_attr] predicted = new_data_frame[predicted_label_attr] # Get gold negatives, positives gold_negative = gold[gold == 0].index.values gold_positive = gold[gold == 1].index.values # Get predicted negatives, positives predicted_negative = predicted[predicted == 0].index.values predicted_positive = predicted[predicted == 1].index.values # get false positive indices false_positive_indices = \ list(set(gold_negative).intersection(predicted_positive)) # get true positive indices true_positive_indices = \ list(set(gold_positive).intersection(predicted_positive)) # get false negative indices false_negative_indices = \ list(set(gold_positive).intersection(predicted_negative)) # get true negative indices true_negative_indices = \ list(set(gold_negative).intersection(predicted_negative)) # Get the number of TP, FP, FN, TN num_true_positives = float(len(true_positive_indices)) num_false_positives = float(len(false_positive_indices)) num_false_negatives = float(len(false_negative_indices)) num_true_negatives = float(len(true_negative_indices)) # Precision = num_tp/ (num_tp + num_fp) # Get precision numerator, denominator precision_numerator = num_true_positives precision_denominiator = num_true_positives + num_false_positives # Precision = num_tp/ (num_tp + num_fn) # Get recall numerator, denominator recall_numerator = num_true_positives recall_denominator = num_true_positives + num_false_negatives # Compute precision if precision_denominiator == 0.0: precision = 0.0 else: precision = precision_numerator / precision_denominiator # Compute recall if recall_denominator == 0.0: recall = 0.0 else: recall = recall_numerator / recall_denominator # Compute F1 if precision == 0.0 and recall == 0.0: F1 = 0.0 else: F1 = (2.0 * precision * recall) / (precision + recall) # Get the fk_ltable and fk_rtable fk_ltable = cm.get_property(data_frame, 'fk_ltable') fk_rtable = cm.get_property(data_frame, 'fk_rtable') # Check if the fk_ltable contain any missing values if ch.does_contain_missing_vals(data_frame, fk_ltable): logger.error('The fk_ltable (%s) contains missing values' %fk_ltable) raise AssertionError('The fk_ltable (%s) contains missing values' % fk_ltable) # Check if the fk_rtable contain any missing values if ch.does_contain_missing_vals(data_frame, fk_rtable): logger.error('The fk_rtable (%s) contains missing values' %fk_rtable) raise AssertionError('The fk_rtable (%s) contains missing values' % fk_rtable) # Set the index values to fk_ltable and fk_rtable new_data_frame.set_index([fk_ltable, fk_rtable], drop=False, inplace=True) # Get the list of false positives and false negatives. false_pos_ls = list(new_data_frame.iloc[false_positive_indices].index.values) false_neg_ls = list(new_data_frame.iloc[false_negative_indices].index.values) # Store and return the accuracy results. accuracy_results = collections.OrderedDict() accuracy_results['prec_numerator'] = precision_numerator accuracy_results['prec_denominator'] = precision_denominiator accuracy_results['precision'] = precision accuracy_results['recall_numerator'] = recall_numerator accuracy_results['recall_denominator'] = recall_denominator accuracy_results['recall'] = recall accuracy_results['f1'] = F1 accuracy_results['pred_pos_num'] = num_true_positives + num_false_positives accuracy_results['false_pos_num'] = num_false_positives accuracy_results['false_pos_ls'] = false_pos_ls accuracy_results['pred_neg_num'] = num_false_negatives + num_true_negatives accuracy_results['false_neg_num'] = num_false_negatives accuracy_results['false_neg_ls'] = false_neg_ls return accuracy_results
[docs]def get_false_positives_as_df(table, eval_summary, verbose=False): """ Select only the false positives from the input table and return as a DataFrame based on the evaluation results. Args: table (DataFrame): The input table (pandas DataFrame) that was used for evaluation. eval_summary (dictionary): A Python dictionary containing evaluation results, typically from 'eval_matches' command. Returns: A pandas DataFrame containing only the False positives from the input table. Further, this function sets the output DataFrame's properties same as input DataFrame. Examples: >>> import py_entitymatching as em >>> # G is the labeled data used for development purposes, match_f is the feature table >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels') >>> dt = em.DTMatcher() >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> pred_table = dt.predict(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], append=True, target_attr='predicted_labels') >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels') >>> false_pos_df = em.get_false_positives_as_df(H, eval_summary) """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(table, pd.DataFrame, error_prefix='Input cand.set') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) data_frame = _get_dataframe(table, eval_summary['false_pos_ls']) # # Update catalog ch.log_info(logger, 'Updating catalog', verbose) cm.init_properties(data_frame) cm.copy_properties(table, data_frame) return data_frame
[docs]def get_false_negatives_as_df(table, eval_summary, verbose=False): """ Select only the false negatives from the input table and return as a DataFrame based on the evaluation results. Args: table (DataFrame): The input table (pandas DataFrame) that was used for evaluation. eval_summary (dictionary): A Python dictionary containing evaluation results, typically from 'eval_matches' command. Returns: A pandas DataFrame containing only the false negatives from the input table. Further, this function sets the output DataFrame's properties same as input DataFrame. Examples: >>> import py_entitymatching as em >>> # G is the labeled data used for development purposes, match_f is the feature table >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels') >>> dt = em.DTMatcher() >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> pred_table = dt.predict(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], append=True, target_attr='predicted_labels') >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels') >>> false_neg_df = em.get_false_negatives_as_df(H, eval_summary) """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(table, pd.DataFrame, error_prefix='Input cand.set') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from the catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) data_frame = _get_dataframe(table, eval_summary['false_neg_ls']) # # Update catalog ch.log_info(logger, 'Updating catalog', verbose) cm.init_properties(data_frame) cm.copy_properties(table, data_frame) # # Update catalog ch.log_info(logger, 'Returning the dataframe', verbose) return data_frame