Source code for py_stringsimjoin.profiler.profiler

"""Profiling tools"""

import pandas as pd

from py_stringsimjoin.utils.validation import validate_attr, \
                                              validate_input_table


[docs]def profile_table_for_join(input_table, profile_attrs=None): """Profiles the attributes in the table to suggest implications for join. Args: input_table (DataFrame): input table to profile. profile_attrs (list): list of attribute names from the input table to be profiled (defaults to None). If not provided, all attributes in the input table will be profiled. Returns: A dataframe consisting of profile output. Specifically, the dataframe contains three columns, 1) 'Unique values' column, which shows the number of unique values in each attribute, 2) 'Missing values' column, which shows the number of missing values in each attribute, and 3) 'Comments' column, which contains comments about each attribute. The output dataframe is indexed by attribute name, so that the statistics for each attribute can be easily accessed using the attribute name. """ # check if the input table is a dataframe validate_input_table(input_table, 'input table') profile_output = [] if profile_attrs is None: profile_attrs = list(input_table.columns.values) else: # check if the profile attributes exist for attr in profile_attrs: validate_attr(attr, input_table.columns, 'profile attribute', 'input table') num_rows = len(input_table) for attr in profile_attrs: # compute number of unique values in the column unique_values = len(input_table[attr].unique()) # compute number of missing values in the column missing_values = sum(pd.isnull(input_table[attr])) # compute percentage of unique values in the column unique_percent = round((float(unique_values) / float(num_rows)) * 100, 2) # compute percentage of missing values in the column missing_percent = round((float(missing_values) / float(num_rows)) * 100, 2) # format stats for better display formatted_unique_stat = _format_statistic(unique_values, unique_percent) formatted_missing_stat = _format_statistic(missing_values, missing_percent) comments = '' # if there are missing values in the column, add a comment. if missing_percent > 0: comments = ''.join(['Joining on this attribute will ignore ', formatted_missing_stat, ' rows.']) # if the column consists of unique values, add a comment. if unique_percent == 100.0 and missing_values == 0: comments = 'This attribute can be used as a key attribute.' profile_output.append((attr, formatted_unique_stat, formatted_missing_stat, comments)) # compose output dataframe containing the profiling results. output_header = ['Attribute', 'Unique values', 'Missing values', 'Comments'] output_df = pd.DataFrame(profile_output, columns=output_header) return output_df.set_index('Attribute')
def _format_statistic(stat, stat_percent): return ''.join([str(stat), ' (', str(stat_percent), '%)'])