Source code for py_stringsimjoin.utils.converter

import pandas as pd

[docs]def dataframe_column_to_str(dataframe, col_name, inplace=False, return_col=False): """Convert columun in the dataframe into string type while preserving NaN values. This method is useful when performing join over numeric columns. Currently, the join methods expect the join columns to be of string type. Hence, the numeric columns need to be converted to string type before performing the join. Args: dataframe (DataFrame): Input pandas dataframe. col_name (string): Name of the column in the dataframe to be converted. inplace (boolean): A flag indicating whether the input dataframe should be modified inplace or in a copy of it. return_col (boolean): A flag indicating whether a copy of the converted column should be returned. When this flag is set to True, the method will not modify the original dataframe and will return a new column of string type. Only one of inplace and return_col can be set to True. Returns: A Boolean value when inplace is set to True. A new dataframe when inplace is set to False and return_col is set to False. A series when inplace is set to False and return_col is set to True. """ if not isinstance(dataframe, pd.DataFrame): raise AssertionError('First argument is not of type pandas dataframe') if col_name not in dataframe.columns: raise AssertionError('Column \'' + col_name + '\' not found in the' + \ ' input dataframe') if not isinstance(inplace, bool): raise AssertionError('Parameter \'inplace\' is not of type bool') if not isinstance(return_col, bool): raise AssertionError('Parameter \'return_col\' is not of type bool') if inplace and return_col: raise AssertionError('Both \'inplace\' and \'return_col\' parameters' +\ 'cannot be set to True') col_type = dataframe[col_name].dtype if inplace: num_rows = len(dataframe[col_name]) if (num_rows == 0 or sum(pd.isnull(dataframe[col_name])) == num_rows): dataframe[col_name] = dataframe[col_name].astype(pd.np.object) return True else: return series_to_str(dataframe[col_name], inplace) elif return_col: return series_to_str(dataframe[col_name], inplace) else: dataframe_copy = dataframe.copy() series_to_str(dataframe_copy[col_name], True) return dataframe_copy
[docs]def series_to_str(series, inplace=False): """Convert series into string type while preserving NaN values. Args: series (Series): Input pandas series. inplace (boolean): A flag indicating whether the input series should be modified inplace or in a copy of it. This flag is ignored when the input series consists of only NaN values or the series is empty (with int or float type). In these two cases, we always return a copy irrespective of the inplace flag. Returns: A Boolean value when inplace is set to True. A series when inplace is set to False. """ if not isinstance(series, pd.Series): raise AssertionError('First argument is not of type pandas dataframe') if not isinstance(inplace, bool): raise AssertionError('Parameter \'inplace\' is not of type bool') col_type = series.dtype # Currently, we ignore the inplace flag when the series is empty and is of # type int or float. In this case, we will always return a copy. if len(series) == 0: if col_type == pd.np.object and inplace: return True else: return series.astype(pd.np.object) if col_type == pd.np.object: # If column is already of type object, do not perform any conversion. if inplace: return True else: return series.copy() elif pd.np.issubdtype(col_type, pd.np.integer): # If the column is of type int, then there are no missing values in the # column and hence we can directly convert it to string using # the astype method. col_str = series.astype(str) if inplace: series.update(col_str) return True else: return col_str elif pd.np.issubdtype(col_type, pd.np.float): # If the column is of type float, then there are two cases: # (1) column only contains interger values along with NaN. # (2) column actually contains floating point values. # For case 1, we preserve the NaN values as such and convert the float # values to string by first converting them to int and then to string. # For case 1, we preserve the NaN values as such and convert the float # values directly to string. # get the column values that are not NaN col_non_nan_values = series.dropna() # Currently, we ignore the inplace flag when all values in the column # are NaN and will always return a copy of the column cast into # object type. if len(col_non_nan_values) == 0: return series.astype(pd.np.object) # find how many of these values are actually integer values cast into # float. int_values = sum(col_non_nan_values.apply(lambda val: val.is_integer())) # if all these values are interger values, then we handle according # to case 1, else we proceed by case 2. if int_values == len(col_non_nan_values): col_str = series.apply(lambda val: pd.np.NaN if pd.isnull(val) else str(int(val))) else: col_str = series.apply(lambda val: pd.np.NaN if pd.isnull(val) else str(val)) if inplace: series.update(col_str) return True else: return col_str else: raise TypeError('Invalid column type. ' + \ 'Cannot convert the column to string.')