Use case: sometimes files are processed were nulls are represented with text like “NULL” meaning the field is not actually empty or null.
Below are some python functions and a test demonstrating functionality.
def getListOfMissingValues(): """ desc: List of common words used to represent null that are often found in files as text """ lst = ['NaN', 'NAN', 'nan', 'null', 'NULL', 'nul', 'NUL', 'none', 'NONE', '', ' ', ' '] return lst def advanceMissingValues(df): """ desc: Count nulls and hardcoded text that represents nulls param p1: DataFrame name return: DataFrame of field names and count values """ lstMissingVals = getListOfMissingValues() col_list = getListOfFieldNames(df) output = pd.DataFrame(col_list) output.rename(columns = {0:'FieldName'}, inplace = True) output['Count'] = '' #For each field name count nulls and other null type values for col in col_list: nullCnt = df[col].isnull().sum(axis=0) #For each missing value perform count on column missValCnt = 0 for missVal in lstMissingVals: missValCnt = missValCnt + len(df[(df[col]==missVal)]) cntTotal = nullCnt + missValCnt output.loc[output['FieldName'] == col, 'Count'] = cntTotal return output #Test Setup lst = ['NaN', 'NAN', 'nan', 'null', 'NULL', 'nul', 'NUL', 'none', 'NONE', '', ' ', ' ' ,None] mdf = pd.DataFrame(lst) mdf.rename(columns = {0:'NullTypes'}, inplace = True) print(mdf) #Run Test chk = advanceMissingValues(mdf) chk
Sample output:
