No Story, No Ecstasy

[Kaggle Data Cleaning] Python basic code 본문

Data Science Series

[Kaggle Data Cleaning] Python basic code

heave_17 2021. 4. 28. 21:54
# Data Cleaning
import pandas as pd 
import numpy as np 

df = pd.DataFrame()

# 1. Handling Missing Values

# Check missing values count
missing_values_count = df.isnull().sum()
total_cells = np.product(df.shape)
missing_cells = missing_values_count.sum()

percent_missing = missing_cells / total_cells * 100
print(percent_missing)


# Drop missing values
#  Row
df.dropna() # drop rows if it have at least one missing value

#  Column
df.dropna(axis=1) # drop columns if it have at least one missing value

# Filling in missing values automatically
df.fillna(0) # replace missing values with zero
df.fillna(method='bfill', axis=0) # replace all NA's the value that comes directly after it in the same column
## method: {‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, None}, default None

## imputer usage example
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy="median")
# strategy = {'mean', 'median', 'most_frequent}
imputer.fit(df)
X = imputer.transform(df) #returned as numpy
df_tr = pd.DataFrame(X, columns = df.columns, index = list(df.index.values))


# 2. Scaling and Normalization
#     - Scaling: changing the range of data
#     - Normalization: changing the shape of the distribution of data

original_data = np.random.exponential(size=100)

# mix-max scale the data between 0 and 1
scaled_data = minmax_scaling(original_data, columns=[0])

# Scikit-Learn Scaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
## - StandardScaler (mean and std.)
scaler = StandardScaler()
## - MinMaxScaler (0-1)
scaler = MinMaxScaler()
## - MaxAbsScaler (0: 0, Abs(X): 1)
scaler = MaxAbsScaler()
## - RobustScaler (median, IQR) > robust from outliers
scaler = RobustScaler()

scaler.fit(original_data)
data_scaled = scaler.transform(original_data)
#data_scaled = scaler.fit_transform(original_data)

# normalize the exponential data with boxcox
normalized_data = stats.boxcox(original_data)


# 3. Parsing Dates

# Check the data type
print(df['Date'].dtype)

# Convert to datetime
df['date_parsed'] = pd.to_datetime(landslides['date'], format="%m/%d/%y")

# Select the day of the month
day_of_month_df = df['date_parsed'].dt.day 

# Plot the day of the month to check the date parsing
day_of_month_df = day_of_month_df.dropna()
sns.distplot(day_of_month_df, kde=False, bins=31)

# Check if whole date data have same format
date_lengths = df.Date.str.len()
data_lengths.value_counts()
indices = np.where([date_lengths == 24])[1]
df.loc[indices]


# 4. Character Encodings

# start with a string
before = "This is the euro symbol: €"

# check to see what datatype it is
type(before)

# encode it to a different encoding, replacing characters that raise errors
after = before.encode("utf-8", errors="replace")

# check the type
type(after)
print(after)

# convert it back to utf-8
print(after.decode("utf-8"))

# read in files with encoding problems

# look at the first ten thousand bytes to guess the character encoding
with open("../input/kickstarter-projects/ks-projects-201801.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))

# check what the character encoding might be
print(result)

# read in the file with the encoding detected by chardet
kickstarter_2016 = pd.read_csv("../input/kickstarter-projects/s-projects-201612.csv", encoding='Windows-1252')

# look at the first few lines
kickstarter_2016.head()

# save our file (will be saved as UTF-8 by default!)
kickstarter_2016.to_csv("ks-projects-201801-utf8.csv")


# 5. Inconsistent Data Entry ("South Korea" vs "southkorea")

cands = df['country'].unique()
cands.sort()

# convert to lower case
professors['Country'] = professors['Country'].str.lower()
# remove trailing white spaces
professors['Country'] = professors['Country'].str.strip()

# It does look like there is another inconsistency: 'southkorea' and 'south korea' should be the same. We're going to use the fuzzywuzzy package to help identify which strings are closest to each other.
matches = fuzzywuzzy.process.extract("south korea", countries, limit=10, 
                                     scorer=fuzzywuzzy.fuzz.token_sort_ratio)

# take a look at them (to set a min_ratio)
print(matches)

# function to replace rows in the provided column of the provided dataframe
# that match the provided string above the provided ratio with the provided string
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, 
                                         scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
    
    # let us know the function's done
    print("All done!")

# use the function we just wrote to replace close matches to "south korea" with "south korea"
replace_matches_in_column(df=professors, column='Country', string_to_match="south korea")