Data Science Series
[Kaggle Intro to Machine Learning] Python basic code
heave_17
2021. 4. 28. 21:47
# Basic Data Exploration
import pandas as pd
data = pd.read_csv('melb_data.csv')
print(data.describe())
print(data.dtypes)
print(data.head())
# Selecting Data for Modeling
print(data.columns)
data = data.dropna(axis=0)
X = data.copy()
#Selecting the prediction target
y = X.pop('Price')
#print(y.head())
#Choosing "Features"
cand_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = X[cand_features]
#print(X.describe())
#Building your model
from sklearn.tree import DecisionTreeRegressor
home_model = DecisionTreeRegressor(random_state=1)
home_model.fit(X, y)
print(y.head())
print(home_model.predict(X.head()))
# Model Validation
# There are many metrics for summarizing model quality, but we'll start with one called Mean Absolute Error (also called MAE).
# error = actual - predicted
from sklearn.metrics import mean_absolute_error
predicted_home_prices = home_model.predict(X)
print(mean_absolute_error(y, predicted_home_prices))
#Split validation data
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)
home_model = DecisionTreeRegressor()
home_model.fit(train_X, train_y)
val_predictions = home_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))
# Underfitting and Overfitting
#Experimenting with Different Models
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
model.fit(train_X, train_y)
preds_val = model.predict(val_X)
mae = mean_absolute_error(val_y, preds_val)
return(mae)
for max_leaf_nodes in [5, 50, 500, 5000]:
my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))
# Random Forests
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, preds))