[C02 머신러닝 프로젝트 AtoZ]09 파이프라인 만들기
import pandas as pd
import numpy as np
기존 파일들에서 했던 작업
데이터 불러오기
housing = pd.read_csv('housing.csv')
income category 특성을 기준으로, 계층적 샘플링
from sklearn.model_selection import StratifiedShuffleSplit
# 소득 카테고리 특성을 만들기
housing["income_cat"] = pd.cut(housing["median_income"],
bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])
# 각 income별로 비율을 맞춰서 샘플링
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in split.split(housing, housing["income_cat"]):
train_set = housing.loc[train_idx]
test_set = housing.loc[test_idx]
# income_cat 특성을 삭제해서 데이터를 원래 상태로 되돌림
for set_ in (train_set, test_set):
set_.drop("income_cat", axis=1, inplace=True)
변환기 만들기
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room=True):
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self
def transform(self, X):
rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
population_per_household = X[:, population_ix] / X[:, households_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
파이프라인 만들기
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# 숫자형 특성에 적용할 파이프라인
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")), # 결측치를 중간값으로 대체
('attribs_adder', CombinedAttributesAdder()), # 새로운 특성 추가
('std_scaler', StandardScaler()), # 표준화
])
# 범주형과 숫자형 각각을 하나의 파이프라인에서 처리
num_attribs = list(train_set.drop('ocean_proximity', axis=1))
cat_attribs = ['ocean_proximity']
full_pipeline = ColumnTransformer([
('num', num_pipeline, num_attribs), # 숫자형 파이프라인
('cat', OneHotEncoder(), cat_attribs), # 범주형 파이프라인
])
housing_prepared = full_pipeline.fit_transform(housing)
# housing_prepared to csv
pd.DataFrame(housing_prepared).to_csv('housing_prepared.csv', index=False)
Leave a comment