1 minute read

import pandas as pd
import numpy as np

기존 파일들에서 했던 작업

데이터 불러오기

housing = pd.read_csv('housing.csv')

income category 특성을 기준으로, 계층적 샘플링

from sklearn.model_selection import StratifiedShuffleSplit

# 소득 카테고리 특성을 만들기
housing["income_cat"] = pd.cut(housing["median_income"],
                                bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                                labels=[1, 2, 3, 4, 5])

# 각 income별로 비율을 맞춰서 샘플링
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in split.split(housing, housing["income_cat"]):
    train_set = housing.loc[train_idx]
    test_set = housing.loc[test_idx]

# income_cat 특성을 삭제해서 데이터를 원래 상태로 되돌림
for set_ in (train_set, test_set):
    set_.drop("income_cat", axis=1, inplace=True)

변환기 만들기

from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
  def __init__(self, add_bedrooms_per_room=True):
    self.add_bedrooms_per_room = add_bedrooms_per_room
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
    population_per_household = X[:, population_ix] / X[:, households_ix]
    if self.add_bedrooms_per_room:
      bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
      return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
    else:
      return np.c_[X, rooms_per_household, population_per_household]

파이프라인 만들기

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# 숫자형 특성에 적용할 파이프라인
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")), # 결측치를 중간값으로 대체
    ('attribs_adder', CombinedAttributesAdder()), # 새로운 특성 추가
    ('std_scaler', StandardScaler()), # 표준화
])

# 범주형과 숫자형 각각을 하나의 파이프라인에서 처리
num_attribs = list(train_set.drop('ocean_proximity', axis=1))
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs), # 숫자형 파이프라인
    ('cat', OneHotEncoder(), cat_attribs), # 범주형 파이프라인
])

housing_prepared = full_pipeline.fit_transform(housing)

# housing_prepared to csv
pd.DataFrame(housing_prepared).to_csv('housing_prepared.csv', index=False)

Leave a comment