데이콘 경진대회 : 전력소비량 AI 예측 경진대회 모델 : LightGBM 사용 파라미터 튜닝 : 패키지에서 제공하는 기본값만 사용, 사업장별 동일한 파라미터 적용 방법 : 사업장 60개를 개별적으로 학습

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error # 정확도 평가
import lightgbm as lgb
import math

pd.set_option('display.max_columns', 15)

def data_read(name): # 입력 데이터 읽기
    df = pd.read_excel(f'{name}.xlsx')
    df = df.set_index('date_time')
    df.columns = ['num', 'pcon', 'temp', 'wind', 'humidity', 'rain', 'sunshine', 'non-elec', 'pv']

    df['dt'] = df.index # index 복사
    df['dt'] = pd.to_datetime(df['dt'], format='%Y-%m-%d %H') # datetime 변환, 복사

    df['dayofweek'] = df['dt'].dt.dayofweek # 요일 추가
    df['month'] = df['dt'].dt.month # 월 추가
    df['hour'] = df['dt'].dt.hour # 시간

    df = df.drop(['dt'], axis=1)
    dfc = df.copy() # 메모리를 달리해서 dataframe 복사

    if name == 'train':
        dfc['non-elec'] = dfc['non-elec'].fillna(0) # nan을 0으로 채움
        dfc['pv'] = dfc['pv'].fillna(0)  # nan을 0으로 채움

    if name == 'test':
        dfc['temp'] = dfc['temp'].interpolate() # 테스트 데이터 중 기온 보간
        dfc['wind'] = dfc['wind'].interpolate() # 테스트 데이터 중 풍속 보간
        dfc['humidity'] = dfc['humidity'].interpolate() # 테스트 데이터 중 습도 보간

        dfc['rain'] = dfc['rain'] / 6 # 6시간 누적데이터를 해당시간 데이터로 변환
        dfc['rain'] = dfc['rain'].fillna(method='bfill') # Nan 값을 뒷방향으로 채우기
        dfc['rain'] = dfc['rain'].fillna(0) # 남아있는 Nan 값을 0으로 채우기

        dfc['sunshine'] = dfc['sunshine'] / 3 # 3시간 누적데이터를 해당시간 데이터로 변환
        dfc['sunshine'] = dfc['sunshine'].fillna(method='bfill') # Nan 값을 뒷방향으로 채우기
        dfc['sunshine'] = dfc['sunshine'].fillna(0) # 남아있는 Nan 값을 0으로 채우기

        cur_nonelec = 0 # 비전기냉방설비 플래그
        cur_pv = 0 # 태양광설비 플래그
        for idx in range(len(dfc)):
            if dfc.iloc[idx]['hour'] == 0: # 매일 0시의 값 확인
                if dfc.iloc[idx]['non-elec'] == 1: # 비전기냉방 설비를 보유 여부
                    cur_nonelec = 1 # 보유
                else:
                    cur_nonelec = 0 # 비보유
                if dfc.iloc[idx]['pv'] == 1: # 태양광설비 보유 여부
                    cur_pv = 1 # 보유
                else:
                    cur_pv = 0 # 비보유
            else: # 설비 보유 여부(1/0)를 입력
                dfc['non-elec'][idx] = cur_nonelec
                dfc['pv'][idx] = cur_pv

    dfc.fillna(0, inplace=True) # 남아있는 Nan을 0으로 입력

    return dfc

def method_LightGBM(train_X, train_Y, test_X, test_Y):  # LightGBM 패키지 적용
    X_train = train_X.copy()
    Y_train = train_Y.copy()

    # LGBM 학습
    model = lgb.LGBMRegressor(objective='regression', boosting_type='gbdt', learning_rate=0.05, metric='mse')
    model.fit(X_train, Y_train)

    Y_pred = model.predict(test_X) # 예측..
    Y_pred = pd.DataFrame(Y_pred, index=test_Y.index) # DataFrame으로 변환

    return Y_pred

# 데이터 읽기 & 데이터 보간
train_data = data_read('train') # 학습데이터 읽기
train_data

test_data = data_read('test') # 테스트 데이터 읽기
test_data

<ipython-input-6-c909b963a8ec>:46: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfc['non-elec'][idx] = cur_nonelec
<ipython-input-6-c909b963a8ec>:47: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfc['pv'][idx] = cur_pv

# 60개 사업장별 학습

pred_total = []

for site in range(1, train_data['num'].nunique()+1): # 1부터 60까지 반복
    train = train_data[train_data['num'] == site] # 해당하는 숫자의 훈련 사이트 선택

    train_pcon_max = train['pcon'].max() # 전력소비량의 최대값
    train_pcon_min = train['pcon'].min() # 전력소비량의 최소값
    train['pcon'] = (train['pcon'] - train_pcon_min)/(train_pcon_max-train_pcon_min) # 정규화

    train_X = train.drop(['num', 'pcon'], axis=1) # 사업장 번호와 전력소비량 삭제 => 입력데이터 생성
    train_Y = train['pcon'] # 전력소비량을 Y 출력으로 설정

    test = test_data[test_data['num'] == site] # 해당되는 숫자의 테스트 사업장 선택
    test_X = test.drop(['num', 'pcon'], axis=1) # 사업장 번호와 전력소비량 삭제 => 입력데이터 생성
    test_Y = test['pcon'] # 전력소비량을 Y 출력으로 설정

    # print(len(train_X), len(train_Y), len(test_X), len(test_Y))

    site_pred = method_LightGBM(train_X, train_Y, test_X, test_Y) # 사업장 당 예측값
    cal_pred = site_pred.copy()
    cal_pred.columns = ['pred'] # 컬럼 추가
    cal_pred = pd.DataFrame(cal_pred)

    #cal_pred['pred'] = cal_pred['pred'] * train_pcon_max # 실제 값으로 원복
    cal_pred['pred'] = cal_pred['pred'] * (train_pcon_max-train_pcon_min) + train_pcon_min # 정규화 이전값 복원

    pred_total.append(cal_pred) # 사업장 예측값을 60개 바인딩

<ipython-input-10-f2564096af62>:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['pcon'] = (train['pcon'] - train_pcon_min)/(train_pcon_max-train_pcon_min) # 정규화

comp = pd.concat(pred_total) # 60개 예측값을 하나로 결합
comp

	num	pcon	temp	wind	humidity	rain	sunshine	non-elec	pv	dayofweek	month	hour
date_time
2020-06-01 00	1	8179.056	17.6	2.5	92.0	0.8	0.0	0.0	0.0	0	6	0
2020-06-01 01	1	8135.640	17.7	2.9	91.0	0.3	0.0	0.0	0.0	0	6	1
2020-06-01 02	1	8107.128	17.5	3.2	91.0	0.0	0.0	0.0	0.0	0	6	2
2020-06-01 03	1	8048.808	17.1	3.2	91.0	0.0	0.0	0.0	0.0	0	6	3
2020-06-01 04	1	8043.624	17.0	3.3	92.0	0.0	0.0	0.0	0.0	0	6	4
...	...	...	...	...	...	...	...	...	...	...	...	...
2020-08-24 19	60	4114.368	27.8	2.3	68.0	0.0	0.7	1.0	1.0	0	8	19
2020-08-24 20	60	3975.696	27.3	1.2	71.0	0.0	0.0	1.0	1.0	0	8	20
2020-08-24 21	60	3572.208	27.3	1.8	71.0	0.0	0.0	1.0	1.0	0	8	21
2020-08-24 22	60	3299.184	27.1	1.8	74.0	0.0	0.0	1.0	1.0	0	8	22
2020-08-24 23	60	3204.576	27.1	2.6	75.0	0.0	0.0	1.0	1.0	0	8	23

	num	pcon	temp	wind	humidity	rain	sunshine	non-elec	pv	dayofweek	month	hour
date_time
2020-08-25 00	1	8696.160	27.800000	1.500000	74.000000	0.0	0.0	0.0	0.0	1	8	0
2020-08-25 01	1	8674.128	27.633333	1.366667	75.333333	0.0	0.0	0.0	0.0	1	8	1
2020-08-25 02	1	8644.320	27.466667	1.233333	76.666667	0.0	0.0	0.0	0.0	1	8	2
2020-08-25 03	1	8624.232	27.300000	1.100000	78.000000	0.0	0.0	0.0	0.0	1	8	3
2020-08-25 04	1	8621.640	26.900000	1.166667	79.666667	0.0	0.0	0.0	0.0	1	8	4
...	...	...	...	...	...	...	...	...	...	...	...	...
2020-08-31 19	60	3891.456	28.633333	3.566667	66.000000	0.0	0.0	1.0	1.0	0	8	19
2020-08-31 20	60	3886.272	28.266667	3.833333	67.000000	0.0	0.0	1.0	1.0	0	8	20
2020-08-31 21	60	3678.480	27.900000	4.100000	68.000000	0.0	0.0	1.0	1.0	0	8	21
2020-08-31 22	60	3269.808	27.900000	4.100000	68.000000	0.0	0.0	1.0	1.0	0	8	22
2020-08-31 23	60	3096.144	27.900000	4.100000	68.000000	0.0	0.0	1.0	1.0	0	8	23

	pred
date_time
2020-08-25 00	8670.936871
2020-08-25 01	8661.803152
2020-08-25 02	8651.990545
2020-08-25 03	8640.240818
2020-08-25 04	8633.519894
...	...
2020-08-31 19	4067.160610
2020-08-31 20	3969.056922
2020-08-31 21	3784.600560
2020-08-31 22	3604.907669
2020-08-31 23	3290.364479