0netw0m1ra

[054] 웹사이트 방문자 예측 실습 정리 본문

빅데이터분석기사 실기

[054] 웹사이트 방문자 예측 실습 정리

M1RA 2022. 6. 24. 20:00

# 필요한 라이브러리 import

import pandas as pd

import numpy as np

pd.set_option('max_rows', 500)

pd.set_option('max_columns', 20)

pd.set_option('display.float_format', '[:.4f]'.format)

 

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import RandomForestRegressor

 

# 문제가 어떻게 나올지 모름

# mae, mse, rmse, msle, rmsle 구하는 함수

def get_scores2(model, xtrain, xtest, ytrain, ytest):

    pred1 = model.predict(xtrain)

    pred2 = model.predict(xtest)

 

    A1 = r2_score(ytrain, pred1)

    A2 = r2_score(ytest, pred2)

 

    # 음수 있어도 됨 : mae, mse, rmse

    B = mae(ytest, pred2)

    C = mse(ytest, pred2)

    E = np.sqrt(C) # rmse

    

    # 음수 있으면 안됨 : msle, rmsle

    pred2 = np.where(pred2<0, 0, pred2)

    D = msle(ytest, pred2)

    F = np.sqrt(D) # rmsle

    data = [round(x, 4) for x in [A1, A2, B, C, D, E, F]]

    names = 'r2_train r2_test mae mse msle rmse rmsle'.split()

    scores = pd.Series(data, index=names)

 

    return scores

 

def make_models(xtrain, xtest, ytrain, ytest, n=300):

    temp = pd.DataFrame()

 

    model1 = LinearRegression().fit(xtrain, ytrain)

    temp['model1'] = get_scores(model1, xtrain, xtest, ytrain, ytest)

 

    model2 = DecisionTreeRegressor(random_state=0).fit(xtrain, ytrain)

    temp['model2'] = get_scores2(model2, xtrain, xtest, ytrain, ytest)

 

    for d in range(3,9):

        model2 = DecisionTreeRegressor(max_depth=d, random_state=0).fit(xtrain, ytrain)

        temp[f'model2_{d}'] = get_scores2(model2, xtrain, xtest, ytrain, ytest)

 

    model3 = RandomForestRegressor(n, random_state=0).fit(xtrain, ytrain)

    temp['model3'] = get_scores2(model3, xtrain, xtest, ytrain, ytest)

 

    for d in range(3, 9):

        model3 = RandomForestRegressor(n, random_state=0).fit(xtrain, ytrain)

        temp[f'model3_{d}'] = get_scores2(model3, xtrain, xtest, ytrain, ytest)

            

    model4 = XGBRegressor(objective='reg:squarederror').fit(xtrain, ytrain)

    temp['model4'] = get_scores2(model4, xtrain, xtest, ytrain, ytest)

 

# xtrain, xtest, ytrain

X_use = pd.read_csv('x_train.csv')

X_submission = pd.read_csv('x_test.csv')

Y = pd.read_csv('y_train.csv')

 

dfX = pd.concat([X_use, X_submission], ignore_index=True, axis=0)

names = ['page_loads', 'first_time_visits', 'returning_visits']

dfX['date'] = pd.to_datetime(dfX['date'], format='%m%d%Y')

dfX2 = dfX.drop(columns='day')

 

date = pd.DataFrame()

temp = dfX2['date'].dt

date['year'] = temp.year

date['month'] = temp.month

date['day'] = temp.day

dfX3 = dfX2.drop(columns='date')

dfX3 = pd.concat([dfX3, date], axis=1)

 

Xfeatures = ['day_of_week', 'returning_visits', 'year', 'month', 'day']

dfX4 = dfX3[Xfeatures]

 

train_size = len(X_use)

XF = dfX4[:train_size]

X_submission = dfX4[train_size:]

YF = Y['unique_visits']

 

xtrain, xtest, ytrain, ytest = train_test_split(XF, YF, test_size=0.3, random_state=1234)

 

models = make_models(xtrain, xtest, ytrain, ytest)

model = XGBRegressor(objective='reg:squarederror').fit(xtrain, ytrain)

print(get_scores(model, xtrain, xtest, ytrain, ytest))

pred = model.predict(X_submission)

submission = pd.DataFrame({ 'row':X_submission['row'], 'unique_visits':pred})

submission.to_csv('00000000.csv', index=False)

 

 

 

 

 

 

 

 

 

<출처>

인프런 - [EduAtoZ] 빅데이터분석기사 실기 대비 Part3. 웹사이트 방문자 예측 실습 - 시험보러 가서는 이렇게 하세요!

https://www.inflearn.com/course/%EB%B9%85%EB%8D%B0%EC%9D%B4%ED%84%B0-%EB%B6%84%EC%84%9D%EA%B8%B0%EC%82%AC-%EC%8B%A4%EA%B8%B0-%ED%8C%8C%EC%9D%B4%EC%8D%AC