본문 바로가기

R

[R] Regression - lm()를 사용한 linear regression

반응형

lm()을 사용한 linear regression의 큰 틀은 아래와 같다.

 

1. Data load

  - load()

  - read.csv()

2. Data preperation and cleaning

  - colSums(is.na())

  - str() / dim() / summary()

  - as.factor() / as.Date() / ... etc.

  - cut()

3. Model build

  - lm()

4. Predict

  - predict()

5. Model Evaluation

  - R^2

 

 

 

각 단계별 예제코드는 아래와 같다.

예제코드는 box_off_num을 목적변수로 하여 영화관객수를 예측하는 코드이다.

 

#Libarary

library(ROCR)
library(plyr)
library(dplyr)

library(lubridate)

 

 

#data load

load('kormovie.Rdata')
dim(movie_test)
dim(movie_train)
str(movie_train)
str(movie_test)
summary(movie_train)
summary(movie_test)
colSums(is.na(movie_train))
colSums(is.na(movie_test))

 

#data preperation and cleaning

movie_train$dir_prev_bfnum[is.na(movie_train$dir_prev_bfnum)]<- 0
movie_test$dir_prev_bfnum[is.na(movie_test$dir_prev_bfnum)] <- 0
movie_train$naver_score[is.na(movie_train$naver_score)] <- mean(movie_train$naver_score, na.rm = T)
movie_test$naver_score[is.na(movie_test$naver_score)] <- mean(movie_test$naver_score, na.rm = T)
movie_train$watcha_score[is.na(movie_train$watcha_score)] <- mean(movie_train$watcha_score, na.rm = T)
movie_test$watcha_score[is.na(movie_test$watcha_score)] <- mean(movie_test$watcha_score, na.rm = T)
movie_train <- movie_train[, -1]
movie_test <- movie_test[, -1]
movie_train <- movie_train[, -1]
movie_test <- movie_test[, -1]
summary(movie_train)

str(movie_train$release_time)
movie_train$release_time <- as.Date(movie_train$release_time, '%Y-%m-%d')
movie_train$release_time <- month(movie_train$release_time)
movie_test$release_time <- as.Date(movie_test$release_time, '%Y-%m-%d')
movie_test$release_time <- month(movie_test$release_time)
head(movie_test$release_time)
movie_test$release_time<- cut(movie_test$release_time, breaks = c(0, 3, 6, 9, 12), right = TRUE, labels = c(1, 2, 3, 4))
movie_test$release_time <- as.factor(movie_test$release_time)
movie_train$release_time<- cut(movie_train$release_time, breaks = c(0, 3, 6, 9, 12), right = TRUE, labels = c(1, 2, 3, 4))
movie_train$release_time <- as.factor(movie_train$release_time)
movie_train$genre <- as.factor(movie_train$genre)
movie_test$genre <- as.factor(movie_test$genre)
movie_train$screening_rat <- as.factor(movie_train$screening_rat)
movie_test$screening_rat <- as.factor(movie_test$screening_rat)
movie_train$genre_dd <- ifelse(movie_train$genre == "다큐멘터리" | movie_train$genre == "드라마" | movie_train$genre == "느와르" | movie_train$genre == "공포", 1, 0)
movie_test$genre_dd <- ifelse(movie_test$genre == "다큐멘터리" | movie_test$genre == "드라마" | movie_test$genre == "느와르"| movie_test$genre == "공포", 1, 0)

 

#building model
m_model <- lm(box_off_num ~ genre_dd + release_time + screening_rat + dir_prev_bfnum *  dir_prev_num + num_actor *num_staff * time +  naver_score * watcha_score, data=movie_train)
m_model
str(movie_train)
summary(m_model)

#predict
movie_train$pred <- predict(m_model, newdata = movie_train)
movie_test$pred <- predict(m_model, newdata = movie_test)

 

#calculate R2
calcR2 <- function(label, estimation) {
  RSS = sum((label - estimation) ** 2)
  SStot = sum((label - mean(label)) ** 2)
  return (1-RSS/SStot)
}

calcR2(movie_train$box_off_num, movie_train$pred)
pred_boxoffice_test <- predict(m_model, newdata = movie_test)

728x90
반응형