Notice
Recent Posts
Recent Comments
Link
«   2025/06   »
1 2 3 4 5 6 7
8 9 10 11 12 13 14
15 16 17 18 19 20 21
22 23 24 25 26 27 28
29 30
Archives
Today
Total
관리 메뉴

Ubermensch

EDa 본문

카테고리 없음

EDa

now.ubermensch 2019. 7. 15. 11:21

sapply(df[sapply(df,is.numeric)],summary)
hist(df$OVERAGE,breaks = 20)
boxplot(df$OVERAGE)
plot(density(df$OVERAGE))

 

# OVERAGE 
# 데이터 통계량 확인
# Min. 1st Qu.  Median    Mean  3rd Qu.    Max. 
#-2.00    0.00   59.00   86.01  179.00  335.00 
# 1. 크게 3개의 변곡 포인트가 있다.(0,50,200) 

# STEP2-2.숫자형 변수 분포 차트 $LEFTOVER#########################################################################
hist(df$LEFTOVER,breaks = 20)
boxplot(df$LEFTOVER)
plot(density(df$LEFTOVER))
# 데이터 통계량 확인
# Min. 1st Qu.  Median    Mean  3rd Qu.    Max.
# 0.0     0.0    14.5    24.0    42.0    89.0 
# LEFTOVER : 25% 를 기준으로 데이터가 확 줄고 일정해진다.

 

#결측치 제거 
df<-na.omit(df)
colSums(is.na(df)) # HOUSE에 존재하는 결측치를 다 제거한 것을 확인 가능
#독립변수간 상관관계 확인
cor(df[, sapply(df, is.numeric)])

# 상관관계가 높은 변수
- HANDSET_PRICE & INCOME
qplot(HANDSET_PRICE, INCOME, data = df)

- OVER_15MINS_CALLS_PER_MONTH & OVERAGE
qplot(OVER_15MINS_CALLS_PER_MONTH, OVERAGE, data = df)

- AVERAGE_CALL_DURATION & LEFTOVER
qplot(AVERAGE_CALL_DURATION, LEFTOVER, data = df)
summary(df$AVERAGE_CALL_DURATION)

# 종속변수와의 상관관계 확인 ####################################
# 종속변수가 범주형이기 때문에, 연속형 변수와의 비교에는 로지스틱/ 범주형과의 비교에는 카이제곱 테스트를 실행함
#연속형 검증
a<-glm(CHURN ~ OVERAGE, family = binomial, data = df  )
summary(a) #P-value값(<2e-16)으로 유의미함을 알 수 있다.

b<-glm(CHURN ~ LEFTOVER, family = binomial, data = df  )
summary(b) #P-value값(8.99e-16)으로 유의미함을 알 수 있다.

c<-glm(CHURN ~ HOUSE, family = binomial, data = df  )
summary(c) #P-value값(<2e-16)으로 유의미함을 알 수 있다.

# 범주형 검증
chisq.test(df$REPORTED_SATISFACTION, df$CHURN) #  p-value = 0.1472으로 유의미하지 않다.
chisq.test(df$REPORTED_USAGE_LEVEL, df$CHURN) #  p-value = 0.9249으로 유의미하지 않다.
chisq.test(df$COLLEGE, df$CHURN) #  p-value = 0.02561으로 애매하다.
chisq.test(df$CONSIDERING_CHANGE_OF_PLAN, df$CHURN) #  p-value = 0.2467 애매하다.