오픈데이터 분석 실습 : Tidyverse 패키지
설명
- 6개의 핵심 패키지 포함 23개의 패키지로 이뤄진 메타 패키지
- ggplot2
- dplyr
- tidyr
- readr
- purrr
- tibble
- stringr
- forcats
패키지 불러오기
#install.packages("tidyverse")
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr 1.1.0 ✔ readr 2.1.4
#> ✔ forcats 1.0.0 ✔ stringr 1.5.0
#> ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
#> ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
#> ✔ purrr 1.0.1
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
#> ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
tidyverse 실습
- 항공편 데이터 nycflights13
#install.packages("nycflights13")
library(nycflights13)
파이프 연산자를 통한 코드 직관화 예시
- 순차적, 간결함
rnorm(10000, mean=10, sd=1) %>% # 평균10, 표준편차1인 정규분포에서 10000개 난수 생성
sample(size = 100, replace = F) %>% # 샘플 100개를 랜덤으로 비복원 추출
log() %>% # 로그 함수 적용
diff() %>% # 차분 계산
plot(col="red", type="l") # 그래프 그리기
행의 수
%>% nrow()
flights #> [1] 336776
앞부분 훑어보기
%>% head()
flights #> # A tibble: 6 × 19
#> year month day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
#> 1 2013 1 1 517 515 2 830 819 11 UA
#> 2 2013 1 1 533 529 4 850 830 20 UA
#> 3 2013 1 1 542 540 2 923 850 33 AA
#> 4 2013 1 1 544 545 -1 1004 1022 -18 B6
#> 5 2013 1 1 554 600 -6 812 837 -25 DL
#> 6 2013 1 1 554 558 -4 740 728 12 UA
#> # … with 9 more variables: flight <int>, tailnum <chr>, origin <chr>,
#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#> # time_hour <dttm>, and abbreviated variable names ¹sched_dep_time,
#> # ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
그룹화 + 집계값 요약
### 날짜별 평균 출발지연 시간
<- flights %>%
mean_delay_by_day group_by(year, month, day) %>%
summarise(delay = mean(dep_delay, na.rm = T))
#> `summarise()` has grouped output by 'year', 'month'. You can override using the
#> `.groups` argument.
mean_delay_by_day#> # A tibble: 365 × 4
#> # Groups: year, month [12]
#> year month day delay
#> <int> <int> <int> <dbl>
#> 1 2013 1 1 11.5
#> 2 2013 1 2 13.9
#> 3 2013 1 3 11.0
#> 4 2013 1 4 8.95
#> 5 2013 1 5 5.73
#> 6 2013 1 6 7.15
#> 7 2013 1 7 5.42
#> 8 2013 1 8 2.55
#> 9 2013 1 9 2.28
#> 10 2013 1 10 2.84
#> # … with 355 more rows
select()
- 컬럼 선택 (순서 지정 가능)
### p.s 컬럼이름 되도록 띄어쓰기 사용×
%>% select(year,month,day)
flights #> # A tibble: 336,776 × 3
#> year month day
#> <int> <int> <int>
#> 1 2013 1 1
#> 2 2013 1 1
#> 3 2013 1 1
#> 4 2013 1 1
#> 5 2013 1 1
#> 6 2013 1 1
#> 7 2013 1 1
#> 8 2013 1 1
#> 9 2013 1 1
#> 10 2013 1 1
#> # … with 336,766 more rows
컬럼 선택
### year ~ day, delay로 끝나는 컬럼 외 2개 컬럼
<- flights %>%
flights_sample select(year:day, ends_with("delay"), distance, air_time)
flights_sample#> # A tibble: 336,776 × 7
#> year month day dep_delay arr_delay distance air_time
#> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 2013 1 1 2 11 1400 227
#> 2 2013 1 1 4 20 1416 227
#> 3 2013 1 1 2 33 1089 160
#> 4 2013 1 1 -1 -18 1576 183
#> 5 2013 1 1 -6 -25 762 116
#> 6 2013 1 1 -4 12 719 150
#> 7 2013 1 1 -5 19 1065 158
#> 8 2013 1 1 -3 -14 229 53
#> 9 2013 1 1 -3 -8 944 140
#> 10 2013 1 1 -2 8 733 138
#> # … with 336,766 more rows
mutate()
- 새로운 컬럼 생성
%>%
flights_sample mutate(net_delay = arr_delay - dep_delay,
speed = distance/air_time*60)
#> # A tibble: 336,776 × 9
#> year month day dep_delay arr_delay distance air_time net_delay speed
#> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2013 1 1 2 11 1400 227 9 370.
#> 2 2013 1 1 4 20 1416 227 16 374.
#> 3 2013 1 1 2 33 1089 160 31 408.
#> 4 2013 1 1 -1 -18 1576 183 -17 517.
#> 5 2013 1 1 -6 -25 762 116 -19 394.
#> 6 2013 1 1 -4 12 719 150 16 288.
#> 7 2013 1 1 -5 19 1065 158 24 404.
#> 8 2013 1 1 -3 -14 229 53 -11 259.
#> 9 2013 1 1 -3 -8 944 140 -5 405.
#> 10 2013 1 1 -2 8 733 138 10 319.
#> # … with 336,766 more rows
filter()
- 필터링 (조건 설정)
%>% filter(month==1)
flights #> # A tibble: 27,004 × 19
#> year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
#> 1 2013 1 1 517 515 2 830 819 11 UA
#> 2 2013 1 1 533 529 4 850 830 20 UA
#> 3 2013 1 1 542 540 2 923 850 33 AA
#> 4 2013 1 1 544 545 -1 1004 1022 -18 B6
#> 5 2013 1 1 554 600 -6 812 837 -25 DL
#> 6 2013 1 1 554 558 -4 740 728 12 UA
#> 7 2013 1 1 555 600 -5 913 854 19 B6
#> 8 2013 1 1 557 600 -3 709 723 -14 EV
#> 9 2013 1 1 557 600 -3 838 846 -8 B6
#> 10 2013 1 1 558 600 -2 753 745 8 AA
#> # … with 26,994 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
bind_rows()
- 두 데이터 묶기
<- flights %>% filter(month==1)
jan <- flights %>% filter(month==2)
feb <- bind_rows(jan,feb)
janfeb
%>% head()
janfeb #> # A tibble: 6 × 19
#> year month day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
#> 1 2013 1 1 517 515 2 830 819 11 UA
#> 2 2013 1 1 533 529 4 850 830 20 UA
#> 3 2013 1 1 542 540 2 923 850 33 AA
#> 4 2013 1 1 544 545 -1 1004 1022 -18 B6
#> 5 2013 1 1 554 600 -6 812 837 -25 DL
#> 6 2013 1 1 554 558 -4 740 728 12 UA
#> # … with 9 more variables: flight <int>, tailnum <chr>, origin <chr>,
#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#> # time_hour <dttm>, and abbreviated variable names ¹sched_dep_time,
#> # ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
%>% tail()
janfeb #> # A tibble: 6 × 19
#> year month day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
#> 1 2013 2 28 NA 605 NA NA 805 NA MQ
#> 2 2013 2 28 NA 850 NA NA 1035 NA MQ
#> 3 2013 2 28 NA 905 NA NA 1115 NA MQ
#> 4 2013 2 28 NA 1115 NA NA 1310 NA MQ
#> 5 2013 2 28 NA 830 NA NA 1205 NA UA
#> 6 2013 2 28 NA 840 NA NA 1147 NA UA
#> # … with 9 more variables: flight <int>, tailnum <chr>, origin <chr>,
#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#> # time_hour <dttm>, and abbreviated variable names ¹sched_dep_time,
#> # ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
### filter로 간결하게 사용
%>% filter(month %in% c(1,2))
flights #> # A tibble: 51,955 × 19
#> year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
#> 1 2013 1 1 517 515 2 830 819 11 UA
#> 2 2013 1 1 533 529 4 850 830 20 UA
#> 3 2013 1 1 542 540 2 923 850 33 AA
#> 4 2013 1 1 544 545 -1 1004 1022 -18 B6
#> 5 2013 1 1 554 600 -6 812 837 -25 DL
#> 6 2013 1 1 554 558 -4 740 728 12 UA
#> 7 2013 1 1 555 600 -5 913 854 19 B6
#> 8 2013 1 1 557 600 -3 709 723 -14 EV
#> 9 2013 1 1 557 600 -3 838 846 -8 B6
#> 10 2013 1 1 558 600 -2 753 745 8 AA
#> # … with 51,945 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
arrange()
- 데이터 정렬 (오름/내림차순)
%>% arrange(dep_delay)
flights #> # A tibble: 336,776 × 19
#> year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
#> 1 2013 12 7 2040 2123 -43 40 2352 48 B6
#> 2 2013 2 3 2022 2055 -33 2240 2338 -58 DL
#> 3 2013 11 10 1408 1440 -32 1549 1559 -10 EV
#> 4 2013 1 11 1900 1930 -30 2233 2243 -10 DL
#> 5 2013 1 29 1703 1730 -27 1947 1957 -10 F9
#> 6 2013 8 9 729 755 -26 1002 955 7 MQ
#> 7 2013 10 23 1907 1932 -25 2143 2143 0 EV
#> 8 2013 3 30 2030 2055 -25 2213 2250 -37 MQ
#> 9 2013 3 2 1431 1455 -24 1601 1631 -30 9E
#> 10 2013 5 5 934 958 -24 1225 1309 -44 B6
#> # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
%>% arrange(-dep_delay)
flights #> # A tibble: 336,776 × 19
#> year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
#> 1 2013 1 9 641 900 1301 1242 1530 1272 HA
#> 2 2013 6 15 1432 1935 1137 1607 2120 1127 MQ
#> 3 2013 1 10 1121 1635 1126 1239 1810 1109 MQ
#> 4 2013 9 20 1139 1845 1014 1457 2210 1007 AA
#> 5 2013 7 22 845 1600 1005 1044 1815 989 MQ
#> 6 2013 4 10 1100 1900 960 1342 2211 931 DL
#> 7 2013 3 17 2321 810 911 135 1020 915 DL
#> 8 2013 6 27 959 1900 899 1236 2226 850 DL
#> 9 2013 7 22 2257 759 898 121 1026 895 DL
#> 10 2013 12 5 756 1700 896 1058 2020 878 AA
#> # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
그룹화 + 집계값 요약
- 월별 평균 지연시간, 개수
%>%
flights group_by(month) %>%
summarise(mean_dep_delay = mean(dep_delay, na.rm = T),
count = n())
#> # A tibble: 12 × 3
#> month mean_dep_delay count
#> <int> <dbl> <int>
#> 1 1 10.0 27004
#> 2 2 10.8 24951
#> 3 3 13.2 28834
#> 4 4 13.9 28330
#> 5 5 13.0 28796
#> 6 6 20.8 28243
#> 7 7 21.7 29425
#> 8 8 12.6 29327
#> 9 9 6.72 27574
#> 10 10 6.24 28889
#> 11 11 5.44 27268
#> 12 12 16.6 28135
left_join
### 컬럼 선택
<- flights %>%
flights_sample2 select(year:day, origin, carrier)
flights_sample2#> # A tibble: 336,776 × 5
#> year month day origin carrier
#> <int> <int> <int> <chr> <chr>
#> 1 2013 1 1 EWR UA
#> 2 2013 1 1 LGA UA
#> 3 2013 1 1 JFK AA
#> 4 2013 1 1 JFK B6
#> 5 2013 1 1 LGA DL
#> 6 2013 1 1 EWR UA
#> 7 2013 1 1 EWR B6
#> 8 2013 1 1 LGA EV
#> 9 2013 1 1 JFK B6
#> 10 2013 1 1 LGA AA
#> # … with 336,766 more rows
### join 시킬 데이터셋 - 'airlines'
airlines#> # A tibble: 16 × 2
#> carrier name
#> <chr> <chr>
#> 1 9E Endeavor Air Inc.
#> 2 AA American Airlines Inc.
#> 3 AS Alaska Airlines Inc.
#> 4 B6 JetBlue Airways
#> 5 DL Delta Air Lines Inc.
#> 6 EV ExpressJet Airlines Inc.
#> 7 F9 Frontier Airlines Inc.
#> 8 FL AirTran Airways Corporation
#> 9 HA Hawaiian Airlines Inc.
#> 10 MQ Envoy Air
#> 11 OO SkyWest Airlines Inc.
#> 12 UA United Air Lines Inc.
#> 13 US US Airways Inc.
#> 14 VX Virgin America
#> 15 WN Southwest Airlines Co.
#> 16 YV Mesa Airlines Inc.
### key값 기준으로 왼쪽에 join
%>%
flights_sample2 left_join(airlines, by="carrier")
#> # A tibble: 336,776 × 6
#> year month day origin carrier name
#> <int> <int> <int> <chr> <chr> <chr>
#> 1 2013 1 1 EWR UA United Air Lines Inc.
#> 2 2013 1 1 LGA UA United Air Lines Inc.
#> 3 2013 1 1 JFK AA American Airlines Inc.
#> 4 2013 1 1 JFK B6 JetBlue Airways
#> 5 2013 1 1 LGA DL Delta Air Lines Inc.
#> 6 2013 1 1 EWR UA United Air Lines Inc.
#> 7 2013 1 1 EWR B6 JetBlue Airways
#> 8 2013 1 1 LGA EV ExpressJet Airlines Inc.
#> 9 2013 1 1 JFK B6 JetBlue Airways
#> 10 2013 1 1 LGA AA American Airlines Inc.
#> # … with 336,766 more rows
### key값 컬럼 이름이 다를 때
colnames(airlines)[1] <- 'different'
%>%
flights_sample2 left_join(airlines, by=c("carrier" = "different"))
#> # A tibble: 336,776 × 6
#> year month day origin carrier name
#> <int> <int> <int> <chr> <chr> <chr>
#> 1 2013 1 1 EWR UA United Air Lines Inc.
#> 2 2013 1 1 LGA UA United Air Lines Inc.
#> 3 2013 1 1 JFK AA American Airlines Inc.
#> 4 2013 1 1 JFK B6 JetBlue Airways
#> 5 2013 1 1 LGA DL Delta Air Lines Inc.
#> 6 2013 1 1 EWR UA United Air Lines Inc.
#> 7 2013 1 1 EWR B6 JetBlue Airways
#> 8 2013 1 1 LGA EV ExpressJet Airlines Inc.
#> 9 2013 1 1 JFK B6 JetBlue Airways
#> 10 2013 1 1 LGA AA American Airlines Inc.
#> # … with 336,766 more rows
Quiz
- 월 마다 가장 연착이 긴 시간 톺아보기
- 월 마다 가장 연착이 많이된 비행기는 해당월의 평균보다 몇배가 높나?
### 1
%>%
flights group_by(month) %>%
summarise(max_dep_delay = max(dep_delay, na.rm = T)) %>%
arrange(-max_dep_delay)
#> # A tibble: 12 × 2
#> month max_dep_delay
#> <int> <dbl>
#> 1 1 1301
#> 2 6 1137
#> 3 9 1014
#> 4 7 1005
#> 5 4 960
#> 6 3 911
#> 7 12 896
#> 8 5 878
#> 9 2 853
#> 10 11 798
#> 11 10 702
#> 12 8 520
### 2
%>%
flights group_by(month) %>%
summarise(mean_dep_delay = mean(dep_delay, na.rm = T),
max_dep_delay = max(dep_delay, na.rm = T),
compare = max_dep_delay/mean_dep_delay)
#> # A tibble: 12 × 4
#> month mean_dep_delay max_dep_delay compare
#> <int> <dbl> <dbl> <dbl>
#> 1 1 10.0 1301 130.
#> 2 2 10.8 853 78.9
#> 3 3 13.2 911 68.9
#> 4 4 13.9 960 68.9
#> 5 5 13.0 878 67.6
#> 6 6 20.8 1137 54.5
#> 7 7 21.7 1005 46.3
#> 8 8 12.6 520 41.2
#> 9 9 6.72 1014 151.
#> 10 10 6.24 702 112.
#> 11 11 5.44 798 147.
#> 12 12 16.6 896 54.1