오픈데이터 분석 실습 : Tidyverse 패키지

설명

  • 6개의 핵심 패키지 포함 23개의 패키지로 이뤄진 메타 패키지
    • ggplot2
    • dplyr
    • tidyr
    • readr
    • purrr
    • tibble
    • stringr
    • forcats

패키지 불러오기

#install.packages("tidyverse")
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.0     ✔ readr     2.1.4
#> ✔ forcats   1.0.0     ✔ stringr   1.5.0
#> ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
#> ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
#> ✔ purrr     1.0.1     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

tidyverse 실습

#install.packages("nycflights13")
library(nycflights13)

파이프 연산자를 통한 코드 직관화 예시

  • 순차적, 간결함
rnorm(10000, mean=10, sd=1) %>%        # 평균10, 표준편차1인 정규분포에서 10000개 난수 생성
  sample(size = 100, replace = F) %>%  # 샘플 100개를 랜덤으로 비복원 추출
  log() %>%                            # 로그 함수 적용
  diff() %>%                           # 차분 계산 
  plot(col="red", type="l")            # 그래프 그리기

행의 수

flights %>% nrow()
#> [1] 336776

앞부분 훑어보기

flights %>% head()
#> # A tibble: 6 × 19
#>    year month   day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#>   <int> <int> <int>    <int>       <int>   <dbl>   <int>   <int>   <dbl> <chr>  
#> 1  2013     1     1      517         515       2     830     819      11 UA     
#> 2  2013     1     1      533         529       4     850     830      20 UA     
#> 3  2013     1     1      542         540       2     923     850      33 AA     
#> 4  2013     1     1      544         545      -1    1004    1022     -18 B6     
#> 5  2013     1     1      554         600      -6     812     837     -25 DL     
#> 6  2013     1     1      554         558      -4     740     728      12 UA     
#> # … with 9 more variables: flight <int>, tailnum <chr>, origin <chr>,
#> #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#> #   time_hour <dttm>, and abbreviated variable names ¹​sched_dep_time,
#> #   ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay

그룹화 + 집계값 요약

### 날짜별 평균 출발지연 시간
mean_delay_by_day <- flights %>% 
  group_by(year, month, day) %>% 
  summarise(delay = mean(dep_delay, na.rm = T))
#> `summarise()` has grouped output by 'year', 'month'. You can override using the
#> `.groups` argument.

mean_delay_by_day
#> # A tibble: 365 × 4
#> # Groups:   year, month [12]
#>     year month   day delay
#>    <int> <int> <int> <dbl>
#>  1  2013     1     1 11.5 
#>  2  2013     1     2 13.9 
#>  3  2013     1     3 11.0 
#>  4  2013     1     4  8.95
#>  5  2013     1     5  5.73
#>  6  2013     1     6  7.15
#>  7  2013     1     7  5.42
#>  8  2013     1     8  2.55
#>  9  2013     1     9  2.28
#> 10  2013     1    10  2.84
#> # … with 355 more rows

select()

  • 컬럼 선택 (순서 지정 가능)
### p.s 컬럼이름 되도록 띄어쓰기 사용×
flights %>% select(year,month,day)
#> # A tibble: 336,776 × 3
#>     year month   day
#>    <int> <int> <int>
#>  1  2013     1     1
#>  2  2013     1     1
#>  3  2013     1     1
#>  4  2013     1     1
#>  5  2013     1     1
#>  6  2013     1     1
#>  7  2013     1     1
#>  8  2013     1     1
#>  9  2013     1     1
#> 10  2013     1     1
#> # … with 336,766 more rows

컬럼 선택

### year ~ day, delay로 끝나는 컬럼 외 2개 컬럼
flights_sample <- flights %>% 
  select(year:day, ends_with("delay"), distance, air_time)

flights_sample
#> # A tibble: 336,776 × 7
#>     year month   day dep_delay arr_delay distance air_time
#>    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>
#>  1  2013     1     1         2        11     1400      227
#>  2  2013     1     1         4        20     1416      227
#>  3  2013     1     1         2        33     1089      160
#>  4  2013     1     1        -1       -18     1576      183
#>  5  2013     1     1        -6       -25      762      116
#>  6  2013     1     1        -4        12      719      150
#>  7  2013     1     1        -5        19     1065      158
#>  8  2013     1     1        -3       -14      229       53
#>  9  2013     1     1        -3        -8      944      140
#> 10  2013     1     1        -2         8      733      138
#> # … with 336,766 more rows

mutate()

  • 새로운 컬럼 생성
flights_sample %>% 
  mutate(net_delay = arr_delay - dep_delay,
         speed = distance/air_time*60)
#> # A tibble: 336,776 × 9
#>     year month   day dep_delay arr_delay distance air_time net_delay speed
#>    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl> <dbl>
#>  1  2013     1     1         2        11     1400      227         9  370.
#>  2  2013     1     1         4        20     1416      227        16  374.
#>  3  2013     1     1         2        33     1089      160        31  408.
#>  4  2013     1     1        -1       -18     1576      183       -17  517.
#>  5  2013     1     1        -6       -25      762      116       -19  394.
#>  6  2013     1     1        -4        12      719      150        16  288.
#>  7  2013     1     1        -5        19     1065      158        24  404.
#>  8  2013     1     1        -3       -14      229       53       -11  259.
#>  9  2013     1     1        -3        -8      944      140        -5  405.
#> 10  2013     1     1        -2         8      733      138        10  319.
#> # … with 336,766 more rows

filter()

  • 필터링 (조건 설정)
flights %>% filter(month==1)
#> # A tibble: 27,004 × 19
#>     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#>    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
#>  1  2013     1     1      517        515       2     830     819      11 UA     
#>  2  2013     1     1      533        529       4     850     830      20 UA     
#>  3  2013     1     1      542        540       2     923     850      33 AA     
#>  4  2013     1     1      544        545      -1    1004    1022     -18 B6     
#>  5  2013     1     1      554        600      -6     812     837     -25 DL     
#>  6  2013     1     1      554        558      -4     740     728      12 UA     
#>  7  2013     1     1      555        600      -5     913     854      19 B6     
#>  8  2013     1     1      557        600      -3     709     723     -14 EV     
#>  9  2013     1     1      557        600      -3     838     846      -8 B6     
#> 10  2013     1     1      558        600      -2     753     745       8 AA     
#> # … with 26,994 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay

bind_rows()

  • 두 데이터 묶기
jan <- flights %>% filter(month==1)
feb <- flights %>% filter(month==2)
janfeb <- bind_rows(jan,feb)

janfeb %>% head()
#> # A tibble: 6 × 19
#>    year month   day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#>   <int> <int> <int>    <int>       <int>   <dbl>   <int>   <int>   <dbl> <chr>  
#> 1  2013     1     1      517         515       2     830     819      11 UA     
#> 2  2013     1     1      533         529       4     850     830      20 UA     
#> 3  2013     1     1      542         540       2     923     850      33 AA     
#> 4  2013     1     1      544         545      -1    1004    1022     -18 B6     
#> 5  2013     1     1      554         600      -6     812     837     -25 DL     
#> 6  2013     1     1      554         558      -4     740     728      12 UA     
#> # … with 9 more variables: flight <int>, tailnum <chr>, origin <chr>,
#> #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#> #   time_hour <dttm>, and abbreviated variable names ¹​sched_dep_time,
#> #   ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay

janfeb %>% tail()
#> # A tibble: 6 × 19
#>    year month   day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#>   <int> <int> <int>    <int>       <int>   <dbl>   <int>   <int>   <dbl> <chr>  
#> 1  2013     2    28       NA         605      NA      NA     805      NA MQ     
#> 2  2013     2    28       NA         850      NA      NA    1035      NA MQ     
#> 3  2013     2    28       NA         905      NA      NA    1115      NA MQ     
#> 4  2013     2    28       NA        1115      NA      NA    1310      NA MQ     
#> 5  2013     2    28       NA         830      NA      NA    1205      NA UA     
#> 6  2013     2    28       NA         840      NA      NA    1147      NA UA     
#> # … with 9 more variables: flight <int>, tailnum <chr>, origin <chr>,
#> #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#> #   time_hour <dttm>, and abbreviated variable names ¹​sched_dep_time,
#> #   ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay

### filter로 간결하게 사용
flights %>% filter(month %in% c(1,2))
#> # A tibble: 51,955 × 19
#>     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#>    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
#>  1  2013     1     1      517        515       2     830     819      11 UA     
#>  2  2013     1     1      533        529       4     850     830      20 UA     
#>  3  2013     1     1      542        540       2     923     850      33 AA     
#>  4  2013     1     1      544        545      -1    1004    1022     -18 B6     
#>  5  2013     1     1      554        600      -6     812     837     -25 DL     
#>  6  2013     1     1      554        558      -4     740     728      12 UA     
#>  7  2013     1     1      555        600      -5     913     854      19 B6     
#>  8  2013     1     1      557        600      -3     709     723     -14 EV     
#>  9  2013     1     1      557        600      -3     838     846      -8 B6     
#> 10  2013     1     1      558        600      -2     753     745       8 AA     
#> # … with 51,945 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay

arrange()

  • 데이터 정렬 (오름/내림차순)
flights %>% arrange(dep_delay)
#> # A tibble: 336,776 × 19
#>     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#>    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
#>  1  2013    12     7     2040       2123     -43      40    2352      48 B6     
#>  2  2013     2     3     2022       2055     -33    2240    2338     -58 DL     
#>  3  2013    11    10     1408       1440     -32    1549    1559     -10 EV     
#>  4  2013     1    11     1900       1930     -30    2233    2243     -10 DL     
#>  5  2013     1    29     1703       1730     -27    1947    1957     -10 F9     
#>  6  2013     8     9      729        755     -26    1002     955       7 MQ     
#>  7  2013    10    23     1907       1932     -25    2143    2143       0 EV     
#>  8  2013     3    30     2030       2055     -25    2213    2250     -37 MQ     
#>  9  2013     3     2     1431       1455     -24    1601    1631     -30 9E     
#> 10  2013     5     5      934        958     -24    1225    1309     -44 B6     
#> # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay

flights %>% arrange(-dep_delay)
#> # A tibble: 336,776 × 19
#>     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#>    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
#>  1  2013     1     9      641        900    1301    1242    1530    1272 HA     
#>  2  2013     6    15     1432       1935    1137    1607    2120    1127 MQ     
#>  3  2013     1    10     1121       1635    1126    1239    1810    1109 MQ     
#>  4  2013     9    20     1139       1845    1014    1457    2210    1007 AA     
#>  5  2013     7    22      845       1600    1005    1044    1815     989 MQ     
#>  6  2013     4    10     1100       1900     960    1342    2211     931 DL     
#>  7  2013     3    17     2321        810     911     135    1020     915 DL     
#>  8  2013     6    27      959       1900     899    1236    2226     850 DL     
#>  9  2013     7    22     2257        759     898     121    1026     895 DL     
#> 10  2013    12     5      756       1700     896    1058    2020     878 AA     
#> # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay

그룹화 + 집계값 요약

  • 월별 평균 지연시간, 개수
flights %>% 
  group_by(month) %>% 
  summarise(mean_dep_delay = mean(dep_delay, na.rm = T),
            count = n())
#> # A tibble: 12 × 3
#>    month mean_dep_delay count
#>    <int>          <dbl> <int>
#>  1     1          10.0  27004
#>  2     2          10.8  24951
#>  3     3          13.2  28834
#>  4     4          13.9  28330
#>  5     5          13.0  28796
#>  6     6          20.8  28243
#>  7     7          21.7  29425
#>  8     8          12.6  29327
#>  9     9           6.72 27574
#> 10    10           6.24 28889
#> 11    11           5.44 27268
#> 12    12          16.6  28135

left_join

### 컬럼 선택
flights_sample2 <- flights %>% 
  select(year:day, origin, carrier)

flights_sample2
#> # A tibble: 336,776 × 5
#>     year month   day origin carrier
#>    <int> <int> <int> <chr>  <chr>  
#>  1  2013     1     1 EWR    UA     
#>  2  2013     1     1 LGA    UA     
#>  3  2013     1     1 JFK    AA     
#>  4  2013     1     1 JFK    B6     
#>  5  2013     1     1 LGA    DL     
#>  6  2013     1     1 EWR    UA     
#>  7  2013     1     1 EWR    B6     
#>  8  2013     1     1 LGA    EV     
#>  9  2013     1     1 JFK    B6     
#> 10  2013     1     1 LGA    AA     
#> # … with 336,766 more rows

### join 시킬 데이터셋 - 'airlines'
airlines
#> # A tibble: 16 × 2
#>    carrier name                       
#>    <chr>   <chr>                      
#>  1 9E      Endeavor Air Inc.          
#>  2 AA      American Airlines Inc.     
#>  3 AS      Alaska Airlines Inc.       
#>  4 B6      JetBlue Airways            
#>  5 DL      Delta Air Lines Inc.       
#>  6 EV      ExpressJet Airlines Inc.   
#>  7 F9      Frontier Airlines Inc.     
#>  8 FL      AirTran Airways Corporation
#>  9 HA      Hawaiian Airlines Inc.     
#> 10 MQ      Envoy Air                  
#> 11 OO      SkyWest Airlines Inc.      
#> 12 UA      United Air Lines Inc.      
#> 13 US      US Airways Inc.            
#> 14 VX      Virgin America             
#> 15 WN      Southwest Airlines Co.     
#> 16 YV      Mesa Airlines Inc.

### key값 기준으로 왼쪽에 join
flights_sample2 %>% 
  left_join(airlines, by="carrier")
#> # A tibble: 336,776 × 6
#>     year month   day origin carrier name                    
#>    <int> <int> <int> <chr>  <chr>   <chr>                   
#>  1  2013     1     1 EWR    UA      United Air Lines Inc.   
#>  2  2013     1     1 LGA    UA      United Air Lines Inc.   
#>  3  2013     1     1 JFK    AA      American Airlines Inc.  
#>  4  2013     1     1 JFK    B6      JetBlue Airways         
#>  5  2013     1     1 LGA    DL      Delta Air Lines Inc.    
#>  6  2013     1     1 EWR    UA      United Air Lines Inc.   
#>  7  2013     1     1 EWR    B6      JetBlue Airways         
#>  8  2013     1     1 LGA    EV      ExpressJet Airlines Inc.
#>  9  2013     1     1 JFK    B6      JetBlue Airways         
#> 10  2013     1     1 LGA    AA      American Airlines Inc.  
#> # … with 336,766 more rows

### key값 컬럼 이름이 다를 때
colnames(airlines)[1] <- 'different'

flights_sample2 %>% 
  left_join(airlines, by=c("carrier" = "different"))
#> # A tibble: 336,776 × 6
#>     year month   day origin carrier name                    
#>    <int> <int> <int> <chr>  <chr>   <chr>                   
#>  1  2013     1     1 EWR    UA      United Air Lines Inc.   
#>  2  2013     1     1 LGA    UA      United Air Lines Inc.   
#>  3  2013     1     1 JFK    AA      American Airlines Inc.  
#>  4  2013     1     1 JFK    B6      JetBlue Airways         
#>  5  2013     1     1 LGA    DL      Delta Air Lines Inc.    
#>  6  2013     1     1 EWR    UA      United Air Lines Inc.   
#>  7  2013     1     1 EWR    B6      JetBlue Airways         
#>  8  2013     1     1 LGA    EV      ExpressJet Airlines Inc.
#>  9  2013     1     1 JFK    B6      JetBlue Airways         
#> 10  2013     1     1 LGA    AA      American Airlines Inc.  
#> # … with 336,766 more rows

Quiz

### 1
flights %>%
  group_by(month) %>% 
  summarise(max_dep_delay = max(dep_delay, na.rm = T)) %>% 
  arrange(-max_dep_delay)
#> # A tibble: 12 × 2
#>    month max_dep_delay
#>    <int>         <dbl>
#>  1     1          1301
#>  2     6          1137
#>  3     9          1014
#>  4     7          1005
#>  5     4           960
#>  6     3           911
#>  7    12           896
#>  8     5           878
#>  9     2           853
#> 10    11           798
#> 11    10           702
#> 12     8           520

### 2
flights %>%
  group_by(month) %>% 
  summarise(mean_dep_delay = mean(dep_delay, na.rm = T),
            max_dep_delay = max(dep_delay, na.rm = T),
            compare = max_dep_delay/mean_dep_delay)
#> # A tibble: 12 × 4
#>    month mean_dep_delay max_dep_delay compare
#>    <int>          <dbl>         <dbl>   <dbl>
#>  1     1          10.0           1301   130. 
#>  2     2          10.8            853    78.9
#>  3     3          13.2            911    68.9
#>  4     4          13.9            960    68.9
#>  5     5          13.0            878    67.6
#>  6     6          20.8           1137    54.5
#>  7     7          21.7           1005    46.3
#>  8     8          12.6            520    41.2
#>  9     9           6.72          1014   151. 
#> 10    10           6.24           702   112. 
#> 11    11           5.44           798   147. 
#> 12    12          16.6            896    54.1