Data Transformation

오픈데이터 분석 실습 : Data Transformation

패키지 불러오기

library(tidyverse)
library(nycflights13)

간단한 데이터셋 파악

nycflights13

### 도움말
#?flights

### 요약
flights %>% str()
#> tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
#>  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
#>  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
#>  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
#>  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
#>  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
#>  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
#>  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
#>  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
#>  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
#>  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
#>  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
#>  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
#>  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
#>  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
#>  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
#>  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
#>  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
#>  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
#>  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...

### 컬럼 이름
flights %>% colnames()
#>  [1] "year"           "month"          "day"            "dep_time"      
#>  [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
#>  [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
#> [13] "origin"         "dest"           "air_time"       "distance"      
#> [17] "hour"           "minute"         "time_hour"

dplyr 기초

filter

조건 필터링

### table - 빈도수 파악
flights$month %>% table()
#> .
#>     1     2     3     4     5     6     7     8     9    10    11    12 
#> 27004 24951 28834 28330 28796 28243 29425 29327 27574 28889 27268 28135

### 월별 필터링
flights %>%
  filter(month==12 | month==11)
#> # A tibble: 55,403 × 19
#>     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#>    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
#>  1  2013    11     1        5       2359       6     352     345       7 B6     
#>  2  2013    11     1       35       2250     105     123    2356      87 B6     
#>  3  2013    11     1      455        500      -5     641     651     -10 US     
#>  4  2013    11     1      539        545      -6     856     827      29 UA     
#>  5  2013    11     1      542        545      -3     831     855     -24 AA     
#>  6  2013    11     1      549        600     -11     912     923     -11 UA     
#>  7  2013    11     1      550        600     -10     705     659       6 US     
#>  8  2013    11     1      554        600      -6     659     701      -2 US     
#>  9  2013    11     1      554        600      -6     826     827      -1 DL     
#> 10  2013    11     1      554        600      -6     749     751      -2 DL     
#> # … with 55,393 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> #   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

### 월별 필터링 - %in% 사용
flights %>% 
  filter(month %in% c(11,12))
#> # A tibble: 55,403 × 19
#>     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#>    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
#>  1  2013    11     1        5       2359       6     352     345       7 B6     
#>  2  2013    11     1       35       2250     105     123    2356      87 B6     
#>  3  2013    11     1      455        500      -5     641     651     -10 US     
#>  4  2013    11     1      539        545      -6     856     827      29 UA     
#>  5  2013    11     1      542        545      -3     831     855     -24 AA     
#>  6  2013    11     1      549        600     -11     912     923     -11 UA     
#>  7  2013    11     1      550        600     -10     705     659       6 US     
#>  8  2013    11     1      554        600      -6     659     701      -2 US     
#>  9  2013    11     1      554        600      -6     826     827      -1 DL     
#> 10  2013    11     1      554        600      -6     749     751      -2 DL     
#> # … with 55,393 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> #   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

missing values

결측값 NA

### 결측값은 연산 불가능
NA
#> [1] NA

NA > 5
#> [1] NA

10 == NA
#> [1] NA

NA + 19
#> [1] NA

NA / 2
#> [1] NA

### 결측값 개수
flights$dep_time %>%
  is.na() %>% 
  sum()
#> [1] 8255

### 결측값 있는 df 생성
df <- data.frame(x = c(1, NA, 3))
df
#>    x
#> 1  1
#> 2 NA
#> 3  3

### 타입 확인
df %>% class()
#> [1] "data.frame"

### tibble : tidyverse의 df 클래스
tibble_df <- tibble(x = c(1, NA, 3))
tibble_df
#> # A tibble: 3 × 1
#>       x
#>   <dbl>
#> 1     1
#> 2    NA
#> 3     3

tibble_df %>% filter(x > 1)
#> # A tibble: 1 × 1
#>       x
#>   <dbl>
#> 1     3

tibble_df %>% filter(is.na(x) | x>1)
#> # A tibble: 2 × 1
#>       x
#>   <dbl>
#> 1    NA
#> 2     3

arrange

정렬

### 오름차순 정렬
flights %>% arrange(dep_time)
#> # A tibble: 336,776 × 19
#>     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#>    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
#>  1  2013     1    13        1       2249      72     108    2357      71 B6     
#>  2  2013     1    31        1       2100     181     124    2225     179 WN     
#>  3  2013    11    13        1       2359       2     442     440       2 B6     
#>  4  2013    12    16        1       2359       2     447     437      10 B6     
#>  5  2013    12    20        1       2359       2     430     440     -10 B6     
#>  6  2013    12    26        1       2359       2     437     440      -3 B6     
#>  7  2013    12    30        1       2359       2     441     437       4 B6     
#>  8  2013     2    11        1       2100     181     111    2225     166 WN     
#>  9  2013     2    24        1       2245      76     121    2354      87 B6     
#> 10  2013     3     8        1       2355       6     431     440      -9 B6     
#> # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> #   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

### 결측값은 가장 하단 배치
df <- tibble(x = c(5,2,NA))
arrange(df, x)
#> # A tibble: 3 × 1
#>       x
#>   <dbl>
#> 1     2
#> 2     5
#> 3    NA

arrange(df, -x)
#> # A tibble: 3 × 1
#>       x
#>   <dbl>
#> 1     5
#> 2     2
#> 3    NA

select

컬럼 선택

### 컬럼 선택 (순서 지정 가능)
flights %>% select(year, month, day)
#> # A tibble: 336,776 × 3
#>     year month   day
#>    <int> <int> <int>
#>  1  2013     1     1
#>  2  2013     1     1
#>  3  2013     1     1
#>  4  2013     1     1
#>  5  2013     1     1
#>  6  2013     1     1
#>  7  2013     1     1
#>  8  2013     1     1
#>  9  2013     1     1
#> 10  2013     1     1
#> # … with 336,766 more rows

flights %>% select(year:day)
#> # A tibble: 336,776 × 3
#>     year month   day
#>    <int> <int> <int>
#>  1  2013     1     1
#>  2  2013     1     1
#>  3  2013     1     1
#>  4  2013     1     1
#>  5  2013     1     1
#>  6  2013     1     1
#>  7  2013     1     1
#>  8  2013     1     1
#>  9  2013     1     1
#> 10  2013     1     1
#> # … with 336,766 more rows

### 컬럼 제외
flights %>% select(-(year:day))
#> # A tibble: 336,776 × 16
#>    dep_t…¹ sched…² dep_d…³ arr_t…⁴ sched…⁵ arr_d…⁶ carrier flight tailnum origin
#>      <int>   <int>   <dbl>   <int>   <int>   <dbl> <chr>    <int> <chr>   <chr> 
#>  1     517     515       2     830     819      11 UA        1545 N14228  EWR   
#>  2     533     529       4     850     830      20 UA        1714 N24211  LGA   
#>  3     542     540       2     923     850      33 AA        1141 N619AA  JFK   
#>  4     544     545      -1    1004    1022     -18 B6         725 N804JB  JFK   
#>  5     554     600      -6     812     837     -25 DL         461 N668DN  LGA   
#>  6     554     558      -4     740     728      12 UA        1696 N39463  EWR   
#>  7     555     600      -5     913     854      19 B6         507 N516JB  EWR   
#>  8     557     600      -3     709     723     -14 EV        5708 N829AS  LGA   
#>  9     557     600      -3     838     846      -8 B6          79 N593JB  JFK   
#> 10     558     600      -2     753     745       8 AA         301 N3ALAA  LGA   
#> # … with 336,766 more rows, 6 more variables: dest <chr>, air_time <dbl>,
#> #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>, and abbreviated
#> #   variable names ¹dep_time, ²sched_dep_time, ³dep_delay, ⁴arr_time,
#> #   ⁵sched_arr_time, ⁶arr_delay

### 시작 문자열 지정
flights %>% select(starts_with('dep'))
#> # A tibble: 336,776 × 2
#>    dep_time dep_delay
#>       <int>     <dbl>
#>  1      517         2
#>  2      533         4
#>  3      542         2
#>  4      544        -1
#>  5      554        -6
#>  6      554        -4
#>  7      555        -5
#>  8      557        -3
#>  9      557        -3
#> 10      558        -2
#> # … with 336,766 more rows

### 끝 문자열 지정
flights %>% select(ends_with("time"))
#> # A tibble: 336,776 × 5
#>    dep_time sched_dep_time arr_time sched_arr_time air_time
#>       <int>          <int>    <int>          <int>    <dbl>
#>  1      517            515      830            819      227
#>  2      533            529      850            830      227
#>  3      542            540      923            850      160
#>  4      544            545     1004           1022      183
#>  5      554            600      812            837      116
#>  6      554            558      740            728      150
#>  7      555            600      913            854      158
#>  8      557            600      709            723       53
#>  9      557            600      838            846      140
#> 10      558            600      753            745      138
#> # … with 336,766 more rows

### 포함 문자열 지정
flights %>% select(contains("time"))
#> # A tibble: 336,776 × 6
#>    dep_time sched_dep_time arr_time sched_arr_time air_time time_hour          
#>       <int>          <int>    <int>          <int>    <dbl> <dttm>             
#>  1      517            515      830            819      227 2013-01-01 05:00:00
#>  2      533            529      850            830      227 2013-01-01 05:00:00
#>  3      542            540      923            850      160 2013-01-01 05:00:00
#>  4      544            545     1004           1022      183 2013-01-01 05:00:00
#>  5      554            600      812            837      116 2013-01-01 06:00:00
#>  6      554            558      740            728      150 2013-01-01 05:00:00
#>  7      555            600      913            854      158 2013-01-01 06:00:00
#>  8      557            600      709            723       53 2013-01-01 06:00:00
#>  9      557            600      838            846      140 2013-01-01 06:00:00
#> 10      558            600      753            745      138 2013-01-01 06:00:00
#> # … with 336,766 more rows

### 컬럼 이름 변경 new = old
flights %>% rename(tail_num = tailnum)
#> # A tibble: 336,776 × 19
#>     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#>    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
#>  1  2013     1     1      517        515       2     830     819      11 UA     
#>  2  2013     1     1      533        529       4     850     830      20 UA     
#>  3  2013     1     1      542        540       2     923     850      33 AA     
#>  4  2013     1     1      544        545      -1    1004    1022     -18 B6     
#>  5  2013     1     1      554        600      -6     812     837     -25 DL     
#>  6  2013     1     1      554        558      -4     740     728      12 UA     
#>  7  2013     1     1      555        600      -5     913     854      19 B6     
#>  8  2013     1     1      557        600      -3     709     723     -14 EV     
#>  9  2013     1     1      557        600      -3     838     846      -8 B6     
#> 10  2013     1     1      558        600      -2     753     745       8 AA     
#> # … with 336,766 more rows, 9 more variables: flight <int>, tail_num <chr>,
#> #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> #   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

### 순서 배치 + 나머지
flights %>% select(time_hour, air_time, everything())
#> # A tibble: 336,776 × 19
#>    time_hour           air_t…¹  year month   day dep_t…² sched…³ dep_d…⁴ arr_t…⁵
#>    <dttm>                <dbl> <int> <int> <int>   <int>   <int>   <dbl>   <int>
#>  1 2013-01-01 05:00:00     227  2013     1     1     517     515       2     830
#>  2 2013-01-01 05:00:00     227  2013     1     1     533     529       4     850
#>  3 2013-01-01 05:00:00     160  2013     1     1     542     540       2     923
#>  4 2013-01-01 05:00:00     183  2013     1     1     544     545      -1    1004
#>  5 2013-01-01 06:00:00     116  2013     1     1     554     600      -6     812
#>  6 2013-01-01 05:00:00     150  2013     1     1     554     558      -4     740
#>  7 2013-01-01 06:00:00     158  2013     1     1     555     600      -5     913
#>  8 2013-01-01 06:00:00      53  2013     1     1     557     600      -3     709
#>  9 2013-01-01 06:00:00     140  2013     1     1     557     600      -3     838
#> 10 2013-01-01 06:00:00     138  2013     1     1     558     600      -2     753
#> # … with 336,766 more rows, 10 more variables: sched_arr_time <int>,
#> #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
#> #   dest <chr>, distance <dbl>, hour <dbl>, minute <dbl>, and abbreviated
#> #   variable names ¹air_time, ²dep_time, ³sched_dep_time, ⁴dep_delay, ⁵arr_time

mutate

새로운 컬럼 생성

flights %>% mutate(
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)
#> # A tibble: 336,776 × 22
#>     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#>    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
#>  1  2013     1     1      517        515       2     830     819      11 UA     
#>  2  2013     1     1      533        529       4     850     830      20 UA     
#>  3  2013     1     1      542        540       2     923     850      33 AA     
#>  4  2013     1     1      544        545      -1    1004    1022     -18 B6     
#>  5  2013     1     1      554        600      -6     812     837     -25 DL     
#>  6  2013     1     1      554        558      -4     740     728      12 UA     
#>  7  2013     1     1      555        600      -5     913     854      19 B6     
#>  8  2013     1     1      557        600      -3     709     723     -14 EV     
#>  9  2013     1     1      557        600      -3     838     846      -8 B6     
#> 10  2013     1     1      558        600      -2     753     745       8 AA     
#> # … with 336,766 more rows, 12 more variables: flight <int>, tailnum <chr>,
#> #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> #   minute <dbl>, time_hour <dttm>, gain <dbl>, hours <dbl>,
#> #   gain_per_hour <dbl>, and abbreviated variable names ¹sched_dep_time,
#> #   ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

### transmute : mutate와 다르게 반환된 값만 출력 
flights %>% transmute(
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)
#> # A tibble: 336,776 × 3
#>     gain hours gain_per_hour
#>    <dbl> <dbl>         <dbl>
#>  1    -9 3.78          -2.38
#>  2   -16 3.78          -4.23
#>  3   -31 2.67         -11.6 
#>  4    17 3.05           5.57
#>  5    19 1.93           9.83
#>  6   -16 2.5           -6.4 
#>  7   -24 2.63          -9.11
#>  8    11 0.883         12.5 
#>  9     5 2.33           2.14
#> 10   -10 2.3           -4.35
#> # … with 336,766 more rows

유용한 기능

### 몫
5 %/% 3
#> [1] 1

### 나머지
5 %% 3
#> [1] 2

### 그룹화 + 집계값 요약
flights %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay))
#> `summarise()` has grouped output by 'year', 'month'. You can override using the
#> `.groups` argument.
#> # A tibble: 365 × 4
#> # Groups:   year, month [12]
#>     year month   day  mean
#>    <int> <int> <int> <dbl>
#>  1  2013     1     1    NA
#>  2  2013     1     2    NA
#>  3  2013     1     3    NA
#>  4  2013     1     4    NA
#>  5  2013     1     5    NA
#>  6  2013     1     6    NA
#>  7  2013     1     7    NA
#>  8  2013     1     8    NA
#>  9  2013     1     9    NA
#> 10  2013     1    10    NA
#> # … with 355 more rows

### 그룹화 + 집계값 요약 + NA 제거
flights %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay, na.rm = T))
#> `summarise()` has grouped output by 'year', 'month'. You can override using the
#> `.groups` argument.
#> # A tibble: 365 × 4
#> # Groups:   year, month [12]
#>     year month   day  mean
#>    <int> <int> <int> <dbl>
#>  1  2013     1     1 11.5 
#>  2  2013     1     2 13.9 
#>  3  2013     1     3 11.0 
#>  4  2013     1     4  8.95
#>  5  2013     1     5  5.73
#>  6  2013     1     6  7.15
#>  7  2013     1     7  5.42
#>  8  2013     1     8  2.55
#>  9  2013     1     9  2.28
#> 10  2013     1    10  2.84
#> # … with 355 more rows

### !is.na : 결측값 아닌 값들만 출력
flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay))
#> # A tibble: 327,346 × 19
#>     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#>    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
#>  1  2013     1     1      517        515       2     830     819      11 UA     
#>  2  2013     1     1      533        529       4     850     830      20 UA     
#>  3  2013     1     1      542        540       2     923     850      33 AA     
#>  4  2013     1     1      544        545      -1    1004    1022     -18 B6     
#>  5  2013     1     1      554        600      -6     812     837     -25 DL     
#>  6  2013     1     1      554        558      -4     740     728      12 UA     
#>  7  2013     1     1      555        600      -5     913     854      19 B6     
#>  8  2013     1     1      557        600      -3     709     723     -14 EV     
#>  9  2013     1     1      557        600      -3     838     846      -8 B6     
#> 10  2013     1     1      558        600      -2     753     745       8 AA     
#> # … with 327,336 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> #   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

### 그룹화 해제 후 집계값 요약
group <- flights %>%
  group_by(year, month, day)

group %>% 
  ungroup() %>% 
  summarise(flights = n())
#> # A tibble: 1 × 1
#>   flights
#>     <int>
#> 1  336776

### drop_na
tibble_df %>% drop_na()
#> # A tibble: 2 × 1
#>       x
#>   <dbl>
#> 1     1
#> 2     3

### mutate 컬럼 위치 지정
flights %>%
  mutate(mean_arr_time = mean(arr_time, na.rm = T), .after = arr_time)
#> # A tibble: 336,776 × 20
#>     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ mean_…⁴ sched…⁵ arr_d…⁶
#>    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <dbl>   <int>   <dbl>
#>  1  2013     1     1      517        515       2     830   1502.     819      11
#>  2  2013     1     1      533        529       4     850   1502.     830      20
#>  3  2013     1     1      542        540       2     923   1502.     850      33
#>  4  2013     1     1      544        545      -1    1004   1502.    1022     -18
#>  5  2013     1     1      554        600      -6     812   1502.     837     -25
#>  6  2013     1     1      554        558      -4     740   1502.     728      12
#>  7  2013     1     1      555        600      -5     913   1502.     854      19
#>  8  2013     1     1      557        600      -3     709   1502.     723     -14
#>  9  2013     1     1      557        600      -3     838   1502.     846      -8
#> 10  2013     1     1      558        600      -2     753   1502.     745       8
#> # … with 336,766 more rows, 10 more variables: carrier <chr>, flight <int>,
#> #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#> #   hour <dbl>, minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> #   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴mean_arr_time, ⁵sched_arr_time,
#> #   ⁶arr_delay

rank, slice

행의 범위 지정

### 년/월/일자별 top10 'arr_delay' 출력

### rank 사용
flights %>% 
  group_by(year,month,day) %>% 
  filter(rank(desc(arr_delay)) < 11) %>% 
  select(year,month,day, arr_delay)
#> # A tibble: 3,691 × 4
#> # Groups:   year, month, day [365]
#>     year month   day arr_delay
#>    <int> <int> <int>     <dbl>
#>  1  2013     1     1       851
#>  2  2013     1     1       338
#>  3  2013     1     1       263
#>  4  2013     1     1       166
#>  5  2013     1     1       174
#>  6  2013     1     1       222
#>  7  2013     1     1       250
#>  8  2013     1     1       246
#>  9  2013     1     1       191
#> 10  2013     1     1       456
#> # … with 3,681 more rows

### slice 사용
flights %>% 
  group_by(year,month,day) %>% 
  slice_max(arr_delay,n=10) %>% 
  select(year,month,day, arr_delay)
#> # A tibble: 3,697 × 4
#> # Groups:   year, month, day [365]
#>     year month   day arr_delay
#>    <int> <int> <int>     <dbl>
#>  1  2013     1     1       851
#>  2  2013     1     1       456
#>  3  2013     1     1       338
#>  4  2013     1     1       263
#>  5  2013     1     1       250
#>  6  2013     1     1       246
#>  7  2013     1     1       222
#>  8  2013     1     1       191
#>  9  2013     1     1       174
#> 10  2013     1     1       166
#> # … with 3,687 more rows

### 순위 책정
flights %>% 
  select(year,month,day,arr_delay) %>% 
  arrange(year,month,day,desc(arr_delay)) %>% 
  group_by(year,month,day) %>% 
  mutate(rank = rank(desc(arr_delay)))
#> # A tibble: 336,776 × 5
#> # Groups:   year, month, day [365]
#>     year month   day arr_delay  rank
#>    <int> <int> <int>     <dbl> <dbl>
#>  1  2013     1     1       851     1
#>  2  2013     1     1       456     2
#>  3  2013     1     1       338     3
#>  4  2013     1     1       263     4
#>  5  2013     1     1       250     5
#>  6  2013     1     1       246     6
#>  7  2013     1     1       222     7
#>  8  2013     1     1       191     8
#>  9  2013     1     1       174     9
#> 10  2013     1     1       166    10
#> # … with 336,766 more rows

Quiz 1

dep_time의 결측지를 dep_time의 평균 값으로 교체하시오

### NA 제외한 평균값
mean_dep_time <- mean(flights$dep_time,na.rm = T) %>% as.integer()
mean_dep_time
#> [1] 1349

### repalce_na 
flights$dep_time <- flights$dep_time %>% 
  replace_na(mean_dep_time)

flights$dep_time %>% is.na() %>% sum()
#> [1] 0

### mutate + ifelse
flights %>% 
  mutate(dep_time = ifelse(is.na(dep_time),
                           mean_dep_time,dep_time))
#> # A tibble: 336,776 × 19
#>     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#>    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
#>  1  2013     1     1      517        515       2     830     819      11 UA     
#>  2  2013     1     1      533        529       4     850     830      20 UA     
#>  3  2013     1     1      542        540       2     923     850      33 AA     
#>  4  2013     1     1      544        545      -1    1004    1022     -18 B6     
#>  5  2013     1     1      554        600      -6     812     837     -25 DL     
#>  6  2013     1     1      554        558      -4     740     728      12 UA     
#>  7  2013     1     1      555        600      -5     913     854      19 B6     
#>  8  2013     1     1      557        600      -3     709     723     -14 EV     
#>  9  2013     1     1      557        600      -3     838     846      -8 B6     
#> 10  2013     1     1      558        600      -2     753     745       8 AA     
#> # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> #   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

Quiz 2

1. 월별 비행기 개수를 구하시오
1. dest(도착공항)별로 비행기가 10000대 이상 착륙한 공항으로 도착한 비행정보만 추출

### 1
flights %>%
  group_by(month) %>% 
  summarise(fly_count = n())
#> # A tibble: 12 × 2
#>    month fly_count
#>    <int>     <int>
#>  1     1     27004
#>  2     2     24951
#>  3     3     28834
#>  4     4     28330
#>  5     5     28796
#>  6     6     28243
#>  7     7     29425
#>  8     8     29327
#>  9     9     27574
#> 10    10     28889
#> 11    11     27268
#> 12    12     28135

### 2
air <- flights %>% 
  group_by(dest) %>%
  filter(n() >= 10000)

air %>% nrow()
#> [1] 131440

table(air$dest)
#> 
#>   ATL   BOS   CLT   FLL   LAX   MCO   MIA   ORD   SFO 
#> 17215 15508 14064 12055 16174 14082 11728 17283 13331

Quiz 3

다음은 미국 NBA 농구리그의 농구선수별 게임당 경기통계(stats)이다.

Pos: 농구에서 선수의 포지션
Age: 나이
Tm: 팀이름
3P: 3점슛 성공횟수
3PA: 3점슛 시도횟수
3P%: 3점 성공률
PTS: 평균득점

일 때 다음 물음에 답하시오.

stats <-  read_csv("C:/Users/seong taek/Desktop/3-1 Opendata_Analysis/opendata/nba2021_per_game.csv")
#> Rows: 497 Columns: 29
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr  (3): Player, Pos, Tm
#> dbl (26): Age, G, GS, MP, FG, FGA, FG%, 3P, 3PA, 3P%, 2P, 2PA, 2P%, eFG%, FT...
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
stats
#> # A tibble: 497 × 29
#>    Player      Pos     Age Tm        G    GS    MP    FG   FGA `FG%`  `3P` `3PA`
#>    <chr>       <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1 Precious A… PF       21 MIA      28     2  14.6   2.6   4.4 0.59    0     0  
#>  2 Jaylen Ada… PG       24 MIL       6     0   2.8   0.2   1.3 0.125   0     0.3
#>  3 Steven Ada… C        27 NOP      27    27  28.1   3.5   5.8 0.603   0     0  
#>  4 Bam Adebayo C        23 MIA      26    26  33.6   7.4  12.9 0.573   0.1   0.2
#>  5 LaMarcus A… C        35 SAS      18    18  26.7   5.9  12.5 0.476   1.3   3.7
#>  6 Ty-Shon Al… SG       22 PHO       3     0   2.7   0     1   0       0     0.3
#>  7 Nickeil Al… SG       22 NOP      23     3  19.2   3.3   8.2 0.41    1     3.8
#>  8 Grayson Al… SG       25 MEM      19     8  23.9   3.2   7.4 0.429   2.3   5.3
#>  9 Jarrett Al… C        22 TOT      28    10  26.2   4.4   6.8 0.642   0     0.1
#> 10 Jarrett Al… C        22 BRK      12     5  26.7   3.7   5.4 0.677   0     0  
#> # … with 487 more rows, and 17 more variables: `3P%` <dbl>, `2P` <dbl>,
#> #   `2PA` <dbl>, `2P%` <dbl>, `eFG%` <dbl>, FT <dbl>, FTA <dbl>, `FT%` <dbl>,
#> #   ORB <dbl>, DRB <dbl>, TRB <dbl>, AST <dbl>, STL <dbl>, BLK <dbl>,
#> #   TOV <dbl>, PF <dbl>, PTS <dbl>

1. 위의 dataframe을 가지고 group 별 통계값을 계산하시오.

NBA 농구팀별(Tm)로 가장 평균득점(PTS)이 높은 사람과 낮은 사람을 추출하는 코드를 작성하시오.
또한, 팀별로 평균득점 최대값과 최소값의 차이를 gap이라는 컬럼을 새로 만들어 나타내시오.

stats %>% 
  group_by(Tm) %>% 
  summarise(max_score = max(PTS),
         min_score = min(PTS),
         gap = max_score - min_score,
         max_player = Player[which.max(PTS)],
         min_player = Player[which.min(PTS)])
#> # A tibble: 31 × 6
#>    Tm    max_score min_score   gap max_player     min_player       
#>    <chr>     <dbl>     <dbl> <dbl> <chr>          <chr>            
#>  1 ATL        26.5       1.8  24.7 Trae Young     Bruno Fernando   
#>  2 BOS        25.9       2.4  23.5 Jaylen Brown   Tremont Waters   
#>  3 BRK        29         0    29   Kevin Durant   Noah Vonleh      
#>  4 CHI        28.5       1.5  27   Zach LaVine    Luke Kornet      
#>  5 CHO        22.3       1    21.3 Gordon Hayward Vernon Carey Jr. 
#>  6 CLE        22.8       1.2  21.6 Collin Sexton  Marques Bolden   
#>  7 DAL        29.1       1    28.1 Luka Dončić    Tyrell Terry     
#>  8 DEN        27.4       0.6  26.8 Nikola Jokić   Vlatko Čančar    
#>  9 DET        23.8       0    23.8 Jerami Grant   Deividas Sirvydis
#> 10 GSW        30         1.5  28.5 Stephen Curry  Nico Mannion     
#> # … with 21 more rows

2. 각 포지션별로(Pos) 평균나이 대비 해당 선수의 나이가 몇배 높거나 낮은지 비율을 계산하시오

예를들어 PG 포지션의 평균나이는 27세이고, Chris Paul의 나이는 35세이므로, 비율은 35/27이 된다.
모든 선수에 대해 이 비율을 age_ratio_by_position이라는 새로운 컬럼으로 저장하는 코드를 작성하시오.
단 , 포지션 별로 비율이 가장 높은 한명만 추출

stats %>% 
  group_by(Pos) %>% 
  mutate(mean_age = mean(Age),
         age_ratio_by_position = Age/mean_age) %>% 
  select(Player, Pos, Age, mean_age, age_ratio_by_position) %>% 
  arrange(-age_ratio_by_position) %>% 
  slice_max(age_ratio_by_position, n=1)
#> # A tibble: 9 × 5
#> # Groups:   Pos [9]
#>   Player          Pos     Age mean_age age_ratio_by_position
#>   <chr>           <chr> <dbl>    <dbl>                 <dbl>
#> 1 Marc Gasol      C        36     26.1                  1.38
#> 2 Noah Vonleh     F        25     24                    1.04
#> 3 Norvel Pelle    F-C      27     27                    1   
#> 4 Derrick Rose    G        32     27                    1.19
#> 5 Carmelo Anthony PF       36     25.9                  1.39
#> 6 LeBron James    PG       36     25.6                  1.41
#> 7 Andre Iguodala  SF       37     25.4                  1.46
#> 8 Rodions Kurucs  SF-PF    22     22                    1   
#> 9 J.J. Redick     SG       36     25.3                  1.43

Data Transformation

Seongtaek

2023-04-15

오픈데이터 분석 실습 : Data Transformation

패키지 불러오기

간단한 데이터셋 파악

dplyr 기초

filter

missing values

arrange

select

mutate

유용한 기능

rank, slice

Quiz 1

Quiz 2

Quiz 3

1. 위의 dataframe을 가지고 group 별 통계값을 계산하시오.

2. 각 포지션별로(Pos) 평균나이 대비 해당 선수의 나이가 몇배 높거나 낮은지 비율을 계산하시오