오픈데이터 분석 실습 : Data Transformation
패키지 불러오기
library(tidyverse)
library(nycflights13)
간단한 데이터셋 파악
- nycflights13
### 도움말
#?flights
### 요약
%>% str()
flights #> tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
#> $ year : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
#> $ month : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
#> $ day : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
#> $ dep_time : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
#> $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
#> $ dep_delay : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
#> $ arr_time : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
#> $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
#> $ arr_delay : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
#> $ carrier : chr [1:336776] "UA" "UA" "AA" "B6" ...
#> $ flight : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
#> $ tailnum : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
#> $ origin : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
#> $ dest : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
#> $ air_time : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
#> $ distance : num [1:336776] 1400 1416 1089 1576 762 ...
#> $ hour : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
#> $ minute : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
#> $ time_hour : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
### 컬럼 이름
%>% colnames()
flights #> [1] "year" "month" "day" "dep_time"
#> [5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
#> [9] "arr_delay" "carrier" "flight" "tailnum"
#> [13] "origin" "dest" "air_time" "distance"
#> [17] "hour" "minute" "time_hour"
dplyr 기초
filter
- 조건 필터링
### table - 빈도수 파악
$month %>% table()
flights#> .
#> 1 2 3 4 5 6 7 8 9 10 11 12
#> 27004 24951 28834 28330 28796 28243 29425 29327 27574 28889 27268 28135
### 월별 필터링
%>%
flights filter(month==12 | month==11)
#> # A tibble: 55,403 × 19
#> year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
#> 1 2013 11 1 5 2359 6 352 345 7 B6
#> 2 2013 11 1 35 2250 105 123 2356 87 B6
#> 3 2013 11 1 455 500 -5 641 651 -10 US
#> 4 2013 11 1 539 545 -6 856 827 29 UA
#> 5 2013 11 1 542 545 -3 831 855 -24 AA
#> 6 2013 11 1 549 600 -11 912 923 -11 UA
#> 7 2013 11 1 550 600 -10 705 659 6 US
#> 8 2013 11 1 554 600 -6 659 701 -2 US
#> 9 2013 11 1 554 600 -6 826 827 -1 DL
#> 10 2013 11 1 554 600 -6 749 751 -2 DL
#> # … with 55,393 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
### 월별 필터링 - %in% 사용
%>%
flights filter(month %in% c(11,12))
#> # A tibble: 55,403 × 19
#> year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
#> 1 2013 11 1 5 2359 6 352 345 7 B6
#> 2 2013 11 1 35 2250 105 123 2356 87 B6
#> 3 2013 11 1 455 500 -5 641 651 -10 US
#> 4 2013 11 1 539 545 -6 856 827 29 UA
#> 5 2013 11 1 542 545 -3 831 855 -24 AA
#> 6 2013 11 1 549 600 -11 912 923 -11 UA
#> 7 2013 11 1 550 600 -10 705 659 6 US
#> 8 2013 11 1 554 600 -6 659 701 -2 US
#> 9 2013 11 1 554 600 -6 826 827 -1 DL
#> 10 2013 11 1 554 600 -6 749 751 -2 DL
#> # … with 55,393 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
missing values
- 결측값 NA
### 결측값은 연산 불가능
NA
#> [1] NA
NA > 5
#> [1] NA
10 == NA
#> [1] NA
NA + 19
#> [1] NA
NA / 2
#> [1] NA
### 결측값 개수
$dep_time %>%
flightsis.na() %>%
sum()
#> [1] 8255
### 결측값 있는 df 생성
<- data.frame(x = c(1, NA, 3))
df
df#> x
#> 1 1
#> 2 NA
#> 3 3
### 타입 확인
%>% class()
df #> [1] "data.frame"
### tibble : tidyverse의 df 클래스
<- tibble(x = c(1, NA, 3))
tibble_df
tibble_df#> # A tibble: 3 × 1
#> x
#> <dbl>
#> 1 1
#> 2 NA
#> 3 3
%>% filter(x > 1)
tibble_df #> # A tibble: 1 × 1
#> x
#> <dbl>
#> 1 3
%>% filter(is.na(x) | x>1)
tibble_df #> # A tibble: 2 × 1
#> x
#> <dbl>
#> 1 NA
#> 2 3
arrange
- 정렬
### 오름차순 정렬
%>% arrange(dep_time)
flights #> # A tibble: 336,776 × 19
#> year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
#> 1 2013 1 13 1 2249 72 108 2357 71 B6
#> 2 2013 1 31 1 2100 181 124 2225 179 WN
#> 3 2013 11 13 1 2359 2 442 440 2 B6
#> 4 2013 12 16 1 2359 2 447 437 10 B6
#> 5 2013 12 20 1 2359 2 430 440 -10 B6
#> 6 2013 12 26 1 2359 2 437 440 -3 B6
#> 7 2013 12 30 1 2359 2 441 437 4 B6
#> 8 2013 2 11 1 2100 181 111 2225 166 WN
#> 9 2013 2 24 1 2245 76 121 2354 87 B6
#> 10 2013 3 8 1 2355 6 431 440 -9 B6
#> # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
### 결측값은 가장 하단 배치
<- tibble(x = c(5,2,NA))
df arrange(df, x)
#> # A tibble: 3 × 1
#> x
#> <dbl>
#> 1 2
#> 2 5
#> 3 NA
arrange(df, -x)
#> # A tibble: 3 × 1
#> x
#> <dbl>
#> 1 5
#> 2 2
#> 3 NA
select
- 컬럼 선택
### 컬럼 선택 (순서 지정 가능)
%>% select(year, month, day)
flights #> # A tibble: 336,776 × 3
#> year month day
#> <int> <int> <int>
#> 1 2013 1 1
#> 2 2013 1 1
#> 3 2013 1 1
#> 4 2013 1 1
#> 5 2013 1 1
#> 6 2013 1 1
#> 7 2013 1 1
#> 8 2013 1 1
#> 9 2013 1 1
#> 10 2013 1 1
#> # … with 336,766 more rows
%>% select(year:day)
flights #> # A tibble: 336,776 × 3
#> year month day
#> <int> <int> <int>
#> 1 2013 1 1
#> 2 2013 1 1
#> 3 2013 1 1
#> 4 2013 1 1
#> 5 2013 1 1
#> 6 2013 1 1
#> 7 2013 1 1
#> 8 2013 1 1
#> 9 2013 1 1
#> 10 2013 1 1
#> # … with 336,766 more rows
### 컬럼 제외
%>% select(-(year:day))
flights #> # A tibble: 336,776 × 16
#> dep_t…¹ sched…² dep_d…³ arr_t…⁴ sched…⁵ arr_d…⁶ carrier flight tailnum origin
#> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
#> 1 517 515 2 830 819 11 UA 1545 N14228 EWR
#> 2 533 529 4 850 830 20 UA 1714 N24211 LGA
#> 3 542 540 2 923 850 33 AA 1141 N619AA JFK
#> 4 544 545 -1 1004 1022 -18 B6 725 N804JB JFK
#> 5 554 600 -6 812 837 -25 DL 461 N668DN LGA
#> 6 554 558 -4 740 728 12 UA 1696 N39463 EWR
#> 7 555 600 -5 913 854 19 B6 507 N516JB EWR
#> 8 557 600 -3 709 723 -14 EV 5708 N829AS LGA
#> 9 557 600 -3 838 846 -8 B6 79 N593JB JFK
#> 10 558 600 -2 753 745 8 AA 301 N3ALAA LGA
#> # … with 336,766 more rows, 6 more variables: dest <chr>, air_time <dbl>,
#> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>, and abbreviated
#> # variable names ¹dep_time, ²sched_dep_time, ³dep_delay, ⁴arr_time,
#> # ⁵sched_arr_time, ⁶arr_delay
### 시작 문자열 지정
%>% select(starts_with('dep'))
flights #> # A tibble: 336,776 × 2
#> dep_time dep_delay
#> <int> <dbl>
#> 1 517 2
#> 2 533 4
#> 3 542 2
#> 4 544 -1
#> 5 554 -6
#> 6 554 -4
#> 7 555 -5
#> 8 557 -3
#> 9 557 -3
#> 10 558 -2
#> # … with 336,766 more rows
### 끝 문자열 지정
%>% select(ends_with("time"))
flights #> # A tibble: 336,776 × 5
#> dep_time sched_dep_time arr_time sched_arr_time air_time
#> <int> <int> <int> <int> <dbl>
#> 1 517 515 830 819 227
#> 2 533 529 850 830 227
#> 3 542 540 923 850 160
#> 4 544 545 1004 1022 183
#> 5 554 600 812 837 116
#> 6 554 558 740 728 150
#> 7 555 600 913 854 158
#> 8 557 600 709 723 53
#> 9 557 600 838 846 140
#> 10 558 600 753 745 138
#> # … with 336,766 more rows
### 포함 문자열 지정
%>% select(contains("time"))
flights #> # A tibble: 336,776 × 6
#> dep_time sched_dep_time arr_time sched_arr_time air_time time_hour
#> <int> <int> <int> <int> <dbl> <dttm>
#> 1 517 515 830 819 227 2013-01-01 05:00:00
#> 2 533 529 850 830 227 2013-01-01 05:00:00
#> 3 542 540 923 850 160 2013-01-01 05:00:00
#> 4 544 545 1004 1022 183 2013-01-01 05:00:00
#> 5 554 600 812 837 116 2013-01-01 06:00:00
#> 6 554 558 740 728 150 2013-01-01 05:00:00
#> 7 555 600 913 854 158 2013-01-01 06:00:00
#> 8 557 600 709 723 53 2013-01-01 06:00:00
#> 9 557 600 838 846 140 2013-01-01 06:00:00
#> 10 558 600 753 745 138 2013-01-01 06:00:00
#> # … with 336,766 more rows
### 컬럼 이름 변경 new = old
%>% rename(tail_num = tailnum)
flights #> # A tibble: 336,776 × 19
#> year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
#> 1 2013 1 1 517 515 2 830 819 11 UA
#> 2 2013 1 1 533 529 4 850 830 20 UA
#> 3 2013 1 1 542 540 2 923 850 33 AA
#> 4 2013 1 1 544 545 -1 1004 1022 -18 B6
#> 5 2013 1 1 554 600 -6 812 837 -25 DL
#> 6 2013 1 1 554 558 -4 740 728 12 UA
#> 7 2013 1 1 555 600 -5 913 854 19 B6
#> 8 2013 1 1 557 600 -3 709 723 -14 EV
#> 9 2013 1 1 557 600 -3 838 846 -8 B6
#> 10 2013 1 1 558 600 -2 753 745 8 AA
#> # … with 336,766 more rows, 9 more variables: flight <int>, tail_num <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
### 순서 배치 + 나머지
%>% select(time_hour, air_time, everything())
flights #> # A tibble: 336,776 × 19
#> time_hour air_t…¹ year month day dep_t…² sched…³ dep_d…⁴ arr_t…⁵
#> <dttm> <dbl> <int> <int> <int> <int> <int> <dbl> <int>
#> 1 2013-01-01 05:00:00 227 2013 1 1 517 515 2 830
#> 2 2013-01-01 05:00:00 227 2013 1 1 533 529 4 850
#> 3 2013-01-01 05:00:00 160 2013 1 1 542 540 2 923
#> 4 2013-01-01 05:00:00 183 2013 1 1 544 545 -1 1004
#> 5 2013-01-01 06:00:00 116 2013 1 1 554 600 -6 812
#> 6 2013-01-01 05:00:00 150 2013 1 1 554 558 -4 740
#> 7 2013-01-01 06:00:00 158 2013 1 1 555 600 -5 913
#> 8 2013-01-01 06:00:00 53 2013 1 1 557 600 -3 709
#> 9 2013-01-01 06:00:00 140 2013 1 1 557 600 -3 838
#> 10 2013-01-01 06:00:00 138 2013 1 1 558 600 -2 753
#> # … with 336,766 more rows, 10 more variables: sched_arr_time <int>,
#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
#> # dest <chr>, distance <dbl>, hour <dbl>, minute <dbl>, and abbreviated
#> # variable names ¹air_time, ²dep_time, ³sched_dep_time, ⁴dep_delay, ⁵arr_time
mutate
- 새로운 컬럼 생성
%>% mutate(
flights gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)#> # A tibble: 336,776 × 22
#> year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
#> 1 2013 1 1 517 515 2 830 819 11 UA
#> 2 2013 1 1 533 529 4 850 830 20 UA
#> 3 2013 1 1 542 540 2 923 850 33 AA
#> 4 2013 1 1 544 545 -1 1004 1022 -18 B6
#> 5 2013 1 1 554 600 -6 812 837 -25 DL
#> 6 2013 1 1 554 558 -4 740 728 12 UA
#> 7 2013 1 1 555 600 -5 913 854 19 B6
#> 8 2013 1 1 557 600 -3 709 723 -14 EV
#> 9 2013 1 1 557 600 -3 838 846 -8 B6
#> 10 2013 1 1 558 600 -2 753 745 8 AA
#> # … with 336,766 more rows, 12 more variables: flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>, gain <dbl>, hours <dbl>,
#> # gain_per_hour <dbl>, and abbreviated variable names ¹sched_dep_time,
#> # ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
### transmute : mutate와 다르게 반환된 값만 출력
%>% transmute(
flights gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)#> # A tibble: 336,776 × 3
#> gain hours gain_per_hour
#> <dbl> <dbl> <dbl>
#> 1 -9 3.78 -2.38
#> 2 -16 3.78 -4.23
#> 3 -31 2.67 -11.6
#> 4 17 3.05 5.57
#> 5 19 1.93 9.83
#> 6 -16 2.5 -6.4
#> 7 -24 2.63 -9.11
#> 8 11 0.883 12.5
#> 9 5 2.33 2.14
#> 10 -10 2.3 -4.35
#> # … with 336,766 more rows
유용한 기능
### 몫
5 %/% 3
#> [1] 1
### 나머지
5 %% 3
#> [1] 2
### 그룹화 + 집계값 요약
%>%
flights group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))
#> `summarise()` has grouped output by 'year', 'month'. You can override using the
#> `.groups` argument.
#> # A tibble: 365 × 4
#> # Groups: year, month [12]
#> year month day mean
#> <int> <int> <int> <dbl>
#> 1 2013 1 1 NA
#> 2 2013 1 2 NA
#> 3 2013 1 3 NA
#> 4 2013 1 4 NA
#> 5 2013 1 5 NA
#> 6 2013 1 6 NA
#> 7 2013 1 7 NA
#> 8 2013 1 8 NA
#> 9 2013 1 9 NA
#> 10 2013 1 10 NA
#> # … with 355 more rows
### 그룹화 + 집계값 요약 + NA 제거
%>%
flights group_by(year, month, day) %>%
summarise(mean = mean(dep_delay, na.rm = T))
#> `summarise()` has grouped output by 'year', 'month'. You can override using the
#> `.groups` argument.
#> # A tibble: 365 × 4
#> # Groups: year, month [12]
#> year month day mean
#> <int> <int> <int> <dbl>
#> 1 2013 1 1 11.5
#> 2 2013 1 2 13.9
#> 3 2013 1 3 11.0
#> 4 2013 1 4 8.95
#> 5 2013 1 5 5.73
#> 6 2013 1 6 7.15
#> 7 2013 1 7 5.42
#> 8 2013 1 8 2.55
#> 9 2013 1 9 2.28
#> 10 2013 1 10 2.84
#> # … with 355 more rows
### !is.na : 결측값 아닌 값들만 출력
%>%
flights filter(!is.na(dep_delay), !is.na(arr_delay))
#> # A tibble: 327,346 × 19
#> year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
#> 1 2013 1 1 517 515 2 830 819 11 UA
#> 2 2013 1 1 533 529 4 850 830 20 UA
#> 3 2013 1 1 542 540 2 923 850 33 AA
#> 4 2013 1 1 544 545 -1 1004 1022 -18 B6
#> 5 2013 1 1 554 600 -6 812 837 -25 DL
#> 6 2013 1 1 554 558 -4 740 728 12 UA
#> 7 2013 1 1 555 600 -5 913 854 19 B6
#> 8 2013 1 1 557 600 -3 709 723 -14 EV
#> 9 2013 1 1 557 600 -3 838 846 -8 B6
#> 10 2013 1 1 558 600 -2 753 745 8 AA
#> # … with 327,336 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
### 그룹화 해제 후 집계값 요약
<- flights %>%
group group_by(year, month, day)
%>%
group ungroup() %>%
summarise(flights = n())
#> # A tibble: 1 × 1
#> flights
#> <int>
#> 1 336776
### drop_na
%>% drop_na()
tibble_df #> # A tibble: 2 × 1
#> x
#> <dbl>
#> 1 1
#> 2 3
### mutate 컬럼 위치 지정
%>%
flights mutate(mean_arr_time = mean(arr_time, na.rm = T), .after = arr_time)
#> # A tibble: 336,776 × 20
#> year month day dep_time sched_de…¹ dep_d…² arr_t…³ mean_…⁴ sched…⁵ arr_d…⁶
#> <int> <int> <int> <int> <int> <dbl> <int> <dbl> <int> <dbl>
#> 1 2013 1 1 517 515 2 830 1502. 819 11
#> 2 2013 1 1 533 529 4 850 1502. 830 20
#> 3 2013 1 1 542 540 2 923 1502. 850 33
#> 4 2013 1 1 544 545 -1 1004 1502. 1022 -18
#> 5 2013 1 1 554 600 -6 812 1502. 837 -25
#> 6 2013 1 1 554 558 -4 740 1502. 728 12
#> 7 2013 1 1 555 600 -5 913 1502. 854 19
#> 8 2013 1 1 557 600 -3 709 1502. 723 -14
#> 9 2013 1 1 557 600 -3 838 1502. 846 -8
#> 10 2013 1 1 558 600 -2 753 1502. 745 8
#> # … with 336,766 more rows, 10 more variables: carrier <chr>, flight <int>,
#> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#> # hour <dbl>, minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴mean_arr_time, ⁵sched_arr_time,
#> # ⁶arr_delay
rank, slice
- 행의 범위 지정
### 년/월/일자별 top10 'arr_delay' 출력
### rank 사용
%>%
flights group_by(year,month,day) %>%
filter(rank(desc(arr_delay)) < 11) %>%
select(year,month,day, arr_delay)
#> # A tibble: 3,691 × 4
#> # Groups: year, month, day [365]
#> year month day arr_delay
#> <int> <int> <int> <dbl>
#> 1 2013 1 1 851
#> 2 2013 1 1 338
#> 3 2013 1 1 263
#> 4 2013 1 1 166
#> 5 2013 1 1 174
#> 6 2013 1 1 222
#> 7 2013 1 1 250
#> 8 2013 1 1 246
#> 9 2013 1 1 191
#> 10 2013 1 1 456
#> # … with 3,681 more rows
### slice 사용
%>%
flights group_by(year,month,day) %>%
slice_max(arr_delay,n=10) %>%
select(year,month,day, arr_delay)
#> # A tibble: 3,697 × 4
#> # Groups: year, month, day [365]
#> year month day arr_delay
#> <int> <int> <int> <dbl>
#> 1 2013 1 1 851
#> 2 2013 1 1 456
#> 3 2013 1 1 338
#> 4 2013 1 1 263
#> 5 2013 1 1 250
#> 6 2013 1 1 246
#> 7 2013 1 1 222
#> 8 2013 1 1 191
#> 9 2013 1 1 174
#> 10 2013 1 1 166
#> # … with 3,687 more rows
### 순위 책정
%>%
flights select(year,month,day,arr_delay) %>%
arrange(year,month,day,desc(arr_delay)) %>%
group_by(year,month,day) %>%
mutate(rank = rank(desc(arr_delay)))
#> # A tibble: 336,776 × 5
#> # Groups: year, month, day [365]
#> year month day arr_delay rank
#> <int> <int> <int> <dbl> <dbl>
#> 1 2013 1 1 851 1
#> 2 2013 1 1 456 2
#> 3 2013 1 1 338 3
#> 4 2013 1 1 263 4
#> 5 2013 1 1 250 5
#> 6 2013 1 1 246 6
#> 7 2013 1 1 222 7
#> 8 2013 1 1 191 8
#> 9 2013 1 1 174 9
#> 10 2013 1 1 166 10
#> # … with 336,766 more rows
Quiz 1
- dep_time의 결측지를 dep_time의 평균 값으로 교체하시오
### NA 제외한 평균값
<- mean(flights$dep_time,na.rm = T) %>% as.integer()
mean_dep_time
mean_dep_time#> [1] 1349
### repalce_na
$dep_time <- flights$dep_time %>%
flightsreplace_na(mean_dep_time)
$dep_time %>% is.na() %>% sum()
flights#> [1] 0
### mutate + ifelse
%>%
flights mutate(dep_time = ifelse(is.na(dep_time),
mean_dep_time,dep_time))#> # A tibble: 336,776 × 19
#> year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
#> 1 2013 1 1 517 515 2 830 819 11 UA
#> 2 2013 1 1 533 529 4 850 830 20 UA
#> 3 2013 1 1 542 540 2 923 850 33 AA
#> 4 2013 1 1 544 545 -1 1004 1022 -18 B6
#> 5 2013 1 1 554 600 -6 812 837 -25 DL
#> 6 2013 1 1 554 558 -4 740 728 12 UA
#> 7 2013 1 1 555 600 -5 913 854 19 B6
#> 8 2013 1 1 557 600 -3 709 723 -14 EV
#> 9 2013 1 1 557 600 -3 838 846 -8 B6
#> 10 2013 1 1 558 600 -2 753 745 8 AA
#> # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>, and abbreviated variable names
#> # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
Quiz 2
- 월별 비행기 개수를 구하시오
- dest(도착공항)별로 비행기가 10000대 이상 착륙한 공항으로 도착한 비행정보만 추출
### 1
%>%
flights group_by(month) %>%
summarise(fly_count = n())
#> # A tibble: 12 × 2
#> month fly_count
#> <int> <int>
#> 1 1 27004
#> 2 2 24951
#> 3 3 28834
#> 4 4 28330
#> 5 5 28796
#> 6 6 28243
#> 7 7 29425
#> 8 8 29327
#> 9 9 27574
#> 10 10 28889
#> 11 11 27268
#> 12 12 28135
### 2
<- flights %>%
air group_by(dest) %>%
filter(n() >= 10000)
%>% nrow()
air #> [1] 131440
table(air$dest)
#>
#> ATL BOS CLT FLL LAX MCO MIA ORD SFO
#> 17215 15508 14064 12055 16174 14082 11728 17283 13331
Quiz 3
다음은 미국 NBA 농구리그의 농구선수별 게임당 경기통계(stats)이다.
- Pos: 농구에서 선수의 포지션
- Age: 나이
- Tm: 팀이름
- 3P: 3점슛 성공횟수
- 3PA: 3점슛 시도횟수
- 3P%: 3점 성공률
- PTS: 평균득점
일 때 다음 물음에 답하시오.
<- read_csv("C:/Users/seong taek/Desktop/3-1 Opendata_Analysis/opendata/nba2021_per_game.csv")
stats #> Rows: 497 Columns: 29
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (3): Player, Pos, Tm
#> dbl (26): Age, G, GS, MP, FG, FGA, FG%, 3P, 3PA, 3P%, 2P, 2PA, 2P%, eFG%, FT...
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
stats#> # A tibble: 497 × 29
#> Player Pos Age Tm G GS MP FG FGA `FG%` `3P` `3PA`
#> <chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Precious A… PF 21 MIA 28 2 14.6 2.6 4.4 0.59 0 0
#> 2 Jaylen Ada… PG 24 MIL 6 0 2.8 0.2 1.3 0.125 0 0.3
#> 3 Steven Ada… C 27 NOP 27 27 28.1 3.5 5.8 0.603 0 0
#> 4 Bam Adebayo C 23 MIA 26 26 33.6 7.4 12.9 0.573 0.1 0.2
#> 5 LaMarcus A… C 35 SAS 18 18 26.7 5.9 12.5 0.476 1.3 3.7
#> 6 Ty-Shon Al… SG 22 PHO 3 0 2.7 0 1 0 0 0.3
#> 7 Nickeil Al… SG 22 NOP 23 3 19.2 3.3 8.2 0.41 1 3.8
#> 8 Grayson Al… SG 25 MEM 19 8 23.9 3.2 7.4 0.429 2.3 5.3
#> 9 Jarrett Al… C 22 TOT 28 10 26.2 4.4 6.8 0.642 0 0.1
#> 10 Jarrett Al… C 22 BRK 12 5 26.7 3.7 5.4 0.677 0 0
#> # … with 487 more rows, and 17 more variables: `3P%` <dbl>, `2P` <dbl>,
#> # `2PA` <dbl>, `2P%` <dbl>, `eFG%` <dbl>, FT <dbl>, FTA <dbl>, `FT%` <dbl>,
#> # ORB <dbl>, DRB <dbl>, TRB <dbl>, AST <dbl>, STL <dbl>, BLK <dbl>,
#> # TOV <dbl>, PF <dbl>, PTS <dbl>
1. 위의 dataframe을 가지고 group 별 통계값을 계산하시오.
- NBA 농구팀별(Tm)로 가장 평균득점(PTS)이 높은 사람과 낮은 사람을 추출하는 코드를 작성하시오.
- 또한, 팀별로 평균득점 최대값과 최소값의 차이를
gap
이라는 컬럼을 새로 만들어 나타내시오.
%>%
stats group_by(Tm) %>%
summarise(max_score = max(PTS),
min_score = min(PTS),
gap = max_score - min_score,
max_player = Player[which.max(PTS)],
min_player = Player[which.min(PTS)])
#> # A tibble: 31 × 6
#> Tm max_score min_score gap max_player min_player
#> <chr> <dbl> <dbl> <dbl> <chr> <chr>
#> 1 ATL 26.5 1.8 24.7 Trae Young Bruno Fernando
#> 2 BOS 25.9 2.4 23.5 Jaylen Brown Tremont Waters
#> 3 BRK 29 0 29 Kevin Durant Noah Vonleh
#> 4 CHI 28.5 1.5 27 Zach LaVine Luke Kornet
#> 5 CHO 22.3 1 21.3 Gordon Hayward Vernon Carey Jr.
#> 6 CLE 22.8 1.2 21.6 Collin Sexton Marques Bolden
#> 7 DAL 29.1 1 28.1 Luka Dončić Tyrell Terry
#> 8 DEN 27.4 0.6 26.8 Nikola Jokić Vlatko Čančar
#> 9 DET 23.8 0 23.8 Jerami Grant Deividas Sirvydis
#> 10 GSW 30 1.5 28.5 Stephen Curry Nico Mannion
#> # … with 21 more rows
2. 각 포지션별로(Pos) 평균나이 대비 해당 선수의 나이가 몇배 높거나 낮은지 비율을 계산하시오
- 예를들어 PG 포지션의 평균나이는 27세이고, Chris Paul의 나이는 35세이므로, 비율은 35/27이 된다.
- 모든 선수에 대해 이 비율을
age_ratio_by_position
이라는 새로운 컬럼으로 저장하는 코드를 작성하시오. - 단 , 포지션 별로 비율이 가장 높은 한명만 추출
%>%
stats group_by(Pos) %>%
mutate(mean_age = mean(Age),
age_ratio_by_position = Age/mean_age) %>%
select(Player, Pos, Age, mean_age, age_ratio_by_position) %>%
arrange(-age_ratio_by_position) %>%
slice_max(age_ratio_by_position, n=1)
#> # A tibble: 9 × 5
#> # Groups: Pos [9]
#> Player Pos Age mean_age age_ratio_by_position
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 Marc Gasol C 36 26.1 1.38
#> 2 Noah Vonleh F 25 24 1.04
#> 3 Norvel Pelle F-C 27 27 1
#> 4 Derrick Rose G 32 27 1.19
#> 5 Carmelo Anthony PF 36 25.9 1.39
#> 6 LeBron James PG 36 25.6 1.41
#> 7 Andre Iguodala SF 37 25.4 1.46
#> 8 Rodions Kurucs SF-PF 22 22 1
#> 9 J.J. Redick SG 36 25.3 1.43