library(tidyverse)
library(nycflights13)
![]() |
Data Transformaion
1 오픈데이터 분석 실습 : Data Transformation
1.1 패키지 불러오기
1.2 간단한 데이터셋 파악
- nycflights13
### 도움말
#?flights
### 요약
%>% str() flights
tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
$ year : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
$ month : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
$ day : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
$ dep_time : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
$ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
$ dep_delay : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
$ arr_time : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
$ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
$ arr_delay : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
$ carrier : chr [1:336776] "UA" "UA" "AA" "B6" ...
$ flight : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
$ tailnum : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
$ origin : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
$ dest : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
$ air_time : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
$ distance : num [1:336776] 1400 1416 1089 1576 762 ...
$ hour : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
$ minute : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
$ time_hour : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
### 컬럼 이름
%>% colnames() flights
[1] "year" "month" "day" "dep_time"
[5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
[9] "arr_delay" "carrier" "flight" "tailnum"
[13] "origin" "dest" "air_time" "distance"
[17] "hour" "minute" "time_hour"
2 dplyr 기초
2.1 filter
- 조건 필터링
### table - 빈도수 파악
$month %>% table() flights
.
1 2 3 4 5 6 7 8 9 10 11 12
27004 24951 28834 28330 28796 28243 29425 29327 27574 28889 27268 28135
### 월별 필터링
%>%
flights filter(month==12 | month==11)
# A tibble: 55,403 × 19
year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
1 2013 11 1 5 2359 6 352 345 7 B6
2 2013 11 1 35 2250 105 123 2356 87 B6
3 2013 11 1 455 500 -5 641 651 -10 US
4 2013 11 1 539 545 -6 856 827 29 UA
5 2013 11 1 542 545 -3 831 855 -24 AA
6 2013 11 1 549 600 -11 912 923 -11 UA
7 2013 11 1 550 600 -10 705 659 6 US
8 2013 11 1 554 600 -6 659 701 -2 US
9 2013 11 1 554 600 -6 826 827 -1 DL
10 2013 11 1 554 600 -6 749 751 -2 DL
# … with 55,393 more rows, 9 more variables: flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>, and abbreviated variable names
# ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
### 월별 필터링 - %in% 사용
%>%
flights filter(month %in% c(11,12))
# A tibble: 55,403 × 19
year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
1 2013 11 1 5 2359 6 352 345 7 B6
2 2013 11 1 35 2250 105 123 2356 87 B6
3 2013 11 1 455 500 -5 641 651 -10 US
4 2013 11 1 539 545 -6 856 827 29 UA
5 2013 11 1 542 545 -3 831 855 -24 AA
6 2013 11 1 549 600 -11 912 923 -11 UA
7 2013 11 1 550 600 -10 705 659 6 US
8 2013 11 1 554 600 -6 659 701 -2 US
9 2013 11 1 554 600 -6 826 827 -1 DL
10 2013 11 1 554 600 -6 749 751 -2 DL
# … with 55,393 more rows, 9 more variables: flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>, and abbreviated variable names
# ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
2.2 missing values
- 결측값 NA
### 결측값은 연산 불가능
NA
[1] NA
NA > 5
[1] NA
10 == NA
[1] NA
NA + 19
[1] NA
NA / 2
[1] NA
### 결측값 개수
$dep_time %>%
flightsis.na() %>%
sum()
[1] 8255
### 결측값 있는 df 생성
<- data.frame(x = c(1, NA, 3))
df df
x
1 1
2 NA
3 3
### 타입 확인
%>% class() df
[1] "data.frame"
### tibble : tidyverse의 df 클래스
<- tibble(x = c(1, NA, 3))
tibble_df tibble_df
# A tibble: 3 × 1
x
<dbl>
1 1
2 NA
3 3
%>% filter(x > 1) tibble_df
# A tibble: 1 × 1
x
<dbl>
1 3
%>% filter(is.na(x) | x>1) tibble_df
# A tibble: 2 × 1
x
<dbl>
1 NA
2 3
2.3 arrange
- 정렬
### 오름차순 정렬
%>% arrange(dep_time) flights
# A tibble: 336,776 × 19
year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
1 2013 1 13 1 2249 72 108 2357 71 B6
2 2013 1 31 1 2100 181 124 2225 179 WN
3 2013 11 13 1 2359 2 442 440 2 B6
4 2013 12 16 1 2359 2 447 437 10 B6
5 2013 12 20 1 2359 2 430 440 -10 B6
6 2013 12 26 1 2359 2 437 440 -3 B6
7 2013 12 30 1 2359 2 441 437 4 B6
8 2013 2 11 1 2100 181 111 2225 166 WN
9 2013 2 24 1 2245 76 121 2354 87 B6
10 2013 3 8 1 2355 6 431 440 -9 B6
# … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>, and abbreviated variable names
# ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
### 결측값은 가장 하단 배치
<- tibble(x = c(5,2,NA))
df arrange(df, x)
# A tibble: 3 × 1
x
<dbl>
1 2
2 5
3 NA
arrange(df, -x)
# A tibble: 3 × 1
x
<dbl>
1 5
2 2
3 NA
2.4 select
- 컬럼 선택
### 컬럼 선택 (순서 지정 가능)
%>% select(year, month, day) flights
# A tibble: 336,776 × 3
year month day
<int> <int> <int>
1 2013 1 1
2 2013 1 1
3 2013 1 1
4 2013 1 1
5 2013 1 1
6 2013 1 1
7 2013 1 1
8 2013 1 1
9 2013 1 1
10 2013 1 1
# … with 336,766 more rows
%>% select(year:day) flights
# A tibble: 336,776 × 3
year month day
<int> <int> <int>
1 2013 1 1
2 2013 1 1
3 2013 1 1
4 2013 1 1
5 2013 1 1
6 2013 1 1
7 2013 1 1
8 2013 1 1
9 2013 1 1
10 2013 1 1
# … with 336,766 more rows
### 컬럼 제외
%>% select(-(year:day)) flights
# A tibble: 336,776 × 16
dep_t…¹ sched…² dep_d…³ arr_t…⁴ sched…⁵ arr_d…⁶ carrier flight tailnum origin
<int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
1 517 515 2 830 819 11 UA 1545 N14228 EWR
2 533 529 4 850 830 20 UA 1714 N24211 LGA
3 542 540 2 923 850 33 AA 1141 N619AA JFK
4 544 545 -1 1004 1022 -18 B6 725 N804JB JFK
5 554 600 -6 812 837 -25 DL 461 N668DN LGA
6 554 558 -4 740 728 12 UA 1696 N39463 EWR
7 555 600 -5 913 854 19 B6 507 N516JB EWR
8 557 600 -3 709 723 -14 EV 5708 N829AS LGA
9 557 600 -3 838 846 -8 B6 79 N593JB JFK
10 558 600 -2 753 745 8 AA 301 N3ALAA LGA
# … with 336,766 more rows, 6 more variables: dest <chr>, air_time <dbl>,
# distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>, and abbreviated
# variable names ¹dep_time, ²sched_dep_time, ³dep_delay, ⁴arr_time,
# ⁵sched_arr_time, ⁶arr_delay
### 시작 문자열 지정
%>% select(starts_with('dep')) flights
# A tibble: 336,776 × 2
dep_time dep_delay
<int> <dbl>
1 517 2
2 533 4
3 542 2
4 544 -1
5 554 -6
6 554 -4
7 555 -5
8 557 -3
9 557 -3
10 558 -2
# … with 336,766 more rows
### 끝 문자열 지정
%>% select(ends_with("time")) flights
# A tibble: 336,776 × 5
dep_time sched_dep_time arr_time sched_arr_time air_time
<int> <int> <int> <int> <dbl>
1 517 515 830 819 227
2 533 529 850 830 227
3 542 540 923 850 160
4 544 545 1004 1022 183
5 554 600 812 837 116
6 554 558 740 728 150
7 555 600 913 854 158
8 557 600 709 723 53
9 557 600 838 846 140
10 558 600 753 745 138
# … with 336,766 more rows
### 포함 문자열 지정
%>% select(contains("time")) flights
# A tibble: 336,776 × 6
dep_time sched_dep_time arr_time sched_arr_time air_time time_hour
<int> <int> <int> <int> <dbl> <dttm>
1 517 515 830 819 227 2013-01-01 05:00:00
2 533 529 850 830 227 2013-01-01 05:00:00
3 542 540 923 850 160 2013-01-01 05:00:00
4 544 545 1004 1022 183 2013-01-01 05:00:00
5 554 600 812 837 116 2013-01-01 06:00:00
6 554 558 740 728 150 2013-01-01 05:00:00
7 555 600 913 854 158 2013-01-01 06:00:00
8 557 600 709 723 53 2013-01-01 06:00:00
9 557 600 838 846 140 2013-01-01 06:00:00
10 558 600 753 745 138 2013-01-01 06:00:00
# … with 336,766 more rows
### 컬럼 이름 변경 new = old
%>% rename(tail_num = tailnum) flights
# A tibble: 336,776 × 19
year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
1 2013 1 1 517 515 2 830 819 11 UA
2 2013 1 1 533 529 4 850 830 20 UA
3 2013 1 1 542 540 2 923 850 33 AA
4 2013 1 1 544 545 -1 1004 1022 -18 B6
5 2013 1 1 554 600 -6 812 837 -25 DL
6 2013 1 1 554 558 -4 740 728 12 UA
7 2013 1 1 555 600 -5 913 854 19 B6
8 2013 1 1 557 600 -3 709 723 -14 EV
9 2013 1 1 557 600 -3 838 846 -8 B6
10 2013 1 1 558 600 -2 753 745 8 AA
# … with 336,766 more rows, 9 more variables: flight <int>, tail_num <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>, and abbreviated variable names
# ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
### 순서 배치 + 나머지
%>% select(time_hour, air_time, everything()) flights
# A tibble: 336,776 × 19
time_hour air_t…¹ year month day dep_t…² sched…³ dep_d…⁴ arr_t…⁵
<dttm> <dbl> <int> <int> <int> <int> <int> <dbl> <int>
1 2013-01-01 05:00:00 227 2013 1 1 517 515 2 830
2 2013-01-01 05:00:00 227 2013 1 1 533 529 4 850
3 2013-01-01 05:00:00 160 2013 1 1 542 540 2 923
4 2013-01-01 05:00:00 183 2013 1 1 544 545 -1 1004
5 2013-01-01 06:00:00 116 2013 1 1 554 600 -6 812
6 2013-01-01 05:00:00 150 2013 1 1 554 558 -4 740
7 2013-01-01 06:00:00 158 2013 1 1 555 600 -5 913
8 2013-01-01 06:00:00 53 2013 1 1 557 600 -3 709
9 2013-01-01 06:00:00 140 2013 1 1 557 600 -3 838
10 2013-01-01 06:00:00 138 2013 1 1 558 600 -2 753
# … with 336,766 more rows, 10 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
# dest <chr>, distance <dbl>, hour <dbl>, minute <dbl>, and abbreviated
# variable names ¹air_time, ²dep_time, ³sched_dep_time, ⁴dep_delay, ⁵arr_time
2.5 mutate
- 새로운 컬럼 생성
%>% mutate(
flights gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
# A tibble: 336,776 × 22
year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
1 2013 1 1 517 515 2 830 819 11 UA
2 2013 1 1 533 529 4 850 830 20 UA
3 2013 1 1 542 540 2 923 850 33 AA
4 2013 1 1 544 545 -1 1004 1022 -18 B6
5 2013 1 1 554 600 -6 812 837 -25 DL
6 2013 1 1 554 558 -4 740 728 12 UA
7 2013 1 1 555 600 -5 913 854 19 B6
8 2013 1 1 557 600 -3 709 723 -14 EV
9 2013 1 1 557 600 -3 838 846 -8 B6
10 2013 1 1 558 600 -2 753 745 8 AA
# … with 336,766 more rows, 12 more variables: flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>, gain <dbl>, hours <dbl>,
# gain_per_hour <dbl>, and abbreviated variable names ¹sched_dep_time,
# ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
### transmute : mutate와 다르게 반환된 값만 출력
%>% transmute(
flights gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
# A tibble: 336,776 × 3
gain hours gain_per_hour
<dbl> <dbl> <dbl>
1 -9 3.78 -2.38
2 -16 3.78 -4.23
3 -31 2.67 -11.6
4 17 3.05 5.57
5 19 1.93 9.83
6 -16 2.5 -6.4
7 -24 2.63 -9.11
8 11 0.883 12.5
9 5 2.33 2.14
10 -10 2.3 -4.35
# … with 336,766 more rows
2.6 유용한 기능
### 몫
5 %/% 3
[1] 1
### 나머지
5 %% 3
[1] 2
### 그룹화 + 집계값 요약
%>%
flights group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))
`summarise()` has grouped output by 'year', 'month'. You can override using the
`.groups` argument.
# A tibble: 365 × 4
# Groups: year, month [12]
year month day mean
<int> <int> <int> <dbl>
1 2013 1 1 NA
2 2013 1 2 NA
3 2013 1 3 NA
4 2013 1 4 NA
5 2013 1 5 NA
6 2013 1 6 NA
7 2013 1 7 NA
8 2013 1 8 NA
9 2013 1 9 NA
10 2013 1 10 NA
# … with 355 more rows
### 그룹화 + 집계값 요약 + NA 제거
%>%
flights group_by(year, month, day) %>%
summarise(mean = mean(dep_delay, na.rm = T))
`summarise()` has grouped output by 'year', 'month'. You can override using the
`.groups` argument.
# A tibble: 365 × 4
# Groups: year, month [12]
year month day mean
<int> <int> <int> <dbl>
1 2013 1 1 11.5
2 2013 1 2 13.9
3 2013 1 3 11.0
4 2013 1 4 8.95
5 2013 1 5 5.73
6 2013 1 6 7.15
7 2013 1 7 5.42
8 2013 1 8 2.55
9 2013 1 9 2.28
10 2013 1 10 2.84
# … with 355 more rows
### !is.na : 결측값 아닌 값들만 출력
%>%
flights filter(!is.na(dep_delay), !is.na(arr_delay))
# A tibble: 327,346 × 19
year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
1 2013 1 1 517 515 2 830 819 11 UA
2 2013 1 1 533 529 4 850 830 20 UA
3 2013 1 1 542 540 2 923 850 33 AA
4 2013 1 1 544 545 -1 1004 1022 -18 B6
5 2013 1 1 554 600 -6 812 837 -25 DL
6 2013 1 1 554 558 -4 740 728 12 UA
7 2013 1 1 555 600 -5 913 854 19 B6
8 2013 1 1 557 600 -3 709 723 -14 EV
9 2013 1 1 557 600 -3 838 846 -8 B6
10 2013 1 1 558 600 -2 753 745 8 AA
# … with 327,336 more rows, 9 more variables: flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>, and abbreviated variable names
# ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
### 그룹화 해제 후 집계값 요약
<- flights %>%
group group_by(year, month, day)
%>%
group ungroup() %>%
summarise(flights = n())
# A tibble: 1 × 1
flights
<int>
1 336776
### drop_na
%>% drop_na() tibble_df
# A tibble: 2 × 1
x
<dbl>
1 1
2 3
### mutate 컬럼 위치 지정
%>%
flights mutate(mean_arr_time = mean(arr_time, na.rm = T), .after = arr_time)
# A tibble: 336,776 × 20
year month day dep_time sched_de…¹ dep_d…² arr_t…³ mean_…⁴ sched…⁵ arr_d…⁶
<int> <int> <int> <int> <int> <dbl> <int> <dbl> <int> <dbl>
1 2013 1 1 517 515 2 830 1502. 819 11
2 2013 1 1 533 529 4 850 1502. 830 20
3 2013 1 1 542 540 2 923 1502. 850 33
4 2013 1 1 544 545 -1 1004 1502. 1022 -18
5 2013 1 1 554 600 -6 812 1502. 837 -25
6 2013 1 1 554 558 -4 740 1502. 728 12
7 2013 1 1 555 600 -5 913 1502. 854 19
8 2013 1 1 557 600 -3 709 1502. 723 -14
9 2013 1 1 557 600 -3 838 1502. 846 -8
10 2013 1 1 558 600 -2 753 1502. 745 8
# … with 336,766 more rows, 10 more variables: carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>, and abbreviated variable names
# ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴mean_arr_time, ⁵sched_arr_time,
# ⁶arr_delay
2.7 rank, slice
- 행의 범위 지정
### 년/월/일자별 top10 'arr_delay' 출력
### rank 사용
%>%
flights group_by(year,month,day) %>%
filter(rank(desc(arr_delay)) < 11) %>%
select(year,month,day, arr_delay)
# A tibble: 3,691 × 4
# Groups: year, month, day [365]
year month day arr_delay
<int> <int> <int> <dbl>
1 2013 1 1 851
2 2013 1 1 338
3 2013 1 1 263
4 2013 1 1 166
5 2013 1 1 174
6 2013 1 1 222
7 2013 1 1 250
8 2013 1 1 246
9 2013 1 1 191
10 2013 1 1 456
# … with 3,681 more rows
### slice 사용
%>%
flights group_by(year,month,day) %>%
slice_max(arr_delay,n=10) %>%
select(year,month,day, arr_delay)
# A tibble: 3,697 × 4
# Groups: year, month, day [365]
year month day arr_delay
<int> <int> <int> <dbl>
1 2013 1 1 851
2 2013 1 1 456
3 2013 1 1 338
4 2013 1 1 263
5 2013 1 1 250
6 2013 1 1 246
7 2013 1 1 222
8 2013 1 1 191
9 2013 1 1 174
10 2013 1 1 166
# … with 3,687 more rows
### 순위 책정
%>%
flights select(year,month,day,arr_delay) %>%
arrange(year,month,day,desc(arr_delay)) %>%
group_by(year,month,day) %>%
mutate(rank = rank(desc(arr_delay)))
# A tibble: 336,776 × 5
# Groups: year, month, day [365]
year month day arr_delay rank
<int> <int> <int> <dbl> <dbl>
1 2013 1 1 851 1
2 2013 1 1 456 2
3 2013 1 1 338 3
4 2013 1 1 263 4
5 2013 1 1 250 5
6 2013 1 1 246 6
7 2013 1 1 222 7
8 2013 1 1 191 8
9 2013 1 1 174 9
10 2013 1 1 166 10
# … with 336,766 more rows
3 Quiz 1
- dep_time의 결측지를 dep_time의 평균 값으로 교체하시오
### NA 제외한 평균값
<- mean(flights$dep_time,na.rm = T) %>% as.integer()
mean_dep_time mean_dep_time
[1] 1349
### repalce_na
$dep_time <- flights$dep_time %>%
flightsreplace_na(mean_dep_time)
$dep_time %>% is.na() %>% sum() flights
[1] 0
### mutate + ifelse
%>%
flights mutate(dep_time = ifelse(is.na(dep_time),
mean_dep_time,dep_time))
# A tibble: 336,776 × 19
year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
1 2013 1 1 517 515 2 830 819 11 UA
2 2013 1 1 533 529 4 850 830 20 UA
3 2013 1 1 542 540 2 923 850 33 AA
4 2013 1 1 544 545 -1 1004 1022 -18 B6
5 2013 1 1 554 600 -6 812 837 -25 DL
6 2013 1 1 554 558 -4 740 728 12 UA
7 2013 1 1 555 600 -5 913 854 19 B6
8 2013 1 1 557 600 -3 709 723 -14 EV
9 2013 1 1 557 600 -3 838 846 -8 B6
10 2013 1 1 558 600 -2 753 745 8 AA
# … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>, and abbreviated variable names
# ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
4 Quiz 2
- 월별 비행기 개수를 구하시오
- dest(도착공항)별로 비행기가 10000대 이상 착륙한 공항으로 도착한 비행정보만 추출
### 1
%>%
flights group_by(month) %>%
summarise(fly_count = n())
# A tibble: 12 × 2
month fly_count
<int> <int>
1 1 27004
2 2 24951
3 3 28834
4 4 28330
5 5 28796
6 6 28243
7 7 29425
8 8 29327
9 9 27574
10 10 28889
11 11 27268
12 12 28135
### 2
<- flights %>%
air group_by(dest) %>%
filter(n() >= 10000)
%>% nrow() air
[1] 131440
table(air$dest)
ATL BOS CLT FLL LAX MCO MIA ORD SFO
17215 15508 14064 12055 16174 14082 11728 17283 13331
5 Quiz 3
다음은 미국 NBA 농구리그의 농구선수별 게임당 경기통계(stats)이다.
- Pos: 농구에서 선수의 포지션
- Age: 나이
- Tm: 팀이름
- 3P: 3점슛 성공횟수
- 3PA: 3점슛 시도횟수
- 3P%: 3점 성공률
- PTS: 평균득점
일 때 다음 물음에 답하시오.
<- read_csv("C:/Users/seong taek/Desktop/3-1 Opendata_Analysis/opendata/nba2021_per_game.csv") stats
Rows: 497 Columns: 29
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Player, Pos, Tm
dbl (26): Age, G, GS, MP, FG, FGA, FG%, 3P, 3PA, 3P%, 2P, 2PA, 2P%, eFG%, FT...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
stats
# A tibble: 497 × 29
Player Pos Age Tm G GS MP FG FGA `FG%` `3P` `3PA`
<chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Precious A… PF 21 MIA 28 2 14.6 2.6 4.4 0.59 0 0
2 Jaylen Ada… PG 24 MIL 6 0 2.8 0.2 1.3 0.125 0 0.3
3 Steven Ada… C 27 NOP 27 27 28.1 3.5 5.8 0.603 0 0
4 Bam Adebayo C 23 MIA 26 26 33.6 7.4 12.9 0.573 0.1 0.2
5 LaMarcus A… C 35 SAS 18 18 26.7 5.9 12.5 0.476 1.3 3.7
6 Ty-Shon Al… SG 22 PHO 3 0 2.7 0 1 0 0 0.3
7 Nickeil Al… SG 22 NOP 23 3 19.2 3.3 8.2 0.41 1 3.8
8 Grayson Al… SG 25 MEM 19 8 23.9 3.2 7.4 0.429 2.3 5.3
9 Jarrett Al… C 22 TOT 28 10 26.2 4.4 6.8 0.642 0 0.1
10 Jarrett Al… C 22 BRK 12 5 26.7 3.7 5.4 0.677 0 0
# … with 487 more rows, and 17 more variables: `3P%` <dbl>, `2P` <dbl>,
# `2PA` <dbl>, `2P%` <dbl>, `eFG%` <dbl>, FT <dbl>, FTA <dbl>, `FT%` <dbl>,
# ORB <dbl>, DRB <dbl>, TRB <dbl>, AST <dbl>, STL <dbl>, BLK <dbl>,
# TOV <dbl>, PF <dbl>, PTS <dbl>
5.1 1. 위의 dataframe을 가지고 group 별 통계값을 계산하시오.
- NBA 농구팀별(Tm)로 가장 평균득점(PTS)이 높은 사람과 낮은 사람을 추출하는 코드를 작성하시오.
- 또한, 팀별로 평균득점 최대값과 최소값의 차이를
gap
이라는 컬럼을 새로 만들어 나타내시오.
%>%
stats group_by(Tm) %>%
summarise(max_score = max(PTS),
min_score = min(PTS),
gap = max_score - min_score,
max_player = Player[which.max(PTS)],
min_player = Player[which.min(PTS)])
# A tibble: 31 × 6
Tm max_score min_score gap max_player min_player
<chr> <dbl> <dbl> <dbl> <chr> <chr>
1 ATL 26.5 1.8 24.7 Trae Young Bruno Fernando
2 BOS 25.9 2.4 23.5 Jaylen Brown Tremont Waters
3 BRK 29 0 29 Kevin Durant Noah Vonleh
4 CHI 28.5 1.5 27 Zach LaVine Luke Kornet
5 CHO 22.3 1 21.3 Gordon Hayward Vernon Carey Jr.
6 CLE 22.8 1.2 21.6 Collin Sexton Marques Bolden
7 DAL 29.1 1 28.1 Luka Dončić Tyrell Terry
8 DEN 27.4 0.6 26.8 Nikola Jokić Vlatko Čančar
9 DET 23.8 0 23.8 Jerami Grant Deividas Sirvydis
10 GSW 30 1.5 28.5 Stephen Curry Nico Mannion
# … with 21 more rows
5.2 2. 각 포지션별로(Pos) 평균나이 대비 해당 선수의 나이가 몇배 높거나 낮은지 비율을 계산하시오
- 예를들어 PG 포지션의 평균나이는 27세이고, Chris Paul의 나이는 35세이므로, 비율은 35/27이 된다.
- 모든 선수에 대해 이 비율을
age_ratio_by_position
이라는 새로운 컬럼으로 저장하는 코드를 작성하시오. - 단 , 포지션 별로 비율이 가장 높은 한명만 추출
%>%
stats group_by(Pos) %>%
mutate(mean_age = mean(Age),
age_ratio_by_position = Age/mean_age) %>%
select(Player, Pos, Age, mean_age, age_ratio_by_position) %>%
arrange(-age_ratio_by_position) %>%
slice_max(age_ratio_by_position, n=1)
# A tibble: 9 × 5
# Groups: Pos [9]
Player Pos Age mean_age age_ratio_by_position
<chr> <chr> <dbl> <dbl> <dbl>
1 Marc Gasol C 36 26.1 1.38
2 Noah Vonleh F 25 24 1.04
3 Norvel Pelle F-C 27 27 1
4 Derrick Rose G 32 27 1.19
5 Carmelo Anthony PF 36 25.9 1.39
6 LeBron James PG 36 25.6 1.41
7 Andre Iguodala SF 37 25.4 1.46
8 Rodions Kurucs SF-PF 22 22 1
9 J.J. Redick SG 36 25.3 1.43