오픈데이터 분석 실습 : Data Tyding
패키지 불러오기
library(tidyverse)
library(ggplot2)
Tidy data
- 원칙
- 각 변수는 각각의 열을 가져야한다
- 각 변수는 각각의 행을 가져야한다
- 각 셀은 하나의 값을 가져야한다
### 확진자수 데이터셋
table1#> # A tibble: 6 × 4
#> country year cases population
#> <chr> <dbl> <dbl> <dbl>
#> 1 Afghanistan 1999 745 19987071
#> 2 Afghanistan 2000 2666 20595360
#> 3 Brazil 1999 37737 172006362
#> 4 Brazil 2000 80488 174504898
#> 5 China 1999 212258 1272915272
#> 6 China 2000 213766 1280428583
### rate 컬럼 생성
%>%
table1 mutate(rate = cases/population*10000)
#> # A tibble: 6 × 5
#> country year cases population rate
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Afghanistan 1999 745 19987071 0.373
#> 2 Afghanistan 2000 2666 20595360 1.29
#> 3 Brazil 1999 37737 172006362 2.19
#> 4 Brazil 2000 80488 174504898 4.61
#> 5 China 1999 212258 1272915272 1.67
#> 6 China 2000 213766 1280428583 1.67
### 연도별 cases(확진자) 수
%>%
table1 count(year, wt=cases) # wt=weight=가중치
#> # A tibble: 2 × 2
#> year n
#> <dbl> <dbl>
#> 1 1999 250740
#> 2 2000 296920
### 국가별 확진자 수 추이
ggplot(table1, aes(x=year, y=cases)) +
geom_line(aes(group = country), colour = "grey50") + # 국가별 선 구분, 색상 통일
geom_point(aes(colour = country, shape = country)) + # 국가별 점 색상/모양 구분
scale_x_continuous(breaks = c(1999,2000)) + # x축 눈금 지정
facet_wrap(vars(country), scales = "free") # 국가별 패널 구분, 눈금-독립조정
### 2000년도 국가별 ratio
<- table1 %>%
datagroup_by(country) %>%
mutate(ratio = cases/min(cases)) %>% # ratio = 확진자 수 최소 정규화
filter(year==2000)
data#> # A tibble: 3 × 5
#> # Groups: country [3]
#> country year cases population ratio
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Afghanistan 2000 2666 20595360 3.58
#> 2 Brazil 2000 80488 174504898 2.13
#> 3 China 2000 213766 1280428583 1.01
### 국가별 2000년도 확진자 수 최소 정규화
ggplot(data, aes(x = country, y = ratio, fill = country)) + # 색상:국가별
geom_bar(stat = "identity", width = 0.5) + # 막대너비 지정
labs(title = "Ratio by Country",
x = "Country", y = "Ratio") +
theme_minimal() +
theme(legend.position = "none") # 범례 미지정
### 1999 - 2000 증가량 계산
<- table1
data
<- table1 %>%
datagroup_by(country) %>%
mutate(increase = cases - cases[year == 1999])
data#> # A tibble: 6 × 5
#> # Groups: country [3]
#> country year cases population increase
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Afghanistan 1999 745 19987071 0
#> 2 Afghanistan 2000 2666 20595360 1921
#> 3 Brazil 1999 37737 172006362 0
#> 4 Brazil 2000 80488 174504898 42751
#> 5 China 1999 212258 1272915272 0
#> 6 China 2000 213766 1280428583 1508
### 국가별 확진자 수 증가량 추이
ggplot(data %>% filter(year==2000),
aes(x = country, y = increase, fill = country)) +
geom_bar(stat = "identity", width = 0.5) +
labs(title = "Cases Increase from 1999 to 2000 by Country",
x = "Country", y = "Cases Increase") +
theme_minimal() +
theme(legend.position = "none")
Pivoting
pivot_longer
- 데이터를 긴 형태에서 넓은 형태로 변환
- 여러 컬럼을 하나의 컬럼으로 정리
- 주로 정규화 작업을 수행
### 국가/연도별 확진자 수
table4a#> # A tibble: 3 × 3
#> country `1999` `2000`
#> <chr> <dbl> <dbl>
#> 1 Afghanistan 745 2666
#> 2 Brazil 37737 80488
#> 3 China 212258 213766
### pivot_longer
<- table4a %>%
table4a_pivot_longer pivot_longer(
cols = c(`1999`, `2000`), # 변환할 컬럼 지정
names_to = "year", # 변환될 컬럼 이름 지정
values_to = "cases" # 변환된 값들이 저장될 컬럼 이름 지정
%>%
) mutate(year = parse_integer(year)) # 정수형 타입 변환
table4a_pivot_longer#> # A tibble: 6 × 3
#> country year cases
#> <chr> <int> <dbl>
#> 1 Afghanistan 1999 745
#> 2 Afghanistan 2000 2666
#> 3 Brazil 1999 37737
#> 4 Brazil 2000 80488
#> 5 China 1999 212258
#> 6 China 2000 213766
### ggplot 그래프
%>%
table4a_pivot_longer ggplot(aes(x = year, y = cases)) +
geom_line(aes(group = country), colour = "grey50") +
geom_point(aes(colour = country, shape = country)) +
scale_x_continuous(breaks = c(1999, 2000))
pivot_longer + left_join
### 국가/연도별 인구수
table4b#> # A tibble: 3 × 3
#> country `1999` `2000`
#> <chr> <dbl> <dbl>
#> 1 Afghanistan 19987071 20595360
#> 2 Brazil 172006362 174504898
#> 3 China 1272915272 1280428583
### pivot_longer
<- table4b %>%
table4b_pivot_longer pivot_longer(
cols = c(`1999`, `2000`),
names_to = "year",
values_to = "population"
%>%
) mutate(year = parse_integer(year))
table4b_pivot_longer#> # A tibble: 6 × 3
#> country year population
#> <chr> <int> <dbl>
#> 1 Afghanistan 1999 19987071
#> 2 Afghanistan 2000 20595360
#> 3 Brazil 1999 172006362
#> 4 Brazil 2000 174504898
#> 5 China 1999 1272915272
#> 6 China 2000 1280428583
### 확진자 수 + 인구수 left_join
%>% left_join(table4b_pivot_longer, by = c("country", "year"))
table4a_pivot_longer #> # A tibble: 6 × 4
#> country year cases population
#> <chr> <int> <dbl> <dbl>
#> 1 Afghanistan 1999 745 19987071
#> 2 Afghanistan 2000 2666 20595360
#> 3 Brazil 1999 37737 172006362
#> 4 Brazil 2000 80488 174504898
#> 5 China 1999 212258 1272915272
#> 6 China 2000 213766 1280428583
pivot_wider
- 데이터를 넓은 형태에서 긴 형태로 변환
- 하나의 컬럼을 여러개 컬럼으로 확장
- 주로 비정규화 작업을 수행
table2#> # A tibble: 12 × 4
#> country year type count
#> <chr> <dbl> <chr> <dbl>
#> 1 Afghanistan 1999 cases 745
#> 2 Afghanistan 1999 population 19987071
#> 3 Afghanistan 2000 cases 2666
#> 4 Afghanistan 2000 population 20595360
#> 5 Brazil 1999 cases 37737
#> 6 Brazil 1999 population 172006362
#> 7 Brazil 2000 cases 80488
#> 8 Brazil 2000 population 174504898
#> 9 China 1999 cases 212258
#> 10 China 1999 population 1272915272
#> 11 China 2000 cases 213766
#> 12 China 2000 population 1280428583
### pivot_wider
<- table2 %>%
table2_pivot_wider pivot_wider(names_from = type, # 여러 컬럼으로 확장할 컬럼 지정
values_from = count) %>% # 확장할 컬럼에 해당하는 값
mutate(rate = cases / population)
table2_pivot_wider#> # A tibble: 6 × 5
#> country year cases population rate
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Afghanistan 1999 745 19987071 0.0000373
#> 2 Afghanistan 2000 2666 20595360 0.000129
#> 3 Brazil 1999 37737 172006362 0.000219
#> 4 Brazil 2000 80488 174504898 0.000461
#> 5 China 1999 212258 1272915272 0.000167
#> 6 China 2000 213766 1280428583 0.000167
### ggplot 그래프
%>% ggplot(aes(x = year, y = rate)) +
table2_pivot_wider geom_line(aes(group = country), colour = "grey50") +
geom_point(aes(colour = country, shape = country)) +
scale_x_continuous(breaks = c(1999, 2000))
Advanced Pivoting (Missing values)
NA값이 있을 때의 pivot_longer
#remotes::install_github("dcl-docs/dcldata")
library(dcldata)
example_migration#> # A tibble: 3 × 6
#> dest Afghanistan Canada India Japan `South Africa`
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Albania <NA> 913 <NA> <NA> <NA>
#> 2 Bulgaria 483 713 281 213 260
#> 3 Romania <NA> <NA> 102 <NA> <NA>
### drop_na로 제거
%>%
example_migration pivot_longer(cols = !dest,
names_to = "origin",
values_to = "migrants") %>%
drop_na(migrants)
#> # A tibble: 7 × 3
#> dest origin migrants
#> <chr> <chr> <chr>
#> 1 Albania Canada 913
#> 2 Bulgaria Afghanistan 483
#> 3 Bulgaria Canada 713
#> 4 Bulgaria India 281
#> 5 Bulgaria Japan 213
#> 6 Bulgaria South Africa 260
#> 7 Romania India 102