library(ggplot2)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(ggrepel)
library(cowplot) # plot_grid
![]() |
Figure 3.2
1 데이터 시각화 실습 : 그래프 합치기 Figure 3.2
1.1 패키지 불러오기
1.2 데이터 불러오기, 파악
- ncdc_normals.csv (날짜별 온도 등 데이터셋)
<- read.csv('C:/Users/seong taek/Desktop/3-1 DataVisualize/data_visualize/ncdc_normals.csv')
ncdc_normals
### 차원 파악
%>% dim()
ncdc_normals #> [1] 2745366 6
### 앞부분 훑어보기
%>% head()
ncdc_normals #> station_id month day temperature flag date
#> 1 AQW00061705 1 1 82.4 C 0000-01-01
#> 2 AQW00061705 1 2 82.4 C 0000-01-02
#> 3 AQW00061705 1 3 82.4 C 0000-01-03
#> 4 AQW00061705 1 4 82.4 C 0000-01-04
#> 5 AQW00061705 1 5 82.4 C 0000-01-05
#> 6 AQW00061705 1 6 82.4 C 0000-01-06
### 통계 요약 정보
%>% summary()
ncdc_normals #> station_id month day temperature
#> Length:2745366 Min. : 1.000 Min. : 1.00 Min. :-21.80
#> Class :character 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 39.70
#> Mode :character Median : 7.000 Median :16.00 Median : 54.60
#> Mean : 6.514 Mean :15.76 Mean : 53.17
#> 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.: 68.10
#> Max. :12.000 Max. :31.00 Max. :103.20
#> flag date
#> Length:2745366 Length:2745366
#> Class :character Class :character
#> Mode :character Mode :character
#>
#>
#>
### 각 컬럼 클래스(타입) 확인
%>% sapply(class)
ncdc_normals #> station_id month day temperature flag date
#> "character" "integer" "integer" "numeric" "character" "character"
### 각 컬럼 자료형 확인
%>% sapply(typeof)
ncdc_normals #> station_id month day temperature flag date
#> "character" "integer" "integer" "double" "character" "character"
1.3 전처리
### staion id와 location 지정 선택한 d.f 만들기
<- data.frame(station_id = c("USW00014819","USC00042319","USW00093107","USW00012918"),
station_loc location = c("Chicago","Death Valley","San Diego","Houston"))
%>% head()
station_loc #> station_id location
#> 1 USW00014819 Chicago
#> 2 USC00042319 Death Valley
#> 3 USW00093107 San Diego
#> 4 USW00012918 Houston
### station_id로 ncdc_normals와 station_loc 두 컬럼을 `inner_join`
<- ncdc_normals %>% inner_join(station_loc, by="station_id")
temps_long %>% head()
temps_long #> station_id month day temperature flag date location
#> 1 USC00042319 1 1 51.0 S 0000-01-01 Death Valley
#> 2 USC00042319 1 2 51.2 S 0000-01-02 Death Valley
#> 3 USC00042319 1 3 51.3 S 0000-01-03 Death Valley
#> 4 USC00042319 1 4 51.4 S 0000-01-04 Death Valley
#> 5 USC00042319 1 5 51.6 S 0000-01-05 Death Valley
#> 6 USC00042319 1 6 51.7 S 0000-01-06 Death Valley
%>% sapply(class)
temps_long #> station_id month day temperature flag date
#> "character" "integer" "integer" "numeric" "character" "character"
#> location
#> "character"
### temps_long의 `date`의 타입을 Cha → Date로 변환
$date <- temps_long$date %>% as.Date('%Y-%m-%d')
temps_long%>% sapply(class)
temps_long #> station_id month day temperature flag date
#> "character" "integer" "integer" "numeric" "character" "Date"
#> location
#> "character"
### Houston 필터
<- temps_long %>% filter(location=='Houston')
data_Houston %>% head()
data_Houston #> station_id month day temperature flag date location
#> 1 USW00012918 1 1 53.9 S 0000-01-01 Houston
#> 2 USW00012918 1 2 53.8 S 0000-01-02 Houston
#> 3 USW00012918 1 3 53.8 S 0000-01-03 Houston
#> 4 USW00012918 1 4 53.8 S 0000-01-04 Houston
#> 5 USW00012918 1 5 53.8 S 0000-01-05 Houston
#> 6 USW00012918 1 6 53.7 S 0000-01-06 Houston
%>% tail()
data_Houston #> station_id month day temperature flag date location
#> 361 USW00012918 12 26 54.2 C 0000-12-26 Houston
#> 362 USW00012918 12 27 54.1 C 0000-12-27 Houston
#> 363 USW00012918 12 28 54.1 C 0000-12-28 Houston
#> 364 USW00012918 12 29 54.0 C 0000-12-29 Houston
#> 365 USW00012918 12 30 53.9 C 0000-12-30 Houston
#> 366 USW00012918 12 31 53.9 C 0000-12-31 Houston
1.4 x축 눈금 설정
<- '0000-01-01' %>% as.Date('%Y-%m-%d')
date_s <- '0001-01-01' %>% as.Date('%Y-%m-%d')
date_e <- seq(date_s, date_e, by = '3 month') break_date
2 ggplot + 축 설정
- 사용 데이터셋 : data_Houston
- x=date, y=temperature
- scale_x_date
- 이름 : month
- 간격 : break_date (3개월)
- 간격 라벨 (Jan ~ Jan)
- scale_y_continuous
- 이름 : temp
- 테마 : 밝게
<- ggplot(data_Houston, aes(x=date, y=temperature)) +
temp_plot geom_line(linewidth=1, color='royalblue') +
scale_x_date(name= 'month',
breaks= break_date,
labels= c('Jan','Apr','Jul','Oct','Jan')) +
scale_y_continuous(name= 'temp') +
theme_light()
temp_plot
2.1 plot_grid
- 여러 개의 그래프를 그리드로 결합하여 하나의 그래프로 만들어주는 함수
### 2개의 temp_long 그래프
<- plot_grid(temp_plot,
plot_ab
temp_plot,nrow= 1, # 행의 개수
rel_widths= c(1,2), # 각각의 너비
labels= c('a','b')) # 라벨 a,b
plot_ab
### plot_ab 그래프 + templong 그래프
<- plot_grid(plot_ab,
plot_abc
temp_plot,ncol= 1, # 열의 개수
rel_heights= c(1.5, 1),# 각각의 높이
labels= c('','c')) # 라벨 '그대로', 'c'
plot_abc