Data_Visualization CH3

code
data_visualization
rstudio
Author

Seongtaek

Published

April 4, 2023

HTML파일로 보기

Figure 3.2

1 데이터 시각화 실습 : 그래프 합치기 Figure 3.2

1.1 패키지 불러오기

library(ggplot2)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(ggrepel)
library(cowplot) # plot_grid

1.2 데이터 불러오기, 파악

  • ncdc_normals.csv (날짜별 온도 등 데이터셋)
ncdc_normals <- read.csv('C:/Users/seong taek/Desktop/3-1 DataVisualize/data_visualize/ncdc_normals.csv')

### 차원 파악
ncdc_normals %>% dim()
#> [1] 2745366       6

### 앞부분 훑어보기
ncdc_normals %>% head()
#>    station_id month day temperature flag       date
#> 1 AQW00061705     1   1        82.4    C 0000-01-01
#> 2 AQW00061705     1   2        82.4    C 0000-01-02
#> 3 AQW00061705     1   3        82.4    C 0000-01-03
#> 4 AQW00061705     1   4        82.4    C 0000-01-04
#> 5 AQW00061705     1   5        82.4    C 0000-01-05
#> 6 AQW00061705     1   6        82.4    C 0000-01-06

### 통계 요약 정보
ncdc_normals %>% summary()
#>   station_id            month             day         temperature    
#>  Length:2745366     Min.   : 1.000   Min.   : 1.00   Min.   :-21.80  
#>  Class :character   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 39.70  
#>  Mode  :character   Median : 7.000   Median :16.00   Median : 54.60  
#>                     Mean   : 6.514   Mean   :15.76   Mean   : 53.17  
#>                     3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.: 68.10  
#>                     Max.   :12.000   Max.   :31.00   Max.   :103.20  
#>      flag               date          
#>  Length:2745366     Length:2745366    
#>  Class :character   Class :character  
#>  Mode  :character   Mode  :character  
#>                                       
#>                                       
#> 

### 각 컬럼 클래스(타입) 확인
ncdc_normals %>% sapply(class)
#>  station_id       month         day temperature        flag        date 
#> "character"   "integer"   "integer"   "numeric" "character" "character"

### 각 컬럼 자료형 확인
ncdc_normals %>% sapply(typeof)
#>  station_id       month         day temperature        flag        date 
#> "character"   "integer"   "integer"    "double" "character" "character"

1.3 전처리

### staion id와 location 지정 선택한 d.f 만들기
station_loc <- data.frame(station_id = c("USW00014819","USC00042319","USW00093107","USW00012918"),
                          location = c("Chicago","Death Valley","San Diego","Houston"))

station_loc %>% head()
#>    station_id     location
#> 1 USW00014819      Chicago
#> 2 USC00042319 Death Valley
#> 3 USW00093107    San Diego
#> 4 USW00012918      Houston
### station_id로 ncdc_normals와 station_loc 두 컬럼을 `inner_join`
temps_long <- ncdc_normals %>% inner_join(station_loc, by="station_id")
temps_long %>% head()
#>    station_id month day temperature flag       date     location
#> 1 USC00042319     1   1        51.0    S 0000-01-01 Death Valley
#> 2 USC00042319     1   2        51.2    S 0000-01-02 Death Valley
#> 3 USC00042319     1   3        51.3    S 0000-01-03 Death Valley
#> 4 USC00042319     1   4        51.4    S 0000-01-04 Death Valley
#> 5 USC00042319     1   5        51.6    S 0000-01-05 Death Valley
#> 6 USC00042319     1   6        51.7    S 0000-01-06 Death Valley
temps_long %>% sapply(class)
#>  station_id       month         day temperature        flag        date 
#> "character"   "integer"   "integer"   "numeric" "character" "character" 
#>    location 
#> "character"

### temps_long의 `date`의 타입을 Cha → Date로 변환
temps_long$date <- temps_long$date %>% as.Date('%Y-%m-%d')
temps_long %>% sapply(class)
#>  station_id       month         day temperature        flag        date 
#> "character"   "integer"   "integer"   "numeric" "character"      "Date" 
#>    location 
#> "character"
### Houston 필터 
data_Houston <- temps_long %>% filter(location=='Houston')
data_Houston %>% head()
#>    station_id month day temperature flag       date location
#> 1 USW00012918     1   1        53.9    S 0000-01-01  Houston
#> 2 USW00012918     1   2        53.8    S 0000-01-02  Houston
#> 3 USW00012918     1   3        53.8    S 0000-01-03  Houston
#> 4 USW00012918     1   4        53.8    S 0000-01-04  Houston
#> 5 USW00012918     1   5        53.8    S 0000-01-05  Houston
#> 6 USW00012918     1   6        53.7    S 0000-01-06  Houston
data_Houston %>% tail()
#>      station_id month day temperature flag       date location
#> 361 USW00012918    12  26        54.2    C 0000-12-26  Houston
#> 362 USW00012918    12  27        54.1    C 0000-12-27  Houston
#> 363 USW00012918    12  28        54.1    C 0000-12-28  Houston
#> 364 USW00012918    12  29        54.0    C 0000-12-29  Houston
#> 365 USW00012918    12  30        53.9    C 0000-12-30  Houston
#> 366 USW00012918    12  31        53.9    C 0000-12-31  Houston

1.4 x축 눈금 설정

date_s <- '0000-01-01' %>% as.Date('%Y-%m-%d')
date_e <- '0001-01-01' %>% as.Date('%Y-%m-%d')
break_date <- seq(date_s, date_e, by = '3 month')

2 ggplot + 축 설정

  • 사용 데이터셋 : data_Houston
  • x=date, y=temperature
  • scale_x_date
    • 이름 : month
    • 간격 : break_date (3개월)
    • 간격 라벨 (Jan ~ Jan)
  • scale_y_continuous
    • 이름 : temp
  • 테마 : 밝게
temp_plot <- ggplot(data_Houston, aes(x=date, y=temperature)) +
  geom_line(linewidth=1, color='royalblue') +
  scale_x_date(name= 'month',
               breaks= break_date,
               labels= c('Jan','Apr','Jul','Oct','Jan')) +
  scale_y_continuous(name= 'temp') +
  theme_light()

temp_plot

2.1 plot_grid

  • 여러 개의 그래프를 그리드로 결합하여 하나의 그래프로 만들어주는 함수
### 2개의 temp_long 그래프
plot_ab <- plot_grid(temp_plot,
                     temp_plot,
                     nrow= 1,            # 행의 개수
                     rel_widths= c(1,2), # 각각의 너비
                     labels= c('a','b')) # 라벨 a,b

plot_ab

### plot_ab 그래프 + templong 그래프
plot_abc <- plot_grid(plot_ab,
                      temp_plot,
                      ncol= 1,               # 열의 개수
                      rel_heights= c(1.5, 1),# 각각의 높이
                      labels= c('','c'))     # 라벨 '그대로', 'c'

plot_abc