오픈데이터 분석 실습 : Import/Write Data

패키지 불러오기

  • read_csv 기능 존재
library(tidyverse)

Import Data

read_csv

  • read.csv와 다르게 문자열 factor 처리X
### 현재 경로 확인
getwd()
#> [1] "C:/Users/seong taek/Desktop/3-1 Opendata_Analysis/opendata"

### 경로 설정
#setwd()

### csv파일 불러오기
heights <- read_csv("heights.csv")
heights
#> # A tibble: 1,192 × 6
#>     earn height sex       ed   age race    
#>    <dbl>  <dbl> <chr>  <dbl> <dbl> <chr>   
#>  1 50000   74.4 male      16    45 white   
#>  2 60000   65.5 female    16    58 white   
#>  3 30000   63.6 female    16    29 white   
#>  4 50000   63.1 female    16    91 other   
#>  5 51000   63.4 female    17    39 white   
#>  6  9000   64.4 female    15    26 white   
#>  7 29000   61.7 female    12    49 white   
#>  8 32000   72.7 male      17    46 white   
#>  9  2000   72.0 male      15    21 hispanic
#> 10 27000   72.2 male      12    26 white   
#> # … with 1,182 more rows

### 절대경로 사용
heights <- read_csv("C:/Users/seong taek/Desktop/3-1 Opendata_Analysis/opendata/heights.csv")
heights
#> # A tibble: 1,192 × 6
#>     earn height sex       ed   age race    
#>    <dbl>  <dbl> <chr>  <dbl> <dbl> <chr>   
#>  1 50000   74.4 male      16    45 white   
#>  2 60000   65.5 female    16    58 white   
#>  3 30000   63.6 female    16    29 white   
#>  4 50000   63.1 female    16    91 other   
#>  5 51000   63.4 female    17    39 white   
#>  6  9000   64.4 female    15    26 white   
#>  7 29000   61.7 female    12    49 white   
#>  8 32000   72.7 male      17    46 white   
#>  9  2000   72.0 male      15    21 hispanic
#> 10 27000   72.2 male      12    26 white   
#> # … with 1,182 more rows

### tibble 데이터 프레임 생성
read_csv("a,b,c
         1,2,3
         4,5,6")
#> # A tibble: 2 × 3
#>       a     b     c
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
#> 2     4     5     6

### 라인 스킵
read_csv("The first line of metadata
         The second line of metadata
         x,y,z
         1,2,3" , skip=2)
#> # A tibble: 1 × 3
#>       x     y     z
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3

### 주석 스킵
read_csv("#A comment I want to skip
         x,y,z
         1,2,3", comment="#")
#> # A tibble: 1 × 3
#>       x     y     z
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3

### 컬럼 이름 없이 내용만
read_csv("1,2,3
         4,5,6", col_names = F)
#> # A tibble: 2 × 3
#>      X1    X2    X3
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
#> 2     4     5     6

### '\n' : 한줄 띄우기
read_csv("1,2,3 \n 4,5,6,", col_names = F)
#> # A tibble: 2 × 3
#>      X1    X2    X3
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
#> 2     4     5     6

### 컬럼 이름 지정
read_csv("1,2,3 \n 4,5,6,", col_names = c("A","B","C"))
#> # A tibble: 2 × 3
#>       A     B     C
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
#> 2     4     5     6

### NA값 부여
read_csv("a,b,c \n 1,2,.", na=".")
#> # A tibble: 1 × 3
#>       a     b c    
#>   <dbl> <dbl> <lgl>
#> 1     1     2 NA

Locale 설정/확인

Sys.getlocale()
#> [1] "LC_COLLATE=Korean_Korea.utf8;LC_CTYPE=Korean_Korea.utf8;LC_MONETARY=Korean_Korea.utf8;LC_NUMERIC=C;LC_TIME=Korean_Korea.utf8"

### 언어 영어로
#Sys.setlocale("LC_ALL", "English")

### 강제 언어 삭제
#Sys.setlocale("LC_ALL", "C")

한글 파일 읽기

### 인코딩 찾기
guess_encoding("exercise.csv")
#> # A tibble: 2 × 2
#>   encoding   confidence
#>   <chr>           <dbl>
#> 1 EUC-KR           1   
#> 2 IBM420_ltr       0.25

### 인코딩 입력으로 에러해결
exercise <- read_csv("exercise.csv", locale = locale(encoding = "EUC-KR"))
exercise
#> # A tibble: 5 × 2
#>   이름  선호도
#>   <chr>  <dbl>
#> 1 하민       5
#> 2 하준       4
#> 3 하진       4
#> 4 태산       3
#> 5 태민       2

### csv파일을 미리 열어보고 인코딩 변경
exercise <- read_csv("exercise_utf_8.csv")
exercise
#> # A tibble: 5 × 2
#>   이름  선호도
#>   <chr>  <dbl>
#> 1 하민       5
#> 2 하준       4
#> 3 하진       4
#> 4 태산       3
#> 5 태민       2

guess_encoding("exercise_utf_8.csv")
#> # A tibble: 3 × 2
#>   encoding     confidence
#>   <chr>             <dbl>
#> 1 UTF-8              1   
#> 2 windows-1255       0.38
#> 3 windows-1255       0.29

Write Data

파일 저장/삭제

heights <- read_csv("heights.csv")
heights
#> # A tibble: 1,192 × 6
#>     earn height sex       ed   age race    
#>    <dbl>  <dbl> <chr>  <dbl> <dbl> <chr>   
#>  1 50000   74.4 male      16    45 white   
#>  2 60000   65.5 female    16    58 white   
#>  3 30000   63.6 female    16    29 white   
#>  4 50000   63.1 female    16    91 other   
#>  5 51000   63.4 female    17    39 white   
#>  6  9000   64.4 female    15    26 white   
#>  7 29000   61.7 female    12    49 white   
#>  8 32000   72.7 male      17    46 white   
#>  9  2000   72.0 male      15    21 hispanic
#> 10 27000   72.2 male      12    26 white   
#> # … with 1,182 more rows

### 현재 경로에 csv파일 저장
write_csv(heights, "만들 파일 이름.csv")

### rds 확장자 
write_rds(heights, "만들 파일 이름.rds")
read_rds("만들 파일 이름.rds")
#> # A tibble: 1,192 × 6
#>     earn height sex       ed   age race    
#>    <dbl>  <dbl> <chr>  <dbl> <dbl> <chr>   
#>  1 50000   74.4 male      16    45 white   
#>  2 60000   65.5 female    16    58 white   
#>  3 30000   63.6 female    16    29 white   
#>  4 50000   63.1 female    16    91 other   
#>  5 51000   63.4 female    17    39 white   
#>  6  9000   64.4 female    15    26 white   
#>  7 29000   61.7 female    12    49 white   
#>  8 32000   72.7 male      17    46 white   
#>  9  2000   72.0 male      15    21 hispanic
#> 10 27000   72.2 male      12    26 white   
#> # … with 1,182 more rows

### 파일 삭제
file.remove("만들 파일 이름.csv")
#> [1] TRUE

feather 패키지

#install.packages("feather")
library(feather)

write_feather(heights, "heights.feather")
read_feather("heights.feather")
#> # A tibble: 1,192 × 6
#>     earn height sex       ed   age race    
#>    <dbl>  <dbl> <chr>  <dbl> <dbl> <chr>   
#>  1 50000   74.4 male      16    45 white   
#>  2 60000   65.5 female    16    58 white   
#>  3 30000   63.6 female    16    29 white   
#>  4 50000   63.1 female    16    91 other   
#>  5 51000   63.4 female    17    39 white   
#>  6  9000   64.4 female    15    26 white   
#>  7 29000   61.7 female    12    49 white   
#>  8 32000   72.7 male      17    46 white   
#>  9  2000   72.0 male      15    21 hispanic
#> 10 27000   72.2 male      12    26 white   
#> # … with 1,182 more rows