오픈데이터 분석 실습 : Import/Write Data
패키지 불러오기
- read_csv 기능 존재
library(tidyverse)
Import Data
read_csv
- read.csv와 다르게 문자열 factor 처리X
### 현재 경로 확인
getwd()
#> [1] "C:/Users/seong taek/Desktop/3-1 Opendata_Analysis/opendata"
### 경로 설정
#setwd()
### csv파일 불러오기
<- read_csv("heights.csv")
heights
heights#> # A tibble: 1,192 × 6
#> earn height sex ed age race
#> <dbl> <dbl> <chr> <dbl> <dbl> <chr>
#> 1 50000 74.4 male 16 45 white
#> 2 60000 65.5 female 16 58 white
#> 3 30000 63.6 female 16 29 white
#> 4 50000 63.1 female 16 91 other
#> 5 51000 63.4 female 17 39 white
#> 6 9000 64.4 female 15 26 white
#> 7 29000 61.7 female 12 49 white
#> 8 32000 72.7 male 17 46 white
#> 9 2000 72.0 male 15 21 hispanic
#> 10 27000 72.2 male 12 26 white
#> # … with 1,182 more rows
### 절대경로 사용
<- read_csv("C:/Users/seong taek/Desktop/3-1 Opendata_Analysis/opendata/heights.csv")
heights
heights#> # A tibble: 1,192 × 6
#> earn height sex ed age race
#> <dbl> <dbl> <chr> <dbl> <dbl> <chr>
#> 1 50000 74.4 male 16 45 white
#> 2 60000 65.5 female 16 58 white
#> 3 30000 63.6 female 16 29 white
#> 4 50000 63.1 female 16 91 other
#> 5 51000 63.4 female 17 39 white
#> 6 9000 64.4 female 15 26 white
#> 7 29000 61.7 female 12 49 white
#> 8 32000 72.7 male 17 46 white
#> 9 2000 72.0 male 15 21 hispanic
#> 10 27000 72.2 male 12 26 white
#> # … with 1,182 more rows
### tibble 데이터 프레임 생성
read_csv("a,b,c
1,2,3
4,5,6")
#> # A tibble: 2 × 3
#> a b c
#> <dbl> <dbl> <dbl>
#> 1 1 2 3
#> 2 4 5 6
### 라인 스킵
read_csv("The first line of metadata
The second line of metadata
x,y,z
1,2,3" , skip=2)
#> # A tibble: 1 × 3
#> x y z
#> <dbl> <dbl> <dbl>
#> 1 1 2 3
### 주석 스킵
read_csv("#A comment I want to skip
x,y,z
1,2,3", comment="#")
#> # A tibble: 1 × 3
#> x y z
#> <dbl> <dbl> <dbl>
#> 1 1 2 3
### 컬럼 이름 없이 내용만
read_csv("1,2,3
4,5,6", col_names = F)
#> # A tibble: 2 × 3
#> X1 X2 X3
#> <dbl> <dbl> <dbl>
#> 1 1 2 3
#> 2 4 5 6
### '\n' : 한줄 띄우기
read_csv("1,2,3 \n 4,5,6,", col_names = F)
#> # A tibble: 2 × 3
#> X1 X2 X3
#> <dbl> <dbl> <dbl>
#> 1 1 2 3
#> 2 4 5 6
### 컬럼 이름 지정
read_csv("1,2,3 \n 4,5,6,", col_names = c("A","B","C"))
#> # A tibble: 2 × 3
#> A B C
#> <dbl> <dbl> <dbl>
#> 1 1 2 3
#> 2 4 5 6
### NA값 부여
read_csv("a,b,c \n 1,2,.", na=".")
#> # A tibble: 1 × 3
#> a b c
#> <dbl> <dbl> <lgl>
#> 1 1 2 NA
Locale 설정/확인
Sys.getlocale()
#> [1] "LC_COLLATE=Korean_Korea.utf8;LC_CTYPE=Korean_Korea.utf8;LC_MONETARY=Korean_Korea.utf8;LC_NUMERIC=C;LC_TIME=Korean_Korea.utf8"
### 언어 영어로
#Sys.setlocale("LC_ALL", "English")
### 강제 언어 삭제
#Sys.setlocale("LC_ALL", "C")
한글 파일 읽기
### 인코딩 찾기
guess_encoding("exercise.csv")
#> # A tibble: 2 × 2
#> encoding confidence
#> <chr> <dbl>
#> 1 EUC-KR 1
#> 2 IBM420_ltr 0.25
### 인코딩 입력으로 에러해결
<- read_csv("exercise.csv", locale = locale(encoding = "EUC-KR"))
exercise
exercise#> # A tibble: 5 × 2
#> 이름 선호도
#> <chr> <dbl>
#> 1 하민 5
#> 2 하준 4
#> 3 하진 4
#> 4 태산 3
#> 5 태민 2
### csv파일을 미리 열어보고 인코딩 변경
<- read_csv("exercise_utf_8.csv")
exercise
exercise#> # A tibble: 5 × 2
#> 이름 선호도
#> <chr> <dbl>
#> 1 하민 5
#> 2 하준 4
#> 3 하진 4
#> 4 태산 3
#> 5 태민 2
guess_encoding("exercise_utf_8.csv")
#> # A tibble: 3 × 2
#> encoding confidence
#> <chr> <dbl>
#> 1 UTF-8 1
#> 2 windows-1255 0.38
#> 3 windows-1255 0.29
Write Data
파일 저장/삭제
<- read_csv("heights.csv")
heights
heights#> # A tibble: 1,192 × 6
#> earn height sex ed age race
#> <dbl> <dbl> <chr> <dbl> <dbl> <chr>
#> 1 50000 74.4 male 16 45 white
#> 2 60000 65.5 female 16 58 white
#> 3 30000 63.6 female 16 29 white
#> 4 50000 63.1 female 16 91 other
#> 5 51000 63.4 female 17 39 white
#> 6 9000 64.4 female 15 26 white
#> 7 29000 61.7 female 12 49 white
#> 8 32000 72.7 male 17 46 white
#> 9 2000 72.0 male 15 21 hispanic
#> 10 27000 72.2 male 12 26 white
#> # … with 1,182 more rows
### 현재 경로에 csv파일 저장
write_csv(heights, "만들 파일 이름.csv")
### rds 확장자
write_rds(heights, "만들 파일 이름.rds")
read_rds("만들 파일 이름.rds")
#> # A tibble: 1,192 × 6
#> earn height sex ed age race
#> <dbl> <dbl> <chr> <dbl> <dbl> <chr>
#> 1 50000 74.4 male 16 45 white
#> 2 60000 65.5 female 16 58 white
#> 3 30000 63.6 female 16 29 white
#> 4 50000 63.1 female 16 91 other
#> 5 51000 63.4 female 17 39 white
#> 6 9000 64.4 female 15 26 white
#> 7 29000 61.7 female 12 49 white
#> 8 32000 72.7 male 17 46 white
#> 9 2000 72.0 male 15 21 hispanic
#> 10 27000 72.2 male 12 26 white
#> # … with 1,182 more rows
### 파일 삭제
file.remove("만들 파일 이름.csv")
#> [1] TRUE
feather 패키지
#install.packages("feather")
library(feather)
write_feather(heights, "heights.feather")
read_feather("heights.feather")
#> # A tibble: 1,192 × 6
#> earn height sex ed age race
#> <dbl> <dbl> <chr> <dbl> <dbl> <chr>
#> 1 50000 74.4 male 16 45 white
#> 2 60000 65.5 female 16 58 white
#> 3 30000 63.6 female 16 29 white
#> 4 50000 63.1 female 16 91 other
#> 5 51000 63.4 female 17 39 white
#> 6 9000 64.4 female 15 26 white
#> 7 29000 61.7 female 12 49 white
#> 8 32000 72.7 male 17 46 white
#> 9 2000 72.0 male 15 21 hispanic
#> 10 27000 72.2 male 12 26 white
#> # … with 1,182 more rows